Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
traversc committed Jan 26, 2024
1 parent c1f0572 commit e5f5ace
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 16 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ SystemRequirements: GNU make
LinkingTo:
Rcpp, RcppParallel, BH
Imports:
Rcpp (>= 0.12.18.3), RcppParallel (>= 5.1.3), R6
Rcpp (>= 0.12.18.3), RcppParallel (>= 5.1.3), R6, rlang, dplyr, stringi
Suggests:
knitr, rmarkdown, stringdist, qs, dplyr, Biostrings, igraph, ggplot2, stringi
knitr, rmarkdown, stringdist, qs, Biostrings, igraph, ggplot2
VignetteBuilder: knitr
RoxygenNote: 7.2.3
Roxygen: list(markdown = TRUE)
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
importFrom(Rcpp,sourceCpp)
importFrom(RcppParallel, RcppParallelLibs)
importFrom(R6, R6Class)
importFrom(rlang, .data)
useDynLib(seqtrie, .registration=TRUE)
export("RadixTree")
export("RadixForest")
export("dist_matrix")
export("dist_pairwise")
export("dist_search")
export("split_search")
export("generate_cost_matrix")
21 changes: 11 additions & 10 deletions R/RadixTree_search_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,10 @@ dist_search <- function(query, target, max_distance = NULL, max_fraction = NULL,
#' # target1 AAGACCTAA CC
#' # query2 GGGTGTAA CCACCC
#' # target2 GGTGTAA CCAC
#' # Despite having different frames, query1 and query2 and clearly match to target1 and target2, respectively.
#' # One could consider splitting based on a common core sequence, e.g. a common TAA stop codon.
#' # Despite having different frames, query1 and query2 and clearly
#' # match to target1 and target2, respectively.
#' # One could consider splitting based on a common core sequence,
#' # e.g. a common TAA stop codon.
#' split_search(query=c( "AGACCTAACCC", "GGGTGTAACCACCC"),
#' target=c("AAGACCTAACC", "GGTGTAACCAC"),
#' query_split=c(8, 8),
Expand All @@ -89,9 +91,9 @@ split_search <- function(query, target, query_split, target_split, edge_trim = 0

# Search for similar sequences between lefts and rights
left_matches <- left_tree$search(unique(query_left), max_distance = max_distance, mode = "anchored", ...)
left_matches <- dplyr::rename(left_matches, query_left=query, target_left=target)
left_matches <- dplyr::rename(left_matches, query_left=.data$query, target_left=.data$target)
right_matches <- right_tree$search(unique(query_right), max_distance = max_distance, mode = "anchored", ...)
right_matches <- dplyr::rename(right_matches, query_right=query, target_right=target)
right_matches <- dplyr::rename(right_matches, query_right=.data$query, target_right=.data$target)

# If either left or right finds no matches, return empty dataframe
if(nrow(left_matches) == 0 || nrow(right_matches) == 0) {
Expand All @@ -101,9 +103,9 @@ split_search <- function(query, target, query_split, target_split, edge_trim = 0
# construct map of full sequence to left and right
# filter in only potential matches, i.e. queries or targets that are in both left_matches and right_matches data.frame
df_query <- data.frame(query, query_left, query_right)
df_query <- dplyr::filter(df_query, query_left %in% left_matches$query_left, query_right %in% right_matches$query_right)
df_query <- dplyr::filter(df_query, .data$query_left %in% left_matches$query_left, .data$query_right %in% right_matches$query_right)
df_target <- data.frame(target, target_left, target_right)
df_target <- dplyr::filter(df_target, target_left %in% left_matches$target_left, target_right %in% right_matches$target_right)
df_target <- dplyr::filter(df_target, .data$target_left %in% left_matches$target_left, .data$target_right %in% right_matches$target_right)

# Join results together, append full query and target sequences to left and right matches
left_matches <- dplyr::inner_join(left_matches, df_query, by = "query_left")
Expand All @@ -112,8 +114,7 @@ split_search <- function(query, target, query_split, target_split, edge_trim = 0
right_matches <- dplyr::inner_join(right_matches, df_target, by = "target_right")

results <- dplyr::inner_join(left_matches, right_matches, by = c("query", "target"), suffix=c(".left", ".right"))
results <- dplyr::mutate(results, distance = distance.left + distance.right)
results <- dplyr::filter(results, distance <= max_distance)
results <- dplyr::select(results, query, target, distance)
as.data.frame(results)
results <- dplyr::mutate(results, distance = .data$distance.left + .data$distance.right)
results <- dplyr::filter(results, .data$distance <= max_distance)
as.data.frame(results[c("query", "target", "distance")])
}
6 changes: 4 additions & 2 deletions man/split_search.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions vignettes/vignette.rmd
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ knitr::opts_chunk$set(dpi=96,fig.width=6.5)
### Basic usage

```{r, basic_usage, eval=FALSE}
results <- dist_search(strings1, strings2, max_distance=2, nthreads = 1)
results <- dist_search(x, y, max_distance = 2, nthreads = 1)
```

The above code will find all similar sequences between `strings1` and `strings2`.
The above code will find all similar sequences/strings between `x` and `y`.
This will generally be significantly faster than calculating pairwise distance or
pairwise alignment.

Expand Down

0 comments on commit e5f5ace

Please sign in to comment.