From c6ed007e4108759323cd43e520214c64de6f1b09 Mon Sep 17 00:00:00 2001 From: Zena Lapp Date: Mon, 20 Nov 2023 11:17:15 -0500 Subject: [PATCH] Add step-by-step guide and additional examples and checks resolves #7 --- R/bistro.R | 16 +++-- R/calc_allele_freqs.R | 17 +++-- R/calc_log10_lrs.R | 59 +++++++++++++++++- R/checks.R | 5 +- R/create_db_from_bloodmeals.R | 23 +++++-- R/identify_matches.R | 29 ++++++++- R/match_exact.R | 7 ++- R/match_similarity.R | 6 +- R/preprocess_data.R | 30 +++++++-- _pkgdown.yml | 2 + man/calc_allele_freqs.Rd | 9 ++- man/calc_log10_lrs.Rd | 18 +++++- man/create_db_from_bloodmeals.Rd | 9 ++- man/identify_matches.Rd | 18 +++++- man/prep_bloodmeal_profiles.Rd | 10 ++- man/prep_human_profiles.Rd | 10 ++- tests/testthat/_snaps/calc_log10_lrs.md | 2 +- tests/testthat/test-calc_log10_lrs.R | 3 +- vignettes/bistro.Rmd | 6 ++ vignettes/step-by-step.Rmd | 83 +++++++++++++++++++++++++ 20 files changed, 319 insertions(+), 43 deletions(-) create mode 100644 vignettes/step-by-step.Rmd diff --git a/R/bistro.R b/R/bistro.R index ab921b9..5a3c9d4 100644 --- a/R/bistro.R +++ b/R/bistro.R @@ -128,7 +128,10 @@ bistro <- ) if (calc_allele_freqs) { - pop_allele_freqs <- calc_allele_freqs(human_profiles, rm_markers) + pop_allele_freqs <- calc_allele_freqs(human_profiles, + rm_markers, + check_inputs = FALSE + ) } else if (!is.null(rm_markers)) { pop_allele_freqs <- pop_allele_freqs |> dplyr::select(-dplyr::matches(paste0("^", toupper(rm_markers), "$"))) @@ -139,7 +142,8 @@ bistro <- bloodmeal_profiles, bloodmeal_ids, peak_thresh, - rm_markers + rm_markers, + check_inputs = FALSE ) message("Formatting human profiles") @@ -147,7 +151,8 @@ bistro <- human_profiles, human_ids, rm_twins, - rm_markers + rm_markers, + check_inputs = FALSE ) bm_markers <- bloodmeal_profiles$Marker @@ -180,12 +185,13 @@ bistro <- difftol, threads, seed, - time_limit + time_limit, + check_inputs = FALSE ) message("Identifying matches") - matches <- identify_matches(log10_lrs, bloodmeal_ids) + matches <- identify_matches(log10_lrs, bloodmeal_ids, check_inputs = FALSE) if (return_lrs) { matches <- list( diff --git a/R/calc_allele_freqs.R b/R/calc_allele_freqs.R index 7a66360..1e4813c 100644 --- a/R/calc_allele_freqs.R +++ b/R/calc_allele_freqs.R @@ -2,6 +2,8 @@ #' #' @description Calculate allele frequencies for a (generally human) population. #' +#' @param check_inputs A boolean indicating whether or not to check the +#' inputs to the function. Default: TRUE #' @inheritParams bistro #' #' @return A tibble where the first column is the STR allele and the following @@ -9,11 +11,16 @@ #' that do not exist for a given marker are coded as NA. #' #' @export -#' @keywords internal -calc_allele_freqs <- function(human_profiles, rm_markers = NULL) { - # check if expected columns are present - check_colnames(human_profiles, c("SampleName", "Marker", "Allele")) - check_ids(rm_markers, "rm_markers") +#' @examples +#' calc_allele_freqs(human_profiles) +calc_allele_freqs <- function(human_profiles, + rm_markers = NULL, + check_inputs = TRUE) { + if (check_inputs) { + # check if expected columns are present + check_colnames(human_profiles, c("SampleName", "Marker", "Allele")) + check_ids(rm_markers, "rm_markers") + } if (!is.null(rm_markers)) { human_profiles <- human_profiles |> diff --git a/R/calc_log10_lrs.R b/R/calc_log10_lrs.R index a3e8bb9..41721bb 100644 --- a/R/calc_log10_lrs.R +++ b/R/calc_log10_lrs.R @@ -122,6 +122,7 @@ calc_one_log10_lr <- #' `prep_bloodmeal_profiles()` and `prep_human_profiles()` first. #' #' @inheritParams bistro +#' @inheritParams calc_allele_freqs #' #' @return A tibble with the same output as for [bistro()], except there is no #' match column and every bloodmeal-human pair with the calculated log10_lr is @@ -130,7 +131,16 @@ calc_one_log10_lr <- #' value of 3. #' #' @export -#' @keywords internal +#' @examples +#' bm_profs <- prep_bloodmeal_profiles(bloodmeal_profiles, peak_thresh = 200) +#' hu_profs <- prep_human_profiles(human_profiles) +#' log10_lrs <- calc_log10_lrs(bm_profs, +#' hu_profs, +#' bloodmeal_ids = "evid1", +#' pop_allele_freqs = pop_allele_freqs, +#' kit = "ESX17", +#' peak_thresh = 200 +#' ) calc_log10_lrs <- function(bloodmeal_profiles, human_profiles, @@ -145,7 +155,52 @@ calc_log10_lrs <- difftol = 1, threads = 4, seed = 1, - time_limit = 3) { + time_limit = 3, + check_inputs = TRUE) { + if (check_inputs) { + check_colnames( + bloodmeal_profiles, + c("SampleName", "Marker", "Allele", "Height") + ) + check_colnames(human_profiles, c("SampleName", "Marker", "Allele")) + check_ids(bloodmeal_ids, "bloodmeal_ids") + check_ids(human_ids, "human_ids") + + kit_df <- check_kit(kit) + + bm_prof_markers <- bloodmeal_profiles$Marker |> + unique() |> + toupper() + hu_prof_markers <- human_profiles$Marker |> + unique() |> + toupper() + kit_markers <- kit_df$Marker |> + unique() |> + toupper() + + check_setdiff_markers( + bm_prof_markers, + kit_markers, + "bloodmeal_profiles", + "kit" + ) + check_setdiff_markers( + hu_prof_markers, + kit_markers, + "human_profiles", + "kit" + ) + + check_peak_thresh(peak_thresh) + check_is_bool(model_degrad, "model_degrad") + check_is_bool(model_bw_stutt, "model_bw_stutt") + check_is_bool(model_fw_stutt, "model_fw_stutt") + check_is_numeric(difftol, "difftol", pos = TRUE) + check_is_numeric(threads, "threads", pos = TRUE) + check_is_numeric(seed, "seed") + check_is_numeric(time_limit, "time_limit", pos = TRUE) + } + if (is.null(bloodmeal_ids)) { bloodmeal_ids <- unique(bloodmeal_profiles$SampleName) } diff --git a/R/checks.R b/R/checks.R index f895969..4e69a6f 100644 --- a/R/checks.R +++ b/R/checks.R @@ -299,7 +299,10 @@ check_heights <- function(heights, peak_thresh) { #' @return number of alleles in a complete profile #' #' @keywords internal -check_create_db_input <- function(bloodmeal_profiles, kit, peak_thresh, rm_markers) { +check_create_db_input <- function(bloodmeal_profiles, + kit, + peak_thresh, + rm_markers) { check_colnames( bloodmeal_profiles, c("SampleName", "Marker", "Allele") diff --git a/R/create_db_from_bloodmeals.R b/R/create_db_from_bloodmeals.R index 3c1fde9..07f01d5 100644 --- a/R/create_db_from_bloodmeals.R +++ b/R/create_db_from_bloodmeals.R @@ -3,14 +3,17 @@ #' @inheritParams bistro #' #' @return Human database created from complete single-source bloodmeals. -#' Complete is defined as the number of markers in the kit minus the number of markers in `rm_markers`. +#' Complete is defined as the number of markers in the kit minus the +#' number of markers in `rm_markers`. #' @export #' #' @examples #' \dontrun{ #' # load example data -#' path_to_data <- paste0("https://raw.githubusercontent.com/duke-malaria-collaboratory/", -#' "bistro_validation/main/data/provedit/provedit_samples_mass200thresh.csv") +#' path_to_data <- paste0( +#' "https://raw.githubusercontent.com/duke-malaria-collaboratory/", +#' "bistro_validation/main/data/provedit/provedit_samples_mass200thresh.csv" +#' ) #' samples <- readr::read_csv(path_to_data) #' create_db_from_bloodmeals(samples, kit = "identifiler", peak_thresh = 200) #' } @@ -20,7 +23,12 @@ create_db_from_bloodmeals <- function(bloodmeal_profiles, peak_thresh, rm_markers = c("AMEL")) { check_pkg_version("tidyr", utils::packageVersion("tidyr"), "1.3.0") - n_markers_in_kit <- check_create_db_input(bloodmeal_profiles, kit, peak_thresh, rm_markers) + n_markers_in_kit <- check_create_db_input( + bloodmeal_profiles, + kit, + peak_thresh, + rm_markers + ) bloodmeal_profiles |> filter_peaks(peak_thresh = peak_thresh) |> rm_markers(markers = rm_markers) |> @@ -37,7 +45,12 @@ create_db_from_bloodmeals <- function(bloodmeal_profiles, dplyr::mutate(SampleName = paste0("H", as.numeric(factor(SampleName)))) |> dplyr::arrange(Marker, Allele) |> dplyr::group_by(SampleName) |> - dplyr::summarize(all_alleles = stringr::str_c(paste0(Marker, "__", Allele), collapse = ";")) |> + dplyr::summarize( + all_alleles = + stringr::str_c(paste0(Marker, "__", Allele), + collapse = ";" + ) + ) |> dplyr::group_by(all_alleles) |> dplyr::summarize() |> dplyr::mutate(SampleName = paste0("H", dplyr::row_number()), .before = 1) |> diff --git a/R/identify_matches.R b/R/identify_matches.R index bcaa3ed..5151e6e 100644 --- a/R/identify_matches.R +++ b/R/identify_matches.R @@ -133,12 +133,37 @@ identify_one_match_set <- function(log10_lrs, bloodmeal_id) { #' @param log10_lrs Output from [calc_log10_lrs()] or from `bistro` with #' `return_lrs = TRUE` (`lrs`: the second element in the list) #' @inheritParams bistro +#' @inheritParams calc_allele_freqs #' #' @return A tibble with the same output as for [bistro()]. #' #' @export -#' @keywords internal -identify_matches <- function(log10_lrs, bloodmeal_ids = NULL) { +#' @examples +#' bm_profs <- prep_bloodmeal_profiles(bloodmeal_profiles, peak_thresh = 200) +#' hu_profs <- prep_human_profiles(human_profiles) +#' log10_lrs <- calc_log10_lrs(bm_profs, +#' hu_profs, +#' bloodmeal_ids = "evid1", +#' pop_allele_freqs = pop_allele_freqs, +#' kit = "ESX17", +#' peak_thresh = 200 +#' ) +#' matches <- identify_matches(log10_lrs) +identify_matches <- function(log10_lrs, + bloodmeal_ids = NULL, + check_inputs = TRUE) { + if (check_inputs) { + check_colnames( + log10_lrs, + c( + "bloodmeal_id", "human_id", + "locus_count", "est_noc", "efm_noc", + "log10_lr", "notes" + ) + ) + check_ids(bloodmeal_ids, "bloodmeal_ids") + } + if (is.null(bloodmeal_ids)) { bloodmeal_ids <- unique(log10_lrs$bloodmeal_id) } diff --git a/R/match_exact.R b/R/match_exact.R index deafe22..0d31efa 100644 --- a/R/match_exact.R +++ b/R/match_exact.R @@ -41,14 +41,17 @@ match_exact <- function(bloodmeal_profiles, bloodmeal_profiles, bloodmeal_ids, peak_thresh, - rm_markers = NULL + rm_markers = NULL, + check_heights = FALSE, + check_inputs = FALSE ) human_profiles <- prep_human_profiles( human_profiles, human_ids, rm_twins, - rm_markers = NULL + rm_markers = NULL, + check_inputs = FALSE ) # human profiles diff --git a/R/match_similarity.R b/R/match_similarity.R index 80d6beb..3b10ba1 100644 --- a/R/match_similarity.R +++ b/R/match_similarity.R @@ -56,7 +56,8 @@ match_similarity <- function(bloodmeal_profiles, bloodmeal_ids, peak_thresh, rm_markers = rm_markers, - check_heights = FALSE + check_heights = FALSE, + check_inputs = FALSE ) all_bm_ids <- unique(bloodmeal_profiles$SampleName) @@ -68,7 +69,8 @@ match_similarity <- function(bloodmeal_profiles, human_profiles, human_ids, rm_twins, - rm_markers = rm_markers + rm_markers = rm_markers, + check_inputs = FALSE ) |> tidyr::drop_na() diff --git a/R/preprocess_data.R b/R/preprocess_data.R index 98140a2..ba4d842 100644 --- a/R/preprocess_data.R +++ b/R/preprocess_data.R @@ -5,15 +5,28 @@ #' @param check_heights A boolean indicating whether to check if all peak #' heights are below the threshold. Default: TRUE #' @inheritParams bistro +#' @inheritParams calc_allele_freqs #' #' @return Dataframe with preprocessed bloodmeal profiles #' @export -#' @keywords internal +#' @examples +#' prep_bloodmeal_profiles(bloodmeal_profiles, peak_thresh = 200) prep_bloodmeal_profiles <- function(bloodmeal_profiles, bloodmeal_ids = NULL, peak_thresh = NULL, rm_markers = c("AMEL"), - check_heights = TRUE) { + check_heights = TRUE, + check_inputs = TRUE) { + if (check_inputs) { + check_colnames( + bloodmeal_profiles, + c("SampleName", "Marker", "Allele", "Height") + ) + check_ids(bloodmeal_ids, "bloodmeal_ids") + check_peak_thresh(peak_thresh) + check_is_bool(check_heights, "check_heights") + } + if (is.null(bloodmeal_ids)) { bloodmeal_ids <- unique(bloodmeal_profiles$SampleName) } else { @@ -41,15 +54,24 @@ prep_bloodmeal_profiles <- function(bloodmeal_profiles, #' #' Removes duplicates and optionally twins, subsets ids #' +#' @inheritParams calc_allele_freqs #' @inheritParams bistro #' #' @return Dataframe with preprocessed human profiles #' @export -#' @keywords internal +#' @examples +#' prep_human_profiles(human_profiles) prep_human_profiles <- function(human_profiles, human_ids = NULL, rm_twins = TRUE, - rm_markers = c("AMEL")) { + rm_markers = c("AMEL"), + check_inputs = TRUE) { + if (check_inputs) { + check_colnames(human_profiles, c("SampleName", "Marker", "Allele")) + check_ids(human_ids, "bloodmeal_ids") + check_is_bool(rm_twins, "rm_twins") + } + if (rm_twins) { human_profiles <- rm_twins(human_profiles) } diff --git a/_pkgdown.yml b/_pkgdown.yml index 3da2588..49f91cb 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -32,6 +32,8 @@ reference: Sub-functions used by `bistro()` contents: - calc_allele_freqs + - prep_bloodmeal_profiles + - prep_human_profiles - rm_dups - rm_twins - filter_peaks diff --git a/man/calc_allele_freqs.Rd b/man/calc_allele_freqs.Rd index ad907ee..8bd666b 100644 --- a/man/calc_allele_freqs.Rd +++ b/man/calc_allele_freqs.Rd @@ -4,7 +4,7 @@ \alias{calc_allele_freqs} \title{Calculate allele frequencies} \usage{ -calc_allele_freqs(human_profiles, rm_markers = NULL) +calc_allele_freqs(human_profiles, rm_markers = NULL, check_inputs = TRUE) } \arguments{ \item{human_profiles}{Tibble or data frame with alleles for all humans in @@ -14,6 +14,9 @@ reference database including three columns: SampleName, Marker, Allele.} calculating log10LRs. NULL to include all markers. By default, for the bistro function AMEL is removed as it is not standard to include it in LR calculations.} + +\item{check_inputs}{A boolean indicating whether or not to check the +inputs to the function. Default: TRUE} } \value{ A tibble where the first column is the STR allele and the following @@ -23,4 +26,6 @@ that do not exist for a given marker are coded as NA. \description{ Calculate allele frequencies for a (generally human) population. } -\keyword{internal} +\examples{ +calc_allele_freqs(human_profiles) +} diff --git a/man/calc_log10_lrs.Rd b/man/calc_log10_lrs.Rd index 941fdf9..c165bca 100644 --- a/man/calc_log10_lrs.Rd +++ b/man/calc_log10_lrs.Rd @@ -18,7 +18,8 @@ calc_log10_lrs( difftol = 1, threads = 4, seed = 1, - time_limit = 3 + time_limit = 3, + check_inputs = TRUE ) } \arguments{ @@ -76,6 +77,9 @@ argument. Default: 1} \item{time_limit}{Time limit in minutes to run the \code{\link[euroformix:contLikSearch]{euroformix::contLikSearch()}} function on 1 bloodmeal-human pair. Default: 3} + +\item{check_inputs}{A boolean indicating whether or not to check the +inputs to the function. Default: TRUE} } \value{ A tibble with the same output as for \code{\link[=bistro]{bistro()}}, except there is no @@ -90,4 +94,14 @@ data. If you would like to preprocess it in the same way as is performed internally in the \code{bistro()} function, you must run \code{prep_bloodmeal_profiles()} and \code{prep_human_profiles()} first. } -\keyword{internal} +\examples{ +bm_profs <- prep_bloodmeal_profiles(bloodmeal_profiles, peak_thresh = 200) +hu_profs <- prep_human_profiles(human_profiles) +log10_lrs <- calc_log10_lrs(bm_profs, + hu_profs, + bloodmeal_ids = "evid1", + pop_allele_freqs = pop_allele_freqs, + kit = "ESX17", + peak_thresh = 200 +) +} diff --git a/man/create_db_from_bloodmeals.Rd b/man/create_db_from_bloodmeals.Rd index bebccfd..c3c6ae4 100644 --- a/man/create_db_from_bloodmeals.Rd +++ b/man/create_db_from_bloodmeals.Rd @@ -32,7 +32,8 @@ calculations.} } \value{ Human database created from complete single-source bloodmeals. -Complete is defined as the number of markers in the kit minus the number of markers in \code{rm_markers}. +Complete is defined as the number of markers in the kit minus the +number of markers in \code{rm_markers}. } \description{ Create human profile database from bloodmeal profiles @@ -40,8 +41,10 @@ Create human profile database from bloodmeal profiles \examples{ \dontrun{ # load example data -path_to_data <- paste0("https://raw.githubusercontent.com/duke-malaria-collaboratory/", -"bistro_validation/main/data/provedit/provedit_samples_mass200thresh.csv") +path_to_data <- paste0( + "https://raw.githubusercontent.com/duke-malaria-collaboratory/", + "bistro_validation/main/data/provedit/provedit_samples_mass200thresh.csv" +) samples <- readr::read_csv(path_to_data) create_db_from_bloodmeals(samples, kit = "identifiler", peak_thresh = 200) } diff --git a/man/identify_matches.Rd b/man/identify_matches.Rd index c29d9b2..949e022 100644 --- a/man/identify_matches.Rd +++ b/man/identify_matches.Rd @@ -4,7 +4,7 @@ \alias{identify_matches} \title{Identify matches for multiple bloodmeal-human pairs} \usage{ -identify_matches(log10_lrs, bloodmeal_ids = NULL) +identify_matches(log10_lrs, bloodmeal_ids = NULL, check_inputs = TRUE) } \arguments{ \item{log10_lrs}{Output from \code{\link[=calc_log10_lrs]{calc_log10_lrs()}} or from \code{bistro} with @@ -13,6 +13,9 @@ identify_matches(log10_lrs, bloodmeal_ids = NULL) \item{bloodmeal_ids}{Vector of bloodmeal ids from the SampleName column in \code{bloodmeal_profiles} for which to compute log10_lrs. If NULL, all ids in the input dataframe will be used. Default: NULL} + +\item{check_inputs}{A boolean indicating whether or not to check the +inputs to the function. Default: TRUE} } \value{ A tibble with the same output as for \code{\link[=bistro]{bistro()}}. @@ -20,4 +23,15 @@ A tibble with the same output as for \code{\link[=bistro]{bistro()}}. \description{ Identify matches for multiple bloodmeal-human pairs } -\keyword{internal} +\examples{ +bm_profs <- prep_bloodmeal_profiles(bloodmeal_profiles, peak_thresh = 200) +hu_profs <- prep_human_profiles(human_profiles) +log10_lrs <- calc_log10_lrs(bm_profs, + hu_profs, + bloodmeal_ids = "evid1", + pop_allele_freqs = pop_allele_freqs, + kit = "ESX17", + peak_thresh = 200 +) +matches <- identify_matches(log10_lrs) +} diff --git a/man/prep_bloodmeal_profiles.Rd b/man/prep_bloodmeal_profiles.Rd index 7e41eb0..1c6c35d 100644 --- a/man/prep_bloodmeal_profiles.Rd +++ b/man/prep_bloodmeal_profiles.Rd @@ -9,7 +9,8 @@ prep_bloodmeal_profiles( bloodmeal_ids = NULL, peak_thresh = NULL, rm_markers = c("AMEL"), - check_heights = TRUE + check_heights = TRUE, + check_inputs = TRUE ) } \arguments{ @@ -33,6 +34,9 @@ calculations.} \item{check_heights}{A boolean indicating whether to check if all peak heights are below the threshold. Default: TRUE} + +\item{check_inputs}{A boolean indicating whether or not to check the +inputs to the function. Default: TRUE} } \value{ Dataframe with preprocessed bloodmeal profiles @@ -40,4 +44,6 @@ Dataframe with preprocessed bloodmeal profiles \description{ Removes duplicates and peaks below threshold, subsets ids } -\keyword{internal} +\examples{ +prep_bloodmeal_profiles(bloodmeal_profiles, peak_thresh = 200) +} diff --git a/man/prep_human_profiles.Rd b/man/prep_human_profiles.Rd index e8e1ce0..0dfa93e 100644 --- a/man/prep_human_profiles.Rd +++ b/man/prep_human_profiles.Rd @@ -8,7 +8,8 @@ prep_human_profiles( human_profiles, human_ids = NULL, rm_twins = TRUE, - rm_markers = c("AMEL") + rm_markers = c("AMEL"), + check_inputs = TRUE ) } \arguments{ @@ -27,6 +28,9 @@ matches. Default: TRUE} calculating log10LRs. NULL to include all markers. By default, for the bistro function AMEL is removed as it is not standard to include it in LR calculations.} + +\item{check_inputs}{A boolean indicating whether or not to check the +inputs to the function. Default: TRUE} } \value{ Dataframe with preprocessed human profiles @@ -34,4 +38,6 @@ Dataframe with preprocessed human profiles \description{ Removes duplicates and optionally twins, subsets ids } -\keyword{internal} +\examples{ +prep_human_profiles(human_profiles) +} diff --git a/tests/testthat/_snaps/calc_log10_lrs.md b/tests/testthat/_snaps/calc_log10_lrs.md index bdc9da3..f2fba57 100644 --- a/tests/testthat/_snaps/calc_log10_lrs.md +++ b/tests/testthat/_snaps/calc_log10_lrs.md @@ -61,7 +61,7 @@ Code calc_log10_lrs(bm_profs, hu_profs, pop_allele_freqs = pop_allele_freqs, kit = "ESX17", - peak_thresh = 200) + peak_thresh = 200, check_inputs = FALSE) Message # bloodmeal ids: 2 # human ids: 1 diff --git a/tests/testthat/test-calc_log10_lrs.R b/tests/testthat/test-calc_log10_lrs.R index 0716183..5803db6 100644 --- a/tests/testthat/test-calc_log10_lrs.R +++ b/tests/testthat/test-calc_log10_lrs.R @@ -76,7 +76,8 @@ test_that("calc_log10_lrs works", { hu_profs, pop_allele_freqs = pop_allele_freqs, kit = "ESX17", - peak_thresh = 200 + peak_thresh = 200, + check_inputs = FALSE ) ) }) diff --git a/vignettes/bistro.Rmd b/vignettes/bistro.Rmd index 2568d2f..ac36b3c 100644 --- a/vignettes/bistro.Rmd +++ b/vignettes/bistro.Rmd @@ -25,6 +25,10 @@ A core part of the algorithm is the [`contLikSearch()`](https://github.com/oyvbl bistro uses the log10LRs to identify bloodmeal-human matches using a per-bloodmeal dynamic threshold approach. +## Running bistro in parallel + +First, if you want to run bistro on many samples and would prefer to run it in parallel, potentially on a cluster, check out our template [bistro Snakemake pipeline](https://github.com/duke-malaria-collaboratory/bistro_pipeline). + ## Installing the `bistro` R package To install `bistro`, run the following commands: @@ -262,4 +266,6 @@ bistro(samples, hu_db, "identifiler", 200, ``` +## Running bistro step-by-step +If you're interested in learning how to run sub-functions of `bistro()` individually, head over to `vignette("step-by-step")`. diff --git a/vignettes/step-by-step.Rmd b/vignettes/step-by-step.Rmd new file mode 100644 index 0000000..ab637c2 --- /dev/null +++ b/vignettes/step-by-step.Rmd @@ -0,0 +1,83 @@ +--- +title: "Run bistro step-by-step" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Run bistro step-by-step} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(bistro) +``` + +If you haven't yet, please check out `vignette("bistro")` before reading this vignette. It will provide you with more background and context, and take you through the outputs in more detail than we do here. Then if you want to learn how to run sub-functions of bistro individually, or run bistro step-by-step, you're in the right place. + +We will go through the following steps to run bistro step-by-step using the example data provided in the `bistro` package: + +- `calc_allele_freqs()` +- `prep_bloodmeal_profiles()` +- `prep_human_profiles()` +- `calc_log10_lrs()` +- `identify_matches()` + +## Calculate human population allele frequencies + +If needed, you can first calculate population allele frequencies from your human profile database: + +```{r} +pop_freqs_computed <- calc_allele_freqs(human_profiles) +``` + +We will use the built-in population allele frequencies here since they are more accurate than computing allele frequencies for only the three example profiles in `human_profiles`. + +## Prepare input STR profiles + +Before calculating log10LRs, you must ensure that the bloodmeal and human profiles are prepared correctly. You can do this using `prep_bloodmeal_profiles()` and `prep_human_profiles()`, respectively. + +Both functions: + +- Removes markers the user does not want to use +- Removes duplicate rows to ensure that homozygous alleles are only included once + +`prep_bloodmeal_profiles()` also optionally filters peakes below a user-defined threshold + +`prep_human_profiles()` also optionally removes identical twins (which cannot be resolved) + +```{r} +bm_profs <- prep_bloodmeal_profiles(bloodmeal_profiles, peak_thresh = 200) +hu_profs <- prep_human_profiles(human_profiles) +``` + + +## Calculate log10LRs for bloodmeal-human pairs + +Next, you can use the prepared profiles to compute log10LRs for each bloodmeal-human pair (or a subset of pairs). Note that to identify matches, you will have to have the log10LRs between each bloodmeal and all human profiles in the reference database. + +Here is an example of how to compute the log10LRs for one bloodmeal and all humans in the database: + +```{r} +log10_lrs <- calc_log10_lrs(bm_profs, + hu_profs, + bloodmeal_ids = "evid1", + pop_allele_freqs = pop_allele_freqs, + kit = "ESX17", + peak_thresh = 200 +) +``` + +## Identify bloodmeal-human matches + +Next, we can identify matches to the bloodmeal: + +```{r} +matches <- identify_matches(log10_lrs) +``` +