Skip to content

Commit

Permalink
add dlabel and bc_contamination
Browse files Browse the repository at this point in the history
  • Loading branch information
Elias authored and Elias committed Mar 19, 2024
1 parent ee4acf4 commit 7aa5188
Show file tree
Hide file tree
Showing 5 changed files with 288 additions and 2 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: kungfu
Title: kungfu - kicks stubborn data into shape
Version: 0.2.2
Version: 0.2.3
Authors@R:
person(given = "Johannes",
family = "Elias",
Expand Down Expand Up @@ -28,4 +28,4 @@ Imports: DBI (>= 1.1.1),
tidyr (>= 1.1.3),
utils (>= 4.0.4)
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# Generated by roxygen2: do not edit by hand

export(bc_contamination)
export(cleaned)
export(cleaner)
export(common_join)
export(df_pattern_locate)
export(df_pattern_subset)
export(dfilter)
export(dlabel)
export(is_clean)
export(is_seamless)
export(killConnections)
Expand All @@ -20,6 +22,7 @@ import(dplyr)
import(fuzzyjoin)
import(lubridate)
import(nycflights13)
import(rlang)
import(utils)
importFrom(dplyr,"%>%")
importFrom(dplyr,arrange)
Expand Down
165 changes: 165 additions & 0 deletions R/dlabel.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#' @name dlabel
#' @title dlabel
#'
#' @description
#' This function adds a label to a data frame based on the distance between records as
#' defined by the `distance` column and calculated by \link[kungfu]{dfilter}. The label
#' is a boolean stating whether the distance between records is within the specified
#' range.
#'
#' @param df A data frame.
#' @param id A character string specifying the column name of the id.
#' @param category A character string specifying the column name of the category.
#' @param distance A character string specifying the column name of the distance column; see \link[kungfu]{dfilter}
#' for details.
#' @param min.dist A numeric value specifying the minimum distance between records; see \link[kungfu]{dfilter}
#' for details.
#' @param max.dist A numeric value specifying the maximum distance between records; see \link[kungfu]{dfilter}
#' for details.
#' @param temporal.unit A character string specifying the temporal unit of the distance; see \link[kungfu]{dfilter}
#' for details.
#' @param label_name A character string specifying the name of the label column.
#' @param invert A logical value specifying whether to invert the label.
#' @param df_filter A character string specifying the filter expression to be applied to the data frame before
#' the distance label is calculated.
#'
#' @import dplyr
#' @import rlang
#'
#' @return A data frame with the distance label added.
#'
#' @export
#'
#' @examples
#' # create test data
#' set.seed(123)
#' dl.test <- data.frame(id = sample(1:10, 30, replace = TRUE),
#' category = sample(letters[1:4], 30, replace = TRUE),
#' timestamp = as.POSIXct(runif(size, 1704063600, 1711922400),
#' origin = "1970-01-01"))
#'
#' # test: dlabel will reveal three id-category combinations with temporal
#' # distances within the range of 2 to 40 days pertaining to category 'a'
#' test <- dlabel(dl.test, id = "id", category = "category",
#' distance = "timestamp",
#' min.dist = 2, max.dist = 40,
#' temporal.unit = "days",
#' label_name = "within_range",
#' df_filter = "category == 'a'")
#'

dlabel <- function(df, id, category, distance, min.dist, max.dist, temporal.unit,
label_name = "dlabel",
invert = FALSE, df_filter = NULL) {
# save original df for debugging
df0 <- df
# add row number to df
df <- df %>%
arrange(pick(!!id, !!category, !!distance)) %>%
mutate(dlabel_row_number = row_number())

# if df_filter is not NULL, filter df
if (!is.null(df_filter)) {
df_filtered <- df %>% filter(eval(parse(text = df_filter)))
# df_filter <- rlang::expr(filter(data, !!rlang::enquo(df_filter)))
# df_filtered <- eval(df_filter, envir = list(data = df))
} else {
df_filtered <- df
}

# list of id-category combinations with more than one record
id_category_list <- df_filtered %>%
group_by(pick(!!id, !!category)) %>%
summarize(n = n(), .groups = "drop") %>%
filter(n > 1)

# filter df according to id_category_list
df_records <- df %>%
inner_join(id_category_list,
by = c(id, category)) %>%
select(-n) %>%
arrange(pick(!!id, !!category))

# apply dfilter to df_records
df_records_dfltr <- df_records %>%
group_by(pick(!!id, !!category)) %>%
group_modify(~ dfilter(d = .x[, distance], which.chunk = "all", min.dist = min.dist, max.dist = max.dist, temporal.unit = temporal.unit)) %>%
# add row number to df_records_dfltr
left_join(df_filtered %>% select(all_of(c(!!id, !!category, !!distance, "dlabel_row_number"))),
by = c(id, category, distance)) %>%
# apply ruthless kungfu power to enforce unique row numbers
cleaner(ufn = c(id, category, distance, "dlabel_row_number"))

# modifier according to invert
modf <- function(x) !x # the default; !is.na(chunk), see below
if (invert) modf <- function(x) x # leave as is; is.na(chunk), see below

# add dlabel column to df_filtered
df_dlabel_0 <- df_filtered %>%
left_join(df_records_dfltr, by = "dlabel_row_number") %>%
mutate(!!label_name := modf(is.na(chunk))) %>%
select(-chunk) %>% # remove chunk from dfilter
select(-starts_with(distance)) # remove distance from dfilter (not needed anymore

# rejoin to unfiltered df
df_dlabel <- df %>%
left_join(select(df_dlabel_0, all_of(c("dlabel_row_number", label_name))), by = "dlabel_row_number") %>%
select(-dlabel_row_number) %>% # remove dlabel_row_number
mutate(!!sym(label_name) := replace_na(!!sym(label_name), FALSE))

# return bc_contam
return(df_dlabel)
}

#' @rdname dlabel
#' @description `bc_contamination` calls \code{dlabel} with default parameters for the
#' calculation of contamination in blood cultures.
#'
#' @import dplyr
#' @import rlang
#'
#' @export
#'
#' @examples
#' set.seed(123)
#'
#' bugs <- data.frame(species = c("S. epidermidis", "C. acnes", "S. aureus", "E. coli"),
#' category = c("skin flora", "skin flora", "pathogen", "pathogen"))
#'
#' samples <- data.frame(lab_no = 1:50,
#' patient = sample(1:10, 50, replace = TRUE),
#' species = sample(bugs$species, 50, replace = TRUE),
#' timestamp = as.POSIXct(runif(50, 1704063600, 1711000000),
#' origin = "1970-01-01"))
#'
#' samples <- samples %>% left_join(bugs, by = "species")
#'
#' bc_conta <- bc_contamination(samples,
#' id = "patient",
#' category = "species",
#' distance = "timestamp",
#' df_filter = "category == 'skin flora'")
#'
#' # Patient 9 has 5 cultures with skin flora, which, despite revealing
#' # skin flora, could correspond to infection (field contamination equals
#' # to FALSE), as these cultures satisfy the temporal distance criterion
#' # given by min.dist and max.dist.
#'
#' # check:
#'
#' # bc_conta %>% filter(category=="skin flora" & !contamination)
#'
#' # The remaining samples yielding skin flora likely represent
#' # contamination, as their temporal occurrence is outside the range
#' # given by min.dist and max.dist.
#'

bc_contamination <- function(..., min.dist = 5,
max.dist = 7 * 60 * 24,
temporal.unit = "minutes",
label_name = "contamination",
invert = TRUE) {
if (missing(...)) stop("Please supply arguments to dlabel()!")
dlabel(..., min.dist = min.dist, max.dist = max.dist,
temporal.unit = temporal.unit, label_name = label_name, invert = invert)
}
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Presently, the package contains the following functions (in alphabetic order):
- `cleaner`: removes duplicates in a `data.frame`
- `df_pattern_subset`: subsets a `data.frame` given two regex patterns marking the upper left and lower right corners of the returned `data.frame`.
- `dfilter`: filters a vector of type *numeric*, *integer*, *Date*, or *POSIXt*, in a fashion that removes entries exceeding a user-specified distance from other values; e.g. "dfiltering" `c(1,2,10)` would remove `10`, if argument `max.dist` is `5` (as `10 - 2 > 5`); similarly, an argument `min.dist` can be specified to enforce a minimal distance between entries.
- `dlabel`: wraps `dfilter` to label occurrences satisfying specified distance criteria (see function `dfilter`); function `bc_contamination` is merely a customized call of `dlabel`, designed to look scan blood culture results for possible contamination.
- `pattern_join`: joins two tables based on regex patterns; it is similar to function `regex_join` in package [fuzzyjoin](https://github.com/dgrtwo/fuzzyjoin), which I discovered only after writing `pattern_join`
- `postgresql_uploader`: uploads a `data.frame` into an existing PostgreSQL table
- `rbinder`: function for importing and joining of multiple csv-like files with identical headers
Expand Down
117 changes: 117 additions & 0 deletions man/dlabel.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 7aa5188

Please sign in to comment.