add dlabel and bc_contamination

joheli · Mar 19, 2024 · 7aa5188 · 7aa5188
1 parent ee4acf4
commit 7aa5188
Show file tree

Hide file tree

Showing 5 changed files with 288 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: kungfu
 Title: kungfu - kicks stubborn data into shape
-Version: 0.2.2
+Version: 0.2.3
 Authors@R: 
     person(given = "Johannes",
            family = "Elias",
@@ -28,4 +28,4 @@ Imports: DBI (>= 1.1.1),
          tidyr (>= 1.1.3),
          utils (>= 4.0.4)
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,11 +1,13 @@
 # Generated by roxygen2: do not edit by hand
 
+export(bc_contamination)
 export(cleaned)
 export(cleaner)
 export(common_join)
 export(df_pattern_locate)
 export(df_pattern_subset)
 export(dfilter)
+export(dlabel)
 export(is_clean)
 export(is_seamless)
 export(killConnections)
@@ -20,6 +22,7 @@ import(dplyr)
 import(fuzzyjoin)
 import(lubridate)
 import(nycflights13)
+import(rlang)
 import(utils)
 importFrom(dplyr,"%>%")
 importFrom(dplyr,arrange)

diff --git a/R/dlabel.R b/R/dlabel.R
@@ -0,0 +1,165 @@
+#' @name dlabel
+#' @title dlabel
+#' 
+#' @description 
+#' This function adds a label to a data frame based on the distance between records as 
+#' defined by the `distance` column and calculated by \link[kungfu]{dfilter}. The label
+#' is a boolean stating whether the distance between records is within the specified 
+#' range.
+#' 
+#' @param df A data frame.
+#' @param id A character string specifying the column name of the id.
+#' @param category A character string specifying the column name of the category.
+#' @param distance A character string specifying the column name of the distance column; see \link[kungfu]{dfilter} 
+#' for details.
+#' @param min.dist A numeric value specifying the minimum distance between records; see \link[kungfu]{dfilter}
+#' for details.
+#' @param max.dist A numeric value specifying the maximum distance between records; see \link[kungfu]{dfilter}
+#' for details.
+#' @param temporal.unit A character string specifying the temporal unit of the distance; see \link[kungfu]{dfilter}
+#' for details.
+#' @param label_name A character string specifying the name of the label column.
+#' @param invert A logical value specifying whether to invert the label.
+#' @param df_filter A character string specifying the filter expression to be applied to the data frame before
+#' the distance label is calculated.
+#' 
+#' @import dplyr
+#' @import rlang
+#' 
+#' @return A data frame with the distance label added.
+#' 
+#' @export 
+#' 
+#' @examples
+#' # create test data
+#' set.seed(123)
+#' dl.test <- data.frame(id = sample(1:10, 30, replace = TRUE), 
+#'            category = sample(letters[1:4], 30, replace = TRUE), 
+#'            timestamp = as.POSIXct(runif(size, 1704063600, 1711922400), 
+#'                                         origin = "1970-01-01"))
+#'
+#' # test: dlabel will reveal three id-category combinations with temporal 
+#' # distances within the range of 2 to 40 days pertaining to category 'a'
+#' test <- dlabel(dl.test, id = "id", category = "category", 
+#'         distance = "timestamp", 
+#'         min.dist = 2, max.dist = 40,
+#'         temporal.unit = "days",
+#'         label_name = "within_range",
+#'         df_filter = "category == 'a'")
+#' 
+
+dlabel <- function(df, id, category, distance, min.dist, max.dist, temporal.unit, 
+            label_name = "dlabel",
+            invert = FALSE, df_filter = NULL) {
+  # save original df for debugging
+  df0 <- df
+  # add row number to df
+  df <- df %>% 
+    arrange(pick(!!id, !!category, !!distance)) %>% 
+    mutate(dlabel_row_number = row_number())
+
+  # if df_filter is not NULL, filter df
+  if (!is.null(df_filter)) {
+    df_filtered <- df %>% filter(eval(parse(text = df_filter)))
+    # df_filter <- rlang::expr(filter(data, !!rlang::enquo(df_filter)))
+    # df_filtered <- eval(df_filter, envir = list(data = df))
+  } else {
+    df_filtered <- df
+  }
+
+  # list of id-category combinations with more than one record
+  id_category_list <- df_filtered %>%
+    group_by(pick(!!id, !!category)) %>%
+    summarize(n = n(), .groups = "drop") %>%
+    filter(n > 1)
+
+  # filter df according to id_category_list
+  df_records <- df %>%
+    inner_join(id_category_list,
+               by = c(id, category)) %>%
+    select(-n) %>%
+    arrange(pick(!!id, !!category))
+
+  # apply dfilter to df_records
+  df_records_dfltr <- df_records %>%
+    group_by(pick(!!id, !!category)) %>%
+    group_modify(~ dfilter(d = .x[, distance], which.chunk = "all", min.dist = min.dist, max.dist = max.dist, temporal.unit = temporal.unit)) %>%
+    # add row number to df_records_dfltr
+    left_join(df_filtered %>% select(all_of(c(!!id, !!category, !!distance, "dlabel_row_number"))), 
+              by = c(id, category, distance)) %>%
+    # apply ruthless kungfu power to enforce unique row numbers
+    cleaner(ufn = c(id, category, distance, "dlabel_row_number"))
+
+  # modifier according to invert
+  modf <- function(x) !x # the default; !is.na(chunk), see below 
+  if (invert) modf <- function(x) x # leave as is; is.na(chunk), see below
+
+  # add dlabel column to df_filtered
+  df_dlabel_0 <- df_filtered %>%
+    left_join(df_records_dfltr, by = "dlabel_row_number") %>%
+    mutate(!!label_name := modf(is.na(chunk))) %>%
+    select(-chunk) %>% # remove chunk from dfilter
+    select(-starts_with(distance)) # remove distance from dfilter (not needed anymore
+
+  # rejoin to unfiltered df
+  df_dlabel <- df %>%
+    left_join(select(df_dlabel_0, all_of(c("dlabel_row_number", label_name))), by = "dlabel_row_number") %>%
+    select(-dlabel_row_number) %>% # remove dlabel_row_number
+    mutate(!!sym(label_name) := replace_na(!!sym(label_name), FALSE)) 
+
+  # return bc_contam
+  return(df_dlabel)
+}
+
+#' @rdname dlabel
+#' @description `bc_contamination` calls \code{dlabel} with default parameters for the
+#' calculation of contamination in blood cultures.
+#'
+#' @import dplyr
+#' @import rlang
+#' 
+#' @export 
+#' 
+#' @examples 
+#' set.seed(123)
+#'
+#' bugs <- data.frame(species = c("S. epidermidis", "C. acnes", "S. aureus", "E. coli"),
+#'                   category = c("skin flora", "skin flora", "pathogen", "pathogen"))
+#' 
+#' samples <- data.frame(lab_no = 1:50,
+#'                        patient = sample(1:10, 50, replace = TRUE),
+#'                        species = sample(bugs$species, 50, replace = TRUE),
+#'                        timestamp = as.POSIXct(runif(50, 1704063600, 1711000000), 
+#'                                                origin = "1970-01-01"))
+#' 
+#' samples <- samples %>% left_join(bugs, by = "species")
+#' 
+#' bc_conta <- bc_contamination(samples, 
+#'                             id = "patient", 
+#'                             category = "species", 
+#'                             distance = "timestamp",
+#'                             df_filter = "category == 'skin flora'")
+#' 
+#' # Patient 9 has 5 cultures with skin flora, which, despite revealing
+#' # skin flora, could correspond to infection (field contamination equals
+#' # to FALSE), as these cultures satisfy the temporal distance criterion
+#' # given by min.dist and max.dist.
+#'
+#' # check:
+#'
+#' # bc_conta %>% filter(category=="skin flora" & !contamination)
+#' 
+#' # The remaining samples yielding skin flora likely represent 
+#' # contamination, as their temporal occurrence is outside the range
+#' # given by min.dist and max.dist.
+#' 
+
+bc_contamination <- function(..., min.dist = 5, 
+                            max.dist = 7 * 60 * 24, 
+                            temporal.unit = "minutes",
+                            label_name = "contamination",
+                            invert = TRUE) {
+    if (missing(...)) stop("Please supply arguments to dlabel()!")
+    dlabel(..., min.dist = min.dist, max.dist = max.dist, 
+           temporal.unit = temporal.unit, label_name = label_name, invert = invert)
+}
diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ Presently, the package contains the following functions (in alphabetic order):
   - `cleaner`: removes duplicates in a `data.frame`
   - `df_pattern_subset`: subsets a `data.frame` given two regex patterns marking the upper left and lower right corners of the returned `data.frame`.
   - `dfilter`: filters a vector of type *numeric*, *integer*, *Date*, or *POSIXt*, in a fashion that removes entries exceeding a user-specified distance from other values; e.g. "dfiltering" `c(1,2,10)` would remove `10`, if argument `max.dist` is `5` (as `10 - 2 > 5`); similarly, an argument `min.dist` can be specified to enforce a minimal distance between entries.
+  - `dlabel`: wraps `dfilter` to label occurrences satisfying specified distance criteria (see function `dfilter`); function `bc_contamination` is merely a customized call of `dlabel`, designed to look scan blood culture results for possible contamination.
   - `pattern_join`: joins two tables based on regex patterns; it is similar to function `regex_join` in package [fuzzyjoin](https://github.com/dgrtwo/fuzzyjoin), which I discovered only after writing `pattern_join`
   - `postgresql_uploader`: uploads a `data.frame` into an existing PostgreSQL table
   - `rbinder`: function for importing and joining of multiple csv-like files with identical headers

diff --git a/man/dlabel.Rd b/man/dlabel.Rd