From 81934560fa494eb64e99fb63e1530d55e3d086b8 Mon Sep 17 00:00:00 2001 From: Charlton Callender Date: Mon, 29 Mar 2021 11:20:29 -0700 Subject: [PATCH 1/3] add initial 'prioritize_dt' helper function --- DESCRIPTION | 1 + NAMESPACE | 1 + R/prioritize_dt.R | 128 ++++++++++++++++++++++++++++ R/utils-data-table.R | 3 +- man/prioritize_dt.Rd | 77 +++++++++++++++++ tests/testthat/test-prioritize_dt.R | 69 +++++++++++++++ 6 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 R/prioritize_dt.R create mode 100644 man/prioritize_dt.Rd create mode 100644 tests/testthat/test-prioritize_dt.R diff --git a/DESCRIPTION b/DESCRIPTION index d606bdc..d97fc65 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,6 +26,7 @@ Imports: assertthat, assertive, assertable (>= 0.2.7), + checkmate, methods, fs, reticulate diff --git a/NAMESPACE b/NAMESPACE index 556aa6b..bbde593 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ export(interpolate) export(invlogit) export(logit) export(pct_change) +export(prioritize_dt) export(rinvgamma) export(summarize_dt) import(data.table) diff --git a/R/prioritize_dt.R b/R/prioritize_dt.R new file mode 100644 index 0000000..51e2b07 --- /dev/null +++ b/R/prioritize_dt.R @@ -0,0 +1,128 @@ +#' @title Rank non-unique rows in a data.table using defined priority orders +#' +#' @param dt \[`data.table()`\]\cr +#' Data to determine rank priority for. +#' @param rank_by_cols \[`character()`\]\cr +#' Apply `rank_order` priorities to each unique combination of `rank_by_cols` +#' in `dt`. +#' @param rank_order \[`list()`\]\cr +#' Priority order to use when ranking non-unique rows. Each element of +#' `rank_order` corresponds to a column in `dt`. Possible values for each +#' column are '1' (ascending), '-1' (descending) or ordered factor levels when +#' the column is not a numeric. See details for more information. +#' @param quiet \[`logical(1)`\]\cr +#' Whether to print out detailed messages/warnings about possible issues with +#' `rank_order`. Default is 'FALSE'. +#' +#' @return `dt` with a new 'priority' column generated using the rules specified +#' in `rank_order`. 'priority' equal to 1 is the highest priority +#' +#' @details +#' `prioritize_dt` uses `data.table::setorderv` to order `dt` according to +#' `rank_order`. `prioritize_dt` takes three possible values to specify the +#' order of a column in `dt`. +#' 1. '1', order a numeric column in ascending order (smaller values have higher +#' priority). +#' 2. '-1', order a numeric column in descending order (larger values have +#' higher priority). +#' 3. `factor` levels, to order a categorical column in a custom order with the +#' first level having highest priority. When not all present values of the +#' column are defined in the levels, the priority will be NA and a warning +#' printed if `quiet = FALSE`. +#' +#' @examples +#' # preliminary data with only total population +#' dt_total <- data.table::CJ( +#' location = "USA", year = 2000, age_start = 0, age_end = Inf, +#' method = c("de facto", "de jure"), +#' status = c("preliminary") +#' ) +#' # final data in 10 year age groups +#' dt_10_yr_groups <- data.table::CJ( +#' location = "USA", year = 2000, age_start = seq(0, 80, 10), +#' method = c("de facto", "de jure"), +#' status = c("final") +#' ) +#' dt_10_yr_groups[, age_end := age_start + 10] +#' dt_10_yr_groups[age_start == 80, age_end := Inf] +#' +#' input_dt <- rbind(dt_total, dt_10_yr_groups) +#' input_dt[, n_age_groups := .N, by = setdiff(names(input_dt), c("age_start", "age_end"))] +#' +#' output_dt <- prioritize_dt( +#' dt = input_dt, +#' rank_by_cols = c("location", "year"), +#' rank_order = list( +#' method = c("de facto", "de jure"), # prioritize 'de facto' sources highest +#' n_age_groups = -1 # prioritize sources with more age groups +#' ) +#' ) +#' +#' @export +prioritize_dt <- function(dt, rank_by_cols, rank_order, quiet = FALSE) { + + # validate inputs --------------------------------------------------------- + + checkmate::assert_data_table(dt) + checkmate::assert_logical(quiet, len = 1) + + checkmate::assert_character(rank_by_cols) + checkmate::assert_names(names(dt), must.include = rank_by_cols) + + checkmate::assert_list(rank_order) + checkmate::assert_names(names(dt), must.include = names(rank_order)) + + rank_order <- copy(rank_order) + original_col_order <- names(dt) + original_keys <- key(dt) + + # Prioritize dataset ------------------------------------------------------ + + priority_dt <- unique(dt[, c(rank_by_cols, names(rank_order)), with = F]) + + # reformat categorical columns as factors with defined order in + for (col in names(rank_order)) { + col_levels <- rank_order[[col]] + + if (!checkmate::test_choice(col_levels, choices = c(1, -1))) { + + checkmate::assert_vector(col_levels) + + # check for non-defined levels for the categorical column + other_levels <- setdiff(unique(priority_dt[[col]]), col_levels) + if (!quiet) { + warning( + "'", col, "' `rank_order` is missing levels, the priority for these levels will be 'NA'\n", + "\t- defined levels: ", paste(col_levels, collapse = ","), "\n", + "\t- missing levels: ", paste(other_levels, collapse = ",") + ) + } + + priority_dt[, c(col) := factor(get(col), levels = col_levels)] + # assumes 'col_levels' is already sorted with highest priority first + rank_order[[col]] <- 1 + } + } + + # order based on specified rank order + data.table::setorderv( + x = priority_dt, cols = names(rank_order), + order = unlist(rank_order), + na.last = TRUE + ) + priority_dt[, priority := seq(.N), by = rank_by_cols] + + # add priority rank back onto original dataset + dt <- merge(dt, priority_dt, by = setdiff(names(priority_dt), "priority"), all.x = TRUE) + + # format output + data.table::setcolorder(dt, c(original_col_order, "priority")) + if (is.null(original_keys)) { + original_keys <- c(rank_by_cols, "priority") + } else { + original_keys <- c(original_keys, "priority") + } + data.table::setkeyv(dt, original_keys) + + return(dt) +} diff --git a/R/utils-data-table.R b/R/utils-data-table.R index bca2b23..d378feb 100644 --- a/R/utils-data-table.R +++ b/R/utils-data-table.R @@ -18,4 +18,5 @@ NULL # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when # https://community.rstudio.com/t/how-to-solve-no-visible-binding-for-global-variable-note/28887 utils::globalVariables(c("check", "combine_pdfs_py", - "interpolate_col", "interval", "x", "y", "y_new")) + "interpolate_col", "interval", "x", "y", "y_new", + "priority")) diff --git a/man/prioritize_dt.Rd b/man/prioritize_dt.Rd new file mode 100644 index 0000000..9a9d1b4 --- /dev/null +++ b/man/prioritize_dt.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prioritize_dt.R +\name{prioritize_dt} +\alias{prioritize_dt} +\title{Rank non-unique rows in a data.table using defined priority orders} +\usage{ +prioritize_dt(dt, rank_by_cols, rank_order, quiet = FALSE) +} +\arguments{ +\item{dt}{[\code{data.table()}]\cr +Data to determine rank priority for.} + +\item{rank_by_cols}{[\code{character()}]\cr +Apply \code{rank_order} priorities to each unique combination of \code{rank_by_cols} +in \code{dt}.} + +\item{rank_order}{[\code{list()}]\cr +Priority order to use when ranking non-unique rows. Each element of +\code{rank_order} corresponds to a column in \code{dt}. Possible values for each +column are '1' (ascending), '-1' (descending) or ordered factor levels when +the column is not a numeric. See details for more information.} + +\item{quiet}{[\code{logical(1)}]\cr +Whether to print out detailed messages/warnings about possible issues with +\code{rank_order}. Default is 'FALSE'.} +} +\value{ +\code{dt} with a new 'priority' column generated using the rules specified +in \code{rank_order}. 'priority' equal to 1 is the highest priority +} +\description{ +Rank non-unique rows in a data.table using defined priority orders +} +\details{ +\code{prioritize_dt} uses \code{data.table::setorderv} to order \code{dt} according to +\code{rank_order}. \code{prioritize_dt} takes three possible values to specify the +order of a column in \code{dt}. +\enumerate{ +\item '1', order a numeric column in ascending order (smaller values have higher +priority). +\item '-1', order a numeric column in descending order (larger values have +higher priority). +\item \code{factor} levels, to order a categorical column in a custom order with the +first level having highest priority. When not all present values of the +column are defined in the levels, the priority will be NA and a warning +printed if \code{quiet = FALSE}. +} +} +\examples{ +# preliminary data with only total population +dt_total <- data.table::CJ( + location = "USA", year = 2000, age_start = 0, age_end = Inf, + method = c("de facto", "de jure"), + status = c("preliminary") +) +# final data in 10 year age groups +dt_10_yr_groups <- data.table::CJ( + location = "USA", year = 2000, age_start = seq(0, 80, 10), + method = c("de facto", "de jure"), + status = c("final") +) +dt_10_yr_groups[, age_end := age_start + 10] +dt_10_yr_groups[age_start == 80, age_end := Inf] + +input_dt <- rbind(dt_total, dt_10_yr_groups) +input_dt[, n_age_groups := .N, by = setdiff(names(input_dt), c("age_start", "age_end"))] + +output_dt <- prioritize_dt( + dt = input_dt, + rank_by_cols = c("location", "year"), + rank_order = list( + method = c("de facto", "de jure"), # prioritize 'de facto' sources highest + n_age_groups = -1 # prioritize sources with more age groups + ) +) + +} diff --git a/tests/testthat/test-prioritize_dt.R b/tests/testthat/test-prioritize_dt.R new file mode 100644 index 0000000..23cf711 --- /dev/null +++ b/tests/testthat/test-prioritize_dt.R @@ -0,0 +1,69 @@ +library(data.table) +library(testthat) + + +# Test basic scenario ----------------------------------------------------- + +# set up test input data.table +input_dt <- CJ( + location = "USA", year = c(2000, 2010), age_start = seq(0, 80, 5), + report = c(2015, 2020), # numeric variable to prioritize with + method = c("A", "B"), # categorical variable to prioritize with + value = 1 +) +id_cols <- setdiff(names(input_dt), "value") +setkeyv(input_dt, id_cols) + +rank_order <- list( + method = c("B", "A"), + report = -1 +) + +expected_dt <- copy(input_dt) +expected_dt[method == "B" & report == 2020, priority := 1L] +expected_dt[method == "B" & report == 2015, priority := 2L] +expected_dt[method == "A" & report == 2020, priority := 3L] +expected_dt[method == "A" & report == 2015, priority := 4L] +setkeyv(expected_dt, c(id_cols, "priority")) + +testthat::test_that("prioritization of data works", { + output_dt <- prioritize_dt( + dt = input_dt, + rank_by_cols = c("location", "year"), + rank_order = rank_order + ) + testthat::expect_identical(output_dt, expected_dt) +}) + +# Rank order missing some categorical factor levels ----------------------- + +# same input as above but with method 'C' & 'D' added to input + +# set up test input data.table +input_dt <- CJ( + location = "USA", year = c(2000, 2010), age_start = seq(0, 80, 5), + report = c(2015, 2020), # numeric variable to prioritize with + method = c("A", "B", "C", "D"), # categorical variable to prioritize with + value = 1 +) +id_cols <- setdiff(names(input_dt), "value") +setkeyv(input_dt, id_cols) + +expected_dt <- copy(input_dt) +expected_dt[method == "B" & report == 2020, priority := 1L] +expected_dt[method == "B" & report == 2015, priority := 2L] +expected_dt[method == "A" & report == 2020, priority := 3L] +expected_dt[method == "A" & report == 2015, priority := 4L] +expected_dt[method %in% c("C", "D"), priority := NA] +setkeyv(expected_dt, c(id_cols, "priority")) + + +testthat::test_that("prioritization of data works", { + output_dt <- prioritize_dt( + dt = input_dt, + rank_by_cols = c("location", "year"), + rank_order = rank_order, + quiet = TRUE + ) + testthat::expect_identical(output_dt, expected_dt) +}) From 2e8a93d023428a5c49c928911f1220790316d89c Mon Sep 17 00:00:00 2001 From: Charlton Callender Date: Mon, 29 Mar 2021 13:57:55 -0700 Subject: [PATCH 2/3] clarify 'rank_order' documentation --- R/prioritize_dt.R | 13 +++++++++---- man/prioritize_dt.Rd | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/R/prioritize_dt.R b/R/prioritize_dt.R index 51e2b07..0ccf5b6 100644 --- a/R/prioritize_dt.R +++ b/R/prioritize_dt.R @@ -6,10 +6,12 @@ #' Apply `rank_order` priorities to each unique combination of `rank_by_cols` #' in `dt`. #' @param rank_order \[`list()`\]\cr -#' Priority order to use when ranking non-unique rows. Each element of -#' `rank_order` corresponds to a column in `dt`. Possible values for each -#' column are '1' (ascending), '-1' (descending) or ordered factor levels when -#' the column is not a numeric. See details for more information. +#' Named \[`list()`\] defining the priority order to use when ranking +#' non-unique rows. Each element of `rank_order` corresponds to a column in +#' `dt`, the prioritization is applied according to the order of elements in +#' `rank_order`. Possible values for each column are '1' (ascending), '-1' +#' (descending) or ordered factor levels when the column is not a numeric. See +#' details for more information. #' @param quiet \[`logical(1)`\]\cr #' Whether to print out detailed messages/warnings about possible issues with #' `rank_order`. Default is 'FALSE'. @@ -30,6 +32,9 @@ #' column are defined in the levels, the priority will be NA and a warning #' printed if `quiet = FALSE`. #' +#' The order of elements in `rank_order` matters. The more important rules +#' should be placed earlier in `rank_order` so that they are applied first. +#' #' @examples #' # preliminary data with only total population #' dt_total <- data.table::CJ( diff --git a/man/prioritize_dt.Rd b/man/prioritize_dt.Rd index 9a9d1b4..a998c5f 100644 --- a/man/prioritize_dt.Rd +++ b/man/prioritize_dt.Rd @@ -15,10 +15,12 @@ Apply \code{rank_order} priorities to each unique combination of \code{rank_by_c in \code{dt}.} \item{rank_order}{[\code{list()}]\cr -Priority order to use when ranking non-unique rows. Each element of -\code{rank_order} corresponds to a column in \code{dt}. Possible values for each -column are '1' (ascending), '-1' (descending) or ordered factor levels when -the column is not a numeric. See details for more information.} +Named [\code{list()}] defining the priority order to use when ranking +non-unique rows. Each element of \code{rank_order} corresponds to a column in +\code{dt}, the prioritization is applied according to the order of elements in +\code{rank_order}. Possible values for each column are '1' (ascending), '-1' +(descending) or ordered factor levels when the column is not a numeric. See +details for more information.} \item{quiet}{[\code{logical(1)}]\cr Whether to print out detailed messages/warnings about possible issues with @@ -45,6 +47,9 @@ first level having highest priority. When not all present values of the column are defined in the levels, the priority will be NA and a warning printed if \code{quiet = FALSE}. } + +The order of elements in \code{rank_order} matters. The more important rules +should be placed earlier in \code{rank_order} so that they are applied first. } \examples{ # preliminary data with only total population From ff964534d19257b0d64c6beb936e63ff04d7ab80 Mon Sep 17 00:00:00 2001 From: Charlton Callender Date: Mon, 29 Mar 2021 17:13:15 -0700 Subject: [PATCH 3/3] Add 'unique_id_cols' argument to help with assertions --- R/prioritize_dt.R | 76 ++++++++++++++++++++++++----- man/prioritize_dt.Rd | 36 ++++++++++++-- tests/testthat/test-prioritize_dt.R | 45 ++++++++++++++--- 3 files changed, 134 insertions(+), 23 deletions(-) diff --git a/R/prioritize_dt.R b/R/prioritize_dt.R index 0ccf5b6..6ad6a26 100644 --- a/R/prioritize_dt.R +++ b/R/prioritize_dt.R @@ -4,7 +4,11 @@ #' Data to determine rank priority for. #' @param rank_by_cols \[`character()`\]\cr #' Apply `rank_order` priorities to each unique combination of `rank_by_cols` -#' in `dt`. +#' in `dt`. This should be equal to or a subset of `unique_id_cols`. +#' @param unique_id_cols \[`character()`\]\cr +#' ID columns that once ranked by priority will uniquely identify rows of `dt` +#' in combination with the priority column. This should be a superset of +#' `rank_by_cols`. Default is equal to `rank_by_cols`. #' @param rank_order \[`list()`\]\cr #' Named \[`list()`\] defining the priority order to use when ranking #' non-unique rows. Each element of `rank_order` corresponds to a column in @@ -12,9 +16,19 @@ #' `rank_order`. Possible values for each column are '1' (ascending), '-1' #' (descending) or ordered factor levels when the column is not a numeric. See #' details for more information. -#' @param quiet \[`logical(1)`\]\cr -#' Whether to print out detailed messages/warnings about possible issues with -#' `rank_order`. Default is 'FALSE'. +#' @param warn_missing_levels \[`logical(1)`\]\cr +#' Whether to warn about missing levels for elements of `rank_order` or throw +#' error. Default is 'FALSE' and errors out if there are missing levels. +#' @param warn_non_unique_priority \[`logical(1)`\]\cr +#' Whether to warn about specified `rank_by_cols` & `rank_order` leading to +#' non-unique rows of `dt` after generating 'priority' column. Default is +#' 'FALSE' and errors out if there are non-unique rows. +#' @param check_top_priority_unique_only \[`logical(1)`\]\cr +#' When checking for non-unique rows of `dt` after generating the 'priority' +#' column with the `rank_by_cols` & names of `rank_order`, only check the +#' priority=1 rows. This is useful when specified `rank_order` levels are not +#' exhaustive leading to 'NA' priorities for some rows. Default if 'FALSE' and +#' errors out if there are any non-unique rows. #' #' @return `dt` with a new 'priority' column generated using the rules specified #' in `rank_order`. 'priority' equal to 1 is the highest priority @@ -57,6 +71,7 @@ #' output_dt <- prioritize_dt( #' dt = input_dt, #' rank_by_cols = c("location", "year"), +#' unique_id_cols = c("location", "year", "age_start", "age_end"), #' rank_order = list( #' method = c("de facto", "de jure"), # prioritize 'de facto' sources highest #' n_age_groups = -1 # prioritize sources with more age groups @@ -64,19 +79,34 @@ #' ) #' #' @export -prioritize_dt <- function(dt, rank_by_cols, rank_order, quiet = FALSE) { +prioritize_dt <- function(dt, + rank_by_cols, + unique_id_cols = rank_by_cols, + rank_order, + warn_missing_levels = FALSE, + warn_non_unique_priority = FALSE, + check_top_priority_unique_only = FALSE) { # validate inputs --------------------------------------------------------- + checkmate::assert_logical(warn_missing_levels, len = 1) + checkmate::assert_logical(warn_non_unique_priority, len = 1) + checkmate::assert_logical(check_top_priority_unique_only, len = 1) + checkmate::assert_data_table(dt) - checkmate::assert_logical(quiet, len = 1) + + checkmate::assert_character(unique_id_cols) + checkmate::assert_names(names(dt), must.include = unique_id_cols) checkmate::assert_character(rank_by_cols) checkmate::assert_names(names(dt), must.include = rank_by_cols) + checkmate::assert_subset(rank_by_cols, choices = unique_id_cols) checkmate::assert_list(rank_order) checkmate::assert_names(names(dt), must.include = names(rank_order)) + checkmate::assert_disjunct(names(rank_order), rank_by_cols) + rank_order <- copy(rank_order) original_col_order <- names(dt) original_keys <- key(dt) @@ -95,12 +125,17 @@ prioritize_dt <- function(dt, rank_by_cols, rank_order, quiet = FALSE) { # check for non-defined levels for the categorical column other_levels <- setdiff(unique(priority_dt[[col]]), col_levels) - if (!quiet) { - warning( - "'", col, "' `rank_order` is missing levels, the priority for these levels will be 'NA'\n", - "\t- defined levels: ", paste(col_levels, collapse = ","), "\n", - "\t- missing levels: ", paste(other_levels, collapse = ",") + if (length(other_levels) > 0) { + msg <- paste0( + "'", col, "' `rank_order` is missing levels, the priority for these levels will be 'NA'\n", + "\t- defined levels: ", paste(col_levels, collapse = ","), "\n", + "\t- missing levels: ", paste(other_levels, collapse = ",") ) + if (warn_missing_levels) { + warning(msg) + } else { + stop(msg) + } } priority_dt[, c(col) := factor(get(col), levels = col_levels)] @@ -120,6 +155,25 @@ prioritize_dt <- function(dt, rank_by_cols, rank_order, quiet = FALSE) { # add priority rank back onto original dataset dt <- merge(dt, priority_dt, by = setdiff(names(priority_dt), "priority"), all.x = TRUE) + # check + check_id_cols <- c(unique_id_cols, "priority") + check_dt <- dt + if (check_top_priority_unique_only) check_dt <- dt[priority == 1] + non_unique_dt <- demUtils::identify_non_unique_dt(check_dt, check_id_cols) + if (nrow(non_unique_dt) > 0) { + msg <- paste0( + "Specified `rank_by_cols`, `rank_order` & returned `priority` do not uniquely identify each row of `dt`.\n", + "\t- use `warn_non_unique_priority=TRUE` to return `dt` and run demUtils::identify_non_unique_dt\n", + "\t with `id_cols = c('", paste(check_id_cols, collapse = "', '"), "')`\n", + paste0(capture.output(non_unique_dt), collapse = "\n") + ) + if (warn_non_unique_priority) { + warning(msg) + } else { + stop(msg) + } + } + # format output data.table::setcolorder(dt, c(original_col_order, "priority")) if (is.null(original_keys)) { diff --git a/man/prioritize_dt.Rd b/man/prioritize_dt.Rd index a998c5f..22543a4 100644 --- a/man/prioritize_dt.Rd +++ b/man/prioritize_dt.Rd @@ -4,7 +4,15 @@ \alias{prioritize_dt} \title{Rank non-unique rows in a data.table using defined priority orders} \usage{ -prioritize_dt(dt, rank_by_cols, rank_order, quiet = FALSE) +prioritize_dt( + dt, + rank_by_cols, + unique_id_cols = rank_by_cols, + rank_order, + warn_missing_levels = FALSE, + warn_non_unique_priority = FALSE, + check_top_priority_unique_only = FALSE +) } \arguments{ \item{dt}{[\code{data.table()}]\cr @@ -12,7 +20,12 @@ Data to determine rank priority for.} \item{rank_by_cols}{[\code{character()}]\cr Apply \code{rank_order} priorities to each unique combination of \code{rank_by_cols} -in \code{dt}.} +in \code{dt}. This should be equal to or a subset of \code{unique_id_cols}.} + +\item{unique_id_cols}{[\code{character()}]\cr +ID columns that once ranked by priority will uniquely identify rows of \code{dt} +in combination with the priority column. This should be a superset of +\code{rank_by_cols}. Default is equal to \code{rank_by_cols}.} \item{rank_order}{[\code{list()}]\cr Named [\code{list()}] defining the priority order to use when ranking @@ -22,9 +35,21 @@ non-unique rows. Each element of \code{rank_order} corresponds to a column in (descending) or ordered factor levels when the column is not a numeric. See details for more information.} -\item{quiet}{[\code{logical(1)}]\cr -Whether to print out detailed messages/warnings about possible issues with -\code{rank_order}. Default is 'FALSE'.} +\item{warn_missing_levels}{[\code{logical(1)}]\cr +Whether to warn about missing levels for elements of \code{rank_order} or throw +error. Default is 'FALSE' and errors out if there are missing levels.} + +\item{warn_non_unique_priority}{[\code{logical(1)}]\cr +Whether to warn about specified \code{rank_by_cols} & \code{rank_order} leading to +non-unique rows of \code{dt} after generating 'priority' column. Default is +'FALSE' and errors out if there are non-unique rows.} + +\item{check_top_priority_unique_only}{[\code{logical(1)}]\cr +When checking for non-unique rows of \code{dt} after generating the 'priority' +column with the \code{rank_by_cols} & names of \code{rank_order}, only check the +priority=1 rows. This is useful when specified \code{rank_order} levels are not +exhaustive leading to 'NA' priorities for some rows. Default if 'FALSE' and +errors out if there are any non-unique rows.} } \value{ \code{dt} with a new 'priority' column generated using the rules specified @@ -73,6 +98,7 @@ input_dt[, n_age_groups := .N, by = setdiff(names(input_dt), c("age_start", "age output_dt <- prioritize_dt( dt = input_dt, rank_by_cols = c("location", "year"), + unique_id_cols = c("location", "year", "age_start", "age_end"), rank_order = list( method = c("de facto", "de jure"), # prioritize 'de facto' sources highest n_age_groups = -1 # prioritize sources with more age groups diff --git a/tests/testthat/test-prioritize_dt.R b/tests/testthat/test-prioritize_dt.R index 23cf711..59119bb 100644 --- a/tests/testthat/test-prioritize_dt.R +++ b/tests/testthat/test-prioritize_dt.R @@ -30,11 +30,24 @@ testthat::test_that("prioritization of data works", { output_dt <- prioritize_dt( dt = input_dt, rank_by_cols = c("location", "year"), - rank_order = rank_order + unique_id_cols = c("location", "year", "age_start"), + rank_order = rank_order, ) testthat::expect_identical(output_dt, expected_dt) }) +testthat::test_that("'prioritize_dt' catches missing rank order specification", { + testthat::expect_error( + prioritize_dt( + dt = input_dt, + rank_by_cols = c("location", "year"), + unique_id_cols = c("location", "year", "age_start"), + rank_order = rank_order["method"] + ), + regexp = "do not uniquely identify each row of" + ) +}) + # Rank order missing some categorical factor levels ----------------------- # same input as above but with method 'C' & 'D' added to input @@ -57,13 +70,31 @@ expected_dt[method == "A" & report == 2015, priority := 4L] expected_dt[method %in% c("C", "D"), priority := NA] setkeyv(expected_dt, c(id_cols, "priority")) +testthat::test_that("prioritization of data works when missing levels", { -testthat::test_that("prioritization of data works", { - output_dt <- prioritize_dt( - dt = input_dt, - rank_by_cols = c("location", "year"), - rank_order = rank_order, - quiet = TRUE + # expect error since multiple method/reports will have report NA + testthat::expect_error( + suppressWarnings(prioritize_dt( + dt = input_dt, + rank_by_cols = c("location", "year"), + unique_id_cols = c("location", "year", "age_start"), + rank_order = rank_order, + warn_missing_levels = TRUE, + )), + regexp = "do not uniquely identify each row of" + ) + + # now no error since only checking that top priority is unique + output_dt <- testthat::expect_warning( + prioritize_dt( + dt = input_dt, + rank_by_cols = c("location", "year"), + unique_id_cols = c("location", "year", "age_start"), + rank_order = rank_order, + warn_missing_levels = TRUE, + check_top_priority_unique_only = TRUE + ), + regexp = "'method' `rank_order` is missing levels" ) testthat::expect_identical(output_dt, expected_dt) })