From 81934560fa494eb64e99fb63e1530d55e3d086b8 Mon Sep 17 00:00:00 2001
From: Charlton Callender <chacalle@uw.edu>
Date: Mon, 29 Mar 2021 11:20:29 -0700
Subject: [PATCH 1/3] add initial 'prioritize_dt' helper function

---
 DESCRIPTION                         |   1 +
 NAMESPACE                           |   1 +
 R/prioritize_dt.R                   | 128 ++++++++++++++++++++++++++++
 R/utils-data-table.R                |   3 +-
 man/prioritize_dt.Rd                |  77 +++++++++++++++++
 tests/testthat/test-prioritize_dt.R |  69 +++++++++++++++
 6 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 R/prioritize_dt.R
 create mode 100644 man/prioritize_dt.Rd
 create mode 100644 tests/testthat/test-prioritize_dt.R

diff --git a/DESCRIPTION b/DESCRIPTION
index d606bdc..d97fc65 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -26,6 +26,7 @@ Imports:
     assertthat,
     assertive,
     assertable (>= 0.2.7),
+    checkmate,
     methods,
     fs,
     reticulate
diff --git a/NAMESPACE b/NAMESPACE
index 556aa6b..bbde593 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -11,6 +11,7 @@ export(interpolate)
 export(invlogit)
 export(logit)
 export(pct_change)
+export(prioritize_dt)
 export(rinvgamma)
 export(summarize_dt)
 import(data.table)
diff --git a/R/prioritize_dt.R b/R/prioritize_dt.R
new file mode 100644
index 0000000..51e2b07
--- /dev/null
+++ b/R/prioritize_dt.R
@@ -0,0 +1,128 @@
+#' @title Rank non-unique rows in a data.table using defined priority orders
+#'
+#' @param dt \[`data.table()`\]\cr
+#'   Data to determine rank priority for.
+#' @param rank_by_cols \[`character()`\]\cr
+#'   Apply `rank_order` priorities to each unique combination of `rank_by_cols`
+#'   in `dt`.
+#' @param rank_order \[`list()`\]\cr
+#'   Priority order to use when ranking non-unique rows. Each element of
+#'   `rank_order` corresponds to a column in `dt`. Possible values for each
+#'   column are '1' (ascending), '-1' (descending) or ordered factor levels when
+#'   the column is not a numeric. See details for more information.
+#' @param quiet \[`logical(1)`\]\cr
+#'   Whether to print out detailed messages/warnings about possible issues with
+#'   `rank_order`. Default is 'FALSE'.
+#'
+#' @return `dt` with a new 'priority' column generated using the rules specified
+#'   in `rank_order`. 'priority' equal to 1 is the highest priority
+#'
+#' @details
+#' `prioritize_dt` uses `data.table::setorderv` to order `dt` according to
+#' `rank_order`. `prioritize_dt` takes three possible values to specify the
+#' order of a column in `dt`.
+#' 1. '1', order a numeric column in ascending order (smaller values have higher
+#' priority).
+#' 2. '-1', order a numeric column in descending order (larger values have
+#' higher priority).
+#' 3. `factor` levels, to order a categorical column in a custom order with the
+#' first level having highest priority. When not all present values of the
+#' column are defined in the levels, the priority will be NA and a warning
+#' printed if `quiet = FALSE`.
+#'
+#' @examples
+#' # preliminary data with only total population
+#' dt_total <- data.table::CJ(
+#'   location = "USA", year = 2000, age_start = 0, age_end = Inf,
+#'   method = c("de facto", "de jure"),
+#'   status = c("preliminary")
+#' )
+#' # final data in 10 year age groups
+#' dt_10_yr_groups <- data.table::CJ(
+#'   location = "USA", year = 2000, age_start = seq(0, 80, 10),
+#'   method = c("de facto", "de jure"),
+#'   status = c("final")
+#' )
+#' dt_10_yr_groups[, age_end := age_start + 10]
+#' dt_10_yr_groups[age_start == 80, age_end := Inf]
+#'
+#' input_dt <- rbind(dt_total, dt_10_yr_groups)
+#' input_dt[, n_age_groups := .N, by = setdiff(names(input_dt), c("age_start", "age_end"))]
+#'
+#' output_dt <- prioritize_dt(
+#'   dt = input_dt,
+#'   rank_by_cols = c("location", "year"),
+#'   rank_order = list(
+#'     method = c("de facto", "de jure"), # prioritize 'de facto' sources highest
+#'     n_age_groups = -1 # prioritize sources with more age groups
+#'   )
+#' )
+#'
+#' @export
+prioritize_dt <- function(dt, rank_by_cols, rank_order, quiet = FALSE) {
+
+  # validate inputs ---------------------------------------------------------
+
+  checkmate::assert_data_table(dt)
+  checkmate::assert_logical(quiet, len = 1)
+
+  checkmate::assert_character(rank_by_cols)
+  checkmate::assert_names(names(dt), must.include = rank_by_cols)
+
+  checkmate::assert_list(rank_order)
+  checkmate::assert_names(names(dt), must.include = names(rank_order))
+
+  rank_order <- copy(rank_order)
+  original_col_order <- names(dt)
+  original_keys <- key(dt)
+
+  # Prioritize dataset ------------------------------------------------------
+
+  priority_dt <- unique(dt[, c(rank_by_cols, names(rank_order)), with = F])
+
+  # reformat categorical columns as factors with defined order in
+  for (col in names(rank_order)) {
+    col_levels <- rank_order[[col]]
+
+    if (!checkmate::test_choice(col_levels, choices = c(1, -1))) {
+
+      checkmate::assert_vector(col_levels)
+
+      # check for non-defined levels for the categorical column
+      other_levels <- setdiff(unique(priority_dt[[col]]), col_levels)
+      if (!quiet) {
+        warning(
+        "'", col, "' `rank_order` is missing levels, the priority for these levels will be 'NA'\n",
+        "\t- defined levels: ", paste(col_levels, collapse = ","), "\n",
+        "\t- missing levels: ", paste(other_levels, collapse = ",")
+        )
+      }
+
+      priority_dt[, c(col) := factor(get(col), levels = col_levels)]
+      # assumes 'col_levels' is already sorted with highest priority first
+      rank_order[[col]] <- 1
+    }
+  }
+
+  # order based on specified rank order
+  data.table::setorderv(
+    x = priority_dt, cols = names(rank_order),
+    order = unlist(rank_order),
+    na.last = TRUE
+  )
+  priority_dt[, priority := seq(.N), by = rank_by_cols]
+
+  # add priority rank back onto original dataset
+  dt <- merge(dt, priority_dt, by = setdiff(names(priority_dt), "priority"), all.x = TRUE)
+
+  # format output
+  data.table::setcolorder(dt, c(original_col_order, "priority"))
+  if (is.null(original_keys)) {
+    original_keys <- c(rank_by_cols, "priority")
+  } else {
+    original_keys <- c(original_keys, "priority")
+  }
+  data.table::setkeyv(dt, original_keys)
+
+  return(dt)
+}
diff --git a/R/utils-data-table.R b/R/utils-data-table.R
index bca2b23..d378feb 100644
--- a/R/utils-data-table.R
+++ b/R/utils-data-table.R
@@ -18,4 +18,5 @@ NULL
 # https://stackoverflow.com/questions/9439256/how-can-i-handle-r-cmd-check-no-visible-binding-for-global-variable-notes-when
 # https://community.rstudio.com/t/how-to-solve-no-visible-binding-for-global-variable-note/28887
 utils::globalVariables(c("check", "combine_pdfs_py",
-                         "interpolate_col", "interval", "x", "y", "y_new"))
+                         "interpolate_col", "interval", "x", "y", "y_new",
+                         "priority"))
diff --git a/man/prioritize_dt.Rd b/man/prioritize_dt.Rd
new file mode 100644
index 0000000..9a9d1b4
--- /dev/null
+++ b/man/prioritize_dt.Rd
@@ -0,0 +1,77 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/prioritize_dt.R
+\name{prioritize_dt}
+\alias{prioritize_dt}
+\title{Rank non-unique rows in a data.table using defined priority orders}
+\usage{
+prioritize_dt(dt, rank_by_cols, rank_order, quiet = FALSE)
+}
+\arguments{
+\item{dt}{[\code{data.table()}]\cr
+Data to determine rank priority for.}
+
+\item{rank_by_cols}{[\code{character()}]\cr
+Apply \code{rank_order} priorities to each unique combination of \code{rank_by_cols}
+in \code{dt}.}
+
+\item{rank_order}{[\code{list()}]\cr
+Priority order to use when ranking non-unique rows. Each element of
+\code{rank_order} corresponds to a column in \code{dt}. Possible values for each
+column are '1' (ascending), '-1' (descending) or ordered factor levels when
+the column is not a numeric. See details for more information.}
+
+\item{quiet}{[\code{logical(1)}]\cr
+Whether to print out detailed messages/warnings about possible issues with
+\code{rank_order}. Default is 'FALSE'.}
+}
+\value{
+\code{dt} with a new 'priority' column generated using the rules specified
+in \code{rank_order}. 'priority' equal to 1 is the highest priority
+}
+\description{
+Rank non-unique rows in a data.table using defined priority orders
+}
+\details{
+\code{prioritize_dt} uses \code{data.table::setorderv} to order \code{dt} according to
+\code{rank_order}. \code{prioritize_dt} takes three possible values to specify the
+order of a column in \code{dt}.
+\enumerate{
+\item '1', order a numeric column in ascending order (smaller values have higher
+priority).
+\item '-1', order a numeric column in descending order (larger values have
+higher priority).
+\item \code{factor} levels, to order a categorical column in a custom order with the
+first level having highest priority. When not all present values of the
+column are defined in the levels, the priority will be NA and a warning
+printed if \code{quiet = FALSE}.
+}
+}
+\examples{
+# preliminary data with only total population
+dt_total <- data.table::CJ(
+  location = "USA", year = 2000, age_start = 0, age_end = Inf,
+  method = c("de facto", "de jure"),
+  status = c("preliminary")
+)
+# final data in 10 year age groups
+dt_10_yr_groups <- data.table::CJ(
+  location = "USA", year = 2000, age_start = seq(0, 80, 10),
+  method = c("de facto", "de jure"),
+  status = c("final")
+)
+dt_10_yr_groups[, age_end := age_start + 10]
+dt_10_yr_groups[age_start == 80, age_end := Inf]
+
+input_dt <- rbind(dt_total, dt_10_yr_groups)
+input_dt[, n_age_groups := .N, by = setdiff(names(input_dt), c("age_start", "age_end"))]
+
+output_dt <- prioritize_dt(
+  dt = input_dt,
+  rank_by_cols = c("location", "year"),
+  rank_order = list(
+    method = c("de facto", "de jure"), # prioritize 'de facto' sources highest
+    n_age_groups = -1 # prioritize sources with more age groups
+  )
+)
+
+}
diff --git a/tests/testthat/test-prioritize_dt.R b/tests/testthat/test-prioritize_dt.R
new file mode 100644
index 0000000..23cf711
--- /dev/null
+++ b/tests/testthat/test-prioritize_dt.R
@@ -0,0 +1,69 @@
+library(data.table)
+library(testthat)
+
+
+# Test basic scenario -----------------------------------------------------
+
+# set up test input data.table
+input_dt <- CJ(
+  location = "USA", year = c(2000, 2010), age_start = seq(0, 80, 5),
+  report = c(2015, 2020), # numeric variable to prioritize with
+  method = c("A", "B"), # categorical variable to prioritize with
+  value = 1
+)
+id_cols <- setdiff(names(input_dt), "value")
+setkeyv(input_dt, id_cols)
+
+rank_order <- list(
+  method = c("B", "A"),
+  report = -1
+)
+
+expected_dt <- copy(input_dt)
+expected_dt[method == "B" & report == 2020, priority := 1L]
+expected_dt[method == "B" & report == 2015, priority := 2L]
+expected_dt[method == "A" & report == 2020, priority := 3L]
+expected_dt[method == "A" & report == 2015, priority := 4L]
+setkeyv(expected_dt, c(id_cols, "priority"))
+
+testthat::test_that("prioritization of data works", {
+  output_dt <- prioritize_dt(
+    dt = input_dt,
+    rank_by_cols = c("location", "year"),
+    rank_order = rank_order
+  )
+  testthat::expect_identical(output_dt, expected_dt)
+})
+
+# Rank order missing some categorical factor levels -----------------------
+
+# same input as above but with method 'C' & 'D' added to input
+
+# set up test input data.table
+input_dt <- CJ(
+  location = "USA", year = c(2000, 2010), age_start = seq(0, 80, 5),
+  report = c(2015, 2020), # numeric variable to prioritize with
+  method = c("A", "B", "C", "D"), # categorical variable to prioritize with
+  value = 1
+)
+id_cols <- setdiff(names(input_dt), "value")
+setkeyv(input_dt, id_cols)
+
+expected_dt <- copy(input_dt)
+expected_dt[method == "B" & report == 2020, priority := 1L]
+expected_dt[method == "B" & report == 2015, priority := 2L]
+expected_dt[method == "A" & report == 2020, priority := 3L]
+expected_dt[method == "A" & report == 2015, priority := 4L]
+expected_dt[method %in% c("C", "D"), priority := NA]
+setkeyv(expected_dt, c(id_cols, "priority"))
+
+
+testthat::test_that("prioritization of data works", {
+  output_dt <- prioritize_dt(
+    dt = input_dt,
+    rank_by_cols = c("location", "year"),
+    rank_order = rank_order,
+    quiet = TRUE
+  )
+  testthat::expect_identical(output_dt, expected_dt)
+})

From 2e8a93d023428a5c49c928911f1220790316d89c Mon Sep 17 00:00:00 2001
From: Charlton Callender <chacalle@uw.edu>
Date: Mon, 29 Mar 2021 13:57:55 -0700
Subject: [PATCH 2/3] clarify 'rank_order' documentation

---
 R/prioritize_dt.R    | 13 +++++++++----
 man/prioritize_dt.Rd | 13 +++++++++----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/R/prioritize_dt.R b/R/prioritize_dt.R
index 51e2b07..0ccf5b6 100644
--- a/R/prioritize_dt.R
+++ b/R/prioritize_dt.R
@@ -6,10 +6,12 @@
 #'   Apply `rank_order` priorities to each unique combination of `rank_by_cols`
 #'   in `dt`.
 #' @param rank_order \[`list()`\]\cr
-#'   Priority order to use when ranking non-unique rows. Each element of
-#'   `rank_order` corresponds to a column in `dt`. Possible values for each
-#'   column are '1' (ascending), '-1' (descending) or ordered factor levels when
-#'   the column is not a numeric. See details for more information.
+#'   Named \[`list()`\] defining the priority order to use when ranking
+#'   non-unique rows. Each element of `rank_order` corresponds to a column in
+#'   `dt`, the prioritization is applied according to the order of elements in
+#'   `rank_order`. Possible values for each column are '1' (ascending), '-1'
+#'   (descending) or ordered factor levels when the column is not a numeric. See
+#'   details for more information.
 #' @param quiet \[`logical(1)`\]\cr
 #'   Whether to print out detailed messages/warnings about possible issues with
 #'   `rank_order`. Default is 'FALSE'.
@@ -30,6 +32,9 @@
 #' column are defined in the levels, the priority will be NA and a warning
 #' printed if `quiet = FALSE`.
 #'
+#' The order of elements in `rank_order` matters. The more important rules
+#' should be placed earlier in `rank_order` so that they are applied first.
+#'
 #' @examples
 #' # preliminary data with only total population
 #' dt_total <- data.table::CJ(
diff --git a/man/prioritize_dt.Rd b/man/prioritize_dt.Rd
index 9a9d1b4..a998c5f 100644
--- a/man/prioritize_dt.Rd
+++ b/man/prioritize_dt.Rd
@@ -15,10 +15,12 @@ Apply \code{rank_order} priorities to each unique combination of \code{rank_by_c
 in \code{dt}.}
 
 \item{rank_order}{[\code{list()}]\cr
-Priority order to use when ranking non-unique rows. Each element of
-\code{rank_order} corresponds to a column in \code{dt}. Possible values for each
-column are '1' (ascending), '-1' (descending) or ordered factor levels when
-the column is not a numeric. See details for more information.}
+Named [\code{list()}] defining the priority order to use when ranking
+non-unique rows. Each element of \code{rank_order} corresponds to a column in
+\code{dt}, the prioritization is applied according to the order of elements in
+\code{rank_order}. Possible values for each column are '1' (ascending), '-1'
+(descending) or ordered factor levels when the column is not a numeric. See
+details for more information.}
 
 \item{quiet}{[\code{logical(1)}]\cr
 Whether to print out detailed messages/warnings about possible issues with
@@ -45,6 +47,9 @@ first level having highest priority. When not all present values of the
 column are defined in the levels, the priority will be NA and a warning
 printed if \code{quiet = FALSE}.
 }
+
+The order of elements in \code{rank_order} matters. The more important rules
+should be placed earlier in \code{rank_order} so that they are applied first.
 }
 \examples{
 # preliminary data with only total population

From ff964534d19257b0d64c6beb936e63ff04d7ab80 Mon Sep 17 00:00:00 2001
From: Charlton Callender <chacalle@uw.edu>
Date: Mon, 29 Mar 2021 17:13:15 -0700
Subject: [PATCH 3/3] Add 'unique_id_cols' argument to help with assertions

---
 R/prioritize_dt.R                   | 76 ++++++++++++++++++++++++-----
 man/prioritize_dt.Rd                | 36 ++++++++++++--
 tests/testthat/test-prioritize_dt.R | 45 ++++++++++++++---
 3 files changed, 134 insertions(+), 23 deletions(-)

diff --git a/R/prioritize_dt.R b/R/prioritize_dt.R
index 0ccf5b6..6ad6a26 100644
--- a/R/prioritize_dt.R
+++ b/R/prioritize_dt.R
@@ -4,7 +4,11 @@
 #'   Data to determine rank priority for.
 #' @param rank_by_cols \[`character()`\]\cr
 #'   Apply `rank_order` priorities to each unique combination of `rank_by_cols`
-#'   in `dt`.
+#'   in `dt`. This should be equal to or a subset of `unique_id_cols`.
+#' @param unique_id_cols \[`character()`\]\cr
+#'   ID columns that once ranked by priority will uniquely identify rows of `dt`
+#'   in combination with the priority column. This should be a superset of
+#'   `rank_by_cols`. Default is equal to `rank_by_cols`.
 #' @param rank_order \[`list()`\]\cr
 #'   Named \[`list()`\] defining the priority order to use when ranking
 #'   non-unique rows. Each element of `rank_order` corresponds to a column in
@@ -12,9 +16,19 @@
 #'   `rank_order`. Possible values for each column are '1' (ascending), '-1'
 #'   (descending) or ordered factor levels when the column is not a numeric. See
 #'   details for more information.
-#' @param quiet \[`logical(1)`\]\cr
-#'   Whether to print out detailed messages/warnings about possible issues with
-#'   `rank_order`. Default is 'FALSE'.
+#' @param warn_missing_levels \[`logical(1)`\]\cr
+#'   Whether to warn about missing levels for elements of `rank_order` or throw
+#'   error. Default is 'FALSE' and errors out if there are missing levels.
+#' @param warn_non_unique_priority \[`logical(1)`\]\cr
+#'   Whether to warn about specified `rank_by_cols` & `rank_order` leading to
+#'   non-unique rows of `dt` after generating 'priority' column. Default is
+#'   'FALSE' and errors out if there are non-unique rows.
+#' @param check_top_priority_unique_only \[`logical(1)`\]\cr
+#'   When checking for non-unique rows of `dt` after generating the 'priority'
+#'   column with the `rank_by_cols` & names of `rank_order`, only check the
+#'   priority=1 rows. This is useful when specified `rank_order` levels are not
+#'   exhaustive leading to 'NA' priorities for some rows. Default if 'FALSE' and
+#'   errors out if there are any non-unique rows.
 #'
 #' @return `dt` with a new 'priority' column generated using the rules specified
 #'   in `rank_order`. 'priority' equal to 1 is the highest priority
@@ -57,6 +71,7 @@
 #' output_dt <- prioritize_dt(
 #'   dt = input_dt,
 #'   rank_by_cols = c("location", "year"),
+#'   unique_id_cols = c("location", "year", "age_start", "age_end"),
 #'   rank_order = list(
 #'     method = c("de facto", "de jure"), # prioritize 'de facto' sources highest
 #'     n_age_groups = -1 # prioritize sources with more age groups
@@ -64,19 +79,34 @@
 #' )
 #'
 #' @export
-prioritize_dt <- function(dt, rank_by_cols, rank_order, quiet = FALSE) {
+prioritize_dt <- function(dt,
+                          rank_by_cols,
+                          unique_id_cols = rank_by_cols,
+                          rank_order,
+                          warn_missing_levels = FALSE,
+                          warn_non_unique_priority = FALSE,
+                          check_top_priority_unique_only = FALSE) {
 
   # validate inputs ---------------------------------------------------------
 
+  checkmate::assert_logical(warn_missing_levels, len = 1)
+  checkmate::assert_logical(warn_non_unique_priority, len = 1)
+  checkmate::assert_logical(check_top_priority_unique_only, len = 1)
+
   checkmate::assert_data_table(dt)
-  checkmate::assert_logical(quiet, len = 1)
+
+  checkmate::assert_character(unique_id_cols)
+  checkmate::assert_names(names(dt), must.include = unique_id_cols)
 
   checkmate::assert_character(rank_by_cols)
   checkmate::assert_names(names(dt), must.include = rank_by_cols)
+  checkmate::assert_subset(rank_by_cols, choices = unique_id_cols)
 
   checkmate::assert_list(rank_order)
   checkmate::assert_names(names(dt), must.include = names(rank_order))
 
+  checkmate::assert_disjunct(names(rank_order), rank_by_cols)
+
   rank_order <- copy(rank_order)
   original_col_order <- names(dt)
   original_keys <- key(dt)
@@ -95,12 +125,17 @@ prioritize_dt <- function(dt, rank_by_cols, rank_order, quiet = FALSE) {
 
       # check for non-defined levels for the categorical column
       other_levels <- setdiff(unique(priority_dt[[col]]), col_levels)
-      if (!quiet) {
-        warning(
-        "'", col, "' `rank_order` is missing levels, the priority for these levels will be 'NA'\n",
-        "\t- defined levels: ", paste(col_levels, collapse = ","), "\n",
-        "\t- missing levels: ", paste(other_levels, collapse = ",")
+      if (length(other_levels) > 0) {
+        msg <- paste0(
+          "'", col, "' `rank_order` is missing levels, the priority for these levels will be 'NA'\n",
+          "\t- defined levels: ", paste(col_levels, collapse = ","), "\n",
+          "\t- missing levels: ", paste(other_levels, collapse = ",")
         )
+        if (warn_missing_levels) {
+          warning(msg)
+        } else {
+          stop(msg)
+        }
       }
 
       priority_dt[, c(col) := factor(get(col), levels = col_levels)]
@@ -120,6 +155,25 @@ prioritize_dt <- function(dt, rank_by_cols, rank_order, quiet = FALSE) {
   # add priority rank back onto original dataset
   dt <- merge(dt, priority_dt, by = setdiff(names(priority_dt), "priority"), all.x = TRUE)
 
+  # check
+  check_id_cols <- c(unique_id_cols, "priority")
+  check_dt <- dt
+  if (check_top_priority_unique_only) check_dt <- dt[priority == 1]
+  non_unique_dt <- demUtils::identify_non_unique_dt(check_dt, check_id_cols)
+  if (nrow(non_unique_dt) > 0) {
+    msg <- paste0(
+      "Specified `rank_by_cols`, `rank_order` & returned `priority` do not uniquely identify each row of `dt`.\n",
+      "\t- use `warn_non_unique_priority=TRUE` to return `dt` and run demUtils::identify_non_unique_dt\n",
+      "\t with `id_cols = c('", paste(check_id_cols, collapse = "', '"), "')`\n",
+      paste0(capture.output(non_unique_dt), collapse = "\n")
+    )
+    if (warn_non_unique_priority) {
+      warning(msg)
+    } else {
+      stop(msg)
+    }
+  }
+
   # format output
   data.table::setcolorder(dt, c(original_col_order, "priority"))
   if (is.null(original_keys)) {
diff --git a/man/prioritize_dt.Rd b/man/prioritize_dt.Rd
index a998c5f..22543a4 100644
--- a/man/prioritize_dt.Rd
+++ b/man/prioritize_dt.Rd
@@ -4,7 +4,15 @@
 \alias{prioritize_dt}
 \title{Rank non-unique rows in a data.table using defined priority orders}
 \usage{
-prioritize_dt(dt, rank_by_cols, rank_order, quiet = FALSE)
+prioritize_dt(
+  dt,
+  rank_by_cols,
+  unique_id_cols = rank_by_cols,
+  rank_order,
+  warn_missing_levels = FALSE,
+  warn_non_unique_priority = FALSE,
+  check_top_priority_unique_only = FALSE
+)
 }
 \arguments{
 \item{dt}{[\code{data.table()}]\cr
@@ -12,7 +20,12 @@ Data to determine rank priority for.}
 
 \item{rank_by_cols}{[\code{character()}]\cr
 Apply \code{rank_order} priorities to each unique combination of \code{rank_by_cols}
-in \code{dt}.}
+in \code{dt}. This should be equal to or a subset of \code{unique_id_cols}.}
+
+\item{unique_id_cols}{[\code{character()}]\cr
+ID columns that once ranked by priority will uniquely identify rows of \code{dt}
+in combination with the priority column. This should be a superset of
+\code{rank_by_cols}. Default is equal to \code{rank_by_cols}.}
 
 \item{rank_order}{[\code{list()}]\cr
 Named [\code{list()}] defining the priority order to use when ranking
@@ -22,9 +35,21 @@ non-unique rows. Each element of \code{rank_order} corresponds to a column in
 (descending) or ordered factor levels when the column is not a numeric. See
 details for more information.}
 
-\item{quiet}{[\code{logical(1)}]\cr
-Whether to print out detailed messages/warnings about possible issues with
-\code{rank_order}. Default is 'FALSE'.}
+\item{warn_missing_levels}{[\code{logical(1)}]\cr
+Whether to warn about missing levels for elements of \code{rank_order} or throw
+error. Default is 'FALSE' and errors out if there are missing levels.}
+
+\item{warn_non_unique_priority}{[\code{logical(1)}]\cr
+Whether to warn about specified \code{rank_by_cols} & \code{rank_order} leading to
+non-unique rows of \code{dt} after generating 'priority' column. Default is
+'FALSE' and errors out if there are non-unique rows.}
+
+\item{check_top_priority_unique_only}{[\code{logical(1)}]\cr
+When checking for non-unique rows of \code{dt} after generating the 'priority'
+column with the \code{rank_by_cols} & names of \code{rank_order}, only check the
+priority=1 rows. This is useful when specified \code{rank_order} levels are not
+exhaustive leading to 'NA' priorities for some rows. Default if 'FALSE' and
+errors out if there are any non-unique rows.}
 }
 \value{
 \code{dt} with a new 'priority' column generated using the rules specified
@@ -73,6 +98,7 @@ input_dt[, n_age_groups := .N, by = setdiff(names(input_dt), c("age_start", "age
 output_dt <- prioritize_dt(
   dt = input_dt,
   rank_by_cols = c("location", "year"),
+  unique_id_cols = c("location", "year", "age_start", "age_end"),
   rank_order = list(
     method = c("de facto", "de jure"), # prioritize 'de facto' sources highest
     n_age_groups = -1 # prioritize sources with more age groups
diff --git a/tests/testthat/test-prioritize_dt.R b/tests/testthat/test-prioritize_dt.R
index 23cf711..59119bb 100644
--- a/tests/testthat/test-prioritize_dt.R
+++ b/tests/testthat/test-prioritize_dt.R
@@ -30,11 +30,24 @@ testthat::test_that("prioritization of data works", {
   output_dt <- prioritize_dt(
     dt = input_dt,
     rank_by_cols = c("location", "year"),
-    rank_order = rank_order
+    unique_id_cols = c("location", "year", "age_start"),
+    rank_order = rank_order,
   )
   testthat::expect_identical(output_dt, expected_dt)
 })
 
+testthat::test_that("'prioritize_dt' catches missing rank order specification", {
+  testthat::expect_error(
+    prioritize_dt(
+      dt = input_dt,
+      rank_by_cols = c("location", "year"),
+      unique_id_cols = c("location", "year", "age_start"),
+      rank_order = rank_order["method"]
+    ),
+    regexp = "do not uniquely identify each row of"
+  )
+})
+
 # Rank order missing some categorical factor levels -----------------------
 
 # same input as above but with method 'C' & 'D' added to input
@@ -57,13 +70,31 @@ expected_dt[method == "A" & report == 2015, priority := 4L]
 expected_dt[method %in% c("C", "D"), priority := NA]
 setkeyv(expected_dt, c(id_cols, "priority"))
 
+testthat::test_that("prioritization of data works when missing levels", {
 
-testthat::test_that("prioritization of data works", {
-  output_dt <- prioritize_dt(
-    dt = input_dt,
-    rank_by_cols = c("location", "year"),
-    rank_order = rank_order,
-    quiet = TRUE
+  # expect error since multiple method/reports will have report NA
+  testthat::expect_error(
+    suppressWarnings(prioritize_dt(
+      dt = input_dt,
+      rank_by_cols = c("location", "year"),
+      unique_id_cols = c("location", "year", "age_start"),
+      rank_order = rank_order,
+      warn_missing_levels = TRUE,
+    )),
+    regexp = "do not uniquely identify each row of"
+  )
+
+  # now no error since only checking that top priority is unique
+  output_dt <- testthat::expect_warning(
+    prioritize_dt(
+      dt = input_dt,
+      rank_by_cols = c("location", "year"),
+      unique_id_cols = c("location", "year", "age_start"),
+      rank_order = rank_order,
+      warn_missing_levels = TRUE,
+      check_top_priority_unique_only = TRUE
+    ),
+    regexp = "'method' `rank_order` is missing levels"
   )
   testthat::expect_identical(output_dt, expected_dt)
 })