Add create_ functions to generate identifiers within use_ functions

new functions are `create_composite_id()`, `create_random_id()`, `create_sequential_id()`
AtlasOfLivingAustralia · Dec 9, 2024 · 48e5df4 · 48e5df4
1 parent 29eac6b
commit 48e5df4
Show file tree

Hide file tree

Showing 17 changed files with 370 additions and 184 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,13 +1,15 @@
 # Generated by roxygen2: do not edit by hand
 
 export(check_dataset)
+export(create_composite_id)
+export(create_random_id)
+export(create_sequential_id)
 export(suggest_workflow)
 export(use_abundance)
 export(use_collection)
 export(use_coordinates)
 export(use_datetime)
 export(use_events)
-export(use_id_random)
 export(use_individual_traits)
 export(use_locality)
 export(use_measurements)
@@ -57,7 +59,6 @@ importFrom(dplyr,group_by)
 importFrom(dplyr,group_split)
 importFrom(dplyr,join_by)
 importFrom(dplyr,mutate)
-importFrom(dplyr,n)
 importFrom(dplyr,pull)
 importFrom(dplyr,select)
 importFrom(dplyr,slice_head)
@@ -78,14 +79,21 @@ importFrom(purrr,keep)
 importFrom(purrr,map)
 importFrom(purrr,pluck)
 importFrom(rlang,abort)
+importFrom(rlang,as_label)
 importFrom(rlang,caller_env)
 importFrom(rlang,cnd_muffle)
 importFrom(rlang,enquos)
+importFrom(rlang,eval_tidy)
 importFrom(rlang,exec)
 importFrom(rlang,get_expr)
 importFrom(rlang,inform)
 importFrom(rlang,is_empty)
+importFrom(rlang,is_syntactic_literal)
+importFrom(rlang,quo_get_env)
+importFrom(rlang,quo_get_expr)
+importFrom(rlang,quo_is_call)
 importFrom(rlang,quo_is_null)
+importFrom(rlang,quo_is_symbol)
 importFrom(rlang,try_fetch)
 importFrom(rlang,warn)
 importFrom(rlang,zap)
@@ -99,5 +107,3 @@ importFrom(stringr,str_remove_all)
 importFrom(tibble,lst)
 importFrom(tidyr,replace_na)
 importFrom(tidyr,unnest)
-importFrom(uuid,UUIDgenerate)
-importFrom(uuid,as.UUID)
diff --git a/R/corella-package.R b/R/corella-package.R
@@ -51,9 +51,12 @@
 #'  The wrapper function for checking tibbles for Darwin Core compliance is
 #'  [check_dataset()]. It calls all internal check functions for checking data in columns with matching Darwin Core terms.
 #'
-#'  ** Add occurrence ID **
+#'  **Helper functions**
+#'  These functions are called within `use_` (or `mutate()` functions), and assist in common problems
 #'
-#'   * [use_id_random()] adds a valid uuid column to a tibble
+#'   * [create_composite_id()] Supply a combination of variables to concatenate into a unique identifier
+#'   * [create_sequential_id()] Create a unique identifier of sequential numbers
+#'   * [create_random_id()] Create a unique identifier using `UUID()`
 #'
 #' @keywords internal
 "_PACKAGE"
diff --git a/R/create_id.R b/R/create_id.R
@@ -0,0 +1,126 @@
+#' Create identifier columns
+#'
+#' Identifiers are columns that uniquely identify a single record within a
+#' dataset. These are helper functions, designed to make it easier to
+#' generate such columns from a given dataset. They are designed to be called
+#' within [use_events()], [use_occurrences()], or (equivalently)
+#' [dplyr::mutate()]. Generally speaking, it is best practice to use existing
+#' information from a dataset to generate identifiers; for this reason we
+#' recomment using `create_composite_id()` to aggregate existing fields, if no
+#' such composite is already present within the dataset. It is possible to call
+#' `create_sequential_id()` or `create_random_id()` within
+#' `create_composite_id()` to combine existing and new columns.
+#' @rdname create_id
+#' @param ... Zero or more variable names from the tibble being
+#' mutated (unquoted), and/or zero or more `create_` functions, separated by
+#' commas.
+#' @param sep Character used to separate field values. Defaults to `"-"`
+#' @returns An amended tibble, containing a field with the requested information.
+#' @examples
+#' library(tibble)
+#' df <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"),
+#'              basisOfRecord = "humanObservation",
+#'              site = rep(c("A01", "A02", "A03"), each = 5))
+#' df |>
+#'     use_occurrences(occurrenceID = create_composite_id(create_sequential_id(),
+#'                                                        site,
+#'                                                        eventDate))
+#' @order 1
+#' @export
+create_composite_id <- function(...,
+                              sep = "-"){
+  x <- enquos(...)
+  string_result <- purrr::map(x, switch_expr_type)
+  names(string_result) <- glue("V{seq_along(string_result)}")
+  string_result <- c(string_result, sep = sep)
+  do.call(paste, string_result)
+}
+
+#' Switch functions for quosures
+#' @param x A (single) quosure
+#' @importFrom rlang abort
+#' @importFrom rlang quo_get_expr
+#' @noRd
+#' @keywords internal
+switch_expr_type <- function(x){
+  switch(expr_type(x),
+         "symbol" = {parse_symbol(x)},
+         "call" = {eval_tidy(x)},
+         "literal" = {quo_get_expr(x)},
+         abort("Quosure type not recognised.")
+  )
+}
+
+#' Get type from quosures
+#' @param x A (single) quosure
+#' @importFrom rlang quo_is_symbol
+#' @importFrom rlang quo_is_call
+#' @importFrom rlang quo_get_expr
+#' @importFrom rlang is_syntactic_literal
+#' @noRd
+#' @keywords internal
+expr_type <- function(x){
+  if(quo_is_symbol(x)){
+    "symbol"
+  }else if(quo_is_call(x)){
+    "call"
+  }else if(is_syntactic_literal(quo_get_expr(x))){
+    "literal"
+  }else{
+    typeof(x)
+  }
+}
+
+#' Check whether symbols exist before they are parsed
+#' @param x A (single) quosure
+#' @importFrom rlang quo_get_expr
+#' @importFrom rlang quo_get_env
+#' @importFrom rlang eval_tidy
+#' @importFrom rlang as_label
+#' @noRd
+#' @keywords internal
+parse_symbol <- function(x){
+  if(exists(quo_get_expr(x), where = quo_get_env(x))){
+    result <- eval_tidy(x)
+    if(inherits(result, "function")){ # special case for functions like 'data'
+      as_label(x)                     # which exist in Global
+    }else{
+      result
+    }
+  }else{
+    as_label(x)
+  }
+}
+
+#' Internal function to parse a call
+#' @importFrom rlang eval_tidy
+#' @noRd
+#' @keywords internal
+parse_call <- function(x, ...){
+  eval_tidy(x)
+}
+
+#' @rdname create_id
+#' @param width (Integer) how many characters should the resulting string be?
+#' Defaults to one plus the order of magnitude of the largest number.
+#' @order 2
+#' @export
+create_sequential_id <- function(width){
+  row_count <- dplyr::n()
+  result <- seq_len(row_count)
+  max_digits <- max(floor(log10(result)) + 1)
+  if(missing(width)){
+    width <- max_digits + 1
+  }
+  formatC(result,
+          width = width,
+          format = "d",
+          flag = "0")
+}
+
+#' @rdname create_id
+#' @order 3
+#' @export
+create_random_id <- function(){
+  uuid::UUIDgenerate(use.time = TRUE, dplyr::n())
+}
diff --git a/R/data_functions.R b/R/data_functions.R
@@ -0,0 +1,17 @@
+#' Darwin Core terms
+#'
+#' Return a tibble of Darwin Core terms
+#' @noRd
+#' @keywords Internal
+darwin_core_terms <- function(){
+  dwc_terms
+}
+
+#' Country codes
+#'
+#' Return a tibble of valid country codes
+#' @noRd
+#' @keywords Internal
+country_codes <- function(){
+  country_codes
+}
diff --git a/R/use_events.R b/R/use_events.R
@@ -22,9 +22,11 @@
 #' @param parentEventID The parent event under which one or more Events sit
 #' within.
 #' @param .keep Control which columns from .data are retained in the output.
-#' Note that unlike `dplyr::mutate`, which defaults to `"all"` this defaults to
-#' `"unused"`; i.e. only keeps Darwin Core fields, and not those fields used to
-#' generate them.
+#' Note that unlike most other `use_` functions in `corella`, this defaults to
+#' `"all"` (i.e. same behavior as `dplyr::mutate`). This is because it is common
+#' to create composite indicators from other columns (via
+#' `create_composite_id()`), and deleting these columns by default is typically
+#' unwise.
 #' @returns A tibble with the requested fields added.
 #' @details
 #' Each Event requires a unique `eventID` and `eventType` (because there can
@@ -62,7 +64,7 @@ use_events <- function(
     eventID = NULL,
     eventType = NULL,
     parentEventID = NULL,
-    .keep = "unused"
+    .keep = "all"
 ){
   if(missing(.df)){
     abort(".df is missing, with no default")

diff --git a/R/use_occurrences.R b/R/use_occurrences.R
@@ -20,9 +20,11 @@
 #' * `"humanObservation"`, `"machineObservation"`, `"livingSpecimen"`,
 #' `"preservedSpecimen"`, `"fossilSpecimen"`, `"materialCitation"`
 #' @param .keep Control which columns from .data are retained in the output.
-#' Note that unlike `dplyr::mutate`, which defaults to `"all"` this defaults to
-#' `"unused"`, which only keeps Darwin Core fields and not those fields used to
-#' generate them.
+#' Note that unlike most other `use_` functions in `corella`, this defaults to
+#' `"all"` (i.e. same behavior as `dplyr::mutate`). This is because it is common
+#' to create composite indicators from other columns (via
+#' `create_composite_id()`), and deleting these columns by default is typically
+#' unwise.
 #' @returns A tibble with the requested fields added.
 #' @details
 #' Examples of `occurrenceID` values:
@@ -43,7 +45,7 @@ use_occurrences <- function(
     basisOfRecord = NULL,
     occurrenceStatus = NULL,
     # recordNumber = NULL, # keep?
-    .keep = "unused",
+    .keep = "all",
     .messages = TRUE
 ){
   if(missing(.df)){
@@ -72,21 +74,6 @@ use_occurrences <- function(
       purrr::keep(!names(fn_quos) %in% names(which(null_col_exists_in_df)))
   }
 
-  # if used in occurrenceID, run `use_id_random()`
-  mc <- match.call(expand.dots = FALSE)
-
-  if(!is.null(mc$occurrenceID)) {
-    if(mc$occurrenceID == "use_id_random()") {
-
-      check_uuid_exists(.df)
-
-      result <- .df |>
-        mutate(
-          occurrenceID = use_id_random()
-        )
-    }
-  }
-
   # Update df
   result <- .df |>
     mutate(!!!fn_quos,
@@ -106,88 +93,13 @@ use_occurrences <- function(
   }
 
   # run column checks
-  check_basisOfRecord(result, level = "abort")
   check_occurrenceID(result, level = "abort")
+  check_basisOfRecord(result, level = "abort")
   check_occurrenceStatus(result, level = "abort")
 
   return(result)
 }
 
-#' Check whether a UUID column is already present in a dataset
-#'
-#' @param df Data frame or tibble passed by user
-#' @param level what action should the function take for non-conformance?
-#' Defaults to `"inform"`.
-#' @importFrom uuid as.UUID
-#' @importFrom purrr map
-#' @keywords Internal
-#' @noRd
-check_uuid_exists <- function(df,
-                              # level = c("inform", "warn", "abort"),
-                              call = caller_env()
-){
-  # get first sample of df values, test whether any are UUIDs
-  df_test_uuid <- purrr::map(head(df, 10L), uuid::as.UUID) |> bind_rows()
-
-
-  if(any(!is.na(df_test_uuid))) {
-
-    uuid_cols <- df_test_uuid[sapply(df_test_uuid, function(x) any(!is.na(x)))] |>
-      names()
-
-    bullets <- c(
-      "Column {.field {uuid_cols}} contains UUID values.",
-      i = "Existing unique ID values should be used if they have already been generated.",
-      i = "Use `use_occurrences(occurrenceID = {.field {uuid_cols}})` instead."
-    ) |>
-      cli::cli_bullets() |>
-      cli::cli_fmt()
-
-    cli::cli_abort(bullets, call = call)
-  }
-}
-
-#' Create a random identifier column
-#'
-#' @description
-#' Uses `uuid::UUIDgenerate()` to create a random UUID code without the possible
-#' shortfalls of being influenced by R's internal random number generators
-#' (i.e., set.seed).
-#'
-#' @param x A vector
-#' @importFrom uuid UUIDgenerate
-#' @importFrom dplyr n
-#' @export
-use_id_random <- function(x) {
-  if(missing(x)) {
-    uuid::UUIDgenerate(use.time = TRUE, dplyr::n())
-  } else {
-    cli_abort("{.code use_id_random()} must be used in `use_occurrences()`.")
-    # vctrs::vec_rank(x, ties = "sequential", incomplete = "na")
-  }
-}
-
-#' Create a composite identifier from two or more columns, separated by a colon.
-#' FIXME: This function doesn't not work at the moment.
-#' TODO: Question:: Would a composite identifier column be called occurrenceID?
-#' Should be globally unique - it may be necessary to still add a random number?
-#' Depends on what columns are used to build it.
-#' @param cols character vector of columns to use
-#' @keywords Internal
-#' @noRd
-use_id_composite <- function(data,
-                             cols = NULL) {
-  # TODO: check if the columns specified are contained / found in the data
-  # if not, warning and check for spelling etc.
-  # This method assumes string values - unchecked with other column types
-  concatenated_values <- apply(data[cols], 1, function(row) {
-    gsub(" ", "", tolower(paste(row, collapse = ":")))
-  })
-  # TODO: For now just adding a sequential numeric value
-  data$occurrenceID <- paste0(concatenated_values, ":", 1:nrow(data))
-  return(invisible(data))
-}
-
 #' check basisOfRecord
 #' @param level what action should the function take for non-conformance?
 #' Defaults to `"inform"`.

diff --git a/README.Rmd b/README.Rmd
@@ -25,7 +25,7 @@ knitr::opts_chunk$set(
 `corella` is an R package that helps users standardize their data using the 
 [*Darwin Core*](https://dwc.tdwg.org) data standard, used for biodiversity data like species occurrences. `corella` provides tools to prepare, manipulate and validate data against the standard's criteria. Once standardized, data can be subsequently shared as a [*Darwin Core Archive*](https://ipt.gbif.org/manual/en/ipt/latest/dwca-guide#what-is-darwin-core-archive-dwc-a) and published to open data infrastructures like the [Atlas of Living Australia](https://www.ala.org.au) and [GBIF](https://www.gbif.org/). 
 
-`corella` was built, and is maintained, by the [Science & Decision Support Team](https://labs.ala.org.au) at the [Atlas of Living Australia](https://www.ala.org.au) (ALA). It is named for the Little Corella ([_Cacatua sanguinea_](https://bie.ala.org.au/species/https%3A//biodiversity.org.au/afd/taxa/34b31e86-7ade-4cba-960f-82a6ae586206)). The logo was designed by [Dax Kellie](https://daxkellie.com/)
+`corella` was built, and is maintained, by the [Science & Decision Support Team](https://labs.ala.org.au) at the [Atlas of Living Australia](https://www.ala.org.au) (ALA). It is named for the Little Corella ([_Cacatua sanguinea_](https://bie.ala.org.au/species/https%3A//biodiversity.org.au/afd/taxa/34b31e86-7ade-4cba-960f-82a6ae586206)). The logo was designed by [Dax Kellie](https://daxkellie.com/).
 
 If you have any comments, questions or suggestions, please [contact us](mailto:[email protected]).
 

diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ Support Team](https://labs.ala.org.au) at the [Atlas of Living
 Australia](https://www.ala.org.au) (ALA). It is named for the Little
 Corella ([*Cacatua
 sanguinea*](https://bie.ala.org.au/species/https%3A//biodiversity.org.au/afd/taxa/34b31e86-7ade-4cba-960f-82a6ae586206)).
-The logo was designed by [Dax Kellie](https://daxkellie.com/)
+The logo was designed by [Dax Kellie](https://daxkellie.com/).
 
 If you have any comments, questions or suggestions, please [contact
 us](mailto:[email protected]).
@@ -149,7 +149,7 @@ df |>
 #> ℹ Testing data
 #> ✔ | E P | Column
 #> ⠙ | 0 eventDate
-#> ✔ | 1 ✖ | eventDate  [207ms]
+#> ⠹ | 1 ✖ | eventDate ✔ | 1 ✖ | eventDate  [71ms]
 #> ══ Results ═════════════════════════════════════════════════════════════════════
 #> 
 #> [ Errors: 1 | Pass: 0 ]