Skip to content

Commit

Permalink
Add create_ functions to generate identifiers within use_ functions
Browse files Browse the repository at this point in the history
new functions are `create_composite_id()`, `create_random_id()`, `create_sequential_id()`
  • Loading branch information
mjwestgate committed Dec 9, 2024
1 parent 29eac6b commit 48e5df4
Show file tree
Hide file tree
Showing 17 changed files with 370 additions and 184 deletions.
14 changes: 10 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# Generated by roxygen2: do not edit by hand

export(check_dataset)
export(create_composite_id)
export(create_random_id)
export(create_sequential_id)
export(suggest_workflow)
export(use_abundance)
export(use_collection)
export(use_coordinates)
export(use_datetime)
export(use_events)
export(use_id_random)
export(use_individual_traits)
export(use_locality)
export(use_measurements)
Expand Down Expand Up @@ -57,7 +59,6 @@ importFrom(dplyr,group_by)
importFrom(dplyr,group_split)
importFrom(dplyr,join_by)
importFrom(dplyr,mutate)
importFrom(dplyr,n)
importFrom(dplyr,pull)
importFrom(dplyr,select)
importFrom(dplyr,slice_head)
Expand All @@ -78,14 +79,21 @@ importFrom(purrr,keep)
importFrom(purrr,map)
importFrom(purrr,pluck)
importFrom(rlang,abort)
importFrom(rlang,as_label)
importFrom(rlang,caller_env)
importFrom(rlang,cnd_muffle)
importFrom(rlang,enquos)
importFrom(rlang,eval_tidy)
importFrom(rlang,exec)
importFrom(rlang,get_expr)
importFrom(rlang,inform)
importFrom(rlang,is_empty)
importFrom(rlang,is_syntactic_literal)
importFrom(rlang,quo_get_env)
importFrom(rlang,quo_get_expr)
importFrom(rlang,quo_is_call)
importFrom(rlang,quo_is_null)
importFrom(rlang,quo_is_symbol)
importFrom(rlang,try_fetch)
importFrom(rlang,warn)
importFrom(rlang,zap)
Expand All @@ -99,5 +107,3 @@ importFrom(stringr,str_remove_all)
importFrom(tibble,lst)
importFrom(tidyr,replace_na)
importFrom(tidyr,unnest)
importFrom(uuid,UUIDgenerate)
importFrom(uuid,as.UUID)
7 changes: 5 additions & 2 deletions R/corella-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,12 @@
#' The wrapper function for checking tibbles for Darwin Core compliance is
#' [check_dataset()]. It calls all internal check functions for checking data in columns with matching Darwin Core terms.
#'
#' ** Add occurrence ID **
#' **Helper functions**
#' These functions are called within `use_` (or `mutate()` functions), and assist in common problems
#'
#' * [use_id_random()] adds a valid uuid column to a tibble
#' * [create_composite_id()] Supply a combination of variables to concatenate into a unique identifier
#' * [create_sequential_id()] Create a unique identifier of sequential numbers
#' * [create_random_id()] Create a unique identifier using `UUID()`
#'
#' @keywords internal
"_PACKAGE"
126 changes: 126 additions & 0 deletions R/create_id.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#' Create identifier columns
#'
#' Identifiers are columns that uniquely identify a single record within a
#' dataset. These are helper functions, designed to make it easier to
#' generate such columns from a given dataset. They are designed to be called
#' within [use_events()], [use_occurrences()], or (equivalently)
#' [dplyr::mutate()]. Generally speaking, it is best practice to use existing
#' information from a dataset to generate identifiers; for this reason we
#' recomment using `create_composite_id()` to aggregate existing fields, if no
#' such composite is already present within the dataset. It is possible to call
#' `create_sequential_id()` or `create_random_id()` within
#' `create_composite_id()` to combine existing and new columns.
#' @rdname create_id
#' @param ... Zero or more variable names from the tibble being
#' mutated (unquoted), and/or zero or more `create_` functions, separated by
#' commas.
#' @param sep Character used to separate field values. Defaults to `"-"`
#' @returns An amended tibble, containing a field with the requested information.
#' @examples
#' library(tibble)
#' df <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"),
#' basisOfRecord = "humanObservation",
#' site = rep(c("A01", "A02", "A03"), each = 5))
#' df |>
#' use_occurrences(occurrenceID = create_composite_id(create_sequential_id(),
#' site,
#' eventDate))
#' @order 1
#' @export
create_composite_id <- function(...,
sep = "-"){
x <- enquos(...)
string_result <- purrr::map(x, switch_expr_type)
names(string_result) <- glue("V{seq_along(string_result)}")
string_result <- c(string_result, sep = sep)
do.call(paste, string_result)
}

#' Switch functions for quosures
#' @param x A (single) quosure
#' @importFrom rlang abort
#' @importFrom rlang quo_get_expr
#' @noRd
#' @keywords internal
switch_expr_type <- function(x){
switch(expr_type(x),
"symbol" = {parse_symbol(x)},
"call" = {eval_tidy(x)},
"literal" = {quo_get_expr(x)},
abort("Quosure type not recognised.")
)
}

#' Get type from quosures
#' @param x A (single) quosure
#' @importFrom rlang quo_is_symbol
#' @importFrom rlang quo_is_call
#' @importFrom rlang quo_get_expr
#' @importFrom rlang is_syntactic_literal
#' @noRd
#' @keywords internal
expr_type <- function(x){
if(quo_is_symbol(x)){
"symbol"
}else if(quo_is_call(x)){
"call"
}else if(is_syntactic_literal(quo_get_expr(x))){
"literal"
}else{
typeof(x)
}
}

#' Check whether symbols exist before they are parsed
#' @param x A (single) quosure
#' @importFrom rlang quo_get_expr
#' @importFrom rlang quo_get_env
#' @importFrom rlang eval_tidy
#' @importFrom rlang as_label
#' @noRd
#' @keywords internal
parse_symbol <- function(x){
if(exists(quo_get_expr(x), where = quo_get_env(x))){
result <- eval_tidy(x)
if(inherits(result, "function")){ # special case for functions like 'data'
as_label(x) # which exist in Global
}else{
result
}
}else{
as_label(x)
}
}

#' Internal function to parse a call
#' @importFrom rlang eval_tidy
#' @noRd
#' @keywords internal
parse_call <- function(x, ...){
eval_tidy(x)
}

#' @rdname create_id
#' @param width (Integer) how many characters should the resulting string be?
#' Defaults to one plus the order of magnitude of the largest number.
#' @order 2
#' @export
create_sequential_id <- function(width){
row_count <- dplyr::n()
result <- seq_len(row_count)
max_digits <- max(floor(log10(result)) + 1)
if(missing(width)){
width <- max_digits + 1
}
formatC(result,
width = width,
format = "d",
flag = "0")
}

#' @rdname create_id
#' @order 3
#' @export
create_random_id <- function(){
uuid::UUIDgenerate(use.time = TRUE, dplyr::n())
}
17 changes: 17 additions & 0 deletions R/data_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#' Darwin Core terms
#'
#' Return a tibble of Darwin Core terms
#' @noRd
#' @keywords Internal
darwin_core_terms <- function(){
dwc_terms
}

#' Country codes
#'
#' Return a tibble of valid country codes
#' @noRd
#' @keywords Internal
country_codes <- function(){
country_codes
}
10 changes: 6 additions & 4 deletions R/use_events.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
#' @param parentEventID The parent event under which one or more Events sit
#' within.
#' @param .keep Control which columns from .data are retained in the output.
#' Note that unlike `dplyr::mutate`, which defaults to `"all"` this defaults to
#' `"unused"`; i.e. only keeps Darwin Core fields, and not those fields used to
#' generate them.
#' Note that unlike most other `use_` functions in `corella`, this defaults to
#' `"all"` (i.e. same behavior as `dplyr::mutate`). This is because it is common
#' to create composite indicators from other columns (via
#' `create_composite_id()`), and deleting these columns by default is typically
#' unwise.
#' @returns A tibble with the requested fields added.
#' @details
#' Each Event requires a unique `eventID` and `eventType` (because there can
Expand Down Expand Up @@ -62,7 +64,7 @@ use_events <- function(
eventID = NULL,
eventType = NULL,
parentEventID = NULL,
.keep = "unused"
.keep = "all"
){
if(missing(.df)){
abort(".df is missing, with no default")
Expand Down
102 changes: 7 additions & 95 deletions R/use_occurrences.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
#' * `"humanObservation"`, `"machineObservation"`, `"livingSpecimen"`,
#' `"preservedSpecimen"`, `"fossilSpecimen"`, `"materialCitation"`
#' @param .keep Control which columns from .data are retained in the output.
#' Note that unlike `dplyr::mutate`, which defaults to `"all"` this defaults to
#' `"unused"`, which only keeps Darwin Core fields and not those fields used to
#' generate them.
#' Note that unlike most other `use_` functions in `corella`, this defaults to
#' `"all"` (i.e. same behavior as `dplyr::mutate`). This is because it is common
#' to create composite indicators from other columns (via
#' `create_composite_id()`), and deleting these columns by default is typically
#' unwise.
#' @returns A tibble with the requested fields added.
#' @details
#' Examples of `occurrenceID` values:
Expand All @@ -43,7 +45,7 @@ use_occurrences <- function(
basisOfRecord = NULL,
occurrenceStatus = NULL,
# recordNumber = NULL, # keep?
.keep = "unused",
.keep = "all",
.messages = TRUE
){
if(missing(.df)){
Expand Down Expand Up @@ -72,21 +74,6 @@ use_occurrences <- function(
purrr::keep(!names(fn_quos) %in% names(which(null_col_exists_in_df)))
}

# if used in occurrenceID, run `use_id_random()`
mc <- match.call(expand.dots = FALSE)

if(!is.null(mc$occurrenceID)) {
if(mc$occurrenceID == "use_id_random()") {

check_uuid_exists(.df)

result <- .df |>
mutate(
occurrenceID = use_id_random()
)
}
}

# Update df
result <- .df |>
mutate(!!!fn_quos,
Expand All @@ -106,88 +93,13 @@ use_occurrences <- function(
}

# run column checks
check_basisOfRecord(result, level = "abort")
check_occurrenceID(result, level = "abort")
check_basisOfRecord(result, level = "abort")
check_occurrenceStatus(result, level = "abort")

return(result)
}

#' Check whether a UUID column is already present in a dataset
#'
#' @param df Data frame or tibble passed by user
#' @param level what action should the function take for non-conformance?
#' Defaults to `"inform"`.
#' @importFrom uuid as.UUID
#' @importFrom purrr map
#' @keywords Internal
#' @noRd
check_uuid_exists <- function(df,
# level = c("inform", "warn", "abort"),
call = caller_env()
){
# get first sample of df values, test whether any are UUIDs
df_test_uuid <- purrr::map(head(df, 10L), uuid::as.UUID) |> bind_rows()


if(any(!is.na(df_test_uuid))) {

uuid_cols <- df_test_uuid[sapply(df_test_uuid, function(x) any(!is.na(x)))] |>
names()

bullets <- c(
"Column {.field {uuid_cols}} contains UUID values.",
i = "Existing unique ID values should be used if they have already been generated.",
i = "Use `use_occurrences(occurrenceID = {.field {uuid_cols}})` instead."
) |>
cli::cli_bullets() |>
cli::cli_fmt()

cli::cli_abort(bullets, call = call)
}
}

#' Create a random identifier column
#'
#' @description
#' Uses `uuid::UUIDgenerate()` to create a random UUID code without the possible
#' shortfalls of being influenced by R's internal random number generators
#' (i.e., set.seed).
#'
#' @param x A vector
#' @importFrom uuid UUIDgenerate
#' @importFrom dplyr n
#' @export
use_id_random <- function(x) {
if(missing(x)) {
uuid::UUIDgenerate(use.time = TRUE, dplyr::n())
} else {
cli_abort("{.code use_id_random()} must be used in `use_occurrences()`.")
# vctrs::vec_rank(x, ties = "sequential", incomplete = "na")
}
}

#' Create a composite identifier from two or more columns, separated by a colon.
#' FIXME: This function doesn't not work at the moment.
#' TODO: Question:: Would a composite identifier column be called occurrenceID?
#' Should be globally unique - it may be necessary to still add a random number?
#' Depends on what columns are used to build it.
#' @param cols character vector of columns to use
#' @keywords Internal
#' @noRd
use_id_composite <- function(data,
cols = NULL) {
# TODO: check if the columns specified are contained / found in the data
# if not, warning and check for spelling etc.
# This method assumes string values - unchecked with other column types
concatenated_values <- apply(data[cols], 1, function(row) {
gsub(" ", "", tolower(paste(row, collapse = ":")))
})
# TODO: For now just adding a sequential numeric value
data$occurrenceID <- paste0(concatenated_values, ":", 1:nrow(data))
return(invisible(data))
}

#' check basisOfRecord
#' @param level what action should the function take for non-conformance?
#' Defaults to `"inform"`.
Expand Down
2 changes: 1 addition & 1 deletion README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ knitr::opts_chunk$set(
`corella` is an R package that helps users standardize their data using the
[*Darwin Core*](https://dwc.tdwg.org) data standard, used for biodiversity data like species occurrences. `corella` provides tools to prepare, manipulate and validate data against the standard's criteria. Once standardized, data can be subsequently shared as a [*Darwin Core Archive*](https://ipt.gbif.org/manual/en/ipt/latest/dwca-guide#what-is-darwin-core-archive-dwc-a) and published to open data infrastructures like the [Atlas of Living Australia](https://www.ala.org.au) and [GBIF](https://www.gbif.org/).

`corella` was built, and is maintained, by the [Science & Decision Support Team](https://labs.ala.org.au) at the [Atlas of Living Australia](https://www.ala.org.au) (ALA). It is named for the Little Corella ([_Cacatua sanguinea_](https://bie.ala.org.au/species/https%3A//biodiversity.org.au/afd/taxa/34b31e86-7ade-4cba-960f-82a6ae586206)). The logo was designed by [Dax Kellie](https://daxkellie.com/)
`corella` was built, and is maintained, by the [Science & Decision Support Team](https://labs.ala.org.au) at the [Atlas of Living Australia](https://www.ala.org.au) (ALA). It is named for the Little Corella ([_Cacatua sanguinea_](https://bie.ala.org.au/species/https%3A//biodiversity.org.au/afd/taxa/34b31e86-7ade-4cba-960f-82a6ae586206)). The logo was designed by [Dax Kellie](https://daxkellie.com/).

If you have any comments, questions or suggestions, please [contact us](mailto:[email protected]).

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Support Team](https://labs.ala.org.au) at the [Atlas of Living
Australia](https://www.ala.org.au) (ALA). It is named for the Little
Corella ([*Cacatua
sanguinea*](https://bie.ala.org.au/species/https%3A//biodiversity.org.au/afd/taxa/34b31e86-7ade-4cba-960f-82a6ae586206)).
The logo was designed by [Dax Kellie](https://daxkellie.com/)
The logo was designed by [Dax Kellie](https://daxkellie.com/).

If you have any comments, questions or suggestions, please [contact
us](mailto:[email protected]).
Expand Down Expand Up @@ -149,7 +149,7 @@ df |>
#> ℹ Testing data
#> ✔ | E P | Column
#> ⠙ | 0 eventDate
#> ✔ | 1 ✖ | eventDate [207ms]
#> ⠹ | 1 ✖ | eventDate ✔ | 1 ✖ | eventDate [71ms]
#> ══ Results ═════════════════════════════════════════════════════════════════════
#>
#> [ Errors: 1 | Pass: 0 ]
Expand Down
Loading

0 comments on commit 48e5df4

Please sign in to comment.