diff --git a/NAMESPACE b/NAMESPACE index 1f404e1..fad4662 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,13 +1,15 @@ # Generated by roxygen2: do not edit by hand export(check_dataset) +export(create_composite_id) +export(create_random_id) +export(create_sequential_id) export(suggest_workflow) export(use_abundance) export(use_collection) export(use_coordinates) export(use_datetime) export(use_events) -export(use_id_random) export(use_individual_traits) export(use_locality) export(use_measurements) @@ -57,7 +59,6 @@ importFrom(dplyr,group_by) importFrom(dplyr,group_split) importFrom(dplyr,join_by) importFrom(dplyr,mutate) -importFrom(dplyr,n) importFrom(dplyr,pull) importFrom(dplyr,select) importFrom(dplyr,slice_head) @@ -78,14 +79,21 @@ importFrom(purrr,keep) importFrom(purrr,map) importFrom(purrr,pluck) importFrom(rlang,abort) +importFrom(rlang,as_label) importFrom(rlang,caller_env) importFrom(rlang,cnd_muffle) importFrom(rlang,enquos) +importFrom(rlang,eval_tidy) importFrom(rlang,exec) importFrom(rlang,get_expr) importFrom(rlang,inform) importFrom(rlang,is_empty) +importFrom(rlang,is_syntactic_literal) +importFrom(rlang,quo_get_env) +importFrom(rlang,quo_get_expr) +importFrom(rlang,quo_is_call) importFrom(rlang,quo_is_null) +importFrom(rlang,quo_is_symbol) importFrom(rlang,try_fetch) importFrom(rlang,warn) importFrom(rlang,zap) @@ -99,5 +107,3 @@ importFrom(stringr,str_remove_all) importFrom(tibble,lst) importFrom(tidyr,replace_na) importFrom(tidyr,unnest) -importFrom(uuid,UUIDgenerate) -importFrom(uuid,as.UUID) diff --git a/R/corella-package.R b/R/corella-package.R index cc1e4f9..7072f9a 100644 --- a/R/corella-package.R +++ b/R/corella-package.R @@ -51,9 +51,12 @@ #' The wrapper function for checking tibbles for Darwin Core compliance is #' [check_dataset()]. It calls all internal check functions for checking data in columns with matching Darwin Core terms. #' -#' ** Add occurrence ID ** +#' **Helper functions** +#' These functions are called within `use_` (or `mutate()` functions), and assist in common problems #' -#' * [use_id_random()] adds a valid uuid column to a tibble +#' * [create_composite_id()] Supply a combination of variables to concatenate into a unique identifier +#' * [create_sequential_id()] Create a unique identifier of sequential numbers +#' * [create_random_id()] Create a unique identifier using `UUID()` #' #' @keywords internal "_PACKAGE" diff --git a/R/create_id.R b/R/create_id.R new file mode 100644 index 0000000..516de69 --- /dev/null +++ b/R/create_id.R @@ -0,0 +1,126 @@ +#' Create identifier columns +#' +#' Identifiers are columns that uniquely identify a single record within a +#' dataset. These are helper functions, designed to make it easier to +#' generate such columns from a given dataset. They are designed to be called +#' within [use_events()], [use_occurrences()], or (equivalently) +#' [dplyr::mutate()]. Generally speaking, it is best practice to use existing +#' information from a dataset to generate identifiers; for this reason we +#' recomment using `create_composite_id()` to aggregate existing fields, if no +#' such composite is already present within the dataset. It is possible to call +#' `create_sequential_id()` or `create_random_id()` within +#' `create_composite_id()` to combine existing and new columns. +#' @rdname create_id +#' @param ... Zero or more variable names from the tibble being +#' mutated (unquoted), and/or zero or more `create_` functions, separated by +#' commas. +#' @param sep Character used to separate field values. Defaults to `"-"` +#' @returns An amended tibble, containing a field with the requested information. +#' @examples +#' library(tibble) +#' df <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), +#' basisOfRecord = "humanObservation", +#' site = rep(c("A01", "A02", "A03"), each = 5)) +#' df |> +#' use_occurrences(occurrenceID = create_composite_id(create_sequential_id(), +#' site, +#' eventDate)) +#' @order 1 +#' @export +create_composite_id <- function(..., + sep = "-"){ + x <- enquos(...) + string_result <- purrr::map(x, switch_expr_type) + names(string_result) <- glue("V{seq_along(string_result)}") + string_result <- c(string_result, sep = sep) + do.call(paste, string_result) +} + +#' Switch functions for quosures +#' @param x A (single) quosure +#' @importFrom rlang abort +#' @importFrom rlang quo_get_expr +#' @noRd +#' @keywords internal +switch_expr_type <- function(x){ + switch(expr_type(x), + "symbol" = {parse_symbol(x)}, + "call" = {eval_tidy(x)}, + "literal" = {quo_get_expr(x)}, + abort("Quosure type not recognised.") + ) +} + +#' Get type from quosures +#' @param x A (single) quosure +#' @importFrom rlang quo_is_symbol +#' @importFrom rlang quo_is_call +#' @importFrom rlang quo_get_expr +#' @importFrom rlang is_syntactic_literal +#' @noRd +#' @keywords internal +expr_type <- function(x){ + if(quo_is_symbol(x)){ + "symbol" + }else if(quo_is_call(x)){ + "call" + }else if(is_syntactic_literal(quo_get_expr(x))){ + "literal" + }else{ + typeof(x) + } +} + +#' Check whether symbols exist before they are parsed +#' @param x A (single) quosure +#' @importFrom rlang quo_get_expr +#' @importFrom rlang quo_get_env +#' @importFrom rlang eval_tidy +#' @importFrom rlang as_label +#' @noRd +#' @keywords internal +parse_symbol <- function(x){ + if(exists(quo_get_expr(x), where = quo_get_env(x))){ + result <- eval_tidy(x) + if(inherits(result, "function")){ # special case for functions like 'data' + as_label(x) # which exist in Global + }else{ + result + } + }else{ + as_label(x) + } +} + +#' Internal function to parse a call +#' @importFrom rlang eval_tidy +#' @noRd +#' @keywords internal +parse_call <- function(x, ...){ + eval_tidy(x) +} + +#' @rdname create_id +#' @param width (Integer) how many characters should the resulting string be? +#' Defaults to one plus the order of magnitude of the largest number. +#' @order 2 +#' @export +create_sequential_id <- function(width){ + row_count <- dplyr::n() + result <- seq_len(row_count) + max_digits <- max(floor(log10(result)) + 1) + if(missing(width)){ + width <- max_digits + 1 + } + formatC(result, + width = width, + format = "d", + flag = "0") +} + +#' @rdname create_id +#' @order 3 +#' @export +create_random_id <- function(){ + uuid::UUIDgenerate(use.time = TRUE, dplyr::n()) +} diff --git a/R/data_functions.R b/R/data_functions.R new file mode 100644 index 0000000..c476d18 --- /dev/null +++ b/R/data_functions.R @@ -0,0 +1,17 @@ +#' Darwin Core terms +#' +#' Return a tibble of Darwin Core terms +#' @noRd +#' @keywords Internal +darwin_core_terms <- function(){ + dwc_terms +} + +#' Country codes +#' +#' Return a tibble of valid country codes +#' @noRd +#' @keywords Internal +country_codes <- function(){ + country_codes +} diff --git a/R/use_events.R b/R/use_events.R index 074ced7..3ba4ed1 100644 --- a/R/use_events.R +++ b/R/use_events.R @@ -22,9 +22,11 @@ #' @param parentEventID The parent event under which one or more Events sit #' within. #' @param .keep Control which columns from .data are retained in the output. -#' Note that unlike `dplyr::mutate`, which defaults to `"all"` this defaults to -#' `"unused"`; i.e. only keeps Darwin Core fields, and not those fields used to -#' generate them. +#' Note that unlike most other `use_` functions in `corella`, this defaults to +#' `"all"` (i.e. same behavior as `dplyr::mutate`). This is because it is common +#' to create composite indicators from other columns (via +#' `create_composite_id()`), and deleting these columns by default is typically +#' unwise. #' @returns A tibble with the requested fields added. #' @details #' Each Event requires a unique `eventID` and `eventType` (because there can @@ -62,7 +64,7 @@ use_events <- function( eventID = NULL, eventType = NULL, parentEventID = NULL, - .keep = "unused" + .keep = "all" ){ if(missing(.df)){ abort(".df is missing, with no default") diff --git a/R/use_occurrences.R b/R/use_occurrences.R index 5d85552..bce170e 100644 --- a/R/use_occurrences.R +++ b/R/use_occurrences.R @@ -20,9 +20,11 @@ #' * `"humanObservation"`, `"machineObservation"`, `"livingSpecimen"`, #' `"preservedSpecimen"`, `"fossilSpecimen"`, `"materialCitation"` #' @param .keep Control which columns from .data are retained in the output. -#' Note that unlike `dplyr::mutate`, which defaults to `"all"` this defaults to -#' `"unused"`, which only keeps Darwin Core fields and not those fields used to -#' generate them. +#' Note that unlike most other `use_` functions in `corella`, this defaults to +#' `"all"` (i.e. same behavior as `dplyr::mutate`). This is because it is common +#' to create composite indicators from other columns (via +#' `create_composite_id()`), and deleting these columns by default is typically +#' unwise. #' @returns A tibble with the requested fields added. #' @details #' Examples of `occurrenceID` values: @@ -43,7 +45,7 @@ use_occurrences <- function( basisOfRecord = NULL, occurrenceStatus = NULL, # recordNumber = NULL, # keep? - .keep = "unused", + .keep = "all", .messages = TRUE ){ if(missing(.df)){ @@ -72,21 +74,6 @@ use_occurrences <- function( purrr::keep(!names(fn_quos) %in% names(which(null_col_exists_in_df))) } - # if used in occurrenceID, run `use_id_random()` - mc <- match.call(expand.dots = FALSE) - - if(!is.null(mc$occurrenceID)) { - if(mc$occurrenceID == "use_id_random()") { - - check_uuid_exists(.df) - - result <- .df |> - mutate( - occurrenceID = use_id_random() - ) - } - } - # Update df result <- .df |> mutate(!!!fn_quos, @@ -106,88 +93,13 @@ use_occurrences <- function( } # run column checks - check_basisOfRecord(result, level = "abort") check_occurrenceID(result, level = "abort") + check_basisOfRecord(result, level = "abort") check_occurrenceStatus(result, level = "abort") return(result) } -#' Check whether a UUID column is already present in a dataset -#' -#' @param df Data frame or tibble passed by user -#' @param level what action should the function take for non-conformance? -#' Defaults to `"inform"`. -#' @importFrom uuid as.UUID -#' @importFrom purrr map -#' @keywords Internal -#' @noRd -check_uuid_exists <- function(df, - # level = c("inform", "warn", "abort"), - call = caller_env() -){ - # get first sample of df values, test whether any are UUIDs - df_test_uuid <- purrr::map(head(df, 10L), uuid::as.UUID) |> bind_rows() - - - if(any(!is.na(df_test_uuid))) { - - uuid_cols <- df_test_uuid[sapply(df_test_uuid, function(x) any(!is.na(x)))] |> - names() - - bullets <- c( - "Column {.field {uuid_cols}} contains UUID values.", - i = "Existing unique ID values should be used if they have already been generated.", - i = "Use `use_occurrences(occurrenceID = {.field {uuid_cols}})` instead." - ) |> - cli::cli_bullets() |> - cli::cli_fmt() - - cli::cli_abort(bullets, call = call) - } -} - -#' Create a random identifier column -#' -#' @description -#' Uses `uuid::UUIDgenerate()` to create a random UUID code without the possible -#' shortfalls of being influenced by R's internal random number generators -#' (i.e., set.seed). -#' -#' @param x A vector -#' @importFrom uuid UUIDgenerate -#' @importFrom dplyr n -#' @export -use_id_random <- function(x) { - if(missing(x)) { - uuid::UUIDgenerate(use.time = TRUE, dplyr::n()) - } else { - cli_abort("{.code use_id_random()} must be used in `use_occurrences()`.") - # vctrs::vec_rank(x, ties = "sequential", incomplete = "na") - } -} - -#' Create a composite identifier from two or more columns, separated by a colon. -#' FIXME: This function doesn't not work at the moment. -#' TODO: Question:: Would a composite identifier column be called occurrenceID? -#' Should be globally unique - it may be necessary to still add a random number? -#' Depends on what columns are used to build it. -#' @param cols character vector of columns to use -#' @keywords Internal -#' @noRd -use_id_composite <- function(data, - cols = NULL) { - # TODO: check if the columns specified are contained / found in the data - # if not, warning and check for spelling etc. - # This method assumes string values - unchecked with other column types - concatenated_values <- apply(data[cols], 1, function(row) { - gsub(" ", "", tolower(paste(row, collapse = ":"))) - }) - # TODO: For now just adding a sequential numeric value - data$occurrenceID <- paste0(concatenated_values, ":", 1:nrow(data)) - return(invisible(data)) -} - #' check basisOfRecord #' @param level what action should the function take for non-conformance? #' Defaults to `"inform"`. diff --git a/README.Rmd b/README.Rmd index 7eee354..eadf622 100644 --- a/README.Rmd +++ b/README.Rmd @@ -25,7 +25,7 @@ knitr::opts_chunk$set( `corella` is an R package that helps users standardize their data using the [*Darwin Core*](https://dwc.tdwg.org) data standard, used for biodiversity data like species occurrences. `corella` provides tools to prepare, manipulate and validate data against the standard's criteria. Once standardized, data can be subsequently shared as a [*Darwin Core Archive*](https://ipt.gbif.org/manual/en/ipt/latest/dwca-guide#what-is-darwin-core-archive-dwc-a) and published to open data infrastructures like the [Atlas of Living Australia](https://www.ala.org.au) and [GBIF](https://www.gbif.org/). -`corella` was built, and is maintained, by the [Science & Decision Support Team](https://labs.ala.org.au) at the [Atlas of Living Australia](https://www.ala.org.au) (ALA). It is named for the Little Corella ([_Cacatua sanguinea_](https://bie.ala.org.au/species/https%3A//biodiversity.org.au/afd/taxa/34b31e86-7ade-4cba-960f-82a6ae586206)). The logo was designed by [Dax Kellie](https://daxkellie.com/) +`corella` was built, and is maintained, by the [Science & Decision Support Team](https://labs.ala.org.au) at the [Atlas of Living Australia](https://www.ala.org.au) (ALA). It is named for the Little Corella ([_Cacatua sanguinea_](https://bie.ala.org.au/species/https%3A//biodiversity.org.au/afd/taxa/34b31e86-7ade-4cba-960f-82a6ae586206)). The logo was designed by [Dax Kellie](https://daxkellie.com/). If you have any comments, questions or suggestions, please [contact us](mailto:support@ala.org.au). diff --git a/README.md b/README.md index 54fb0ee..18cc79e 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Support Team](https://labs.ala.org.au) at the [Atlas of Living Australia](https://www.ala.org.au) (ALA). It is named for the Little Corella ([*Cacatua sanguinea*](https://bie.ala.org.au/species/https%3A//biodiversity.org.au/afd/taxa/34b31e86-7ade-4cba-960f-82a6ae586206)). -The logo was designed by [Dax Kellie](https://daxkellie.com/) +The logo was designed by [Dax Kellie](https://daxkellie.com/). If you have any comments, questions or suggestions, please [contact us](mailto:support@ala.org.au). @@ -149,7 +149,7 @@ df |> #> ℹ Testing data #> ✔ | E P | Column #> ⠙ | 0 eventDate -#> ✔ | 1 ✖ | eventDate [207ms] +#> ⠹ | 1 ✖ | eventDate ✔ | 1 ✖ | eventDate [71ms] #> ══ Results ═════════════════════════════════════════════════════════════════════ #> #> [ Errors: 1 | Pass: 0 ] diff --git a/_pkgdown.yml b/_pkgdown.yml index debd750..ff2a0aa 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -49,7 +49,7 @@ reference: - title: Run checks on a dataset for Darwin Core conformance contents: - check_dataset -- title: Add a unique ID +- title: Helper functions contents: - - use_id_random + - create_composite_id diff --git a/man/corella-package.Rd b/man/corella-package.Rd index 0a9956f..f6daf9c 100644 --- a/man/corella-package.Rd +++ b/man/corella-package.Rd @@ -56,9 +56,12 @@ Proposed: The wrapper function for checking tibbles for Darwin Core compliance is \code{\link[=check_dataset]{check_dataset()}}. It calls all internal check functions for checking data in columns with matching Darwin Core terms. -** Add occurrence ID ** +\strong{Helper functions} +These functions are called within \code{use_} (or \code{mutate()} functions), and assist in common problems \itemize{ -\item \code{\link[=use_id_random]{use_id_random()}} adds a valid uuid column to a tibble +\item \code{\link[=create_composite_id]{create_composite_id()}} Supply a combination of variables to concatenate into a unique identifier +\item \code{\link[=create_sequential_id]{create_sequential_id()}} Create a unique identifier of sequential numbers +\item \code{\link[=create_random_id]{create_random_id()}} Create a unique identifier using \code{UUID()} } } diff --git a/man/create_id.Rd b/man/create_id.Rd new file mode 100644 index 0000000..a9636fb --- /dev/null +++ b/man/create_id.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_id.R +\name{create_composite_id} +\alias{create_composite_id} +\alias{create_sequential_id} +\alias{create_random_id} +\title{Create identifier columns} +\usage{ +create_composite_id(..., sep = "-") + +create_sequential_id(width) + +create_random_id() +} +\arguments{ +\item{...}{Zero or more variable names from the tibble being +mutated (unquoted), and/or zero or more \code{create_} functions, separated by +commas.} + +\item{sep}{Character used to separate field values. Defaults to \code{"-"}} + +\item{width}{(Integer) how many characters should the resulting string be? +Defaults to one plus the order of magnitude of the largest number.} +} +\value{ +An amended tibble, containing a field with the requested information. +} +\description{ +Identifiers are columns that uniquely identify a single record within a +dataset. These are helper functions, designed to make it easier to +generate such columns from a given dataset. They are designed to be called +within \code{\link[=use_events]{use_events()}}, \code{\link[=use_occurrences]{use_occurrences()}}, or (equivalently) +\code{\link[dplyr:mutate]{dplyr::mutate()}}. Generally speaking, it is best practice to use existing +information from a dataset to generate identifiers; for this reason we +recomment using \code{create_composite_id()} to aggregate existing fields, if no +such composite is already present within the dataset. It is possible to call +\code{create_sequential_id()} or \code{create_random_id()} within +\code{create_composite_id()} to combine existing and new columns. +} +\examples{ +library(tibble) +df <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) +df |> + use_occurrences(occurrenceID = create_composite_id(create_sequential_id(), + site, + eventDate)) +} diff --git a/man/use_events.Rd b/man/use_events.Rd index 193dec9..c651b91 100644 --- a/man/use_events.Rd +++ b/man/use_events.Rd @@ -9,7 +9,7 @@ use_events( eventID = NULL, eventType = NULL, parentEventID = NULL, - .keep = "unused" + .keep = "all" ) } \arguments{ @@ -21,9 +21,11 @@ use_events( within.} \item{.keep}{Control which columns from .data are retained in the output. -Note that unlike \code{dplyr::mutate}, which defaults to \code{"all"} this defaults to -\code{"unused"}; i.e. only keeps Darwin Core fields, and not those fields used to -generate them.} +Note that unlike most other \code{use_} functions in \code{corella}, this defaults to +\code{"all"} (i.e. same behavior as \code{dplyr::mutate}). This is because it is common +to create composite indicators from other columns (via +\code{create_composite_id()}), and deleting these columns by default is typically +unwise.} \item{df}{a \code{data.frame} or \code{tibble} that the column should be appended to.} } diff --git a/man/use_id_random.Rd b/man/use_id_random.Rd deleted file mode 100644 index 51f1838..0000000 --- a/man/use_id_random.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/use_occurrences.R -\name{use_id_random} -\alias{use_id_random} -\title{Create a random identifier column} -\usage{ -use_id_random(x) -} -\arguments{ -\item{x}{A vector} -} -\description{ -Uses \code{uuid::UUIDgenerate()} to create a random UUID code without the possible -shortfalls of being influenced by R's internal random number generators -(i.e., set.seed). -} diff --git a/man/use_occurrences.Rd b/man/use_occurrences.Rd index b08efe8..b20d4d3 100644 --- a/man/use_occurrences.Rd +++ b/man/use_occurrences.Rd @@ -9,7 +9,7 @@ use_occurrences( occurrenceID = NULL, basisOfRecord = NULL, occurrenceStatus = NULL, - .keep = "unused", + .keep = "all", .messages = TRUE ) } @@ -28,9 +28,11 @@ Accepted \code{basisOfRecord} values are one of: }} \item{.keep}{Control which columns from .data are retained in the output. -Note that unlike \code{dplyr::mutate}, which defaults to \code{"all"} this defaults to -\code{"unused"}, which only keeps Darwin Core fields and not those fields used to -generate them.} +Note that unlike most other \code{use_} functions in \code{corella}, this defaults to +\code{"all"} (i.e. same behavior as \code{dplyr::mutate}). This is because it is common +to create composite indicators from other columns (via +\code{create_composite_id()}), and deleting these columns by default is typically +unwise.} \item{df}{a \code{data.frame} or \code{tibble} that the column should be appended to.} } diff --git a/tests/testthat/test-use_events.R b/tests/testthat/test-use_events.R index 8849056..f4c594f 100644 --- a/tests/testthat/test-use_events.R +++ b/tests/testthat/test-use_events.R @@ -1,3 +1,64 @@ -test_that("multiplication works", { - expect_equal(2 * 2, 4) +test_that("create_sequential_id() works with use_events()", { + input <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) + suppressMessages( + result <- input |> + use_events(eventID = create_sequential_id()) + ) + expect_equal(colnames(result), + c("eventDate", "basisOfRecord", "site", "eventID")) + expect_equal(as.integer(result$eventID), + seq_len(15)) + expect_true(all(nchar(result$eventID) == 3)) +}) + +test_that("create_sequential_id() accepts `width` argument works with use_events()", { + input <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) + suppressMessages(result <- input |> + use_events(eventID = create_sequential_id(width = 10)) + ) + expect_true(all(nchar(result$eventID) == 10)) +}) + +test_that("create_random_id() works with use_events()", { + input <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) + suppressMessages(result <- input |> + use_events(eventID = create_random_id()) + ) + expect_equal(colnames(result), + c("eventDate", "basisOfRecord", "site", "eventID")) + expect_equal(length(unique(result$eventID)), + nrow(result)) +}) + +test_that("create_composite_id() works with use_events()", { + input <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) + suppressMessages(result <- input |> + use_events(eventID = create_composite_id(site, eventDate)) + ) + expect_equal(colnames(result), + c("eventDate", "basisOfRecord", "site", "eventID")) + expect_equal(paste0(result$site, "-", result$eventDate), + result$eventID) +}) + +test_that("create_sequential_id() works within create_composite_id()", { + input <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) + suppressMessages(result <- input |> + use_events(eventID = create_composite_id(create_sequential_id(), + site, + eventDate)) + ) + expect_equal(colnames(result), + c("eventDate", "basisOfRecord", "site", "eventID")) + expect_true(all(grepl("^[[:digit:]]{3}-", result$eventID))) }) diff --git a/tests/testthat/test-use_occurrences.R b/tests/testthat/test-use_occurrences.R index 4c83edd..a15fdf5 100644 --- a/tests/testthat/test-use_occurrences.R +++ b/tests/testthat/test-use_occurrences.R @@ -53,51 +53,6 @@ test_that("use_occurrences has progress messages", { }) -test_that("use_occurrences handles `use_id_random()`", { - quiet_use_occurrences <- purrr::quietly(use_occurrences) - df <- tibble(basisOfRecord = "humanObservation", - col2 = 1:2) - - result <- df |> - quiet_use_occurrences(occurrenceID = use_id_random()) - - expect_s3_class(result$result, c("tbl_df", "tbl", "data.frame")) - expect_equal(colnames(result$result), c("basisOfRecord", "col2", "occurrenceID")) - expect_type(result$result$occurrenceID, "character") - expect_equal(nchar(result$result$occurrenceID), c(36, 36)) -}) - -test_that("use_id_random() generates unique UUID", { - quiet_use_occurrences <- purrr::quietly(use_occurrences) - df <- tibble( - basisOfRecord = "humanObservation", - col2 = 1:2 - ) - - result <- df |> - quiet_use_occurrences(occurrenceID = use_id_random()) - - # if any aren't UUIDs, they will return NA - uuid_check <- result |> - purrr::pluck("result") |> - select(occurrenceID) |> - purrr::map_dfr(uuid::as.UUID) - - expect_type(result$result$occurrenceID, "character") - expect_equal(nchar(result$result$occurrenceID), c(36, 36)) - expect_true(all(!is.na(uuid_check))) - expect_equal(length(unique(result$result$occurrenceID)), nrow(result$result)) -}) - -test_that("use_occurrences errors when UUID is already present in df", { - df <- tibble(basisOfRecord = "humanObservation", - id_col = uuid::UUIDgenerate()) - - expect_error(suppressMessages( - use_occurrences(df, occurrenceID = use_id_random())), - "Column id_col contains UUID values") -}) - test_that("use_occurrences only accepts valid values for basisOfRecord", { valid_values <- c("humanObservation", "machineObservation", "livingSpecimen", "preservedSpecimen", "fossilSpecimen", "materialCitation") @@ -146,3 +101,67 @@ test_that("use_occurrences checks occurrenceStatus format", { "Unexpected value in occurrenceStatus" ) }) + +test_that("create_sequential_id() works with use_occurrences()", { + input <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) + suppressMessages(result <- input |> + use_occurrences(occurrenceID = create_sequential_id()) + ) + expect_equal(colnames(result), + c("eventDate", "basisOfRecord", "site", "occurrenceID")) + expect_equal(as.integer(result$occurrenceID), + seq_len(15)) + expect_true(all(nchar(result$occurrenceID) == 3)) +}) + +test_that("create_sequential_id() accepts `width` argument with use_occurrences()", { + input <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) + suppressMessages(result <- input |> + use_occurrences(occurrenceID = create_sequential_id(width = 10)) + ) + expect_true(all(nchar(result$occurrenceID) == 10)) +}) + +test_that("create_random_id() works with use_occurrences()", { + input <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) + suppressMessages(result <- input |> + use_occurrences(occurrenceID = create_random_id()) + ) + expect_equal(colnames(result), + c("eventDate", "basisOfRecord", "site", "occurrenceID")) + expect_equal(length(unique(result$occurrenceID)), + nrow(result)) +}) + +test_that("create_composite_id() works with use_occurrences()", { + input <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) + suppressMessages(result <- input |> + use_occurrences(occurrenceID = create_composite_id(site, eventDate)) + ) + expect_equal(colnames(result), + c("eventDate", "basisOfRecord", "site", "occurrenceID")) + expect_equal(paste0(result$site, "-", result$eventDate), + result$occurrenceID) +}) + +test_that("create_sequential_id() works within create_composite_id()", { + input <- tibble(eventDate = paste0(rep(c(2020:2024), 3), "-01-01"), + basisOfRecord = "humanObservation", + site = rep(c("A01", "A02", "A03"), each = 5)) + suppressMessages(result <- input |> + use_occurrences(occurrenceID = create_composite_id(create_sequential_id(), + site, + eventDate)) + ) + expect_equal(colnames(result), + c("eventDate", "basisOfRecord", "site", "occurrenceID")) + expect_true(all(grepl("^[[:digit:]]{3}-", result$occurrenceID))) +}) diff --git a/vignettes/quick_start_guide.Rmd b/vignettes/quick_start_guide.Rmd index 24c1a61..534c332 100644 --- a/vignettes/quick_start_guide.Rmd +++ b/vignettes/quick_start_guide.Rmd @@ -162,7 +162,7 @@ For example, after using one of the suggested functions `use_occurrences()`, if #| warning: false df_edited <- df |> use_occurrences( - occurrenceID = use_id_random(), + occurrenceID = create_random_id(), basisOfRecord = "humanObservation" ) ```