diff --git a/DESCRIPTION b/DESCRIPTION index 960a263..4860374 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,9 +17,9 @@ Authors@R: role = c("aut"))) Description: galaxias helps users describe, package and share biodiversity information using the 'Darwin Core' data standard, which is the format used - and accepted by the Global Biodiversity Information Facility (GBIF) and it's + and accepted by the Global Biodiversity Information Facility (GBIF) and its' partner nodes. It is functionally similar to `devtools`, but with a focus on - building Darwin Core Archives (DwCA's) rather than R packages. + building 'Darwin Core Archives' rather than R packages. Depends: R (>= 4.3.0), corella diff --git a/R/build_archive.R b/R/build_archive.R index 97018ca..12dd0a0 100644 --- a/R/build_archive.R +++ b/R/build_archive.R @@ -4,54 +4,51 @@ #' and metadata. This function assumes that all of these file types have been #' pre-constructed, and can be found inside a single folder, with no additional #' or redundant information. This function is similar to `devtools::build()`, -#' in the sense that it takes a repository and wraps it for publication, without -#' assessing the contents in any meaningful way. It differs from -#' `devtools::build()` in that it builds a Darwin Core Archive, rather than an -#' R package. +#' in the sense that it takes a repository and wraps it for publication, It +#' differs from `devtools::build()` in that it builds a Darwin Core Archive, +#' rather than an R package. #' @details #' This function looks for three types of objects in the specified `directory`: #' #' * One or more `csv` files such as `occurrences.csv` &/or `events.csv`. #' These will be manipulated versions of the raw dataset, which have been -#' altered to use Darwin Core terms as column headers. See the `corella` -#' package for details. -#' * A metadata statement, stored in xml using the filename `eml.xml`. The -#' function `use_metadata()` from the `paperbark` package is a good starting -#' point here, followed by `build_metadata()` to save it in xml. +#' altered to use Darwin Core terms as column headers. See +#' [corella::corella-package()] for details. +#' * A metadata statement, stored in `EML` using the filename `eml.xml`. The +#' function [use_metadata()] is a good starting point here, followed by +#' [build_metadata()] once you have populated your metadata statement. #' * A 'schema' document, also stored in xml, called `meta.xml`. This is -#' usually constructed using `build_schema()`. +#' usually constructed using [build_schema()]. #' #' You will get an error if these files are not present. The resulting file #' shares the name of the working directory (with a .zip file extension), #' and is placed in the parent directory -#' @param x (string) A directory containing all the files to be stored in the -#' archive. Defaults to the `data` folder within the current working directory. -#' @param file (string) A file name to save the resulting zip file. +#' @param source (string) A directory containing all the files to be stored in +#' the archive. Defaults to the `data` folder within the current working +#' directory. +#' @param destination (string) A file name to save the resulting zip file. #' @return Invisibly returns the location of the built zip file; but typically #' called for the side-effect of building a 'Darwin Core Archive' (i.e. a zip #' file). #' @importFrom zip zip #' @export -build_archive <- function(x = "data", file) { - x <- get_default_directory(x) - - progress_update("Retrieving metadata...") - files_in <- find_data(x) +build_archive <- function(source = "data", destination) { + progress_update("Retrieving data...") + files_in <- get_default_directory(source) |> + find_data() progress_update("Creating zip folder...") - file_out <- get_default_file(file) + file_out <- get_default_file(destination) progress_update("Building Darwin Core Archive...") zip::zip(zipfile = file_out, files = files_in, mode = "cherry-pick") - cli::cli_alert_success("Darwin Core Archive successfully built. \nSaved as {.file {file_out}}.") + cli::cli_alert_success("Darwin Core Archive successfully built. \nSaved as `{.file {file_out}}`.") cli::cli_progress_done() - # invisible(return(file_out)) # might need this to save - - + invisible(file_out) } #' Simple function to specify a zip file if no arg given diff --git a/R/build_metadata.R b/R/build_metadata.R index 0a436bf..f34183a 100644 --- a/R/build_metadata.R +++ b/R/build_metadata.R @@ -1,34 +1,35 @@ #' Create a metadata statement for a Darwin Core Archive #' #' A metadata statement lists the owner of the dataset, how it was collected, -#' and how it may used (i.e. its' licence). This function simply converts -#' metadata stored in a markdown file to xml, and stores it in the folder -#' specified using the `directory` argument. +#' and how it can be used (i.e. its' licence). This function simply reads +#' converts metadata stored in a markdown file, converts it to xml, and saves it +#' in the `destination` file. #' #' This function is a fairly shallow wrapper on top of functionality build #' in the `paperbark` package, particularly `read_md()` and `write_eml()`. You can #' use that package to gain greater control, or to debug problems, should you #' wish. -#' @param path Path to a metadata statement stored in markdown format (.md). -#' @param file A file where the result should be saved. Defaults to +#' @param source A metadata file stored in markdown format (`.md`). Defaults +#' to `metadata.md`, which is the same as is created by [use_metdata()] +#' @param destination A file where the result should be saved. Defaults to #' `data/eml.xml`. #' @returns Does not return an object to the workspace; called for the side #' effect of building a file named `meta.xml` in the `data` directory. #' @importFrom paperbark read_md #' @importFrom paperbark write_eml #' @export -build_metadata <- function(x = "data", - file = "./data/eml.xml"){ - if(!file.exists(x)){ - cli::cli_abort("{.file {x}} doesn't exist in specified location.") +build_metadata <- function(source = "metadata.md", + destination = "./data/eml.xml"){ + if(!file.exists(source)){ + cli::cli_abort("`{source}` doesn't exist in specified location.") } # import file, ensure EML metadata is added, convert to XML progress_update("Reading file...") - metadata_file <- read_md(x) + metadata_tbl <- read_md(source) progress_update("Writing file...") - write_eml(built_file, file = file) + write_eml(metadata_tbl, file = destination) - cli::cli_alert_success("Metadata successfully built. Saved as {.file /data/eml.xml}.") + cli::cli_alert_success("Metadata successfully built. Saved as `{destination}`.") cli::cli_progress_done() } diff --git a/R/build_schema.R b/R/build_schema.R index a87c3aa..5d6b78d 100644 --- a/R/build_schema.R +++ b/R/build_schema.R @@ -4,27 +4,26 @@ #' It works by detecting column names on csv files in a specified directory; #' these should all be Darwin Core terms for this function to produce reliable #' results. -#' @param x (string) A directory containing all the files to be stored in the -#' archive. Defaults to the `data` folder within the current working directory. -#' @param file (string) A file name for the resulting schema document. +#' @param source A directory (**not** a file) containing files to be documented +#' in the schema document. Defaults to the `data` folder within the current +#' working directory. Note that files that do not match the Darwin Core naming +#' convention and/or do not end in `.csv` are ignored. +#' @param destination A file name for the resulting schema document. Defaults +#' to `./data/meta.xml` for consistency with the Darwin Core standard. #' @returns Does not return an object to the workspace; called for the side #' effect of building a file named `meta.xml` in the specified directory. #' @importFrom paperbark write_eml #' @importFrom glue glue #' @importFrom rlang abort #' @export -build_schema <- function(x = "data", - file = "./data/meta.xml") { - x <- get_default_directory(x) - - files <- detect_dwc_files(x) - fields <- detect_dwc_fields(files) - result <- add_front_matter(fields) - - progress_update("Writing file...") - write_eml(result, file = file) - - cli::cli_alert_success("Schema successfully built. Saved as {.file /data/meta.xml}.") +build_schema <- function(source = "data", + destination = "./data/meta.xml") { + get_default_directory(source) |> + detect_dwc_files() |> + detect_dwc_fields() |> + add_front_matter() |> + write_eml(file = destination) + cli::cli_alert_success("Schema successfully built. Saved as {destination}.") cli::cli_progress_done() } @@ -195,7 +194,17 @@ create_field_rows <- function(x){ index_list <- as.list(seq_along(field_names)) names(index_list) <- rep("index", n_fields) # get sequence of urls - term_list <- as.list(glue("http://rs.tdwg.org/dwc/terms/{field_names}")) + dwc_df <- corella::darwin_core_terms + term_list <- map(field_names, + .f = \(a){ + term_lookup <- dwc_df$term == a + if(any(term_lookup)){ + dwc_df$url[which(term_lookup)[1]] + }else{ + "no-dwc-term-found" + } + }) + # term_list <- as.list(glue("http://rs.tdwg.org/dwc/terms/{field_names}")) # obsolete names(term_list) <- rep("term", n_fields) # combine tibble(level = 3, diff --git a/man/build_archive.Rd b/man/build_archive.Rd index f8aa0c6..a2eae51 100644 --- a/man/build_archive.Rd +++ b/man/build_archive.Rd @@ -4,13 +4,14 @@ \alias{build_archive} \title{Build a Darwin Core Archive from a folder} \usage{ -build_archive(x = "data", file) +build_archive(source = "data", destination) } \arguments{ -\item{x}{(string) A directory containing all the files to be stored in the -archive. Defaults to the \code{data} folder within the current working directory.} +\item{source}{(string) A directory containing all the files to be stored in +the archive. Defaults to the \code{data} folder within the current working +directory.} -\item{file}{(string) A file name to save the resulting zip file.} +\item{destination}{(string) A file name to save the resulting zip file.} } \value{ Invisibly returns the location of the built zip file; but typically @@ -22,23 +23,22 @@ A Darwin Core archive is a zip file with a specified combination of data and metadata. This function assumes that all of these file types have been pre-constructed, and can be found inside a single folder, with no additional or redundant information. This function is similar to \code{devtools::build()}, -in the sense that it takes a repository and wraps it for publication, without -assessing the contents in any meaningful way. It differs from -\code{devtools::build()} in that it builds a Darwin Core Archive, rather than an -R package. +in the sense that it takes a repository and wraps it for publication, It +differs from \code{devtools::build()} in that it builds a Darwin Core Archive, +rather than an R package. } \details{ This function looks for three types of objects in the specified \code{directory}: \itemize{ \item One or more \code{csv} files such as \code{occurrences.csv} &/or \code{events.csv}. These will be manipulated versions of the raw dataset, which have been -altered to use Darwin Core terms as column headers. See the \code{corella} -package for details. -\item A metadata statement, stored in xml using the filename \code{eml.xml}. The -function \code{use_metadata()} from the \code{paperbark} package is a good starting -point here, followed by \code{build_metadata()} to save it in xml. +altered to use Darwin Core terms as column headers. See +\code{\link[corella:corella-package]{corella::corella-package()}} for details. +\item A metadata statement, stored in \code{EML} using the filename \code{eml.xml}. The +function \code{\link[=use_metadata]{use_metadata()}} is a good starting point here, followed by +\code{\link[=build_metadata]{build_metadata()}} once you have populated your metadata statement. \item A 'schema' document, also stored in xml, called \code{meta.xml}. This is -usually constructed using \code{build_schema()}. +usually constructed using \code{\link[=build_schema]{build_schema()}}. } You will get an error if these files are not present. The resulting file diff --git a/man/build_metadata.Rd b/man/build_metadata.Rd index cb6b96d..5713c3a 100644 --- a/man/build_metadata.Rd +++ b/man/build_metadata.Rd @@ -4,13 +4,14 @@ \alias{build_metadata} \title{Create a metadata statement for a Darwin Core Archive} \usage{ -build_metadata(x = "data", file = "./data/eml.xml") +build_metadata(source = "metadata.md", destination = "./data/eml.xml") } \arguments{ -\item{file}{A file where the result should be saved. Defaults to -\code{data/eml.xml}.} +\item{source}{A metadata file stored in markdown format (\code{.md}). Defaults +to \code{metadata.md}, which is the same as is created by \code{\link[=use_metdata]{use_metdata()}}} -\item{path}{Path to a metadata statement stored in markdown format (.md).} +\item{destination}{A file where the result should be saved. Defaults to +\code{data/eml.xml}.} } \value{ Does not return an object to the workspace; called for the side @@ -18,9 +19,9 @@ effect of building a file named \code{meta.xml} in the \code{data} directory. } \description{ A metadata statement lists the owner of the dataset, how it was collected, -and how it may used (i.e. its' licence). This function simply converts -metadata stored in a markdown file to xml, and stores it in the folder -specified using the \code{directory} argument. +and how it can be used (i.e. its' licence). This function simply reads +converts metadata stored in a markdown file, converts it to xml, and saves it +in the \code{destination} file. } \details{ This function is a fairly shallow wrapper on top of functionality build diff --git a/man/build_schema.Rd b/man/build_schema.Rd index a381174..f5200cf 100644 --- a/man/build_schema.Rd +++ b/man/build_schema.Rd @@ -4,13 +4,16 @@ \alias{build_schema} \title{Create a \code{schema} for a Darwin Core Archive} \usage{ -build_schema(x = "data", file = "./data/meta.xml") +build_schema(source = "data", destination = "./data/meta.xml") } \arguments{ -\item{x}{(string) A directory containing all the files to be stored in the -archive. Defaults to the \code{data} folder within the current working directory.} +\item{source}{A directory (\strong{not} a file) containing files to be documented +in the schema document. Defaults to the \code{data} folder within the current +working directory. Note that files that do not match the Darwin Core naming +convention and/or do not end in \code{.csv} are ignored.} -\item{file}{(string) A file name for the resulting schema document.} +\item{destination}{A file name for the resulting schema document. Defaults +to \code{./data/meta.xml} for consistency with the Darwin Core standard.} } \value{ Does not return an object to the workspace; called for the side diff --git a/tests/testthat/test-build.R b/tests/testthat/test-build.R index b6e3ff2..8ef2272 100644 --- a/tests/testthat/test-build.R +++ b/tests/testthat/test-build.R @@ -8,11 +8,14 @@ test_that("build_ functions work correctly in sequence", { # add data # add events.csv - tibble(eventID = 1, eventDate = "2024-01-01") |> + tibble(eventID = 1, + eventDate = "2024-01-01") |> write.csv(file = "data/events.csv", row.names = FALSE) # add occurrences.csv - tibble(basisOfRecord = "humanObservation", individualCount = 1) |> + tibble(basisOfRecord = "humanObservation", + individualCount = 1, + scientificName = "Litoria peronii") |> write.csv(file = "data/occurrences.csv", row.names = FALSE) # expect_error(build_archive()) # no schema or metadata @@ -22,7 +25,7 @@ test_that("build_ functions work correctly in sequence", { build_schema() expect_true(file.exists("data/meta.xml")) result <- readLines("data/meta.xml") - expect_equal(length(result), 15) # correct number of entries + expect_equal(length(result), 16) # correct number of entries expect_true(all(grepl("^\\s*<", result))) # all open with `<` # NOTE: still has problems with attributes containing `amp` instead of `&` # expect_error(build_archive()) # no metadata yet diff --git a/vignettes/quick_start_guide.Rmd b/vignettes/quick_start_guide.Rmd index 55c781c..2e47b3a 100644 --- a/vignettes/quick_start_guide.Rmd +++ b/vignettes/quick_start_guide.Rmd @@ -135,7 +135,7 @@ Darwin Core may be an unfamiliar format, so it can be useful to 'check' your data for common issues. We suggest first using `check_archive()`: -Alternatively, you can use the GBIF 'validate' API to check your data (not functional!) +Alternatively, you can use the GBIF 'validate' API to check your data: ```{r, eval=FALSE} validate_archive()