Merge pull request #29 from Teal-Insights/26-add-ids_bulk_download-fu…

…nction-as-alternative-to-ids_get Add ids_bulk* functions
Teal-Insights · Nov 6, 2024 · 4ab319f · 4ab319f
2 parents fdc14d9 + 57c931e
commit 4ab319f
Show file tree

Hide file tree

Showing 21 changed files with 1,230 additions and 387 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,16 +17,20 @@ Depends:
     R (>= 4.1)
 Imports:
     cli,
+    withr,
     httr2 (>= 1.0.0),
     dplyr (>= 1.0.0),
     purrr (>= 1.0.0),
     tidyr (>= 1.0.0),
     rlang (>= 1.0.0)
 Suggests: 
     curl,
+    tibble,
     devtools (>= 2.4.5),
+    jsonlite (>= 1.0.0),
     lintr (>= 3.1.2),
     quarto,
+    readxl (>= 1.0.0),
     rmarkdown,
     testthat (>= 3.0.0),
     languageserver (>= 0.3.16)

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
+export(ids_bulk)
+export(ids_bulk_files)
+export(ids_bulk_series)
 export(ids_get)
 export(ids_list_counterparts)
 export(ids_list_geographies)
@@ -12,5 +15,6 @@ importFrom(purrr,map)
 importFrom(rlang,eval_tidy)
 importFrom(rlang,parse_expr)
 importFrom(tidyr,crossing)
+importFrom(tidyr,pivot_longer)
 importFrom(tidyr,unnest)
 importFrom(tidyr,unnest_wider)
diff --git a/R/ids_bulk.R b/R/ids_bulk.R
@@ -0,0 +1,217 @@
+#' Download and Process Bulk Data File for International Debt Statistics
+#'
+#' This function downloads a data file from the World Bank International Debt
+#' Statistics (IDS), reads and processes the data into a tidy format.
+#'
+#' @param file_url A character string specifying the URL of the Excel file to
+#' download. This parameter is required (see \link{ids_bulk_files}).
+#' @param file_path An optional character string specifying the file path where
+#' the downloaded file will be saved. Defaults to a temporary file with `.xlsx`
+#' extension. The file will automatically be deleted after processing.
+#' @param quiet A logical parameter indicating whether messages should be
+#' printed to the console.
+#' @param timeout An integer specifying the timeout in seconds for downloading
+#' the file. Defaults to the current R timeout setting.
+#' @param warn_size A logical parameter indicating whether to warn about large
+#' downloads. Defaults to TRUE.
+#'
+#' @return A tibble containing processed debt statistics data with the following
+#' columns:
+#' \describe{
+#'   \item{geography_id}{The unique identifier for the geography (e.g., "ZMB").}
+#'   \item{series_id}{The unique identifier for the series (e.g.,
+#'                    "DT.DOD.DPPG.CD").}
+#'   \item{counterpart_id}{The unique identifier for the counterpart series.}
+#'   \item{year}{The year corresponding to the data (as an integer).}
+#'   \item{value}{The numeric value representing the statistic for the given
+#'                geography, series, counterpart, and year.}
+#' }
+#'
+#' @export
+#' @examples
+#' \dontrun{
+#' available_files <- ids_bulk_files()
+#' data <- ids_bulk(
+#'   available_files$file_url[1]
+#' )
+#' }
+#'
+ids_bulk <- function(
+  file_url,
+  file_path = tempfile(fileext = ".xlsx"),
+  quiet = FALSE,
+  timeout = getOption("timeout", 60),
+  warn_size = TRUE
+) {
+  rlang::check_installed("readxl", reason = "to download bulk files.")
+
+  # Register cleanup immediately after creating temporary file
+  on.exit(unlink(file_path))
+
+  # Download file with size checks and validation
+  download_bulk_file(file_url, file_path, timeout, warn_size, quiet)
+
+  # Read and process the data
+  if (!quiet) message("Reading in file.")
+  bulk_data <- read_bulk_file(file_path)
+
+  if (!quiet) message("Processing file.")
+  bulk_data <- process_bulk_data(bulk_data)
+
+  # Return the processed data
+  return(bulk_data)
+}
+
+#' Get response headers from a URL
+#'
+#' @param file_url URL to request headers from
+#' @return List of response headers
+get_response_headers <- function(file_url) {
+  httr2::request(file_url) |>
+    httr2::req_headers("Accept" = "*/*") |>
+    httr2::req_perform() |>
+    httr2::resp_headers()
+}
+
+#' Download bulk data file with validation
+#'
+#' @param file_url URL of the file to download
+#' @param file_path Path where file should be saved
+#' @param timeout Timeout in seconds
+#' @param warn_size Whether to warn about large files
+#' @param quiet Whether to suppress messages
+download_bulk_file <- function(file_url, file_path, timeout, warn_size, quiet) {
+  # Get file size before downloading
+  response_headers <- get_response_headers(file_url)
+  size_mb <- as.numeric(response_headers$`content-length`) / 1024^2
+  formatted_size <- format(round(size_mb, 1), nsmall = 1)
+
+  if (warn_size && size_mb > 100) {
+    warning(
+      sprintf(
+        paste(
+          "This file is %s MB and may take several minutes to download.",
+          "Current timeout setting: %s seconds.",
+          "Use warn_size=FALSE to disable this warning.",
+          sep = "\n"
+        ),
+        formatted_size,
+        timeout
+      ),
+      call. = FALSE
+    )
+
+    # Interactive confirmation
+    if (warn_size && check_interactive()) {
+      response <- readline("Do you want to continue with the download? (y/N): ")
+      if (!tolower(response) %in% c("y", "yes")) {
+        stop("Download cancelled by user", call. = FALSE)
+      }
+    }
+  }
+
+  # Print message about file download
+  if (!quiet) message("Downloading file to: {file_path}")
+
+  # Download with timeout handling
+  withr::with_options(
+    list(timeout = timeout),
+    tryCatch({
+      download_file(file_url, destfile = file_path, quiet = quiet)
+    },
+    error = function(e) {
+      if (grepl("timeout|cannot open URL", e$message, ignore.case = TRUE)) {
+        stop(
+          paste0(
+            "Download timed out after ", timeout, " seconds.\n",
+            "Try increasing the timeout parameter",
+            " (e.g., timeout=600 for 10 minutes)"
+          ),
+          call. = FALSE
+        )
+      }
+      stop(e$message, call. = FALSE)
+    })
+  )
+
+  # Validate downloaded file
+  validate_file(file_path)
+}
+
+#' Validate that downloaded file exists and is not empty
+#'
+#' @param file_path Path to file to validate
+validate_file <- function(file_path) {
+  if (!file.exists(file_path)) {
+    stop("Download failed: File not created")
+  }
+  if (file.size(file_path) == 0) {
+    unlink(file_path)
+    stop("Download failed: Empty file")
+  }
+}
+
+#' Read bulk file and determine column types
+#'
+#' @param file_path Path to Excel file
+#' @return Raw data frame from Excel file
+read_bulk_file <- function(file_path) {
+  available_columns <- readxl::read_excel(path = file_path, n_max = 0) |>
+    colnames()
+  relevant_columns <- tibble(names = available_columns) |>
+    mutate(
+      type = if_else(grepl(pattern = "[0:9]", .data$names), "numeric", "text")
+    ) |>
+    filter(!grepl("column", names, ignore.case = TRUE))
+
+  readxl::read_excel(
+    path = file_path,
+    range = readxl::cell_cols(seq_len(nrow(relevant_columns))),
+    col_types = relevant_columns$type
+  )
+}
+
+#' Process bulk data into tidy format
+#'
+#' @param bulk_raw Raw data frame from Excel file
+#' @return Processed tibble in tidy format
+process_bulk_data <- function(bulk_raw) {
+  bulk_raw |>
+    select(-c("Country Name", "Classification Name")) |>
+    select(
+      geography_id = "Country Code",
+      series_id = "Series Code",
+      counterpart_id = "Series Name",
+      everything()
+    ) |>
+    tidyr::pivot_longer(
+      cols = -c("geography_id", "series_id", "counterpart_id"),
+      names_to = "year"
+    ) |>
+    mutate(year = as.integer(.data$year)) |>
+    tidyr::drop_na()
+}
+
+#' Check if R is running interactively
+#'
+#' Wrapper around base::interactive() to make the function testable.
+#' This function exists primarily to facilitate testing of interactive features.
+#'
+#' @return Logical indicating whether R is running interactively
+#' @keywords internal
+check_interactive <- function() {
+  interactive()
+}
+
+#' Download a file using utils::download.file
+#'
+#' Wrapper around utils::download.file to facilitate testing.
+#'
+#' @param url URL of file to download
+#' @param destfile Destination file path
+#' @param quiet Whether to suppress messages
+#' @return Invisibly returns the status code from download.file
+#' @keywords internal
+download_file <- function(url, destfile, quiet) {
+  utils::download.file(url, destfile = destfile, quiet = quiet, mode = "wb")
+}
diff --git a/R/ids_bulk_files.R b/R/ids_bulk_files.R
@@ -0,0 +1,43 @@
+#' Retrieve Available Bulk Download Files for International Debt Statistics
+#'
+#' This function returns a tibble with metadata for files available for bulk
+#' download via the World Bank International Debt Statistics (IDS). It includes
+#' information such as file names, URLs, and the last update dates for each file
+#' in Excel (xlsx) format.
+#'
+#' @return A tibble containing the available files and their metadata:
+#' \describe{
+#'   \item{file_name}{The name of the file available for download.}
+#'   \item{file_url}{The URL to download the file in Excel format.}
+#'   \item{last_updated_date}{The date when the file was last updated.}
+#' }
+#'
+#' @export
+#'
+#' @examplesIf curl::has_internet()
+#' ids_bulk_files()
+#'
+ids_bulk_files <- function() {
+
+  rlang::check_installed(
+    "jsonlite", reason = "to download bulk files."
+  )
+
+  ids_meta <- jsonlite::fromJSON(
+    txt = paste0(
+      "https://datacatalogapi.worldbank.org/ddhxext/DatasetDownload",
+      "?dataset_unique_id=0038015&version_id="
+    )
+  )
+
+  bulk_files <- ids_meta$resources |>
+    as_tibble() |>
+    select("name", "distribution", "last_updated_date") |>
+    tidyr::unnest("distribution") |>
+    filter(.data$distribution_format == "xlsx") |>
+    select(file_name = "name", file_url = "url", "last_updated_date") |>
+    mutate(last_updated_date = as.Date(.data$last_updated_date))
+
+  bulk_files
+
+}
diff --git a/R/ids_bulk_series.R b/R/ids_bulk_series.R
@@ -0,0 +1,51 @@
+#' Retrieve Bulk Series Metadata for International Debt Statistics
+#'
+#' This function retrieves a tibble with metadata for series available via
+#' bulk download of the World Bank International Debt Statistics (IDS).
+#'
+#' @return A tibble containing the available series and their metadata:
+#' \describe{
+#'   \item{series_id}{The unique identifier for the series (e.g.,
+#'                    "BN.CAB.XOKA.CD").}
+#'   \item{series_name}{The name of the series (e.g., "Current account balance
+#'                      (current US$)").}
+#'   \item{source_id}{The ID of the data source providing the indicator.}
+#'   \item{source_name}{The name or description of the source of the indicator
+#'                      data.}
+#'   \item{source_note}{Additional notes or descriptions about the data source.}
+#'   \item{source_organization}{The organization responsible for the data
+#'                              source.}
+#' }
+#'
+#' @export
+#'
+#' @examplesIf curl::has_internet()
+#' ids_bulk_series()
+#'
+ids_bulk_series <- function() {
+
+  rlang::check_installed(
+    "jsonlite", reason = "to retrieve available series via bulk download."
+  )
+
+  ids_meta <- jsonlite::fromJSON(
+    txt = paste0(
+      "https://datacatalogapi.worldbank.org/ddhxext/DatasetDownload",
+      "?dataset_unique_id=0038015&version_id="
+    )
+  )
+
+  bulk_series <- ids_meta$indicators |>
+    as_tibble() |>
+    select("lineage") |>
+    tidyr::unnest("lineage") |>
+    select(series_id = "harvest_system_reference")
+
+  api_series <- ids_list_series()
+
+  bulk_series <- bulk_series |>
+    left_join(api_series, join_by("series_id"))
+
+  bulk_series
+
+}
diff --git a/R/wbids-package.R b/R/wbids-package.R
@@ -6,7 +6,7 @@
 
 ## usethis namespace: start
 #' @importFrom purrr map
-#' @importFrom tidyr unnest unnest_wider crossing
+#' @importFrom tidyr unnest unnest_wider crossing pivot_longer
 #' @importFrom rlang parse_expr eval_tidy
 ## usethis namespace: end
 NULL
diff --git a/man/check_interactive.Rd b/man/check_interactive.Rd
diff --git a/man/download_bulk_file.Rd b/man/download_bulk_file.Rd