Skip to content

Commit

Permalink
Merge pull request #29 from Teal-Insights/26-add-ids_bulk_download-fu…
Browse files Browse the repository at this point in the history
…nction-as-alternative-to-ids_get

Add ids_bulk* functions
  • Loading branch information
christophscheuch authored Nov 6, 2024
2 parents fdc14d9 + 57c931e commit 4ab319f
Show file tree
Hide file tree
Showing 21 changed files with 1,230 additions and 387 deletions.
4 changes: 4 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,20 @@ Depends:
R (>= 4.1)
Imports:
cli,
withr,
httr2 (>= 1.0.0),
dplyr (>= 1.0.0),
purrr (>= 1.0.0),
tidyr (>= 1.0.0),
rlang (>= 1.0.0)
Suggests:
curl,
tibble,
devtools (>= 2.4.5),
jsonlite (>= 1.0.0),
lintr (>= 3.1.2),
quarto,
readxl (>= 1.0.0),
rmarkdown,
testthat (>= 3.0.0),
languageserver (>= 0.3.16)
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Generated by roxygen2: do not edit by hand

export(ids_bulk)
export(ids_bulk_files)
export(ids_bulk_series)
export(ids_get)
export(ids_list_counterparts)
export(ids_list_geographies)
Expand All @@ -12,5 +15,6 @@ importFrom(purrr,map)
importFrom(rlang,eval_tidy)
importFrom(rlang,parse_expr)
importFrom(tidyr,crossing)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,unnest)
importFrom(tidyr,unnest_wider)
217 changes: 217 additions & 0 deletions R/ids_bulk.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
#' Download and Process Bulk Data File for International Debt Statistics
#'
#' This function downloads a data file from the World Bank International Debt
#' Statistics (IDS), reads and processes the data into a tidy format.
#'
#' @param file_url A character string specifying the URL of the Excel file to
#' download. This parameter is required (see \link{ids_bulk_files}).
#' @param file_path An optional character string specifying the file path where
#' the downloaded file will be saved. Defaults to a temporary file with `.xlsx`
#' extension. The file will automatically be deleted after processing.
#' @param quiet A logical parameter indicating whether messages should be
#' printed to the console.
#' @param timeout An integer specifying the timeout in seconds for downloading
#' the file. Defaults to the current R timeout setting.
#' @param warn_size A logical parameter indicating whether to warn about large
#' downloads. Defaults to TRUE.
#'
#' @return A tibble containing processed debt statistics data with the following
#' columns:
#' \describe{
#' \item{geography_id}{The unique identifier for the geography (e.g., "ZMB").}
#' \item{series_id}{The unique identifier for the series (e.g.,
#' "DT.DOD.DPPG.CD").}
#' \item{counterpart_id}{The unique identifier for the counterpart series.}
#' \item{year}{The year corresponding to the data (as an integer).}
#' \item{value}{The numeric value representing the statistic for the given
#' geography, series, counterpart, and year.}
#' }
#'
#' @export
#' @examples
#' \dontrun{
#' available_files <- ids_bulk_files()
#' data <- ids_bulk(
#' available_files$file_url[1]
#' )
#' }
#'
ids_bulk <- function(
file_url,
file_path = tempfile(fileext = ".xlsx"),
quiet = FALSE,
timeout = getOption("timeout", 60),
warn_size = TRUE
) {
rlang::check_installed("readxl", reason = "to download bulk files.")

# Register cleanup immediately after creating temporary file
on.exit(unlink(file_path))

# Download file with size checks and validation
download_bulk_file(file_url, file_path, timeout, warn_size, quiet)

# Read and process the data
if (!quiet) message("Reading in file.")
bulk_data <- read_bulk_file(file_path)

if (!quiet) message("Processing file.")
bulk_data <- process_bulk_data(bulk_data)

# Return the processed data
return(bulk_data)
}

#' Get response headers from a URL
#'
#' @param file_url URL to request headers from
#' @return List of response headers
get_response_headers <- function(file_url) {
httr2::request(file_url) |>
httr2::req_headers("Accept" = "*/*") |>
httr2::req_perform() |>
httr2::resp_headers()
}

#' Download bulk data file with validation
#'
#' @param file_url URL of the file to download
#' @param file_path Path where file should be saved
#' @param timeout Timeout in seconds
#' @param warn_size Whether to warn about large files
#' @param quiet Whether to suppress messages
download_bulk_file <- function(file_url, file_path, timeout, warn_size, quiet) {
# Get file size before downloading
response_headers <- get_response_headers(file_url)
size_mb <- as.numeric(response_headers$`content-length`) / 1024^2
formatted_size <- format(round(size_mb, 1), nsmall = 1)

if (warn_size && size_mb > 100) {
warning(
sprintf(
paste(
"This file is %s MB and may take several minutes to download.",
"Current timeout setting: %s seconds.",
"Use warn_size=FALSE to disable this warning.",
sep = "\n"
),
formatted_size,
timeout
),
call. = FALSE
)

# Interactive confirmation
if (warn_size && check_interactive()) {
response <- readline("Do you want to continue with the download? (y/N): ")
if (!tolower(response) %in% c("y", "yes")) {
stop("Download cancelled by user", call. = FALSE)
}
}
}

# Print message about file download
if (!quiet) message("Downloading file to: {file_path}")

# Download with timeout handling
withr::with_options(
list(timeout = timeout),
tryCatch({
download_file(file_url, destfile = file_path, quiet = quiet)
},
error = function(e) {
if (grepl("timeout|cannot open URL", e$message, ignore.case = TRUE)) {
stop(
paste0(
"Download timed out after ", timeout, " seconds.\n",
"Try increasing the timeout parameter",
" (e.g., timeout=600 for 10 minutes)"
),
call. = FALSE
)
}
stop(e$message, call. = FALSE)
})
)

# Validate downloaded file
validate_file(file_path)
}

#' Validate that downloaded file exists and is not empty
#'
#' @param file_path Path to file to validate
validate_file <- function(file_path) {
if (!file.exists(file_path)) {
stop("Download failed: File not created")
}
if (file.size(file_path) == 0) {
unlink(file_path)
stop("Download failed: Empty file")
}
}

#' Read bulk file and determine column types
#'
#' @param file_path Path to Excel file
#' @return Raw data frame from Excel file
read_bulk_file <- function(file_path) {
available_columns <- readxl::read_excel(path = file_path, n_max = 0) |>
colnames()
relevant_columns <- tibble(names = available_columns) |>
mutate(
type = if_else(grepl(pattern = "[0:9]", .data$names), "numeric", "text")
) |>
filter(!grepl("column", names, ignore.case = TRUE))

readxl::read_excel(
path = file_path,
range = readxl::cell_cols(seq_len(nrow(relevant_columns))),
col_types = relevant_columns$type
)
}

#' Process bulk data into tidy format
#'
#' @param bulk_raw Raw data frame from Excel file
#' @return Processed tibble in tidy format
process_bulk_data <- function(bulk_raw) {
bulk_raw |>
select(-c("Country Name", "Classification Name")) |>
select(
geography_id = "Country Code",
series_id = "Series Code",
counterpart_id = "Series Name",
everything()
) |>
tidyr::pivot_longer(
cols = -c("geography_id", "series_id", "counterpart_id"),
names_to = "year"
) |>
mutate(year = as.integer(.data$year)) |>
tidyr::drop_na()
}

#' Check if R is running interactively
#'
#' Wrapper around base::interactive() to make the function testable.
#' This function exists primarily to facilitate testing of interactive features.
#'
#' @return Logical indicating whether R is running interactively
#' @keywords internal
check_interactive <- function() {
interactive()
}

#' Download a file using utils::download.file
#'
#' Wrapper around utils::download.file to facilitate testing.
#'
#' @param url URL of file to download
#' @param destfile Destination file path
#' @param quiet Whether to suppress messages
#' @return Invisibly returns the status code from download.file
#' @keywords internal
download_file <- function(url, destfile, quiet) {
utils::download.file(url, destfile = destfile, quiet = quiet, mode = "wb")
}
43 changes: 43 additions & 0 deletions R/ids_bulk_files.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#' Retrieve Available Bulk Download Files for International Debt Statistics
#'
#' This function returns a tibble with metadata for files available for bulk
#' download via the World Bank International Debt Statistics (IDS). It includes
#' information such as file names, URLs, and the last update dates for each file
#' in Excel (xlsx) format.
#'
#' @return A tibble containing the available files and their metadata:
#' \describe{
#' \item{file_name}{The name of the file available for download.}
#' \item{file_url}{The URL to download the file in Excel format.}
#' \item{last_updated_date}{The date when the file was last updated.}
#' }
#'
#' @export
#'
#' @examplesIf curl::has_internet()
#' ids_bulk_files()
#'
ids_bulk_files <- function() {

rlang::check_installed(
"jsonlite", reason = "to download bulk files."
)

ids_meta <- jsonlite::fromJSON(
txt = paste0(
"https://datacatalogapi.worldbank.org/ddhxext/DatasetDownload",
"?dataset_unique_id=0038015&version_id="
)
)

bulk_files <- ids_meta$resources |>
as_tibble() |>
select("name", "distribution", "last_updated_date") |>
tidyr::unnest("distribution") |>
filter(.data$distribution_format == "xlsx") |>
select(file_name = "name", file_url = "url", "last_updated_date") |>
mutate(last_updated_date = as.Date(.data$last_updated_date))

bulk_files

}
51 changes: 51 additions & 0 deletions R/ids_bulk_series.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#' Retrieve Bulk Series Metadata for International Debt Statistics
#'
#' This function retrieves a tibble with metadata for series available via
#' bulk download of the World Bank International Debt Statistics (IDS).
#'
#' @return A tibble containing the available series and their metadata:
#' \describe{
#' \item{series_id}{The unique identifier for the series (e.g.,
#' "BN.CAB.XOKA.CD").}
#' \item{series_name}{The name of the series (e.g., "Current account balance
#' (current US$)").}
#' \item{source_id}{The ID of the data source providing the indicator.}
#' \item{source_name}{The name or description of the source of the indicator
#' data.}
#' \item{source_note}{Additional notes or descriptions about the data source.}
#' \item{source_organization}{The organization responsible for the data
#' source.}
#' }
#'
#' @export
#'
#' @examplesIf curl::has_internet()
#' ids_bulk_series()
#'
ids_bulk_series <- function() {

rlang::check_installed(
"jsonlite", reason = "to retrieve available series via bulk download."
)

ids_meta <- jsonlite::fromJSON(
txt = paste0(
"https://datacatalogapi.worldbank.org/ddhxext/DatasetDownload",
"?dataset_unique_id=0038015&version_id="
)
)

bulk_series <- ids_meta$indicators |>
as_tibble() |>
select("lineage") |>
tidyr::unnest("lineage") |>
select(series_id = "harvest_system_reference")

api_series <- ids_list_series()

bulk_series <- bulk_series |>
left_join(api_series, join_by("series_id"))

bulk_series

}
2 changes: 1 addition & 1 deletion R/wbids-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

## usethis namespace: start
#' @importFrom purrr map
#' @importFrom tidyr unnest unnest_wider crossing
#' @importFrom tidyr unnest unnest_wider crossing pivot_longer
#' @importFrom rlang parse_expr eval_tidy
## usethis namespace: end
NULL
16 changes: 16 additions & 0 deletions man/check_interactive.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions man/download_bulk_file.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 4ab319f

Please sign in to comment.