-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #29 from Teal-Insights/26-add-ids_bulk_download-fu…
…nction-as-alternative-to-ids_get Add ids_bulk* functions
- Loading branch information
Showing
21 changed files
with
1,230 additions
and
387 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
#' Download and Process Bulk Data File for International Debt Statistics | ||
#' | ||
#' This function downloads a data file from the World Bank International Debt | ||
#' Statistics (IDS), reads and processes the data into a tidy format. | ||
#' | ||
#' @param file_url A character string specifying the URL of the Excel file to | ||
#' download. This parameter is required (see \link{ids_bulk_files}). | ||
#' @param file_path An optional character string specifying the file path where | ||
#' the downloaded file will be saved. Defaults to a temporary file with `.xlsx` | ||
#' extension. The file will automatically be deleted after processing. | ||
#' @param quiet A logical parameter indicating whether messages should be | ||
#' printed to the console. | ||
#' @param timeout An integer specifying the timeout in seconds for downloading | ||
#' the file. Defaults to the current R timeout setting. | ||
#' @param warn_size A logical parameter indicating whether to warn about large | ||
#' downloads. Defaults to TRUE. | ||
#' | ||
#' @return A tibble containing processed debt statistics data with the following | ||
#' columns: | ||
#' \describe{ | ||
#' \item{geography_id}{The unique identifier for the geography (e.g., "ZMB").} | ||
#' \item{series_id}{The unique identifier for the series (e.g., | ||
#' "DT.DOD.DPPG.CD").} | ||
#' \item{counterpart_id}{The unique identifier for the counterpart series.} | ||
#' \item{year}{The year corresponding to the data (as an integer).} | ||
#' \item{value}{The numeric value representing the statistic for the given | ||
#' geography, series, counterpart, and year.} | ||
#' } | ||
#' | ||
#' @export | ||
#' @examples | ||
#' \dontrun{ | ||
#' available_files <- ids_bulk_files() | ||
#' data <- ids_bulk( | ||
#' available_files$file_url[1] | ||
#' ) | ||
#' } | ||
#' | ||
ids_bulk <- function( | ||
file_url, | ||
file_path = tempfile(fileext = ".xlsx"), | ||
quiet = FALSE, | ||
timeout = getOption("timeout", 60), | ||
warn_size = TRUE | ||
) { | ||
rlang::check_installed("readxl", reason = "to download bulk files.") | ||
|
||
# Register cleanup immediately after creating temporary file | ||
on.exit(unlink(file_path)) | ||
|
||
# Download file with size checks and validation | ||
download_bulk_file(file_url, file_path, timeout, warn_size, quiet) | ||
|
||
# Read and process the data | ||
if (!quiet) message("Reading in file.") | ||
bulk_data <- read_bulk_file(file_path) | ||
|
||
if (!quiet) message("Processing file.") | ||
bulk_data <- process_bulk_data(bulk_data) | ||
|
||
# Return the processed data | ||
return(bulk_data) | ||
} | ||
|
||
#' Get response headers from a URL | ||
#' | ||
#' @param file_url URL to request headers from | ||
#' @return List of response headers | ||
get_response_headers <- function(file_url) { | ||
httr2::request(file_url) |> | ||
httr2::req_headers("Accept" = "*/*") |> | ||
httr2::req_perform() |> | ||
httr2::resp_headers() | ||
} | ||
|
||
#' Download bulk data file with validation | ||
#' | ||
#' @param file_url URL of the file to download | ||
#' @param file_path Path where file should be saved | ||
#' @param timeout Timeout in seconds | ||
#' @param warn_size Whether to warn about large files | ||
#' @param quiet Whether to suppress messages | ||
download_bulk_file <- function(file_url, file_path, timeout, warn_size, quiet) { | ||
# Get file size before downloading | ||
response_headers <- get_response_headers(file_url) | ||
size_mb <- as.numeric(response_headers$`content-length`) / 1024^2 | ||
formatted_size <- format(round(size_mb, 1), nsmall = 1) | ||
|
||
if (warn_size && size_mb > 100) { | ||
warning( | ||
sprintf( | ||
paste( | ||
"This file is %s MB and may take several minutes to download.", | ||
"Current timeout setting: %s seconds.", | ||
"Use warn_size=FALSE to disable this warning.", | ||
sep = "\n" | ||
), | ||
formatted_size, | ||
timeout | ||
), | ||
call. = FALSE | ||
) | ||
|
||
# Interactive confirmation | ||
if (warn_size && check_interactive()) { | ||
response <- readline("Do you want to continue with the download? (y/N): ") | ||
if (!tolower(response) %in% c("y", "yes")) { | ||
stop("Download cancelled by user", call. = FALSE) | ||
} | ||
} | ||
} | ||
|
||
# Print message about file download | ||
if (!quiet) message("Downloading file to: {file_path}") | ||
|
||
# Download with timeout handling | ||
withr::with_options( | ||
list(timeout = timeout), | ||
tryCatch({ | ||
download_file(file_url, destfile = file_path, quiet = quiet) | ||
}, | ||
error = function(e) { | ||
if (grepl("timeout|cannot open URL", e$message, ignore.case = TRUE)) { | ||
stop( | ||
paste0( | ||
"Download timed out after ", timeout, " seconds.\n", | ||
"Try increasing the timeout parameter", | ||
" (e.g., timeout=600 for 10 minutes)" | ||
), | ||
call. = FALSE | ||
) | ||
} | ||
stop(e$message, call. = FALSE) | ||
}) | ||
) | ||
|
||
# Validate downloaded file | ||
validate_file(file_path) | ||
} | ||
|
||
#' Validate that downloaded file exists and is not empty | ||
#' | ||
#' @param file_path Path to file to validate | ||
validate_file <- function(file_path) { | ||
if (!file.exists(file_path)) { | ||
stop("Download failed: File not created") | ||
} | ||
if (file.size(file_path) == 0) { | ||
unlink(file_path) | ||
stop("Download failed: Empty file") | ||
} | ||
} | ||
|
||
#' Read bulk file and determine column types | ||
#' | ||
#' @param file_path Path to Excel file | ||
#' @return Raw data frame from Excel file | ||
read_bulk_file <- function(file_path) { | ||
available_columns <- readxl::read_excel(path = file_path, n_max = 0) |> | ||
colnames() | ||
relevant_columns <- tibble(names = available_columns) |> | ||
mutate( | ||
type = if_else(grepl(pattern = "[0:9]", .data$names), "numeric", "text") | ||
) |> | ||
filter(!grepl("column", names, ignore.case = TRUE)) | ||
|
||
readxl::read_excel( | ||
path = file_path, | ||
range = readxl::cell_cols(seq_len(nrow(relevant_columns))), | ||
col_types = relevant_columns$type | ||
) | ||
} | ||
|
||
#' Process bulk data into tidy format | ||
#' | ||
#' @param bulk_raw Raw data frame from Excel file | ||
#' @return Processed tibble in tidy format | ||
process_bulk_data <- function(bulk_raw) { | ||
bulk_raw |> | ||
select(-c("Country Name", "Classification Name")) |> | ||
select( | ||
geography_id = "Country Code", | ||
series_id = "Series Code", | ||
counterpart_id = "Series Name", | ||
everything() | ||
) |> | ||
tidyr::pivot_longer( | ||
cols = -c("geography_id", "series_id", "counterpart_id"), | ||
names_to = "year" | ||
) |> | ||
mutate(year = as.integer(.data$year)) |> | ||
tidyr::drop_na() | ||
} | ||
|
||
#' Check if R is running interactively | ||
#' | ||
#' Wrapper around base::interactive() to make the function testable. | ||
#' This function exists primarily to facilitate testing of interactive features. | ||
#' | ||
#' @return Logical indicating whether R is running interactively | ||
#' @keywords internal | ||
check_interactive <- function() { | ||
interactive() | ||
} | ||
|
||
#' Download a file using utils::download.file | ||
#' | ||
#' Wrapper around utils::download.file to facilitate testing. | ||
#' | ||
#' @param url URL of file to download | ||
#' @param destfile Destination file path | ||
#' @param quiet Whether to suppress messages | ||
#' @return Invisibly returns the status code from download.file | ||
#' @keywords internal | ||
download_file <- function(url, destfile, quiet) { | ||
utils::download.file(url, destfile = destfile, quiet = quiet, mode = "wb") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#' Retrieve Available Bulk Download Files for International Debt Statistics | ||
#' | ||
#' This function returns a tibble with metadata for files available for bulk | ||
#' download via the World Bank International Debt Statistics (IDS). It includes | ||
#' information such as file names, URLs, and the last update dates for each file | ||
#' in Excel (xlsx) format. | ||
#' | ||
#' @return A tibble containing the available files and their metadata: | ||
#' \describe{ | ||
#' \item{file_name}{The name of the file available for download.} | ||
#' \item{file_url}{The URL to download the file in Excel format.} | ||
#' \item{last_updated_date}{The date when the file was last updated.} | ||
#' } | ||
#' | ||
#' @export | ||
#' | ||
#' @examplesIf curl::has_internet() | ||
#' ids_bulk_files() | ||
#' | ||
ids_bulk_files <- function() { | ||
|
||
rlang::check_installed( | ||
"jsonlite", reason = "to download bulk files." | ||
) | ||
|
||
ids_meta <- jsonlite::fromJSON( | ||
txt = paste0( | ||
"https://datacatalogapi.worldbank.org/ddhxext/DatasetDownload", | ||
"?dataset_unique_id=0038015&version_id=" | ||
) | ||
) | ||
|
||
bulk_files <- ids_meta$resources |> | ||
as_tibble() |> | ||
select("name", "distribution", "last_updated_date") |> | ||
tidyr::unnest("distribution") |> | ||
filter(.data$distribution_format == "xlsx") |> | ||
select(file_name = "name", file_url = "url", "last_updated_date") |> | ||
mutate(last_updated_date = as.Date(.data$last_updated_date)) | ||
|
||
bulk_files | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#' Retrieve Bulk Series Metadata for International Debt Statistics | ||
#' | ||
#' This function retrieves a tibble with metadata for series available via | ||
#' bulk download of the World Bank International Debt Statistics (IDS). | ||
#' | ||
#' @return A tibble containing the available series and their metadata: | ||
#' \describe{ | ||
#' \item{series_id}{The unique identifier for the series (e.g., | ||
#' "BN.CAB.XOKA.CD").} | ||
#' \item{series_name}{The name of the series (e.g., "Current account balance | ||
#' (current US$)").} | ||
#' \item{source_id}{The ID of the data source providing the indicator.} | ||
#' \item{source_name}{The name or description of the source of the indicator | ||
#' data.} | ||
#' \item{source_note}{Additional notes or descriptions about the data source.} | ||
#' \item{source_organization}{The organization responsible for the data | ||
#' source.} | ||
#' } | ||
#' | ||
#' @export | ||
#' | ||
#' @examplesIf curl::has_internet() | ||
#' ids_bulk_series() | ||
#' | ||
ids_bulk_series <- function() { | ||
|
||
rlang::check_installed( | ||
"jsonlite", reason = "to retrieve available series via bulk download." | ||
) | ||
|
||
ids_meta <- jsonlite::fromJSON( | ||
txt = paste0( | ||
"https://datacatalogapi.worldbank.org/ddhxext/DatasetDownload", | ||
"?dataset_unique_id=0038015&version_id=" | ||
) | ||
) | ||
|
||
bulk_series <- ids_meta$indicators |> | ||
as_tibble() |> | ||
select("lineage") |> | ||
tidyr::unnest("lineage") |> | ||
select(series_id = "harvest_system_reference") | ||
|
||
api_series <- ids_list_series() | ||
|
||
bulk_series <- bulk_series |> | ||
left_join(api_series, join_by("series_id")) | ||
|
||
bulk_series | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.