Skip to content

Commit

Permalink
Merge pull request #55 from Teal-Insights/54-fix-warning-error-in-ids…
Browse files Browse the repository at this point in the history
…_bulk-download-example

Fix warning & error in ids bulk download example
  • Loading branch information
chriscarrollsmith authored Dec 6, 2024
2 parents aa48566 + ea7addf commit b6f1b9e
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 14 deletions.
32 changes: 24 additions & 8 deletions R/ids_bulk.R
Original file line number Diff line number Diff line change
Expand Up @@ -156,18 +156,26 @@ validate_file <- function(file_path) {
#' @noRd
#'
read_bulk_file <- function(file_path) {
available_columns <- readxl::read_excel(path = file_path, n_max = 0) |>
colnames()
# Read in first row of Excel file to get column names
header_row <- readxl::read_excel(path = file_path, n_max = 0)

# Get all column names
available_columns <- header_row |> colnames()

# Initialize a helper tibble mapping relevant column names to types
relevant_columns <- tibble(names = available_columns) |>
# Drop column names that contain "column" (which are empty in the bulk file)
filter(!grepl("column", .data$names, ignore.case = TRUE)) |>
# Year columns are numeric, all others are text
mutate(
type = if_else(grepl(pattern = "[0:9]", .data$names), "numeric", "text")
) |>
filter(!grepl("column", names, ignore.case = TRUE))
types = if_else(grepl(pattern = "[0:9]", .data$names), "numeric", "text")
)

# Read in the data from the Excel file for only the relevant columns
readxl::read_excel(
path = file_path,
range = readxl::cell_cols(seq_len(nrow(relevant_columns))),
col_types = relevant_columns$type
col_types = relevant_columns$types
)
}

Expand All @@ -179,18 +187,26 @@ read_bulk_file <- function(file_path) {
#'
process_bulk_data <- function(bulk_raw) {
bulk_raw |>
select(-c("Country Name", "Classification Name")) |>
# Select only the relevant columns
select(
"Country Code", "Series Code", "Counterpart-Area Code",
matches("^\\d{4}$")
) |>
# Rename columns to match the package data model
select(
geography_id = "Country Code",
series_id = "Series Code",
counterpart_id = "Series Name",
counterpart_id = "Counterpart-Area Code",
everything()
) |>
# Pivot to long (tidy) format
tidyr::pivot_longer(
cols = -c("geography_id", "series_id", "counterpart_id"),
names_to = "year"
) |>
# Convert year to integer
mutate(year = as.integer(.data$year)) |>
# Drop rows with NA values
tidyr::drop_na()
}

Expand Down
Binary file not shown.
Binary file added tests/testthat/data/read_bulk_file_output.rds
Binary file not shown.
Binary file added tests/testthat/data/read_bulk_info_output.rds
Binary file not shown.
Binary file removed tests/testthat/data/sample.rds
Binary file not shown.
Binary file removed tests/testthat/data/sample.xlsx
Binary file not shown.
33 changes: 27 additions & 6 deletions tests/testthat/test-ids_bulk.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ test_that("ids_bulk handles message parameter correctly", {
mock_data <- tibble::tibble(
"Country Code" = "ABC",
"Country Name" = "Test Country",
"Classification Name" = "Test Class",
"Counterpart-Area Code" = "Test Counterpart",
"Series Code" = "TEST.1",
"Series Name" = "Test Series",
"2020" = 100
Expand Down Expand Up @@ -227,33 +227,52 @@ test_that("download_bulk_file downloads files correctly", {
test_that("read_bulk_file reads files correctly", {
skip_on_cran()

test_path <- test_path("data/sample.xlsx")
test_path <- test_path("data/download_bulk_file_output.xlsx")
result <- read_bulk_file(test_path)
expect_s3_class(result, "tbl_df")
})

test_that("process_bulk_data processes data correctly", {
test_path <- test_path("data/sample.rds")
result <- process_bulk_data(readRDS(test_path))
test_path <- test_path("data/read_bulk_file_output.rds")
test_data <- readRDS(test_path)

result <- process_bulk_data(test_data)

# Test structure
expect_s3_class(result, "tbl_df")
expect_named(
result,
c("geography_id", "series_id", "counterpart_id", "year", "value")
)

# Test data types
expect_type(result$geography_id, "character")
expect_type(result$series_id, "character")
expect_type(result$counterpart_id, "character")
expect_type(result$year, "integer")
expect_type(result$value, "double")

expect_gt(nrow(result), 0)
# Each code in test data should occur 17 times (the number of non-NA years)
expected_country_codes <- rep(test_data$`Country Code`, each = 17)
expect_equal(result$geography_id, expected_country_codes)

expected_counterpart_codes <- rep(
test_data$`Counterpart-Area Code`, each = 17
)
expect_equal(result$counterpart_id, expected_counterpart_codes)

expected_series_codes <- rep(test_data$`Series Code`, each = 17)
expect_equal(result$series_id, expected_series_codes)

# Years should span from 2006 to 2022 (non-NA years)
expect_equal(result$year, rep(2006:2022, times = nrow(test_data)))

# No NAs should be present
expect_false(any(is.na(result$geography_id)))
expect_false(any(is.na(result$series_id)))
expect_false(any(is.na(result$counterpart_id)))
expect_false(any(is.na(result$year)))
expect_false(any(is.na(result$value)))
})

test_that("ids_bulk downloads and processes data correctly", {
Expand All @@ -271,7 +290,9 @@ test_that("ids_bulk downloads and processes data correctly", {
local_mocked_bindings(
check_interactive = function() FALSE,
download_bulk_file = function(...) TRUE,
read_bulk_file = function(...) readRDS(test_path("data/sample.rds"))
read_bulk_file = function(...) {
readRDS(test_path("data/read_bulk_file_output.rds"))
}
)

result <- ids_bulk(
Expand Down
6 changes: 6 additions & 0 deletions tests/testthat/test-ids_bulk_files.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
test_that("ids_bulk_files returns a tibble with expected columns", {
skip_if_not_installed("jsonlite")

local_mocked_bindings(
read_bulk_info = function() {
readRDS(test_path("data/read_bulk_info_output.rds"))
}
)

result <- ids_bulk_files()
expected_columns <- c("file_name", "file_url", "last_updated_date")

Expand Down

0 comments on commit b6f1b9e

Please sign in to comment.