Merge pull request #55 from Teal-Insights/54-fix-warning-error-in-ids…

…_bulk-download-example Fix warning & error in ids bulk download example
Teal-Insights · Dec 6, 2024 · b6f1b9e · b6f1b9e
2 parents aa48566 + ea7addf
commit b6f1b9e
Show file tree

Hide file tree

Showing 8 changed files with 57 additions and 14 deletions.
diff --git a/R/ids_bulk.R b/R/ids_bulk.R
@@ -156,18 +156,26 @@ validate_file <- function(file_path) {
 #' @noRd
 #'
 read_bulk_file <- function(file_path) {
-  available_columns <- readxl::read_excel(path = file_path, n_max = 0) |>
-    colnames()
+  # Read in first row of Excel file to get column names
+  header_row <- readxl::read_excel(path = file_path, n_max = 0)
+
+  # Get all column names
+  available_columns <- header_row |> colnames()
+
+  # Initialize a helper tibble mapping relevant column names to types
   relevant_columns <- tibble(names = available_columns) |>
+    # Drop column names that contain "column" (which are empty in the bulk file)
+    filter(!grepl("column", .data$names, ignore.case = TRUE)) |>
+    # Year columns are numeric, all others are text
     mutate(
-      type = if_else(grepl(pattern = "[0:9]", .data$names), "numeric", "text")
-    ) |>
-    filter(!grepl("column", names, ignore.case = TRUE))
+      types = if_else(grepl(pattern = "[0:9]", .data$names), "numeric", "text")
+    )
 
+  # Read in the data from the Excel file for only the relevant columns
   readxl::read_excel(
     path = file_path,
     range = readxl::cell_cols(seq_len(nrow(relevant_columns))),
-    col_types = relevant_columns$type
+    col_types = relevant_columns$types
   )
 }
 
@@ -179,18 +187,26 @@ read_bulk_file <- function(file_path) {
 #'
 process_bulk_data <- function(bulk_raw) {
   bulk_raw |>
-    select(-c("Country Name", "Classification Name")) |>
+    # Select only the relevant columns
+    select(
+      "Country Code", "Series Code", "Counterpart-Area Code",
+      matches("^\\d{4}$")
+    ) |>
+    # Rename columns to match the package data model
     select(
       geography_id = "Country Code",
       series_id = "Series Code",
-      counterpart_id = "Series Name",
+      counterpart_id = "Counterpart-Area Code",
       everything()
     ) |>
+    # Pivot to long (tidy) format
     tidyr::pivot_longer(
       cols = -c("geography_id", "series_id", "counterpart_id"),
       names_to = "year"
     ) |>
+    # Convert year to integer
     mutate(year = as.integer(.data$year)) |>
+    # Drop rows with NA values
     tidyr::drop_na()
 }
 

diff --git a/tests/testthat/data/download_bulk_file_output.xlsx b/tests/testthat/data/download_bulk_file_output.xlsx
diff --git a/tests/testthat/data/read_bulk_file_output.rds b/tests/testthat/data/read_bulk_file_output.rds
diff --git a/tests/testthat/data/read_bulk_info_output.rds b/tests/testthat/data/read_bulk_info_output.rds
diff --git a/tests/testthat/data/sample.rds b/tests/testthat/data/sample.rds
diff --git a/tests/testthat/data/sample.xlsx b/tests/testthat/data/sample.xlsx
diff --git a/tests/testthat/test-ids_bulk.R b/tests/testthat/test-ids_bulk.R
@@ -65,7 +65,7 @@ test_that("ids_bulk handles message parameter correctly", {
   mock_data <- tibble::tibble(
     "Country Code" = "ABC",
     "Country Name" = "Test Country",
-    "Classification Name" = "Test Class",
+    "Counterpart-Area Code" = "Test Counterpart",
     "Series Code" = "TEST.1",
     "Series Name" = "Test Series",
     "2020" = 100
@@ -227,33 +227,52 @@ test_that("download_bulk_file downloads files correctly", {
 test_that("read_bulk_file reads files correctly", {
   skip_on_cran()
 
-  test_path <- test_path("data/sample.xlsx")
+  test_path <- test_path("data/download_bulk_file_output.xlsx")
   result <- read_bulk_file(test_path)
   expect_s3_class(result, "tbl_df")
 })
 
 test_that("process_bulk_data processes data correctly", {
-  test_path <- test_path("data/sample.rds")
-  result <- process_bulk_data(readRDS(test_path))
+  test_path <- test_path("data/read_bulk_file_output.rds")
+  test_data <- readRDS(test_path)
 
+  result <- process_bulk_data(test_data)
+
+  # Test structure
   expect_s3_class(result, "tbl_df")
   expect_named(
     result,
     c("geography_id", "series_id", "counterpart_id", "year", "value")
   )
 
+  # Test data types
   expect_type(result$geography_id, "character")
   expect_type(result$series_id, "character")
   expect_type(result$counterpart_id, "character")
   expect_type(result$year, "integer")
   expect_type(result$value, "double")
 
-  expect_gt(nrow(result), 0)
+  # Each code in test data should occur 17 times (the number of non-NA years)
+  expected_country_codes <- rep(test_data$`Country Code`, each = 17)
+  expect_equal(result$geography_id, expected_country_codes)
+
+  expected_counterpart_codes <- rep(
+    test_data$`Counterpart-Area Code`, each = 17
+  )
+  expect_equal(result$counterpart_id, expected_counterpart_codes)
+
+  expected_series_codes <- rep(test_data$`Series Code`, each = 17)
+  expect_equal(result$series_id, expected_series_codes)
 
+  # Years should span from 2006 to 2022 (non-NA years)
+  expect_equal(result$year, rep(2006:2022, times = nrow(test_data)))
+
+  # No NAs should be present
   expect_false(any(is.na(result$geography_id)))
   expect_false(any(is.na(result$series_id)))
   expect_false(any(is.na(result$counterpart_id)))
   expect_false(any(is.na(result$year)))
+  expect_false(any(is.na(result$value)))
 })
 
 test_that("ids_bulk downloads and processes data correctly", {
@@ -271,7 +290,9 @@ test_that("ids_bulk downloads and processes data correctly", {
   local_mocked_bindings(
     check_interactive = function() FALSE,
     download_bulk_file = function(...) TRUE,
-    read_bulk_file = function(...) readRDS(test_path("data/sample.rds"))
+    read_bulk_file = function(...) {
+      readRDS(test_path("data/read_bulk_file_output.rds"))
+    }
   )
 
   result <- ids_bulk(

diff --git a/tests/testthat/test-ids_bulk_files.R b/tests/testthat/test-ids_bulk_files.R
@@ -1,6 +1,12 @@
 test_that("ids_bulk_files returns a tibble with expected columns", {
   skip_if_not_installed("jsonlite")
 
+  local_mocked_bindings(
+    read_bulk_info = function() {
+      readRDS(test_path("data/read_bulk_info_output.rds"))
+    }
+  )
+
   result <- ids_bulk_files()
   expected_columns <- c("file_name", "file_url", "last_updated_date")