Skip to content

Commit

Permalink
Add Azure file download
Browse files Browse the repository at this point in the history
  • Loading branch information
zsusswein committed Aug 7, 2024
1 parent d3b008f commit 7182a99
Show file tree
Hide file tree
Showing 8 changed files with 289 additions and 1 deletion.
8 changes: 7 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
Suggests:
testthat (>= 3.0.0)
testthat (>= 3.0.0),
withr
Config/testthat/edition: 3
Imports:
AzureRMR,
AzureStor,
cli,
rlang
URL: https://cdcgov.github.io/cfa-epinow2-pipeline/
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(add_two_numbers)
export(download_from_azure_blob)
export(fetch_blob_container)
export(fetch_credential_from_env_var)
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# CFAEpiNow2Pipeline (development version)

* Azure Blob file download utilities
* CI running on Ubuntu only & working pkgdown deploy to Github Pages
* Initial R package with checks running in CI
145 changes: 145 additions & 0 deletions R/azure.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#' Download specified blobs from Blob Storage and save them in a local dir
#'
#' Note that I think it might be wise to instead specify a blob prefix, list the
#' blobs, and download all the listed blobs. This would let us have some more
#' flexibility with downloading whole remote directories (like delta tables)
#'
#' @param blob_names A vector of blobs to donwload from `container_name`
#' @param local_dest The path to the local directory to save the files in
#' @param container_name The Azure Blob Storage container with `blob_names`
#'
#' @return NULL on success
#' @export
download_from_azure_blob <- function(blob_names, local_dest, container_name) {
blob_container <- rlang::try_fetch(
fetch_blob_container(container_name),
error = function(con) {
cli::cli_abort(
c(
"Unable to authenticate connection to Blob endpoint",
"!" = "Check correct credentials are present as env variables",
"!" = "Check container {.var {container_name}} is correct"
),
parent = con
)
}
)

for (blob in blob_names) {
local_file_path <- file.path(local_dest, blob)
rlang::try_fetch(
download_file_from_container(
blob,
blob_container,
local_file_path
),
error = function(con) {
cli::cli_abort(
c(
"Error downloading blob {.path {blob}}",
"Using container {.path {container_name}}",
"Writing to local file path {.path local_file_path}"
),
parent = con
)
}
)
}
cli::cli_alert_success("Blobs {.path {blob_names}} downloaded successfully")
invisible(NULL)
}

download_file_from_container <- function(
blob_storage_path,
container,
local_file_path) {
cli::cli_alert_info(
"Downloading blob {.path {blob_storage_path}} to {.path {local_file_path}}"
)

AzureStor::download_blob(
container = container,
src = blob_storage_path,
dest = local_file_path,
overwrite = TRUE
)

cli::cli_alert_success(
"Blob {.path {blob_storage_path}} downloaded successfully"
)

invisible(local_file_path)
}

#' Load Azure Blob endpoint using credentials in environment variables
#'
#' This **impure** function depends on the environment variables:
#' * TENANT_ID
#' * SUBSCRIPTION
#' * RESOURCE_GROUP
#' * STORAGE_ACCOUNT
#'
#' It will error out if any of the above is not set.
#' @param container_name The Azure Blob Storage container associated with the
#' credentials
#' @return A Blob endpoint
#' @export
fetch_blob_container <- function(container_name) {
cli::cli_alert_info(
"Attempting to connect to container {.var {container_name}}"
)
cli::cli_alert_info("Loading Azure credentials from env vars")
# nolint start: object_name_linter
TENANT_ID <- fetch_credential_from_env_var("TENANT_ID")
SUBSCRIPTION <- fetch_credential_from_env_var("SUBSCRIPTION")
RESOURCE_GROUP <- fetch_credential_from_env_var("RESOURCE_GROUP")
STORAGE_ACCOUNT <- fetch_credential_from_env_var("STORAGE_ACCOUNT")
# nolint end: object_name_linter
cli::cli_alert_success("Credentials loaded successfully")


cli::cli_alert_info("Authenticating with loaded credentials")
az <- AzureRMR::get_azure_login(TENANT_ID)
subscription <- az$get_subscription(SUBSCRIPTION)
resource_group <- subscription$get_resource_group(RESOURCE_GROUP)
storage_account <- resource_group$get_storage_account(STORAGE_ACCOUNT)

# Getting the access key
keys <- storage_account$list_keys()
access_key <- keys[["key1"]]

endpoint <- AzureStor::blob_endpoint(
storage_account$properties$primaryEndpoints$blob,
key = access_key
)

container <- AzureStor::storage_container(endpoint, container_name)
cli::cli_alert_success("Authenticated connection to {.var {container_name}}")

return(container)
}

#' Fetch Azure credential from environment variable
#'
#' And throw an informative error if credential is not found
#'
#' @param env_var A character, the credential to fetch
#'
#' @return The associated value
#' @export
fetch_credential_from_env_var <- function(env_var) {
credential <- Sys.getenv(env_var)

if (credential == "") {
cli::cli_abort(
c(
"Error loading Azure credentials from environment variables",
"!" = "Environment variable {.envvar {env_var}} not specified or empty"
),
class = "CFA_Rt",
parent = con
)
}

return(credential)
}
23 changes: 23 additions & 0 deletions man/download_from_azure_blob.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions man/fetch_blob_container.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/fetch_credential_from_env_var.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

66 changes: 66 additions & 0 deletions tests/testthat/test-azure.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# NOTE: these tests don't test the happy path because they don't interact with
# Azure resources and mocking a full Azure Blob interface is hard. Instead, they
# test that expected errors are thrown and that if Azure access is mocked, the
# core download function runs all the way through. The function
# `download_file_from_container` isn't tested because it's a simple wrapper
# around `AzureStor::download_blob()` and `testthat::with_mocked_bindings()`
# advises mocking wrappers for tests rather than injecting the mock into the
# external lib.
test_that("Downloading file smoke test", {
file_path <- "not_a_real_file.ext"
download_status <- testthat::with_mocked_bindings(
{
withr::with_tempdir({
download_from_azure_blob(
blob_names = c(file_path),
local_dest = ".",
container_name = "test_container"
)
})
},
fetch_blob_container = function(...) "test-container",
download_file_from_container = function(...) file_path
)

expect_null(download_status)
})

test_that("Download fail throws informative error", {
# Errors on fetching credentials
expect_error(
download_from_azure_blob(
blob_names = c("test.json"),
local_dest = "./",
container_name = "test_container"
)
)

# Credentials mocked, errors on downloading file
testthat::with_mocked_bindings(
{
withr::with_tempdir({
expect_error(
download_from_azure_blob(
blob_names = c("not_a_real_file.ext"),
local_dest = ".",
container_name = "test_container"
)
)
})
},
fetch_blob_container = function(...) "test-container"
)
})

test_that("Credential fetched successfully from env var", {
withr::with_envvar(c("KEY" = "VALUE"), {
expect_equal(fetch_credential_from_env_var("KEY"), "VALUE")
})
})

test_that("Missing credential fails", {
withr::with_envvar(c("MISSING_KEY" = ""), {
expect_error(fetch_credential_from_env_var("MISSING_KEY"))
})
expect_error(fetch_credential_from_env_var("NOT_A_REAL_KEY"))
})

0 comments on commit 7182a99

Please sign in to comment.