Skip to content

Commit

Permalink
Support GCS service account impersonation et. al. (#4799)
Browse files Browse the repository at this point in the history
[SC-42590](https://app.shortcut.com/tiledb-inc/story/42590)
[SC-42830](https://app.shortcut.com/tiledb-inc/story/42830)

This PR adds support for [Google Cloud service account
impersonation](https://cloud.google.com/docs/authentication/use-service-account-impersonation)
to the GCS VFS. This is enabled with a new config option
`vfs.gcs.impersonate_service_account` that also supports chained
impersonation if given a comma-separated list of service accounts.

Because we were using some older APIs to configure the GCS client and
these don't support impersonation, I also switched to the newer APIs.
The migration process was straightforward.

For ease of review, you are suggested to look at each commit
individually.

---
TYPE: CONFIG
DESC: Add `vfs.gcs.impersonate_service_account` option that specifies a
service account to impersonate, or a comma-separated list for chained
impersonation.

---
TYPE: IMPROVEMENT
DESC: Stop using deprecated Google Cloud SDK APIs.

---------

Co-authored-by: Isaiah Norton <[email protected]>
  • Loading branch information
teo-tsirpanis and ihnorton committed Apr 10, 2024
1 parent fa516e3 commit 88f3d79
Show file tree
Hide file tree
Showing 9 changed files with 178 additions and 34 deletions.
3 changes: 3 additions & 0 deletions test/src/unit-capi-config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,7 @@ TEST_CASE("C API: Test config iter", "[capi][config]") {
all_param_values["vfs.read_logging_mode"] = "";
all_param_values["vfs.gcs.endpoint"] = "";
all_param_values["vfs.gcs.project_id"] = "";
all_param_values["vfs.gcs.impersonate_service_account"] = "";
all_param_values["vfs.gcs.max_parallel_ops"] =
std::to_string(std::thread::hardware_concurrency());
all_param_values["vfs.gcs.multi_part_size"] = "5242880";
Expand Down Expand Up @@ -760,6 +761,7 @@ TEST_CASE("C API: Test config iter", "[capi][config]") {
vfs_param_values["read_logging_mode"] = "";
vfs_param_values["gcs.endpoint"] = "";
vfs_param_values["gcs.project_id"] = "";
vfs_param_values["gcs.impersonate_service_account"] = "";
vfs_param_values["gcs.max_parallel_ops"] =
std::to_string(std::thread::hardware_concurrency());
vfs_param_values["gcs.multi_part_size"] = "5242880";
Expand Down Expand Up @@ -823,6 +825,7 @@ TEST_CASE("C API: Test config iter", "[capi][config]") {
std::map<std::string, std::string> gcs_param_values;
gcs_param_values["endpoint"] = "";
gcs_param_values["project_id"] = "";
gcs_param_values["impersonate_service_account"] = "";
gcs_param_values["max_parallel_ops"] =
std::to_string(std::thread::hardware_concurrency());
gcs_param_values["multi_part_size"] = "5242880";
Expand Down
2 changes: 1 addition & 1 deletion test/src/unit-cppapi-config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ TEST_CASE("C++ API: Config iterator", "[cppapi][config]") {
names.push_back(it->first);
}
// Check number of VFS params in default config object.
CHECK(names.size() == 65);
CHECK(names.size() == 66);
}

TEST_CASE("C++ API: Config Environment Variables", "[cppapi][config]") {
Expand Down
53 changes: 53 additions & 0 deletions test/src/unit-vfs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@
#include <azure/storage/blobs.hpp>
#include "tiledb/sm/filesystem/azure.h"
#endif
#ifdef HAVE_GCS
#include <google/cloud/internal/credentials_impl.h>
#include <google/cloud/storage/client.h>
#include "tiledb/sm/filesystem/gcs.h"
#endif
#include "test/support/src/vfs_helpers.h"
#include "tiledb/sm/filesystem/vfs.h"
#include "tiledb/sm/global_state/unit_test_config.h"
Expand Down Expand Up @@ -750,3 +755,51 @@ TEST_CASE("Validate vfs.s3.custom_headers.*", "[s3][custom-headers]") {
REQUIRE_THROWS_WITH(s3.flush_object(uri), matcher);
}
#endif

#ifdef HAVE_GCS
TEST_CASE(
"Validate GCS service account impersonation", "[gcs][impersonation]") {
ThreadPool thread_pool(2);
Config cfg = set_config_params(true);
GCS gcs;
std::string impersonate_service_account, target_service_account;
std::vector<std::string> delegates;

SECTION("Simple") {
impersonate_service_account = "account1";
target_service_account = "account1";
delegates = {};
}

SECTION("Delegated") {
impersonate_service_account = "account1,account2,account3";
target_service_account = "account3";
delegates = {"account1", "account2"};
}

// Test parsing an edge case.
SECTION("Invalid") {
impersonate_service_account = ",";
target_service_account = "";
delegates = {""};
}

require_tiledb_ok(cfg.set(
"vfs.gcs.impersonate_service_account", impersonate_service_account));

require_tiledb_ok(gcs.init(cfg, &thread_pool));

auto credentials = gcs.make_credentials({});

// We are using an internal class only for inspection purposes.
auto impersonate_credentials =
dynamic_cast<google::cloud::internal::ImpersonateServiceAccountConfig*>(
credentials.get());

REQUIRE(impersonate_credentials != nullptr);
REQUIRE(
impersonate_credentials->target_service_account() ==
target_service_account);
REQUIRE(impersonate_credentials->delegates() == delegates);
}
#endif
5 changes: 5 additions & 0 deletions tiledb/api/c_api/config/config_api_external.h
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,11 @@ TILEDB_EXPORT void tiledb_config_free(tiledb_config_t** config) TILEDB_NOEXCEPT;
* - `vfs.gcs.project_id` <br>
* Set the GCS project id. <br>
* **Default**: ""
* - `vfs.gcs.impersonate_service_account` <br>
* Set the GCS service account to impersonate. A chain of impersonated
* accounts can be formed by specifying many service accounts, separated by a
* comma. <br>
* **Default**: ""
* - `vfs.gcs.multi_part_size` <br>
* The part size (in bytes) used in GCS multi part writes.
* Any `uint64_t` value is acceptable. Note:
Expand Down
5 changes: 5 additions & 0 deletions tiledb/sm/config/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ const std::string Config::VFS_AZURE_RETRY_DELAY_MS = "800";
const std::string Config::VFS_AZURE_MAX_RETRY_DELAY_MS = "60000";
const std::string Config::VFS_GCS_ENDPOINT = "";
const std::string Config::VFS_GCS_PROJECT_ID = "";
const std::string Config::VFS_GCS_IMPERSONATE_SERVICE_ACCOUNT = "";
const std::string Config::VFS_GCS_MAX_PARALLEL_OPS =
Config::SM_IO_CONCURRENCY_LEVEL;
const std::string Config::VFS_GCS_MULTI_PART_SIZE = "5242880";
Expand Down Expand Up @@ -420,6 +421,9 @@ const std::map<std::string, std::string> default_config_values = {
"vfs.azure.max_retry_delay_ms", Config::VFS_AZURE_MAX_RETRY_DELAY_MS),
std::make_pair("vfs.gcs.endpoint", Config::VFS_GCS_ENDPOINT),
std::make_pair("vfs.gcs.project_id", Config::VFS_GCS_PROJECT_ID),
std::make_pair(
"vfs.gcs.impersonate_service_account",
Config::VFS_GCS_IMPERSONATE_SERVICE_ACCOUNT),
std::make_pair(
"vfs.gcs.max_parallel_ops", Config::VFS_GCS_MAX_PARALLEL_OPS),
std::make_pair("vfs.gcs.multi_part_size", Config::VFS_GCS_MULTI_PART_SIZE),
Expand Down Expand Up @@ -509,6 +513,7 @@ const std::set<std::string> Config::unserialized_params_ = {
"vfs.s3.aws_external_id",
"vfs.s3.aws_load_frequency",
"vfs.s3.aws_session_name",
"vfs.gcs.impersonate_service_account",
"rest.username",
"rest.password",
"rest.token",
Expand Down
3 changes: 3 additions & 0 deletions tiledb/sm/config/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,9 @@ class Config {
/** GCS project id. */
static const std::string VFS_GCS_PROJECT_ID;

/** GCS service account(s) to impersonate. */
static const std::string VFS_GCS_IMPERSONATE_SERVICE_ACCOUNT;

/** GCS max parallel ops. */
static const std::string VFS_GCS_MAX_PARALLEL_OPS;

Expand Down
5 changes: 5 additions & 0 deletions tiledb/sm/cpp_api/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,11 @@ class Config {
* - `vfs.gcs.project_id` <br>
* Set the GCS project id. <br>
* **Default**: ""
* - `vfs.gcs.impersonate_service_account` <br>
* Set the GCS service account to impersonate. A chain of impersonated
* accounts can be formed by specifying many service accounts, separated by
* a comma. <br>
* **Default**: ""
* - `vfs.gcs.multi_part_size` <br>
* The part size (in bytes) used in GCS multi part writes.
* Any `uint64_t` value is acceptable. Note:
Expand Down
112 changes: 81 additions & 31 deletions tiledb/sm/filesystem/gcs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ Status GCS::init(const Config& config, ThreadPool* const thread_pool) {
}
project_id_ = config.get("vfs.gcs.project_id", &found);
assert(found);
impersonate_service_account_ =
config.get("vfs.gcs.impersonate_service_account", &found);
assert(found);
RETURN_NOT_OK(config.get<uint64_t>(
"vfs.gcs.max_parallel_ops", &max_parallel_ops_, &found));
assert(found);
Expand All @@ -127,20 +130,83 @@ Status GCS::init(const Config& config, ThreadPool* const thread_pool) {
return Status::Ok();
}

/**
* Builds a chain of service account impersonation credentials.
*
* @param credentials The set of credentials to start the chain.
* @param service_accounts A comma-separated list of service accounts, where
* each account will be used to impersonate the next.
* @options Options to set to the credentials.
* @return The new set of credentials.
*/
static shared_ptr<google::cloud::Credentials> apply_impersonation(
shared_ptr<google::cloud::Credentials> credentials,
std::string service_accounts,
google::cloud::Options options) {
if (service_accounts.empty()) {
return credentials;
}
auto last_comma_pos = service_accounts.rfind(',');
// If service_accounts is a comma-separated list, we have to extract the first
// items to a vector and pass them via DelegatesOption, and pass only the last
// account to MakeImpersonateServiceAccountCredentials.
if (last_comma_pos != std::string_view::npos) {
// Create a view over all service accounts except the last one.
auto delegates_str =
std::string_view(service_accounts).substr(0, last_comma_pos);
std::vector<std::string> delegates;
while (true) {
auto comma_pos = delegates_str.find(',');
// Get the characters before the comma. We don't have to check for npos
// yet; substr will trim the size if it is too big.
delegates.push_back(std::string(delegates_str.substr(0, comma_pos)));
if (comma_pos != std::string_view::npos) {
// If there is another comma, discard it and the characters before it.
delegates_str = delegates_str.substr(comma_pos + 1);
} else {
// Otherwise exit the loop; we have processed all intermediate service
// accounts.
break;
}
}
options.set<google::cloud::DelegatesOption>(std::move(delegates));
// Trim service_accounts to its last member.
service_accounts = service_accounts.substr(last_comma_pos + 1);
}
// If service_accounts had any comas, by now it should be left to just the
// last part.
if (service_accounts.find(',') != std::string::npos) {
throw std::logic_error(
"Internal error: service_accounts string was not decomposed.");
}
// Create the credential.
return google::cloud::MakeImpersonateServiceAccountCredentials(
std::move(credentials), std::move(service_accounts), std::move(options));
}

std::shared_ptr<google::cloud::Credentials> GCS::make_credentials(
const google::cloud::Options& options) const {
shared_ptr<google::cloud::Credentials> creds = nullptr;
if (!endpoint_.empty() || getenv("CLOUD_STORAGE_EMULATOR_ENDPOINT")) {
creds = google::cloud::MakeInsecureCredentials();
} else {
creds = google::cloud::MakeGoogleDefaultCredentials(options);
}
return apply_impersonation(creds, impersonate_service_account_, options);
}

Status GCS::init_client() const {
assert(state_ == State::INITIALIZED);

std::lock_guard<std::mutex> lck(client_init_mtx_);

// Client is a google::cloud::storage::StatusOr which compares (in)valid as
// bool
if (client_) {
return Status::Ok();
}

google::cloud::storage::ChannelOptions channel_options;
google::cloud::Options ca_options;
if (!ssl_cfg_.ca_file().empty()) {
channel_options.set_ssl_root_path(ssl_cfg_.ca_file());
ca_options.set<google::cloud::CARootsFilePathOption>(ssl_cfg_.ca_file());
}

if (!ssl_cfg_.ca_path().empty()) {
Expand All @@ -150,43 +216,27 @@ Status GCS::init_client() const {
}

// Note that the order here is *extremely important*
// We must call ::GoogleDefaultCredentials *with* a channel_options
// We must call make_credentials *with* a ca_options
// argument, or else the Curl handle pool will be default-initialized
// with no root dir (CURLOPT_CAINFO), defaulting to build host path.
// Later initializations of ClientOptions/Client with the channel_options
// Later initializations of ClientOptions/Client with the ca_options
// do not appear to sufficiently reset the internal option, leading to
// CA verification failures when using lib from systemA on systemB.
// Ideally we could use CreateDefaultClientOptions(channel_options)
// signature, but that function is header-only/unimplemented
// (as of GCS 1.15).

// Creates the client using the credentials file pointed to by the
// env variable GOOGLE_APPLICATION_CREDENTIALS
try {
shared_ptr<google::cloud::storage::oauth2::Credentials> creds = nullptr;
if (!endpoint_.empty() || getenv("CLOUD_STORAGE_EMULATOR_ENDPOINT")) {
creds = google::cloud::storage::oauth2::CreateAnonymousCredentials();
} else {
auto status_or_creds =
google::cloud::storage::oauth2::GoogleDefaultCredentials(
channel_options);
if (!status_or_creds) {
return LOG_STATUS(Status_GCSError(
"Failed to initialize GCS credentials: " +
status_or_creds.status().message()));
}
creds = *status_or_creds;
}
google::cloud::storage::ClientOptions client_options(
creds, channel_options);
auto client_options = ca_options;
client_options.set<google::cloud::UnifiedCredentialsOption>(
make_credentials(ca_options));
if (!endpoint_.empty()) {
client_options.set_endpoint(endpoint_);
client_options.set<google::cloud::storage::RestEndpointOption>(endpoint_);
}
client_ = tdb_unique_ptr<google::cloud::storage::Client>(tdb_new(
google::cloud::storage::Client,
client_options,
google::cloud::storage::LimitedTimeRetryPolicy(
std::chrono::milliseconds(request_timeout_ms_))));
client_options.set<google::cloud::storage::RetryPolicyOption>(
make_shared<google::cloud::storage::LimitedTimeRetryPolicy>(
HERE(), std::chrono::milliseconds(request_timeout_ms_)));
client_ = tdb_unique_ptr<google::cloud::storage::Client>(
tdb_new(google::cloud::storage::Client, client_options));
} catch (const std::exception& e) {
return LOG_STATUS(
Status_GCSError("Failed to initialize GCS: " + std::string(e.what())));
Expand Down
24 changes: 22 additions & 2 deletions tiledb/sm/filesystem/gcs.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,17 @@

using namespace tiledb::common;

namespace google::cloud::storage {
namespace google::cloud {
GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_BEGIN
class Credentials;
class Options;
GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_END
namespace storage {
GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_BEGIN
class Client;
GOOGLE_CLOUD_CPP_INLINE_NAMESPACE_END
} // namespace google::cloud::storage
} // namespace storage
} // namespace google::cloud

namespace tiledb {

Expand Down Expand Up @@ -309,6 +315,17 @@ class GCS {
*/
Status flush_object(const URI& uri);

/**
* Creates a GCS credentials object.
*
* This method is intended to be used by testing code only.
*
* @param options Options to configure the credentials.
* @return shared pointer to credentials
*/
std::shared_ptr<google::cloud::Credentials> make_credentials(
const google::cloud::Options& options) const;

private:
/* ********************************* */
/* PRIVATE DATATYPES */
Expand Down Expand Up @@ -426,6 +443,9 @@ class GCS {
// The GCS project id.
std::string project_id_;

// A comma-separated list with the GCS service accounts to impersonate.
std::string impersonate_service_account_;

// The GCS REST client.
mutable tdb_unique_ptr<google::cloud::storage::Client> client_;

Expand Down

0 comments on commit 88f3d79

Please sign in to comment.