From 4d72f1dc6ac2c2881dc4bdda4070736f887e54f3 Mon Sep 17 00:00:00 2001 From: "Sean P. Kelly" Date: Mon, 19 Aug 2024 18:24:28 +0000 Subject: [PATCH] pubsys: retry SSM validation on any failure --- Cargo.lock | 12 +++++++++++ Cargo.toml | 1 + tools/pubsys/Cargo.toml | 1 + tools/pubsys/src/aws/ssm/ssm.rs | 38 +++++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 4a405a185..db56152da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2486,6 +2486,7 @@ dependencies = [ "tempfile", "tinytemplate", "tokio", + "tokio-retry", "tokio-stream", "toml", "tough", @@ -3497,6 +3498,17 @@ dependencies = [ "syn 2.0.72", ] +[[package]] +name = "tokio-retry" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f57eb36ecbe0fc510036adff84824dd3c24bb781e21bfa67b69d556aa85214f" +dependencies = [ + "pin-project", + "rand", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" diff --git a/Cargo.toml b/Cargo.toml index bb01407f1..ab57470ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -112,6 +112,7 @@ term_size = "0.3" tinytemplate = "1" tokio = "1" tokio-stream = "0.1" +tokio-retry = "0.3" toml = "0.8" tough = "0.17" tough-kms = "0.9" diff --git a/tools/pubsys/Cargo.toml b/tools/pubsys/Cargo.toml index 705b638b7..0f265526e 100644 --- a/tools/pubsys/Cargo.toml +++ b/tools/pubsys/Cargo.toml @@ -43,6 +43,7 @@ tabled.workspace = true tempfile.workspace = true tinytemplate.workspace = true tokio = { workspace = true, features = ["full"] } +tokio-retry.workspace = true tokio-stream = { workspace = true, features = ["time"] } toml.workspace = true tough = { workspace = true, features = ["http"] } diff --git a/tools/pubsys/src/aws/ssm/ssm.rs b/tools/pubsys/src/aws/ssm/ssm.rs index f064766d4..a1bd5c4d8 100644 --- a/tools/pubsys/src/aws/ssm/ssm.rs +++ b/tools/pubsys/src/aws/ssm/ssm.rs @@ -22,6 +22,15 @@ use nonzero_ext::nonzero; use snafu::{ensure, OptionExt, ResultExt}; use std::collections::{HashMap, HashSet}; use std::time::Duration; +use tokio_retry::{ + strategy::{jitter, ExponentialBackoff}, + RetryIf, +}; + +// SSM validation may retry if it fails for any reason. +// These parameters control the exponential backoff and number of retries. +const SSM_VALIDATION_RETRY_EXP_BASE_MILLIS: u64 = 2_000; +const SSM_VALIDATION_NUM_RETRIES: usize = 3; // Configures the rate limit used for SSM parameter fetching. // SSM service quotas are provided on https://docs.aws.amazon.com/general/latest/gr/ssm.html @@ -430,9 +439,38 @@ pub(crate) async fn set_parameters( } /// Fetch the given parameters, and ensure the live values match the given values +/// +/// Retries validation up to 3 times on any failure, using exponential backoff. pub(crate) async fn validate_parameters( expected_parameters: &SsmParameters, ssm_clients: &HashMap, +) -> Result<()> { + let retry_strategy = ExponentialBackoff::from_millis(SSM_VALIDATION_RETRY_EXP_BASE_MILLIS) + .map(jitter) + .enumerate() + .map(|(attempt, d)| { + if attempt > 0 { + error!("Retrying: attempt = {}", attempt + 1,); + } + d + }) + .take(SSM_VALIDATION_NUM_RETRIES); + + RetryIf::spawn( + retry_strategy, + || async { validate_parameters_inner(expected_parameters, ssm_clients).await }, + |e: &'_ Error| { + error!("Failed to validate SSM parameters: {}", e); + true + }, + ) + .await +} + +/// Fetch the given parameters, and ensure the live values match the given values +async fn validate_parameters_inner( + expected_parameters: &SsmParameters, + ssm_clients: &HashMap, ) -> Result<()> { // Fetch the given parameter names let expected_parameter_names: Vec<&SsmKey> = expected_parameters.keys().collect();