Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not retry indefinitely if service is gone #5789

Merged
merged 12 commits into from
May 30, 2024
17 changes: 15 additions & 2 deletions common/src/api/external/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ pub enum Error {
/// ObjectNotFound instead.
#[error("Not found: {}", .message.display_internal())]
NotFound { message: MessagePair },

/// Access to the target resource is no longer available, and this condition
/// is likely to be permanent.
#[error("Gone")]
Gone,
}

/// Represents an error message which has an external component, along with
Expand Down Expand Up @@ -214,7 +219,8 @@ impl Error {
| Error::InternalError { .. }
| Error::TypeVersionMismatch { .. }
| Error::NotFound { .. }
| Error::Conflict { .. } => false,
| Error::Conflict { .. }
| Error::Gone => false,
}
}

Expand Down Expand Up @@ -335,7 +341,8 @@ impl Error {
match self {
Error::ObjectNotFound { .. }
| Error::ObjectAlreadyExists { .. }
| Error::Forbidden => self,
| Error::Forbidden
| Error::Gone => self,
Error::InvalidRequest { message } => Error::InvalidRequest {
message: message.with_internal_context(context),
},
Expand Down Expand Up @@ -513,6 +520,12 @@ impl From<Error> for HttpError {
internal_message,
}
}

Error::Gone => HttpError::for_client_error(
Some(String::from("Gone")),
http::StatusCode::GONE,
String::from("Gone"),
),
}
}
}
Expand Down
86 changes: 22 additions & 64 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pub mod backoff;
pub mod cmd;
pub mod disk;
pub mod ledger;
pub mod progenitor_operation_retry;
pub mod update;
pub mod vlan;
pub mod zpool_name;
Expand Down Expand Up @@ -79,83 +80,40 @@ impl slog::KV for FileKv {

pub const OMICRON_DPD_TAG: &str = "omicron";

use futures::Future;
use slog::warn;
use crate::api::external::Error;
use crate::progenitor_operation_retry::ProgenitorOperationRetry;
use crate::progenitor_operation_retry::ProgenitorOperationRetryError;
use std::future::Future;

/// Retry a progenitor client operation until a known result is returned.
///
/// Saga execution relies on the outcome of an external call being known: since
/// they are idempotent, reissue the external call until a known result comes
/// back. Retry if a communication error is seen, or if another retryable error
/// is seen.
///
/// Note that retrying is only valid if the call itself is idempotent.
/// See [`ProgenitorOperationRetry`] for more information.
// TODO mark this deprecated, `never_bail` is a bad idea
andrewjstone marked this conversation as resolved.
Show resolved Hide resolved
pub async fn retry_until_known_result<F, T, E, Fut>(
log: &slog::Logger,
mut f: F,
f: F,
) -> Result<T, progenitor_client::Error<E>>
where
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, progenitor_client::Error<E>>>,
E: std::fmt::Debug,
{
backoff::retry_notify(
backoff::retry_policy_internal_service(),
move || {
let fut = f();
async move {
match fut.await {
Err(progenitor_client::Error::CommunicationError(e)) => {
warn!(
log,
"saw transient communication error {}, retrying...",
e,
);

Err(backoff::BackoffError::transient(
progenitor_client::Error::CommunicationError(e),
))
}

Err(progenitor_client::Error::ErrorResponse(
response_value,
)) => {
match response_value.status() {
// Retry on 503 or 429
http::StatusCode::SERVICE_UNAVAILABLE
| http::StatusCode::TOO_MANY_REQUESTS => {
Err(backoff::BackoffError::transient(
progenitor_client::Error::ErrorResponse(
response_value,
),
))
}

// Anything else is a permanent error
_ => Err(backoff::BackoffError::Permanent(
progenitor_client::Error::ErrorResponse(
response_value,
),
)),
}
}

Err(e) => {
warn!(log, "saw permanent error {}, aborting", e,);
match ProgenitorOperationRetry::new(f, never_bail).run(log).await {
Ok(v) => Ok(v),

Err(backoff::BackoffError::Permanent(e))
}
Err(e) => match e {
ProgenitorOperationRetryError::ProgenitorError(e) => Err(e),

Ok(v) => Ok(v),
}
ProgenitorOperationRetryError::Gone
| ProgenitorOperationRetryError::GoneCheckError(_) => {
// ProgenitorOperationRetry::new called with `never_bail` as the
// bail check should never return these variants!
unreachable!();
}
},
|error: progenitor_client::Error<_>, delay| {
warn!(
log,
"failed external call ({:?}), will retry in {:?}", error, delay,
);
},
)
.await
}
}

async fn never_bail() -> Result<bool, Error> {
Ok(false)
}
158 changes: 158 additions & 0 deletions common/src/progenitor_operation_retry.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use futures::Future;
use slog::warn;
use slog::Logger;

use crate::api::external::Error;
use crate::backoff::retry_notify;
use crate::backoff::retry_policy_internal_service;
use crate::backoff::BackoffError;

#[derive(Debug)]
pub enum ProgenitorOperationRetryError<E> {
/// Nexus determined that the operation will never return a known result
/// because the remote server is gone.
Gone,

/// Attempting to check if the retry loop should be stopped failed
GoneCheckError(Error),

/// The retry loop progenitor operation saw a permanent client error
ProgenitorError(progenitor_client::Error<E>),
}

/// Retry a progenitor client operation until a known result is returned, or
/// until something tells us that we should stop trying.
///
/// Saga execution relies on the outcome of an external call being known: since
/// they are idempotent, reissue the external call until a known result comes
/// back. Retry if a communication error is seen, or if another retryable error
/// is seen.
///
/// During the retry loop, call the supplied `gone_check` function to see if the
/// retry loop should be aborted: in the cases where Nexus can _know_ that a
/// request will never complete, the retry loop must be aborted. Otherwise,
/// Nexus will indefinitely retry until some known result is returned.
///
/// Note that retrying is only valid if the `operation` itself is idempotent.
pub struct ProgenitorOperationRetry<
T,
E: std::fmt::Debug,
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, progenitor_client::Error<E>>>,
BF: FnMut() -> BFut,
BFut: Future<Output = Result<bool, Error>>,
> {
operation: F,

/// If Nexus knows that the supplied operation will never successfully
/// complete, then `gone_check` should return true.
gone_check: BF,
}

impl<T, E, F, Fut, BF, BFut> ProgenitorOperationRetry<T, E, F, Fut, BF, BFut>
where
E: std::fmt::Debug,
F: FnMut() -> Fut,
Fut: Future<Output = Result<T, progenitor_client::Error<E>>>,
BF: FnMut() -> BFut,
BFut: Future<Output = Result<bool, Error>>,
{
pub fn new(operation: F, gone_check: BF) -> Self {
Self { operation, gone_check }
}

pub async fn run(
mut self,
log: &Logger,
) -> Result<T, ProgenitorOperationRetryError<E>> {
retry_notify(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like when you call run inside the create_region future inside ensure_region_in_dataset, there is a retry_notify call around the create_region future. This creates two backoff loops rather than one, and seems like a really easy mistake to make. Instead, what if we didn't actually call retry instead of run here but relied on the outer retry loop. I think that would simplify this a lot as well, as now you wouldn't have to worry about the progenitor errors. The outer loop would call a future that only does the gone check once and returns an error, rather than looping.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, the double backoff looping was intentional: the outer retry (in nexus/src/app/crucible.rs) is to poll until a desired state change, and the inner retry (in common/src/progenitor_operation_retry.rs) is to paper over transient network problems. I think I still like this separation, though I don't grok the simplification you're referring to yet. Going to hack around with this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this makes sense, and your hacking didn't yield anything, I'm fine leaving it as is for now.

retry_policy_internal_service(),
move || {
let gone_check = (self.gone_check)();
let f = (self.operation)();

async move {
match gone_check.await {
Ok(dest_is_gone) => {
if dest_is_gone {
return Err(BackoffError::Permanent(
ProgenitorOperationRetryError::Gone
));
}
}
Comment on lines +104 to +110
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style nit, take it or leave it: this could be a bit more concise as:

Suggested change
Ok(dest_is_gone) => {
if dest_is_gone {
return Err(BackoffError::Permanent(
ProgenitorOperationRetryError::Gone
));
}
}
Ok(true) => return Err(BackoffError::Permanent(
ProgenitorOperationRetryError::Gone
)),
Ok(false) => {},

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, I prefer the more explicit style, but thanks :)


Err(e) => {
return Err(BackoffError::Permanent(
ProgenitorOperationRetryError::GoneCheckError(e)
));
}
}

match f.await {
Err(progenitor_client::Error::CommunicationError(e)) => {
warn!(
log,
"saw transient communication error {}, retrying...",
e,
);

Err(BackoffError::transient(
ProgenitorOperationRetryError::ProgenitorError(
progenitor_client::Error::CommunicationError(e)
)
))
}

Err(progenitor_client::Error::ErrorResponse(
response_value,
)) => {
match response_value.status() {
// Retry on 503 or 429
http::StatusCode::SERVICE_UNAVAILABLE
| http::StatusCode::TOO_MANY_REQUESTS => {
Err(BackoffError::transient(
ProgenitorOperationRetryError::ProgenitorError(
progenitor_client::Error::ErrorResponse(
response_value
)
)
))
}

// Anything else is a permanent error
_ => Err(BackoffError::Permanent(
ProgenitorOperationRetryError::ProgenitorError(
progenitor_client::Error::ErrorResponse(
response_value
)
)
))
}
}

Err(e) => {
warn!(log, "saw permanent error {}, aborting", e,);

Err(BackoffError::Permanent(
ProgenitorOperationRetryError::ProgenitorError(e)
))
}

Ok(v) => Ok(v),
}
}
},
|error: ProgenitorOperationRetryError<E>, delay| {
warn!(
log,
"failed external call ({:?}), will retry in {:?}", error, delay,
);
},
)
.await
}
}
52 changes: 52 additions & 0 deletions nexus/db-queries/src/db/datastore/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ use crate::db::error::public_error_from_diesel;
use crate::db::error::ErrorHandler;
use crate::db::identity::Asset;
use crate::db::model::Dataset;
use crate::db::model::PhysicalDisk;
use crate::db::model::PhysicalDiskPolicy;
use crate::db::model::Zpool;
use crate::db::pagination::paginated;
use crate::db::pagination::Paginator;
Expand Down Expand Up @@ -180,6 +182,56 @@ impl DataStore {

Ok(all_datasets)
}

pub async fn dataset_on_in_service_physical_disk(
andrewjstone marked this conversation as resolved.
Show resolved Hide resolved
&self,
// opctx: &OpContext,
dataset_id: Uuid,
) -> LookupResult<bool> {
//let conn = self.pool_connection_authorized(opctx).await?;
andrewjstone marked this conversation as resolved.
Show resolved Hide resolved
let conn = self.pool_connection_unauthorized().await?;

let dataset = {
use db::schema::dataset::dsl;

dsl::dataset
.filter(dsl::id.eq(dataset_id))
.select(Dataset::as_select())
.first_async::<Dataset>(&*conn)
.await
.map_err(|e| {
public_error_from_diesel(e, ErrorHandler::Server)
})?
};

let zpool = {
use db::schema::zpool::dsl;

dsl::zpool
.filter(dsl::id.eq(dataset.pool_id))
.select(Zpool::as_select())
.first_async::<Zpool>(&*conn)
.await
.map_err(|e| {
public_error_from_diesel(e, ErrorHandler::Server)
})?
};

let physical_disk = {
use db::schema::physical_disk::dsl;

dsl::physical_disk
.filter(dsl::id.eq(zpool.physical_disk_id))
.select(PhysicalDisk::as_select())
.first_async::<PhysicalDisk>(&*conn)
.await
.map_err(|e| {
public_error_from_diesel(e, ErrorHandler::Server)
})?
};

Ok(physical_disk.disk_policy == PhysicalDiskPolicy::InService)
}
}

#[cfg(test)]
Expand Down
Loading
Loading