Skip to content

Commit

Permalink
[nexus] Mark VMMs on Expunged sleds as Failed (#6519)
Browse files Browse the repository at this point in the history
  • Loading branch information
hawkw authored Sep 9, 2024
1 parent 69af861 commit ec0d88d
Show file tree
Hide file tree
Showing 4 changed files with 229 additions and 111 deletions.
23 changes: 10 additions & 13 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@ use crate::db::update_and_check::UpdateStatus;
use async_bb8_diesel::AsyncRunQueryDsl;
use chrono::Utc;
use diesel::prelude::*;
use nexus_db_model::ApplySledFilterExt;
use nexus_db_model::Disk;
use nexus_types::deployment::SledFilter;
use omicron_common::api;
use omicron_common::api::external;
use omicron_common::api::external::http_pagination::PaginatedBy;
Expand Down Expand Up @@ -801,12 +799,12 @@ impl DataStore {
Ok(updated)
}

/// Lists all instances on in-service sleds with active Propolis VMM
/// processes, returning the instance along with the VMM on which it's
/// running, the sled on which the VMM is running, and the project that owns
/// the instance.
/// Lists all instances with active Propolis VMM processes, returning the
/// instance along with the VMM on which it's running, the sled on which the
/// VMM is running, and the project that owns the instance.
///
/// The query performed by this function is paginated by the sled's UUID.
/// The query performed by this function is paginated by the sled and
/// instance UUIDs, in that order.
pub async fn instance_and_vmm_list_by_sled_agent(
&self,
opctx: &OpContext,
Expand All @@ -821,9 +819,9 @@ impl DataStore {

// We're going to build the query in stages.
//
// First, select all active sleds, and join the `sled` table with the
// `vmm` table on the VMM's sled_id`, filtering out VMMs which are not
// actually incarnated on a sled.
// First, select all non-deleted sled records, and join the `sled` table
// with the `vmm` table on the VMM's `sled_id`, filtering out VMMs which
// are not actually incarnated on a sled.
let query = sled_dsl::sled
.inner_join(vmm_dsl::vmm.on(vmm_dsl::sled_id.eq(sled_dsl::id)));
// Next, paginate the results, ordering by the sled ID first, so that we
Expand All @@ -844,11 +842,10 @@ impl DataStore {
(sled_dsl::id, vmm_dsl::id),
pagparams,
);
// Filter out sleds that aren't in service, and VMM states in which the
// VMM isn't incarnated on a sled.

let query = query
// Filter out sled and VMM records which have been deleted.
.filter(sled_dsl::time_deleted.is_null())
.sled_filter(SledFilter::InService)
.filter(vmm_dsl::time_deleted.is_null())
// Ignore VMMs which are in states that are not known to exist on a
// sled. Since this query drives instance-watcher health checking,
Expand Down
224 changes: 127 additions & 97 deletions nexus/src/app/background/tasks/instance_watcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use nexus_db_model::Vmm;
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db::pagination::Paginator;
use nexus_db_queries::db::DataStore;
use nexus_types::external_api::views::SledPolicy;
use nexus_types::identity::Asset;
use nexus_types::identity::Resource;
use omicron_common::api::external::Error;
Expand Down Expand Up @@ -87,6 +88,7 @@ impl InstanceWatcher {
client: SledAgentClient,
target: VirtualMachine,
vmm: Vmm,
sled: Sled,
) -> impl Future<Output = Check> + Send + 'static {
let datastore = self.datastore.clone();
let sagas = self.sagas.clone();
Expand All @@ -99,6 +101,7 @@ impl InstanceWatcher {
target.instance_id.to_string(),
);
meta.insert("vmm_id".to_string(), vmm_id.to_string());
meta.insert("sled_id".to_string(), sled.id().to_string());
opctx.child(meta)
};

Expand All @@ -107,96 +110,123 @@ impl InstanceWatcher {
opctx.log, "checking on VMM"; "propolis_id" => %vmm_id
);

let rsp = client
.vmm_get_state(&vmm_id)
.await
.map_err(SledAgentInstanceError);

let mut check = Check {
target,
outcome: Default::default(),
result: Ok(()),
update_saga_queued: false,
};
let state: SledVmmState = match rsp {
Ok(rsp) => rsp.into_inner().into(),
// Oh, this error indicates that the VMM should transition to
// `Failed`. Let's synthesize a `SledInstanceState` that does
// that.
Err(e) if e.vmm_gone() => {
slog::info!(
opctx.log,
"sled-agent error indicates that this instance's \
VMM has failed!";
"error" => %e,
);
check.outcome =
CheckOutcome::Failure(Failure::NoSuchInstance);
// TODO(eliza): it would be nicer if this used the same
// code path as `mark_instance_failed`...
SledVmmState {
vmm_state: nexus::VmmRuntimeState {
r#gen: vmm.runtime.r#gen.0.next(),
state: nexus::VmmState::Failed,
time_updated: chrono::Utc::now(),
},
// It's fine to synthesize `None`s here because a `None`
// just means "don't update the migration state", not
// "there is no migration".
migration_in: None,
migration_out: None,
}

let state = if sled.policy() == SledPolicy::Expunged {
// If the sled has been expunged, any VMMs still on that sled
// should be marked as `Failed`.
slog::info!(
opctx.log,
"instance is assigned to a VMM on an Expunged sled; \
marking it as Failed";
);
check.outcome = CheckOutcome::Failure(Failure::SledExpunged);
// TODO(eliza): it would be nicer if this used the same
// code path as `mark_instance_failed`...
SledVmmState {
vmm_state: nexus::VmmRuntimeState {
r#gen: vmm.runtime.r#gen.0.next(),
state: nexus::VmmState::Failed,
time_updated: chrono::Utc::now(),
},
// It's fine to synthesize `None`s here because a `None`
// just means "don't update the migration state", not
// "there is no migration".
migration_in: None,
migration_out: None,
}
Err(SledAgentInstanceError(ClientError::ErrorResponse(
rsp,
))) => {
let status = rsp.status();
if status.is_client_error() {
slog::warn!(opctx.log, "check failed due to client error";
} else {
// Otherwise, ask the sled-agent what it has to say for itself.
let rsp = client
.vmm_get_state(&vmm_id)
.await
.map_err(SledAgentInstanceError);
match rsp {
Ok(rsp) => {
let state: SledVmmState = rsp.into_inner().into();
check.outcome =
CheckOutcome::Success(state.vmm_state.state.into());
state
}
// Oh, this error indicates that the VMM should transition to
// `Failed`. Let's synthesize a `SledInstanceState` that does
// that.
Err(e) if e.vmm_gone() => {
slog::info!(
opctx.log,
"sled-agent error indicates that this instance's \
VMM has failed!";
"error" => %e,
);
check.outcome =
CheckOutcome::Failure(Failure::NoSuchInstance);
// TODO(eliza): it would be nicer if this used the same
// code path as `mark_instance_failed`...
SledVmmState {
vmm_state: nexus::VmmRuntimeState {
r#gen: vmm.runtime.r#gen.0.next(),
state: nexus::VmmState::Failed,
time_updated: chrono::Utc::now(),
},
// It's fine to synthesize `None`s here because a `None`
// just means "don't update the migration state", not
// "there is no migration".
migration_in: None,
migration_out: None,
}
}
Err(SledAgentInstanceError(
ClientError::ErrorResponse(rsp),
)) => {
let status = rsp.status();
if status.is_client_error() {
slog::warn!(opctx.log, "check incomplete due to client error";
"status" => ?status, "error" => ?rsp.into_inner());
check.result =
Err(Incomplete::ClientHttpError(status.as_u16()));
} else {
slog::info!(opctx.log, "check failed due to server error";
} else {
slog::info!(opctx.log, "check incomplete due to server error";
"status" => ?status, "error" => ?rsp.into_inner());
}
}

check.outcome = CheckOutcome::Failure(
Failure::SledAgentResponse(status.as_u16()),
);
return check;
}
Err(SledAgentInstanceError(
ClientError::CommunicationError(e),
)) => {
// TODO(eliza): eventually, we may want to transition the
// instance to the `Failed` state if the sled-agent has been
// unreachable for a while. We may also want to take other
// corrective actions or alert an operator in this case.
//
// TODO(eliza): because we have the preported IP address
// of the instance's VMM from our databse query, we could
// also ask the VMM directly when the sled-agent is
// unreachable. We should start doing that here at some
// point.
slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
check.outcome =
CheckOutcome::Failure(Failure::SledAgentUnreachable);
return check;
}
Err(SledAgentInstanceError(e)) => {
slog::warn!(
opctx.log,
"error checking up on instance";
"error" => ?e,
"status" => ?e.status(),
);
check.result = Err(Incomplete::ClientError);
return check;
check.result = Err(Incomplete::SledAgentHttpError(
status.as_u16(),
));
return check;
}
Err(SledAgentInstanceError(
ClientError::CommunicationError(e),
)) => {
// TODO(eliza): eventually, we may want to transition the
// instance to the `Failed` state if the sled-agent has been
// unreachable for a while. We may also want to take other
// corrective actions or alert an operator in this case.
//
// TODO(eliza): because we have the purported IP address
// of the instance's VMM from our database query, we could
// also ask the VMM directly when the sled-agent is
// unreachable. We should start doing that here at some
// point.
slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
check.result = Err(Incomplete::SledAgentUnreachable);
return check;
}
Err(SledAgentInstanceError(e)) => {
slog::warn!(
opctx.log,
"error checking up on instance";
"error" => ?e,
"status" => ?e.status(),
);
check.result = Err(Incomplete::ClientError);
return check;
}
}
};

check.outcome = CheckOutcome::Success(state.vmm_state.state.into());
debug!(
opctx.log,
"updating instance state";
Expand Down Expand Up @@ -328,26 +358,20 @@ impl Check {
Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize,
)]
enum Failure {
/// The sled-agent for the sled on which the instance is running was
/// unreachable.
///
/// This may indicate a network partition between us and that sled, that
/// the sled-agent process has crashed, or that the sled is down.
SledAgentUnreachable,
/// The sled-agent responded with an unexpected HTTP error.
SledAgentResponse(u16),
/// The sled-agent indicated that it doesn't know about an instance ID that
/// we believe it *should* know about. This probably means the sled-agent,
/// and potentially the whole sled, has been restarted.
NoSuchInstance,
/// The instance was assigned to a sled that was expunged. Its VMM has been
/// marked as `Failed`, since the sled is no longer present.
SledExpunged,
}

impl Failure {
fn as_str(&self) -> Cow<'static, str> {
match self {
Self::SledAgentUnreachable => "unreachable".into(),
Self::SledAgentResponse(status) => status.to_string().into(),
Self::NoSuchInstance => "no_such_instance".into(),
Self::SledExpunged => "sled_expunged".into(),
}
}
}
Expand All @@ -356,11 +380,16 @@ impl Failure {
Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize,
)]
enum Incomplete {
/// The sled-agent responded with an HTTP client error, indicating that our
/// request as somehow malformed.
ClientHttpError(u16),
/// Something else went wrong while making an HTTP request.
ClientError,
/// The sled-agent for the sled on which the instance is running was
/// unreachable.
///
/// This may indicate a network partition between us and that sled, that
/// the sled-agent process has crashed, or that the sled is down.
SledAgentUnreachable,
/// The sled-agent responded with an unexpected HTTP error.
SledAgentHttpError(u16),
/// We attempted to update the instance state in the database, but no
/// instance with that UUID existed.
///
Expand All @@ -377,8 +406,9 @@ enum Incomplete {
impl Incomplete {
fn as_str(&self) -> Cow<'static, str> {
match self {
Self::ClientHttpError(status) => status.to_string().into(),
Self::ClientError => "client_error".into(),
Self::SledAgentHttpError(status) => status.to_string().into(),
Self::SledAgentUnreachable => "unreachable".into(),
Self::InstanceNotFound => "instance_not_found".into(),
Self::UpdateFailed => "update_failed".into(),
}
Expand Down Expand Up @@ -463,9 +493,8 @@ impl BackgroundTask for InstanceWatcher {
},
};


let target = VirtualMachine::new(self.id, &sled, &instance, &vmm, &project);
tasks.spawn(self.check_instance(opctx, client, target, vmm));
tasks.spawn(self.check_instance(opctx, client, target, vmm, sled));
}

// Now, wait for the check results to come back.
Expand All @@ -487,10 +516,11 @@ impl BackgroundTask for InstanceWatcher {
CheckOutcome::Failure(reason) => {
*check_failures.entry(reason.as_str().into_owned()).or_default() += 1;
}
CheckOutcome::Unknown => {}
}
if let Err(ref reason) = check.result {
*check_errors.entry(reason.as_str().into_owned()).or_default() += 1;
CheckOutcome::Unknown => {
if let Err(reason) = check.result {
*check_errors.entry(reason.as_str().into_owned()).or_default() += 1;
}
}
}
if check.update_saga_queued {
update_sagas_queued += 1;
Expand Down
14 changes: 13 additions & 1 deletion nexus/src/app/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,19 @@ impl super::Nexus {
let sled_lookup = self.sled_lookup(opctx, &sled_id)?;
let (authz_sled,) =
sled_lookup.lookup_for(authz::Action::Modify).await?;
self.db_datastore.sled_set_policy_to_expunged(opctx, &authz_sled).await
let prev_policy = self
.db_datastore
.sled_set_policy_to_expunged(opctx, &authz_sled)
.await?;

// The instance-watcher background task is responsible for marking any
// VMMs running on `Expunged` sleds as `Failed`, so that their instances
// can transition to `Failed` and be deleted or restarted. Let's go
// ahead and activate it now so that those instances don't need to wait
// for the next periodic activation before they can be cleaned up.
self.background_tasks.task_instance_watcher.activate();

Ok(prev_policy)
}

pub(crate) async fn sled_request_firewall_rules(
Expand Down
Loading

0 comments on commit ec0d88d

Please sign in to comment.