Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nexus] Mark VMMs on Expunged sleds as Failed #6519

Merged
merged 6 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 10 additions & 13 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@ use crate::db::update_and_check::UpdateStatus;
use async_bb8_diesel::AsyncRunQueryDsl;
use chrono::Utc;
use diesel::prelude::*;
use nexus_db_model::ApplySledFilterExt;
use nexus_db_model::Disk;
use nexus_types::deployment::SledFilter;
use omicron_common::api;
use omicron_common::api::external;
use omicron_common::api::external::http_pagination::PaginatedBy;
Expand Down Expand Up @@ -801,12 +799,12 @@ impl DataStore {
Ok(updated)
}

/// Lists all instances on in-service sleds with active Propolis VMM
/// processes, returning the instance along with the VMM on which it's
/// running, the sled on which the VMM is running, and the project that owns
/// the instance.
/// Lists all instances with active Propolis VMM processes, returning the
/// instance along with the VMM on which it's running, the sled on which the
/// VMM is running, and the project that owns the instance.
///
/// The query performed by this function is paginated by the sled's UUID.
/// The query performed by this function is paginated by the sled and
/// instance UUIDs, in that order.
pub async fn instance_and_vmm_list_by_sled_agent(
&self,
opctx: &OpContext,
Expand All @@ -821,9 +819,9 @@ impl DataStore {

// We're going to build the query in stages.
//
// First, select all active sleds, and join the `sled` table with the
// `vmm` table on the VMM's sled_id`, filtering out VMMs which are not
// actually incarnated on a sled.
// First, select all non-deleted sled records, and join the `sled` table
// with the `vmm` table on the VMM's `sled_id`, filtering out VMMs which
// are not actually incarnated on a sled.
let query = sled_dsl::sled
.inner_join(vmm_dsl::vmm.on(vmm_dsl::sled_id.eq(sled_dsl::id)));
// Next, paginate the results, ordering by the sled ID first, so that we
Expand All @@ -844,11 +842,10 @@ impl DataStore {
(sled_dsl::id, vmm_dsl::id),
pagparams,
);
// Filter out sleds that aren't in service, and VMM states in which the
// VMM isn't incarnated on a sled.

let query = query
// Filter out sled and VMM records which have been deleted.
.filter(sled_dsl::time_deleted.is_null())
.sled_filter(SledFilter::InService)
hawkw marked this conversation as resolved.
Show resolved Hide resolved
.filter(vmm_dsl::time_deleted.is_null())
// Ignore VMMs which are in states that are not known to exist on a
// sled. Since this query drives instance-watcher health checking,
Expand Down
224 changes: 127 additions & 97 deletions nexus/src/app/background/tasks/instance_watcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use nexus_db_model::Vmm;
use nexus_db_queries::context::OpContext;
use nexus_db_queries::db::pagination::Paginator;
use nexus_db_queries::db::DataStore;
use nexus_types::external_api::views::SledPolicy;
use nexus_types::identity::Asset;
use nexus_types::identity::Resource;
use omicron_common::api::external::Error;
Expand Down Expand Up @@ -87,6 +88,7 @@ impl InstanceWatcher {
client: SledAgentClient,
target: VirtualMachine,
vmm: Vmm,
sled: Sled,
) -> impl Future<Output = Check> + Send + 'static {
let datastore = self.datastore.clone();
let sagas = self.sagas.clone();
Expand All @@ -99,6 +101,7 @@ impl InstanceWatcher {
target.instance_id.to_string(),
);
meta.insert("vmm_id".to_string(), vmm_id.to_string());
meta.insert("sled_id".to_string(), sled.id().to_string());
opctx.child(meta)
};

Expand All @@ -107,96 +110,123 @@ impl InstanceWatcher {
opctx.log, "checking on VMM"; "propolis_id" => %vmm_id
);

let rsp = client
.vmm_get_state(&vmm_id)
.await
.map_err(SledAgentInstanceError);

let mut check = Check {
target,
outcome: Default::default(),
result: Ok(()),
update_saga_queued: false,
};
let state: SledVmmState = match rsp {
Ok(rsp) => rsp.into_inner().into(),
// Oh, this error indicates that the VMM should transition to
// `Failed`. Let's synthesize a `SledInstanceState` that does
// that.
Err(e) if e.vmm_gone() => {
slog::info!(
opctx.log,
"sled-agent error indicates that this instance's \
VMM has failed!";
"error" => %e,
);
check.outcome =
CheckOutcome::Failure(Failure::NoSuchInstance);
// TODO(eliza): it would be nicer if this used the same
// code path as `mark_instance_failed`...
SledVmmState {
vmm_state: nexus::VmmRuntimeState {
r#gen: vmm.runtime.r#gen.0.next(),
state: nexus::VmmState::Failed,
time_updated: chrono::Utc::now(),
},
// It's fine to synthesize `None`s here because a `None`
// just means "don't update the migration state", not
// "there is no migration".
migration_in: None,
migration_out: None,
}

let state = if sled.policy() == SledPolicy::Expunged {
// If the sled has been expunged, any VMMs still on that sled
// should be marked as `Failed`.
slog::info!(
opctx.log,
"instance is assigned to a VMM on an Expunged sled; \
marking it as Failed";
);
check.outcome = CheckOutcome::Failure(Failure::SledExpunged);
// TODO(eliza): it would be nicer if this used the same
// code path as `mark_instance_failed`...
SledVmmState {
vmm_state: nexus::VmmRuntimeState {
r#gen: vmm.runtime.r#gen.0.next(),
state: nexus::VmmState::Failed,
time_updated: chrono::Utc::now(),
},
// It's fine to synthesize `None`s here because a `None`
// just means "don't update the migration state", not
// "there is no migration".
migration_in: None,
migration_out: None,
hawkw marked this conversation as resolved.
Show resolved Hide resolved
}
Err(SledAgentInstanceError(ClientError::ErrorResponse(
rsp,
))) => {
let status = rsp.status();
if status.is_client_error() {
slog::warn!(opctx.log, "check failed due to client error";
} else {
// Otherwise, ask the sled-agent what it has to say for itself.
let rsp = client
.vmm_get_state(&vmm_id)
.await
.map_err(SledAgentInstanceError);
match rsp {
Ok(rsp) => {
let state: SledVmmState = rsp.into_inner().into();
check.outcome =
CheckOutcome::Success(state.vmm_state.state.into());
state
}
// Oh, this error indicates that the VMM should transition to
// `Failed`. Let's synthesize a `SledInstanceState` that does
// that.
Err(e) if e.vmm_gone() => {
slog::info!(
opctx.log,
"sled-agent error indicates that this instance's \
VMM has failed!";
"error" => %e,
);
check.outcome =
CheckOutcome::Failure(Failure::NoSuchInstance);
// TODO(eliza): it would be nicer if this used the same
// code path as `mark_instance_failed`...
SledVmmState {
vmm_state: nexus::VmmRuntimeState {
r#gen: vmm.runtime.r#gen.0.next(),
state: nexus::VmmState::Failed,
time_updated: chrono::Utc::now(),
},
// It's fine to synthesize `None`s here because a `None`
// just means "don't update the migration state", not
// "there is no migration".
migration_in: None,
migration_out: None,
}
}
Err(SledAgentInstanceError(
ClientError::ErrorResponse(rsp),
)) => {
let status = rsp.status();
if status.is_client_error() {
slog::warn!(opctx.log, "check incomplete due to client error";
"status" => ?status, "error" => ?rsp.into_inner());
check.result =
Err(Incomplete::ClientHttpError(status.as_u16()));
} else {
slog::info!(opctx.log, "check failed due to server error";
} else {
slog::info!(opctx.log, "check incomplete due to server error";
"status" => ?status, "error" => ?rsp.into_inner());
}
}

check.outcome = CheckOutcome::Failure(
Failure::SledAgentResponse(status.as_u16()),
);
return check;
}
Err(SledAgentInstanceError(
ClientError::CommunicationError(e),
)) => {
// TODO(eliza): eventually, we may want to transition the
// instance to the `Failed` state if the sled-agent has been
// unreachable for a while. We may also want to take other
// corrective actions or alert an operator in this case.
//
// TODO(eliza): because we have the preported IP address
// of the instance's VMM from our databse query, we could
// also ask the VMM directly when the sled-agent is
// unreachable. We should start doing that here at some
// point.
slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
check.outcome =
CheckOutcome::Failure(Failure::SledAgentUnreachable);
return check;
}
Err(SledAgentInstanceError(e)) => {
slog::warn!(
opctx.log,
"error checking up on instance";
"error" => ?e,
"status" => ?e.status(),
);
check.result = Err(Incomplete::ClientError);
return check;
check.result = Err(Incomplete::SledAgentHttpError(
status.as_u16(),
));
return check;
}
Err(SledAgentInstanceError(
ClientError::CommunicationError(e),
)) => {
// TODO(eliza): eventually, we may want to transition the
// instance to the `Failed` state if the sled-agent has been
// unreachable for a while. We may also want to take other
// corrective actions or alert an operator in this case.
//
// TODO(eliza): because we have the purported IP address
// of the instance's VMM from our database query, we could
// also ask the VMM directly when the sled-agent is
// unreachable. We should start doing that here at some
// point.
slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
check.result = Err(Incomplete::SledAgentUnreachable);
return check;
}
Err(SledAgentInstanceError(e)) => {
slog::warn!(
opctx.log,
"error checking up on instance";
"error" => ?e,
"status" => ?e.status(),
);
check.result = Err(Incomplete::ClientError);
return check;
}
}
};

check.outcome = CheckOutcome::Success(state.vmm_state.state.into());
debug!(
opctx.log,
"updating instance state";
Expand Down Expand Up @@ -328,26 +358,20 @@ impl Check {
Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize,
)]
enum Failure {
/// The sled-agent for the sled on which the instance is running was
/// unreachable.
///
/// This may indicate a network partition between us and that sled, that
/// the sled-agent process has crashed, or that the sled is down.
SledAgentUnreachable,
/// The sled-agent responded with an unexpected HTTP error.
SledAgentResponse(u16),
/// The sled-agent indicated that it doesn't know about an instance ID that
/// we believe it *should* know about. This probably means the sled-agent,
/// and potentially the whole sled, has been restarted.
NoSuchInstance,
/// The instance was assigned to a sled that was expunged. Its VMM has been
/// marked as `Failed`, since the sled is no longer present.
SledExpunged,
}

impl Failure {
fn as_str(&self) -> Cow<'static, str> {
match self {
Self::SledAgentUnreachable => "unreachable".into(),
Self::SledAgentResponse(status) => status.to_string().into(),
Self::NoSuchInstance => "no_such_instance".into(),
Self::SledExpunged => "sled_expunged".into(),
}
}
}
Expand All @@ -356,11 +380,16 @@ impl Failure {
Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize,
)]
enum Incomplete {
/// The sled-agent responded with an HTTP client error, indicating that our
/// request as somehow malformed.
ClientHttpError(u16),
/// Something else went wrong while making an HTTP request.
ClientError,
/// The sled-agent for the sled on which the instance is running was
/// unreachable.
///
/// This may indicate a network partition between us and that sled, that
/// the sled-agent process has crashed, or that the sled is down.
SledAgentUnreachable,
/// The sled-agent responded with an unexpected HTTP error.
SledAgentHttpError(u16),
/// We attempted to update the instance state in the database, but no
/// instance with that UUID existed.
///
Expand All @@ -377,8 +406,9 @@ enum Incomplete {
impl Incomplete {
fn as_str(&self) -> Cow<'static, str> {
match self {
Self::ClientHttpError(status) => status.to_string().into(),
Self::ClientError => "client_error".into(),
Self::SledAgentHttpError(status) => status.to_string().into(),
Self::SledAgentUnreachable => "unreachable".into(),
Self::InstanceNotFound => "instance_not_found".into(),
Self::UpdateFailed => "update_failed".into(),
}
Expand Down Expand Up @@ -463,9 +493,8 @@ impl BackgroundTask for InstanceWatcher {
},
};


let target = VirtualMachine::new(self.id, &sled, &instance, &vmm, &project);
tasks.spawn(self.check_instance(opctx, client, target, vmm));
tasks.spawn(self.check_instance(opctx, client, target, vmm, sled));
}

// Now, wait for the check results to come back.
Expand All @@ -487,10 +516,11 @@ impl BackgroundTask for InstanceWatcher {
CheckOutcome::Failure(reason) => {
*check_failures.entry(reason.as_str().into_owned()).or_default() += 1;
}
CheckOutcome::Unknown => {}
}
if let Err(ref reason) = check.result {
*check_errors.entry(reason.as_str().into_owned()).or_default() += 1;
CheckOutcome::Unknown => {
if let Err(reason) = check.result {
*check_errors.entry(reason.as_str().into_owned()).or_default() += 1;
}
}
}
if check.update_saga_queued {
update_sagas_queued += 1;
Expand Down
14 changes: 13 additions & 1 deletion nexus/src/app/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,19 @@ impl super::Nexus {
let sled_lookup = self.sled_lookup(opctx, &sled_id)?;
let (authz_sled,) =
sled_lookup.lookup_for(authz::Action::Modify).await?;
self.db_datastore.sled_set_policy_to_expunged(opctx, &authz_sled).await
let prev_policy = self
.db_datastore
.sled_set_policy_to_expunged(opctx, &authz_sled)
.await?;

// The instance-watcher background task is responsible for marking any
// VMMs running on `Expunged` sleds as `Failed`, so that their instances
// can transition to `Failed` and be deleted or restarted. Let's go
// ahead and activate it now so that those instances don't need to wait
// for the next periodic activation before they can be cleaned up.
self.background_tasks.task_instance_watcher.activate();

Ok(prev_policy)
}

pub(crate) async fn sled_request_firewall_rules(
Expand Down
Loading
Loading