diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 50f174e497..c452fee173 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -36,9 +36,7 @@ use crate::db::update_and_check::UpdateStatus; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; -use nexus_db_model::ApplySledFilterExt; use nexus_db_model::Disk; -use nexus_types::deployment::SledFilter; use omicron_common::api; use omicron_common::api::external; use omicron_common::api::external::http_pagination::PaginatedBy; @@ -801,12 +799,12 @@ impl DataStore { Ok(updated) } - /// Lists all instances on in-service sleds with active Propolis VMM - /// processes, returning the instance along with the VMM on which it's - /// running, the sled on which the VMM is running, and the project that owns - /// the instance. + /// Lists all instances with active Propolis VMM processes, returning the + /// instance along with the VMM on which it's running, the sled on which the + /// VMM is running, and the project that owns the instance. /// - /// The query performed by this function is paginated by the sled's UUID. + /// The query performed by this function is paginated by the sled and + /// instance UUIDs, in that order. pub async fn instance_and_vmm_list_by_sled_agent( &self, opctx: &OpContext, @@ -821,9 +819,9 @@ impl DataStore { // We're going to build the query in stages. // - // First, select all active sleds, and join the `sled` table with the - // `vmm` table on the VMM's sled_id`, filtering out VMMs which are not - // actually incarnated on a sled. + // First, select all non-deleted sled records, and join the `sled` table + // with the `vmm` table on the VMM's `sled_id`, filtering out VMMs which + // are not actually incarnated on a sled. let query = sled_dsl::sled .inner_join(vmm_dsl::vmm.on(vmm_dsl::sled_id.eq(sled_dsl::id))); // Next, paginate the results, ordering by the sled ID first, so that we @@ -844,11 +842,10 @@ impl DataStore { (sled_dsl::id, vmm_dsl::id), pagparams, ); - // Filter out sleds that aren't in service, and VMM states in which the - // VMM isn't incarnated on a sled. + let query = query + // Filter out sled and VMM records which have been deleted. .filter(sled_dsl::time_deleted.is_null()) - .sled_filter(SledFilter::InService) .filter(vmm_dsl::time_deleted.is_null()) // Ignore VMMs which are in states that are not known to exist on a // sled. Since this query drives instance-watcher health checking, diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index 09ddac192e..c01997ffba 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -15,6 +15,7 @@ use nexus_db_model::Vmm; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; +use nexus_types::external_api::views::SledPolicy; use nexus_types::identity::Asset; use nexus_types::identity::Resource; use omicron_common::api::external::Error; @@ -87,6 +88,7 @@ impl InstanceWatcher { client: SledAgentClient, target: VirtualMachine, vmm: Vmm, + sled: Sled, ) -> impl Future + Send + 'static { let datastore = self.datastore.clone(); let sagas = self.sagas.clone(); @@ -99,6 +101,7 @@ impl InstanceWatcher { target.instance_id.to_string(), ); meta.insert("vmm_id".to_string(), vmm_id.to_string()); + meta.insert("sled_id".to_string(), sled.id().to_string()); opctx.child(meta) }; @@ -107,96 +110,123 @@ impl InstanceWatcher { opctx.log, "checking on VMM"; "propolis_id" => %vmm_id ); - let rsp = client - .vmm_get_state(&vmm_id) - .await - .map_err(SledAgentInstanceError); - let mut check = Check { target, outcome: Default::default(), result: Ok(()), update_saga_queued: false, }; - let state: SledVmmState = match rsp { - Ok(rsp) => rsp.into_inner().into(), - // Oh, this error indicates that the VMM should transition to - // `Failed`. Let's synthesize a `SledInstanceState` that does - // that. - Err(e) if e.vmm_gone() => { - slog::info!( - opctx.log, - "sled-agent error indicates that this instance's \ - VMM has failed!"; - "error" => %e, - ); - check.outcome = - CheckOutcome::Failure(Failure::NoSuchInstance); - // TODO(eliza): it would be nicer if this used the same - // code path as `mark_instance_failed`... - SledVmmState { - vmm_state: nexus::VmmRuntimeState { - r#gen: vmm.runtime.r#gen.0.next(), - state: nexus::VmmState::Failed, - time_updated: chrono::Utc::now(), - }, - // It's fine to synthesize `None`s here because a `None` - // just means "don't update the migration state", not - // "there is no migration". - migration_in: None, - migration_out: None, - } + + let state = if sled.policy() == SledPolicy::Expunged { + // If the sled has been expunged, any VMMs still on that sled + // should be marked as `Failed`. + slog::info!( + opctx.log, + "instance is assigned to a VMM on an Expunged sled; \ + marking it as Failed"; + ); + check.outcome = CheckOutcome::Failure(Failure::SledExpunged); + // TODO(eliza): it would be nicer if this used the same + // code path as `mark_instance_failed`... + SledVmmState { + vmm_state: nexus::VmmRuntimeState { + r#gen: vmm.runtime.r#gen.0.next(), + state: nexus::VmmState::Failed, + time_updated: chrono::Utc::now(), + }, + // It's fine to synthesize `None`s here because a `None` + // just means "don't update the migration state", not + // "there is no migration". + migration_in: None, + migration_out: None, } - Err(SledAgentInstanceError(ClientError::ErrorResponse( - rsp, - ))) => { - let status = rsp.status(); - if status.is_client_error() { - slog::warn!(opctx.log, "check failed due to client error"; + } else { + // Otherwise, ask the sled-agent what it has to say for itself. + let rsp = client + .vmm_get_state(&vmm_id) + .await + .map_err(SledAgentInstanceError); + match rsp { + Ok(rsp) => { + let state: SledVmmState = rsp.into_inner().into(); + check.outcome = + CheckOutcome::Success(state.vmm_state.state.into()); + state + } + // Oh, this error indicates that the VMM should transition to + // `Failed`. Let's synthesize a `SledInstanceState` that does + // that. + Err(e) if e.vmm_gone() => { + slog::info!( + opctx.log, + "sled-agent error indicates that this instance's \ + VMM has failed!"; + "error" => %e, + ); + check.outcome = + CheckOutcome::Failure(Failure::NoSuchInstance); + // TODO(eliza): it would be nicer if this used the same + // code path as `mark_instance_failed`... + SledVmmState { + vmm_state: nexus::VmmRuntimeState { + r#gen: vmm.runtime.r#gen.0.next(), + state: nexus::VmmState::Failed, + time_updated: chrono::Utc::now(), + }, + // It's fine to synthesize `None`s here because a `None` + // just means "don't update the migration state", not + // "there is no migration". + migration_in: None, + migration_out: None, + } + } + Err(SledAgentInstanceError( + ClientError::ErrorResponse(rsp), + )) => { + let status = rsp.status(); + if status.is_client_error() { + slog::warn!(opctx.log, "check incomplete due to client error"; "status" => ?status, "error" => ?rsp.into_inner()); - check.result = - Err(Incomplete::ClientHttpError(status.as_u16())); - } else { - slog::info!(opctx.log, "check failed due to server error"; + } else { + slog::info!(opctx.log, "check incomplete due to server error"; "status" => ?status, "error" => ?rsp.into_inner()); - } + } - check.outcome = CheckOutcome::Failure( - Failure::SledAgentResponse(status.as_u16()), - ); - return check; - } - Err(SledAgentInstanceError( - ClientError::CommunicationError(e), - )) => { - // TODO(eliza): eventually, we may want to transition the - // instance to the `Failed` state if the sled-agent has been - // unreachable for a while. We may also want to take other - // corrective actions or alert an operator in this case. - // - // TODO(eliza): because we have the preported IP address - // of the instance's VMM from our databse query, we could - // also ask the VMM directly when the sled-agent is - // unreachable. We should start doing that here at some - // point. - slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e); - check.outcome = - CheckOutcome::Failure(Failure::SledAgentUnreachable); - return check; - } - Err(SledAgentInstanceError(e)) => { - slog::warn!( - opctx.log, - "error checking up on instance"; - "error" => ?e, - "status" => ?e.status(), - ); - check.result = Err(Incomplete::ClientError); - return check; + check.result = Err(Incomplete::SledAgentHttpError( + status.as_u16(), + )); + return check; + } + Err(SledAgentInstanceError( + ClientError::CommunicationError(e), + )) => { + // TODO(eliza): eventually, we may want to transition the + // instance to the `Failed` state if the sled-agent has been + // unreachable for a while. We may also want to take other + // corrective actions or alert an operator in this case. + // + // TODO(eliza): because we have the purported IP address + // of the instance's VMM from our database query, we could + // also ask the VMM directly when the sled-agent is + // unreachable. We should start doing that here at some + // point. + slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e); + check.result = Err(Incomplete::SledAgentUnreachable); + return check; + } + Err(SledAgentInstanceError(e)) => { + slog::warn!( + opctx.log, + "error checking up on instance"; + "error" => ?e, + "status" => ?e.status(), + ); + check.result = Err(Incomplete::ClientError); + return check; + } } }; - check.outcome = CheckOutcome::Success(state.vmm_state.state.into()); debug!( opctx.log, "updating instance state"; @@ -328,26 +358,20 @@ impl Check { Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, )] enum Failure { - /// The sled-agent for the sled on which the instance is running was - /// unreachable. - /// - /// This may indicate a network partition between us and that sled, that - /// the sled-agent process has crashed, or that the sled is down. - SledAgentUnreachable, - /// The sled-agent responded with an unexpected HTTP error. - SledAgentResponse(u16), /// The sled-agent indicated that it doesn't know about an instance ID that /// we believe it *should* know about. This probably means the sled-agent, /// and potentially the whole sled, has been restarted. NoSuchInstance, + /// The instance was assigned to a sled that was expunged. Its VMM has been + /// marked as `Failed`, since the sled is no longer present. + SledExpunged, } impl Failure { fn as_str(&self) -> Cow<'static, str> { match self { - Self::SledAgentUnreachable => "unreachable".into(), - Self::SledAgentResponse(status) => status.to_string().into(), Self::NoSuchInstance => "no_such_instance".into(), + Self::SledExpunged => "sled_expunged".into(), } } } @@ -356,11 +380,16 @@ impl Failure { Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, )] enum Incomplete { - /// The sled-agent responded with an HTTP client error, indicating that our - /// request as somehow malformed. - ClientHttpError(u16), /// Something else went wrong while making an HTTP request. ClientError, + /// The sled-agent for the sled on which the instance is running was + /// unreachable. + /// + /// This may indicate a network partition between us and that sled, that + /// the sled-agent process has crashed, or that the sled is down. + SledAgentUnreachable, + /// The sled-agent responded with an unexpected HTTP error. + SledAgentHttpError(u16), /// We attempted to update the instance state in the database, but no /// instance with that UUID existed. /// @@ -377,8 +406,9 @@ enum Incomplete { impl Incomplete { fn as_str(&self) -> Cow<'static, str> { match self { - Self::ClientHttpError(status) => status.to_string().into(), Self::ClientError => "client_error".into(), + Self::SledAgentHttpError(status) => status.to_string().into(), + Self::SledAgentUnreachable => "unreachable".into(), Self::InstanceNotFound => "instance_not_found".into(), Self::UpdateFailed => "update_failed".into(), } @@ -463,9 +493,8 @@ impl BackgroundTask for InstanceWatcher { }, }; - let target = VirtualMachine::new(self.id, &sled, &instance, &vmm, &project); - tasks.spawn(self.check_instance(opctx, client, target, vmm)); + tasks.spawn(self.check_instance(opctx, client, target, vmm, sled)); } // Now, wait for the check results to come back. @@ -487,10 +516,11 @@ impl BackgroundTask for InstanceWatcher { CheckOutcome::Failure(reason) => { *check_failures.entry(reason.as_str().into_owned()).or_default() += 1; } - CheckOutcome::Unknown => {} - } - if let Err(ref reason) = check.result { - *check_errors.entry(reason.as_str().into_owned()).or_default() += 1; + CheckOutcome::Unknown => { + if let Err(reason) = check.result { + *check_errors.entry(reason.as_str().into_owned()).or_default() += 1; + } + } } if check.update_saga_queued { update_sagas_queued += 1; diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 9c21ca73a1..5c8b991043 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -102,7 +102,19 @@ impl super::Nexus { let sled_lookup = self.sled_lookup(opctx, &sled_id)?; let (authz_sled,) = sled_lookup.lookup_for(authz::Action::Modify).await?; - self.db_datastore.sled_set_policy_to_expunged(opctx, &authz_sled).await + let prev_policy = self + .db_datastore + .sled_set_policy_to_expunged(opctx, &authz_sled) + .await?; + + // The instance-watcher background task is responsible for marking any + // VMMs running on `Expunged` sleds as `Failed`, so that their instances + // can transition to `Failed` and be deleted or restarted. Let's go + // ahead and activate it now so that those instances don't need to wait + // for the next periodic activation before they can be cleaned up. + self.background_tasks.task_instance_watcher.activate(); + + Ok(prev_policy) } pub(crate) async fn sled_request_firewall_rules( diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 25c8056b22..a3f193ce16 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1209,6 +1209,85 @@ async fn test_instance_failed_by_instance_watcher_can_be_restarted( expect_instance_delete_ok(&client, instance_name).await; } +// Verified that instances currently on Expunged sleds are marked as failed. +#[nexus_test] +async fn test_instance_failed_when_on_expunged_sled( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let instance1_name = "romeo"; + let instance2_name = "juliet"; + + create_project_and_pool(&client).await; + + // Create and start the test instances. + let mk_instance = |name: &'static str| async move { + let instance_url = get_instance_url(name); + let instance = create_instance(client, PROJECT_NAME, name).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + instance_simulate(nexus, &instance_id).await; + let instance_next = instance_get(&client, &instance_url).await; + assert_eq!(instance_next.runtime.run_state, InstanceState::Running); + + slog::info!( + &cptestctx.logctx.log, + "test instance is running"; + "instance_name" => %name, + "instance_id" => %instance_id, + ); + + instance_id + }; + let instance1_id = mk_instance(instance1_name).await; + let instance2_id = mk_instance(instance2_name).await; + + // Create a second sled for instances on the Expunged sled to be assigned + // to. + let default_sled_id: SledUuid = + nexus_test_utils::SLED_AGENT_UUID.parse().unwrap(); + let update_dir = Utf8Path::new("/should/be/unused"); + let other_sled_id = SledUuid::new_v4(); + let _other_sa = nexus_test_utils::start_sled_agent( + cptestctx.logctx.log.new(o!("sled_id" => other_sled_id.to_string())), + cptestctx.server.get_http_server_internal_address().await, + other_sled_id, + &update_dir, + sim::SimMode::Explicit, + ) + .await + .unwrap(); + + // Expunge the sled + slog::info!( + &cptestctx.logctx.log, + "expunging sled"; + "sled_id" => %default_sled_id, + ); + let int_client = &cptestctx.internal_client; + int_client + .make_request( + Method::POST, + "/sleds/expunge", + Some(params::SledSelector { + sled: default_sled_id.into_untyped_uuid(), + }), + StatusCode::OK, + ) + .await + .unwrap(); + + // Wait for both instances to transition to Failed. + instance_wait_for_state(client, instance1_id, InstanceState::Failed).await; + instance_wait_for_state(client, instance2_id, InstanceState::Failed).await; + + // Now, the instances should be deleteable... + expect_instance_delete_ok(&client, instance1_name).await; + // ...or restartable. + expect_instance_start_ok(client, instance2_name).await; +} + #[nexus_test] async fn test_instances_are_not_marked_failed_on_other_sled_agent_errors( cptestctx: &ControlPlaneTestContext,