From 46eb2b4b57d5da51ac22bf0b0ad47f936b06b4ab Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 2 Sep 2024 11:41:45 -0700 Subject: [PATCH] [nexus] handle sled-agent errors as described in RFD 486 (#6455) [RFD 486] describes a principled design for how to use the Failed state for instances and VMMs. In particular, it proposes that: - Instances and VMMs should move to the `Failed` state independently. - A VMM goes to `Failed` if either: + Nexus observes that a VMM it thought was resident on a sled is no longer resident there, or, + it observes through some other means (sled expungement, sled reboot) that all VMMs that were resident on that sled must now be gone. - Instances go to `Failed` when an `instance-update` saga observes that an instance's active VMM has been marked `Failed`. An update saga transitioning an instance to failed cleans up the instance's resource allocations, similarly to how update sagas handle transitioning an instance to `Destroyed` when its active VMM has been destroyed. - For purposes of the external API: + An instance that is itself in the `Failed` state appears externally to be `Failed`. + If an instance in the "has-VMM" internal state and points to a `Failed` VMM, the instance is `Stopping` and then transitions to `Failed`. + Once the actual instance (not just its active VMM) is `Failed`, it can be started or destroyed. This branch implements the behavior described above. In particular, I've added code to the `instance-update` saga to handle instances whose active or migration target VMMs have transitioned to `Failed`, similarly to how it handles the VMMs transitioning to `Destroyed`. I've changed the code that detects whether a sled-agent error indicates that an instance has failed to only do so when the error is a 404 with the specific error code that indicates that the sled-agent has forgotten about a VMM that it previously knew about, and changed the Nexus functions that ask sled-agents to update an instance's state, and the `instance_watcher` background task, to mark instances as `Failed` when they encounter only the appropriate sled-agent error. And, I've changed the `mark_instance_failed` function in Nexus to instead mark the VMM record as failed and trigger an instance-update saga (and it's now named `mark_vmm_failed`), and made similar changes to the `instance_watcher` background task. Finally, I've also made sure that `Failed` instances can be destroyed and restarted, and that instances with `Failed` active VMMs appear to be `Stopping` until the VMM is cleaned up by an update saga. In addition to this, it was necessary to (re)introduce a `VmmState::Creating` variant. The purpose of this variant is to differentiate between VMMs which have been created in the database but do not yet exist on a sled, from those which are known to sled-agent and are actually starting/migrating. This is necessary because presently, the `instance_watcher` background task will attempt to perform health checks for `Starting` and `Migrating` VMMs, and --- if the sled-agent does not actually know about the VMM yet --- will move it to `Failed`. But, when a VMM is newly created, the sled-agent won't know about it yet until it has been registered with the sled-agent, so if the `instance_watcher` task activates while an `instance-start` or `instance-migrate` saga is in flight, the `instance-watcher` task may incorrectly move its VMM to `Failed` if the VMM record has been created but not yet registered. Now, after adding the `Creating` variant, we can avoid performing health checks for those VMMs until the sled-agent actually acknowledges a request to register the VMM. Besides the behavior implemented here, RFD 486 also proposes that the `boot_on_fault` field in the instance record be used by Nexus to determine whether to automatically attempt to restart a `Failed` instance. This is described in issues #6491and #4872. I'm planning to implement this in a separate branch, PR #6503. [RFD 486]: https://rfd.shared.oxide.computer/rfd/0486#_determinations --- common/src/api/internal/nexus.rs | 11 + dev-tools/omdb/src/bin/omdb/nexus.rs | 10 +- dev-tools/omdb/tests/successes.out | 6 +- nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/vmm.rs | 16 +- nexus/db-model/src/vmm_state.rs | 61 ++- nexus/db-queries/src/db/datastore/instance.rs | 29 +- nexus/db-queries/src/db/datastore/vmm.rs | 49 +- .../app/background/tasks/instance_updater.rs | 26 +- .../app/background/tasks/instance_watcher.rs | 90 ++-- nexus/src/app/instance.rs | 468 +++++++++++------- nexus/src/app/sagas/instance_common.rs | 46 +- nexus/src/app/sagas/instance_migrate.rs | 53 +- nexus/src/app/sagas/instance_start.rs | 65 ++- nexus/src/app/sagas/instance_update/mod.rs | 106 ++-- .../src/app/sagas/region_replacement_drive.rs | 26 +- .../sagas/region_snapshot_replacement_step.rs | 3 +- nexus/tests/integration_tests/instances.rs | 248 ++++++++-- schema/crdb/dbinit.sql | 10 +- .../crdb/put-back-creating-vmm-state/up.sql | 2 + sled-agent/src/sim/sled_agent.rs | 64 +-- 21 files changed, 963 insertions(+), 429 deletions(-) create mode 100644 schema/crdb/put-back-creating-vmm-state/up.sql diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 191716d017..cec44d3f43 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -102,6 +102,17 @@ pub enum VmmState { Destroyed, } +impl VmmState { + /// States in which the VMM no longer exists and must be cleaned up. + pub const TERMINAL_STATES: &'static [Self] = + &[Self::Failed, Self::Destroyed]; + + /// Returns `true` if this VMM is in a terminal state. + pub fn is_terminal(&self) -> bool { + Self::TERMINAL_STATES.contains(self) + } +} + /// The dynamic runtime properties of an individual VMM process. #[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)] pub struct VmmRuntimeState { diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 6d7152d9f7..5fd9543cc0 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1394,6 +1394,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { /// number of instances found with destroyed active VMMs destroyed_active_vmms: usize, + /// number of instances found with failed active VMMs + failed_active_vmms: usize, + /// number of instances found with terminated active migrations terminated_active_migrations: usize, @@ -1419,6 +1422,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { ), Ok(UpdaterStatus { destroyed_active_vmms, + failed_active_vmms, terminated_active_migrations, sagas_started, sagas_completed, @@ -1436,9 +1440,13 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { destroyed_active_vmms + terminated_active_migrations ); println!( - " instances with destroyed active VMMs: {}", + " instances with Destroyed active VMMs: {}", destroyed_active_vmms, ); + println!( + " instances with Failed active VMMs: {}", + failed_active_vmms, + ); println!( " instances with terminated active migrations: {}", terminated_active_migrations, diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index e7720dd12c..fd5f393ffc 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -524,7 +524,8 @@ task: "instance_updater" last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms total instances in need of updates: 0 - instances with destroyed active VMMs: 0 + instances with Destroyed active VMMs: 0 + instances with Failed active VMMs: 0 instances with terminated active migrations: 0 update sagas started: 0 update sagas completed successfully: 0 @@ -950,7 +951,8 @@ task: "instance_updater" last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms total instances in need of updates: 0 - instances with destroyed active VMMs: 0 + instances with Destroyed active VMMs: 0 + instances with Failed active VMMs: 0 instances with terminated active migrations: 0 update sagas started: 0 update sagas completed successfully: 0 diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 2438f37fba..54718bbddb 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(93, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(94, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(94, "put-back-creating-vmm-state"), KnownVersion::new(93, "dataset-kinds-zone-and-debug"), KnownVersion::new(92, "lldp-link-config-nullable"), KnownVersion::new(91, "add-management-gateway-producer-kind"), diff --git a/nexus/db-model/src/vmm.rs b/nexus/db-model/src/vmm.rs index 4cc6235e1d..45411f587a 100644 --- a/nexus/db-model/src/vmm.rs +++ b/nexus/db-model/src/vmm.rs @@ -60,27 +60,19 @@ pub struct Vmm { pub runtime: VmmRuntimeState, } -/// The set of states that a VMM can have when it is created. -pub enum VmmInitialState { - Starting, - Migrating, -} - impl Vmm { /// Creates a new VMM record. + /// + /// The new VMM record will be in [`VmmState::Creating`] until it is + /// registered with a sled-agent. pub fn new( id: PropolisUuid, instance_id: InstanceUuid, sled_id: SledUuid, propolis_ip: ipnetwork::IpNetwork, propolis_port: u16, - initial_state: VmmInitialState, ) -> Self { let now = Utc::now(); - let state = match initial_state { - VmmInitialState::Starting => VmmState::Starting, - VmmInitialState::Migrating => VmmState::Migrating, - }; Self { id: id.into_untyped_uuid(), @@ -91,7 +83,7 @@ impl Vmm { propolis_ip, propolis_port: SqlU16(propolis_port), runtime: VmmRuntimeState { - state, + state: VmmState::Creating, time_state_updated: now, gen: Generation::new(), }, diff --git a/nexus/db-model/src/vmm_state.rs b/nexus/db-model/src/vmm_state.rs index 7d44bbedbd..24724c7988 100644 --- a/nexus/db-model/src/vmm_state.rs +++ b/nexus/db-model/src/vmm_state.rs @@ -3,6 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::impl_enum_type; +use omicron_common::api::internal::nexus::VmmState as ApiState; use serde::Deserialize; use serde::Serialize; use std::fmt; @@ -16,6 +17,7 @@ impl_enum_type!( #[diesel(sql_type = VmmStateEnum)] pub enum VmmState; + Creating => b"creating" Starting => b"starting" Running => b"running" Stopping => b"stopping" @@ -30,6 +32,7 @@ impl_enum_type!( impl VmmState { pub fn label(&self) -> &'static str { match self { + VmmState::Creating => "creating", VmmState::Starting => "starting", VmmState::Running => "running", VmmState::Stopping => "stopping", @@ -45,7 +48,26 @@ impl VmmState { /// States in which it is safe to deallocate a VMM's sled resources and mark /// it as deleted. pub const DESTROYABLE_STATES: &'static [Self] = - &[Self::Destroyed, Self::SagaUnwound]; + &[Self::Destroyed, Self::Failed, Self::SagaUnwound]; + + pub const TERMINAL_STATES: &'static [Self] = + &[Self::Destroyed, Self::Failed]; + + /// States in which a VMM record is present in the database but is not + /// resident on a sled, either because it does not yet exist, was produced + /// by an unwound update saga and will never exist, or has already been + /// destroyed. + pub const NONEXISTENT_STATES: &'static [Self] = + &[Self::Creating, Self::SagaUnwound, Self::Destroyed]; + + pub fn is_terminal(&self) -> bool { + Self::TERMINAL_STATES.contains(self) + } + /// Returns `true` if the instance is in a state in which it exists on a + /// sled. + pub fn exists_on_sled(&self) -> bool { + !Self::NONEXISTENT_STATES.contains(self) + } } impl fmt::Display for VmmState { @@ -58,7 +80,9 @@ impl From for omicron_common::api::internal::nexus::VmmState { fn from(value: VmmState) -> Self { use omicron_common::api::internal::nexus::VmmState as Output; match value { - VmmState::Starting => Output::Starting, + // The `Creating` state is internal to Nexus; the outside world + // should treat it as equivalent to `Starting`. + VmmState::Creating | VmmState::Starting => Output::Starting, VmmState::Running => Output::Running, VmmState::Stopping => Output::Stopping, VmmState::Stopped => Output::Stopped, @@ -74,7 +98,9 @@ impl From for sled_agent_client::types::VmmState { fn from(value: VmmState) -> Self { use sled_agent_client::types::VmmState as Output; match value { - VmmState::Starting => Output::Starting, + // The `Creating` state is internal to Nexus; the outside world + // should treat it as equivalent to `Starting`. + VmmState::Creating | VmmState::Starting => Output::Starting, VmmState::Running => Output::Running, VmmState::Stopping => Output::Stopping, VmmState::Stopped => Output::Stopped, @@ -86,9 +112,8 @@ impl From for sled_agent_client::types::VmmState { } } -impl From for VmmState { - fn from(value: omicron_common::api::internal::nexus::VmmState) -> Self { - use omicron_common::api::internal::nexus::VmmState as ApiState; +impl From for VmmState { + fn from(value: ApiState) -> Self { use VmmState as Output; match value { ApiState::Starting => Output::Starting, @@ -108,7 +133,12 @@ impl From for omicron_common::api::external::InstanceState { use omicron_common::api::external::InstanceState as Output; match value { - VmmState::Starting => Output::Starting, + // An instance with a VMM which is in the `Creating` state maps to + // `InstanceState::Starting`, rather than `InstanceState::Creating`. + // If we are still creating the VMM, this is because we are + // attempting to *start* the instance; instances may be created + // without creating a VMM to run them, and then started later. + VmmState::Creating | VmmState::Starting => Output::Starting, VmmState::Running => Output::Running, VmmState::Stopping => Output::Stopping, // `SagaUnwound` should map to `Stopped` so that an `instance_view` @@ -129,3 +159,20 @@ impl diesel::query_builder::QueryId for VmmStateEnum { type QueryId = (); const HAS_STATIC_QUERY_ID: bool = false; } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_all_terminal_api_states_are_terminal_db_states() { + for &api_state in ApiState::TERMINAL_STATES { + let db_state = VmmState::from(api_state); + assert!( + db_state.is_terminal(), + "API VmmState::{api_state:?} is considered terminal, but its \ + corresponding DB state ({db_state:?}) is not!" + ); + } + } +} diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 455aa62192..34579ad21f 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -156,6 +156,13 @@ impl InstanceAndActiveVmm { (InstanceState::Vmm, Some(VmmState::SagaUnwound)) => { external::InstanceState::Stopped } + // - An instance with a "failed" VMM should *not* be counted as + // failed until the VMM is unlinked, because a start saga must be + // able to run "failed" instance. Until then, it will continue to + // appear "stopping". + (InstanceState::Vmm, Some(VmmState::Failed)) => { + external::InstanceState::Stopping + } // - An instance with no VMM is always "stopped" (as long as it's // not "starting" etc.) (InstanceState::NoVmm, _vmm_state) => { @@ -360,21 +367,21 @@ impl DataStore { .collect()) } - /// List all instances with active VMMs in the `Destroyed` state that don't - /// have currently-running instance-updater sagas. + /// List all instances with active VMMs in the provided [`VmmState`] which + /// don't have currently-running instance-updater sagas. /// /// This is used by the `instance_updater` background task to ensure that /// update sagas are scheduled for these instances. - pub async fn find_instances_with_destroyed_active_vmms( + pub async fn find_instances_by_active_vmm_state( &self, opctx: &OpContext, + vmm_state: VmmState, ) -> ListResultVec { - use db::model::VmmState; use db::schema::instance::dsl; use db::schema::vmm::dsl as vmm_dsl; vmm_dsl::vmm - .filter(vmm_dsl::state.eq(VmmState::Destroyed)) + .filter(vmm_dsl::state.eq(vmm_state)) // If the VMM record has already been deleted, we don't need to do // anything about it --- someone already has. .filter(vmm_dsl::time_deleted.is_null()) @@ -734,7 +741,7 @@ impl DataStore { format!( "cannot set migration ID {migration_id} for instance \ {instance_id} (perhaps another migration ID is \ - already present): {error:#}" + already present): {error:#?}" ), ), }) @@ -818,7 +825,15 @@ impl DataStore { vmm_dsl::vmm .on(vmm_dsl::sled_id .eq(sled_dsl::id) - .and(vmm_dsl::time_deleted.is_null())) + .and(vmm_dsl::time_deleted.is_null()) + // Ignore instances which are in states that are not + // known to exist on a sled. Since this query drives + // instance-watcher health checking, it is not necessary + // to perform health checks for VMMs that don't actually + // exist in real life. + .and( + vmm_dsl::state.ne_all(VmmState::NONEXISTENT_STATES), + )) .inner_join( instance_dsl::instance .on(instance_dsl::id diff --git a/nexus/db-queries/src/db/datastore/vmm.rs b/nexus/db-queries/src/db/datastore/vmm.rs index 089a2914be..77b8069a36 100644 --- a/nexus/db-queries/src/db/datastore/vmm.rs +++ b/nexus/db-queries/src/db/datastore/vmm.rs @@ -82,11 +82,9 @@ impl DataStore { opctx: &OpContext, vmm_id: &PropolisUuid, ) -> UpdateResult { - let valid_states = vec![DbVmmState::Destroyed, DbVmmState::Failed]; - let updated = diesel::update(dsl::vmm) .filter(dsl::id.eq(vmm_id.into_untyped_uuid())) - .filter(dsl::state.eq_any(valid_states)) + .filter(dsl::state.eq_any(DbVmmState::DESTROYABLE_STATES)) .filter(dsl::time_deleted.is_null()) .set(dsl::time_deleted.eq(Utc::now())) .check_if_exists::(vmm_id.into_untyped_uuid()) @@ -307,6 +305,49 @@ impl DataStore { }) } + /// Transitions a VMM to the `SagaUnwound` state. + /// + /// # Warning + /// + /// This may *only* be called by the saga that created a VMM record, as it + /// unconditionally increments the generation number and advances the VMM to + /// the `SagaUnwound` state. + /// + /// This is necessary as it is executed in compensating actions for + /// unwinding saga nodes which cannot easily determine whether other + /// actions, which advance the VMM's generation, have executed before the + /// saga unwound. + pub async fn vmm_mark_saga_unwound( + &self, + opctx: &OpContext, + vmm_id: &PropolisUuid, + ) -> Result { + diesel::update(dsl::vmm) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(vmm_id.into_untyped_uuid())) + .set(( + dsl::state.eq(DbVmmState::SagaUnwound), + dsl::time_state_updated.eq(chrono::Utc::now()), + dsl::state_generation.eq(dsl::state_generation + 1), + )) + .check_if_exists::(vmm_id.into_untyped_uuid()) + .execute_and_check(&*self.pool_connection_authorized(opctx).await?) + .await + .map(|r| match r.status { + UpdateStatus::Updated => true, + UpdateStatus::NotUpdatedButExists => false, + }) + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByLookup( + ResourceType::Vmm, + LookupType::ById(vmm_id.into_untyped_uuid()), + ), + ) + }) + } + /// Forcibly overwrites the Propolis IP/Port in the supplied VMM's record with /// the supplied Propolis IP. /// @@ -357,7 +398,7 @@ impl DataStore { paginated(dsl::vmm, dsl::id, pagparams) // In order to be considered "abandoned", a VMM must be: - // - in the `Destroyed` or `SagaUnwound` state + // - in the `Destroyed`, `SagaUnwound`, or `Failed` states .filter(dsl::state.eq_any(DbVmmState::DESTROYABLE_STATES)) // - not deleted yet .filter(dsl::time_deleted.is_null()) diff --git a/nexus/src/app/background/tasks/instance_updater.rs b/nexus/src/app/background/tasks/instance_updater.rs index 46a3bead21..eea07f90a5 100644 --- a/nexus/src/app/background/tasks/instance_updater.rs +++ b/nexus/src/app/background/tasks/instance_updater.rs @@ -12,6 +12,7 @@ use anyhow::Context; use futures::future::BoxFuture; use futures::FutureExt; use nexus_db_model::Instance; +use nexus_db_model::VmmState; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::lookup::LookupPath; use nexus_db_queries::db::DataStore; @@ -84,7 +85,8 @@ impl InstanceUpdater { "destroyed active VMMs", &opctx.log, &mut last_err, - self.datastore.find_instances_with_destroyed_active_vmms(opctx), + self.datastore + .find_instances_by_active_vmm_state(opctx, VmmState::Destroyed), ) .await; stats.destroyed_active_vmms = destroyed_active_vmms.len(); @@ -97,6 +99,24 @@ impl InstanceUpdater { ) .await; + let failed_active_vmms = find_instances( + "failed active VMMs", + &opctx.log, + &mut last_err, + self.datastore + .find_instances_by_active_vmm_state(opctx, VmmState::Failed), + ) + .await; + stats.failed_active_vmms = failed_active_vmms.len(); + self.start_sagas( + &opctx, + stats, + &mut last_err, + &mut sagas, + failed_active_vmms, + ) + .await; + let terminated_active_migrations = find_instances( "terminated active migrations", &opctx.log, @@ -196,6 +216,7 @@ impl InstanceUpdater { #[derive(Default)] struct ActivationStats { destroyed_active_vmms: usize, + failed_active_vmms: usize, terminated_active_migrations: usize, sagas_started: usize, sagas_completed: usize, @@ -221,6 +242,7 @@ impl BackgroundTask for InstanceUpdater { &opctx.log, "instance updater activation completed"; "destroyed_active_vmms" => stats.destroyed_active_vmms, + "failed_active_vmms" => stats.failed_active_vmms, "terminated_active_migrations" => stats.terminated_active_migrations, "update_sagas_started" => stats.sagas_started, "update_sagas_completed" => stats.sagas_completed, @@ -245,6 +267,7 @@ impl BackgroundTask for InstanceUpdater { "instance updater activation failed!"; "error" => %error, "destroyed_active_vmms" => stats.destroyed_active_vmms, + "failed_active_vmms" => stats.failed_active_vmms, "terminated_active_migrations" => stats.terminated_active_migrations, "update_sagas_started" => stats.sagas_started, "update_sagas_completed" => stats.sagas_completed, @@ -257,6 +280,7 @@ impl BackgroundTask for InstanceUpdater { }; json!({ "destroyed_active_vmms": stats.destroyed_active_vmms, + "failed_active_vmms": stats.failed_active_vmms, "terminated_active_migrations": stats.terminated_active_migrations, "sagas_started": stats.sagas_started, "sagas_completed": stats.sagas_completed, diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs index ae78392ea3..7c90e7b1ec 100644 --- a/nexus/src/app/background/tasks/instance_watcher.rs +++ b/nexus/src/app/background/tasks/instance_watcher.rs @@ -5,9 +5,9 @@ //! Background task for pulling instance state from sled-agents. use crate::app::background::BackgroundTask; +use crate::app::instance::SledAgentInstanceError; use crate::app::saga::StartSaga; use futures::{future::BoxFuture, FutureExt}; -use http::StatusCode; use nexus_db_model::Instance; use nexus_db_model::Project; use nexus_db_model::Sled; @@ -19,6 +19,7 @@ use nexus_types::identity::Asset; use nexus_types::identity::Resource; use omicron_common::api::external::Error; use omicron_common::api::external::InstanceState; +use omicron_common::api::internal::nexus; use omicron_common::api::internal::nexus::SledVmmState; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::PropolisUuid; @@ -67,47 +68,72 @@ impl InstanceWatcher { opctx: &OpContext, client: &SledAgentClient, target: VirtualMachine, + vmm: Vmm, ) -> impl Future + Send + 'static { let datastore = self.datastore.clone(); let sagas = self.sagas.clone(); - let opctx = opctx.child( - std::iter::once(( + let vmm_id = PropolisUuid::from_untyped_uuid(target.vmm_id); + let opctx = { + let mut meta = std::collections::BTreeMap::new(); + meta.insert( "instance_id".to_string(), target.instance_id.to_string(), - )) - .collect(), - ); + ); + meta.insert("vmm_id".to_string(), vmm_id.to_string()); + opctx.child(meta) + }; let client = client.clone(); async move { - let vmm_id = PropolisUuid::from_untyped_uuid(target.vmm_id); slog::trace!( opctx.log, "checking on VMM"; "propolis_id" => %vmm_id ); - let rsp = client.vmm_get_state(&vmm_id).await; + slog::trace!(opctx.log, "checking on instance..."); + let rsp = client + .vmm_get_state(&vmm_id) + .await + .map_err(SledAgentInstanceError); let mut check = Check { target, outcome: Default::default(), result: Ok(()), update_saga_queued: false, }; - let state = match rsp { - Ok(rsp) => rsp.into_inner(), - Err(ClientError::ErrorResponse(rsp)) => { - let status = rsp.status(); - if status == StatusCode::NOT_FOUND - && rsp.as_ref().error_code.as_deref() - == Some("NO_SUCH_INSTANCE") - { - slog::info!(opctx.log, "instance is wayyyyy gone"); - // TODO(eliza): eventually, we should attempt to put the - // instance in the `Failed` state here. - check.outcome = - CheckOutcome::Failure(Failure::NoSuchInstance); - return check; + let state: SledVmmState = match rsp { + Ok(rsp) => rsp.into_inner().into(), + // Oh, this error indicates that the VMM should transition to + // `Failed`. Let's synthesize a `SledInstanceState` that does + // that. + Err(e) if e.vmm_gone() => { + slog::info!( + opctx.log, + "sled-agent error indicates that this instance's \ + VMM has failed!"; + "error" => %e, + ); + check.outcome = + CheckOutcome::Failure(Failure::NoSuchInstance); + // TODO(eliza): it would be nicer if this used the same + // code path as `mark_instance_failed`... + SledVmmState { + vmm_state: nexus::VmmRuntimeState { + r#gen: vmm.runtime.r#gen.0.next(), + state: nexus::VmmState::Failed, + time_updated: chrono::Utc::now(), + }, + // It's fine to synthesize `None`s here because a `None` + // just means "don't update the migration state", not + // "there is no migration". + migration_in: None, + migration_out: None, } + } + Err(SledAgentInstanceError(ClientError::ErrorResponse( + rsp, + ))) => { + let status = rsp.status(); if status.is_client_error() { slog::warn!(opctx.log, "check failed due to client error"; "status" => ?status, "error" => ?rsp.into_inner()); @@ -123,13 +149,15 @@ impl InstanceWatcher { ); return check; } - Err(ClientError::CommunicationError(e)) => { + Err(SledAgentInstanceError( + ClientError::CommunicationError(e), + )) => { // TODO(eliza): eventually, we may want to transition the // instance to the `Failed` state if the sled-agent has been // unreachable for a while. We may also want to take other // corrective actions or alert an operator in this case. // - // TODO(eliza): because we have the preported IP address + // TODO(eliza): because we have the preported IP address // of the instance's VMM from our databse query, we could // also ask the VMM directly when the sled-agent is // unreachable. We should start doing that here at some @@ -139,7 +167,7 @@ impl InstanceWatcher { CheckOutcome::Failure(Failure::SledAgentUnreachable); return check; } - Err(e) => { + Err(SledAgentInstanceError(e)) => { slog::warn!( opctx.log, "error checking up on instance"; @@ -151,19 +179,17 @@ impl InstanceWatcher { } }; - let new_runtime_state: SledVmmState = state.into(); - check.outcome = - CheckOutcome::Success(new_runtime_state.vmm_state.state.into()); + check.outcome = CheckOutcome::Success(state.vmm_state.state.into()); debug!( opctx.log, "updating instance state"; - "state" => ?new_runtime_state.vmm_state.state, + "state" => ?state, ); match crate::app::instance::process_vmm_update( &datastore, &opctx, PropolisUuid::from_untyped_uuid(target.vmm_id), - &new_runtime_state, + &state, ) .await { @@ -392,7 +418,7 @@ impl BackgroundTask for InstanceWatcher { if let Some((mut curr_sled, instance, vmm, project)) = batch.next() { let mut client = mk_client(&curr_sled); let target = VirtualMachine::new(self.id, &curr_sled, &instance, &vmm, &project); - tasks.spawn(self.check_instance(opctx, &client, target)); + tasks.spawn(self.check_instance(opctx, &client, target, vmm)); for (sled, instance, vmm, project) in batch { // We're now talking to a new sled agent; update the client. @@ -402,7 +428,7 @@ impl BackgroundTask for InstanceWatcher { } let target = VirtualMachine::new(self.id, &curr_sled, &instance, &vmm, &project); - tasks.spawn(self.check_instance(opctx, &client, target)); + tasks.spawn(self.check_instance(opctx, &client, target, vmm)); } } } diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index b715b6bbd3..fd63768fc7 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -22,6 +22,7 @@ use futures::{FutureExt, SinkExt, StreamExt}; use nexus_db_model::IpAttachState; use nexus_db_model::IpKind; use nexus_db_model::Vmm as DbVmm; +use nexus_db_model::VmmRuntimeState; use nexus_db_model::VmmState as DbVmmState; use nexus_db_queries::authn; use nexus_db_queries::authz; @@ -58,6 +59,7 @@ use propolis_client::support::InstanceSerialConsoleHelper; use propolis_client::support::WSClientOffset; use propolis_client::support::WebSocketStream; use sagas::instance_common::ExternalIpAttach; +use sagas::instance_update; use sled_agent_client::types::InstanceMigrationTargetParams; use sled_agent_client::types::InstanceProperties; use sled_agent_client::types::VmmPutStateBody; @@ -72,48 +74,92 @@ type SledAgentClientError = // Newtype wrapper to avoid the orphan type rule. #[derive(Debug, thiserror::Error)] -pub struct SledAgentInstancePutError(pub SledAgentClientError); +pub struct SledAgentInstanceError(pub SledAgentClientError); -impl std::fmt::Display for SledAgentInstancePutError { +impl std::fmt::Display for SledAgentInstanceError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } -impl From for SledAgentInstancePutError { +impl From for SledAgentInstanceError { fn from(value: SledAgentClientError) -> Self { Self(value) } } -impl From for omicron_common::api::external::Error { - fn from(value: SledAgentInstancePutError) -> Self { +impl From for omicron_common::api::external::Error { + fn from(value: SledAgentInstanceError) -> Self { value.0.into() } } -impl SledAgentInstancePutError { - /// Returns `true` if this error is of a class that indicates that Nexus - /// cannot assume anything about the health of the instance or its sled. - pub fn instance_unhealthy(&self) -> bool { - // TODO(#3238) TODO(#4226) For compatibility, this logic is lifted from - // the From impl that converts Progenitor client errors to - // `omicron_common::api::external::Error`s and from previous logic in - // this module that inferred instance health from those converted - // errors. In particular, some of the outer Progenitor client error - // variants (e.g. CommunicationError) can indicate transient conditions - // that aren't really fatal to an instance and don't otherwise indicate - // that it's unhealthy. - // - // To match old behavior until this is revisited, however, treat all - // Progenitor errors except for explicit error responses as signs of an - // unhealthy instance, and then only treat an instance as healthy if its - // sled returned a 400-level status code. +impl From for dropshot::HttpError { + fn from(value: SledAgentInstanceError) -> Self { + use dropshot::HttpError; + use progenitor_client::Error as ClientError; + // See RFD 486 §6.3: + // https://rfd.shared.oxide.computer/rfd/486#_stopping_or_rebooting_a_running_instance + match value { + // Errors communicating with the sled-agent should return 502 Bad + // Gateway to the caller. + SledAgentInstanceError(ClientError::CommunicationError(e)) => { + // N.B.: it sure *seems* like we should be using the + // `HttpError::for_status` constructor here, but that + // constructor secretly calls the `for_client_error` + // constructor, which panics if we pass a status code that isn't + // a 4xx error. So, instead, we construct an internal error and + // then munge its status code. + // See https://github.com/oxidecomputer/dropshot/issues/693 + let mut error = HttpError::for_internal_error(e.to_string()); + error.status_code = if e.is_timeout() { + http::StatusCode::GATEWAY_TIMEOUT + } else { + http::StatusCode::BAD_GATEWAY + }; + error + } + // Invalid request errors from the sled-agent should return 500 + // Internal Server Errors. + SledAgentInstanceError(ClientError::InvalidRequest(s)) => { + HttpError::for_internal_error(s) + } + // Error responses from sled-agent that indicate the instance is + // unhealthy should be mapped to a 503 error. + e if e.vmm_gone() => { + let mut error = HttpError::for_unavail(None, e.to_string()); + error.external_message = "The instance was running but is no \ + longer reachable. It is being moved to the Failed state." + .to_string(); + error + } + // Other client errors can be handled by the normal + // `external::Error` to `HttpError` conversions. + SledAgentInstanceError(e) => HttpError::from(Error::from(e)), + } + } +} + +impl SledAgentInstanceError { + /// Returns `true` if this error is of a class that indicates that the + /// VMM is no longer known to the sled-agent. + pub fn vmm_gone(&self) -> bool { match &self.0 { + // See RFD 486 §6.3: + // https://rfd.shared.oxide.computer/rfd/486#_stopping_or_rebooting_a_running_instance + // + // > If a sled agent call returns 404 Not Found with a + // > NO_SUCH_INSTANCE error string, the VMM of interest is missing. + // > In this case, sled agent has disowned the VMM, so Nexus can + // > take control of its state machine and drive the VMM to the + // > Failed state and trigger an instance update saga that will + // > remove the dangling VMM ID from the instance and clean up the + // > VMM’s resources. progenitor_client::Error::ErrorResponse(rv) => { - !rv.status().is_client_error() + rv.status() == http::StatusCode::NOT_FOUND + && rv.error_code.as_deref() == Some("NO_SUCH_INSTANCE") } - _ => true, + _ => false, } } } @@ -124,7 +170,7 @@ impl SledAgentInstancePutError { pub enum InstanceStateChangeError { /// Sled agent returned an error from one of its instance endpoints. #[error("sled agent client error")] - SledAgent(#[source] SledAgentInstancePutError), + SledAgent(#[source] SledAgentInstanceError), /// Some other error occurred outside of the attempt to communicate with /// sled agent. @@ -144,6 +190,15 @@ impl From for omicron_common::api::external::Error { } } +impl From for dropshot::HttpError { + fn from(value: InstanceStateChangeError) -> Self { + match value { + InstanceStateChangeError::SledAgent(e) => e.into(), + InstanceStateChangeError::Other(e) => e.into(), + } + } +} + /// The kinds of state changes that can be requested of an instance's current /// VMM (i.e. the VMM pointed to be the instance's `propolis_id` field). pub(crate) enum InstanceStateChangeRequest { @@ -541,7 +596,7 @@ impl super::Nexus { &self, opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, - ) -> UpdateResult { + ) -> Result { let (.., authz_instance) = instance_lookup.lookup_for(authz::Action::Modify).await?; @@ -559,24 +614,23 @@ impl super::Nexus { ) .await { - if let InstanceStateChangeError::SledAgent(inner) = &e { - if inner.instance_unhealthy() { + if let (InstanceStateChangeError::SledAgent(inner), Some(vmm)) = + (&e, state.vmm()) + { + if inner.vmm_gone() { let _ = self - .mark_instance_failed( - &InstanceUuid::from_untyped_uuid( - authz_instance.id(), - ), - state.instance().runtime(), - inner, - ) + .mark_vmm_failed(opctx, authz_instance, vmm, inner) .await; } } - return Err(e.into()); + return Err(e); } - self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await + self.db_datastore + .instance_fetch_with_vmm(opctx, &authz_instance) + .await + .map_err(Into::into) } /// Attempts to start an instance if it is currently stopped. @@ -584,7 +638,7 @@ impl super::Nexus { self: &Arc, opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, - ) -> UpdateResult { + ) -> Result { let (.., authz_instance) = instance_lookup.lookup_for(authz::Action::Modify).await?; @@ -610,6 +664,7 @@ impl super::Nexus { self.db_datastore .instance_fetch_with_vmm(opctx, &authz_instance) .await + .map_err(Into::into) } } } @@ -619,7 +674,7 @@ impl super::Nexus { &self, opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, - ) -> UpdateResult { + ) -> Result { let (.., authz_instance) = instance_lookup.lookup_for(authz::Action::Modify).await?; @@ -637,24 +692,23 @@ impl super::Nexus { ) .await { - if let InstanceStateChangeError::SledAgent(inner) = &e { - if inner.instance_unhealthy() { + if let (InstanceStateChangeError::SledAgent(inner), Some(vmm)) = + (&e, state.vmm()) + { + if inner.vmm_gone() { let _ = self - .mark_instance_failed( - &InstanceUuid::from_untyped_uuid( - authz_instance.id(), - ), - state.instance().runtime(), - inner, - ) + .mark_vmm_failed(opctx, authz_instance, vmm, inner) .await; } } - return Err(e.into()); + return Err(e); } - self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await + self.db_datastore + .instance_fetch_with_vmm(opctx, &authz_instance) + .await + .map_err(Into::into) } /// Idempotently ensures that the sled specified in `db_instance` does not @@ -670,9 +724,7 @@ impl super::Nexus { .await .map(|res| res.into_inner().updated_runtime.map(Into::into)) .map_err(|e| { - InstanceStateChangeError::SledAgent(SledAgentInstancePutError( - e, - )) + InstanceStateChangeError::SledAgent(SledAgentInstanceError(e)) }) } @@ -849,7 +901,7 @@ impl super::Nexus { ) .await .map(|res| res.into_inner().updated_runtime.map(Into::into)) - .map_err(|e| SledAgentInstancePutError(e)); + .map_err(|e| SledAgentInstanceError(e)); // If the operation succeeded, write the instance state back, // returning any subsequent errors that occurred during that @@ -886,7 +938,7 @@ impl super::Nexus { propolis_id: &PropolisUuid, initial_vmm: &db::model::Vmm, operation: InstanceRegisterReason, - ) -> Result<(), Error> { + ) -> Result { opctx.authorize(authz::Action::Modify, authz_instance).await?; // Check that the hostname is valid. @@ -903,7 +955,7 @@ impl super::Nexus { at that time.", db_instance.hostname, ); - return Err(Error::invalid_request(&msg)); + return Err(Error::invalid_request(&msg).into()); }; // Gather disk information and turn that into DiskRequests @@ -937,7 +989,8 @@ impl super::Nexus { return Err(Error::internal_error(&format!( "disk {} is attached but has no PCI slot assignment", disk.id() - ))); + )) + .into()); } }; @@ -961,7 +1014,8 @@ impl super::Nexus { device: "nvme".to_string(), volume_construction_request: serde_json::from_str( &volume.data(), - )?, + ) + .map_err(Error::from)?, }); } @@ -991,7 +1045,8 @@ impl super::Nexus { external_ips.len(), ) .as_str(), - )); + ) + .into()); } // If there are any external IPs not yet fully attached/detached,then @@ -1000,7 +1055,7 @@ impl super::Nexus { if external_ips.iter().any(|v| v.state != IpAttachState::Attached) { return Err(Error::unavail( "External IP attach/detach is in progress during instance_ensure_registered" - )); + ).into()); } // Partition remaining external IPs by class: we can have at most @@ -1017,7 +1072,8 @@ impl super::Nexus { ephemeral_ips.len() ) .as_str(), - )); + ) + .into()); } let ephemeral_ip = ephemeral_ips.get(0).map(|model| model.ip.ip()); @@ -1027,7 +1083,8 @@ impl super::Nexus { if snat_ip.len() != 1 { return Err(Error::internal_error( "Expected exactly one SNAT IP address for an instance", - )); + ) + .into()); } let source_nat = SourceNatConfig::try_from(snat_ip.into_iter().next().unwrap()) @@ -1119,13 +1176,31 @@ impl super::Nexus { let sa = self .sled_client(&SledUuid::from_untyped_uuid(initial_vmm.sled_id)) .await?; + // The state of a freshly-created VMM record in the database is always + // `VmmState::Creating`. Based on whether this VMM is created in order + // to start or migrate an instance, determine the VMM state we will want + // the sled-agent to create the VMM in. Once sled-agent acknowledges our + // registration request, we will advance the database record to the new + // state. + let vmm_runtime = sled_agent_client::types::VmmRuntimeState { + time_updated: chrono::Utc::now(), + r#gen: initial_vmm.runtime.gen.next(), + state: match operation { + InstanceRegisterReason::Migrate { .. } => { + sled_agent_client::types::VmmState::Migrating + } + InstanceRegisterReason::Start { .. } => { + sled_agent_client::types::VmmState::Starting + } + }, + }; let instance_register_result = sa .vmm_register( propolis_id, &sled_agent_client::types::InstanceEnsureBody { hardware: instance_hardware, instance_runtime: db_instance.runtime().clone().into(), - vmm_runtime: initial_vmm.clone().into(), + vmm_runtime, instance_id, propolis_addr: SocketAddr::new( initial_vmm.propolis_ip.ip(), @@ -1137,65 +1212,103 @@ impl super::Nexus { ) .await .map(|res| res.into_inner().into()) - .map_err(|e| SledAgentInstancePutError(e)); + .map_err(|e| SledAgentInstanceError(e)); match instance_register_result { Ok(state) => { self.notify_vmm_updated(opctx, *propolis_id, &state).await?; + Ok(db::model::Vmm { + runtime: state.vmm_state.into(), + ..initial_vmm.clone() + }) } Err(e) => { - if e.instance_unhealthy() { + if e.vmm_gone() { let _ = self - .mark_instance_failed( - &instance_id, - db_instance.runtime(), + .mark_vmm_failed( + &opctx, + authz_instance.clone(), + &initial_vmm, &e, ) .await; } - return Err(e.into()); + Err(InstanceStateChangeError::SledAgent(e)) } } - - Ok(()) } - /// Attempts to move an instance from `prev_instance_runtime` to the - /// `Failed` state in response to an error returned from a call to a sled + /// Attempts to transition an instance to to the `Failed` state in + /// response to an error returned from a call to a sled /// agent instance API, supplied in `reason`. - pub(crate) async fn mark_instance_failed( + /// + /// This is done by first marking the associated `vmm` as `Failed`, and then + /// running an `instance-update` saga which transitions the instance record + /// to `Failed` in response to the active VMM being marked as `Failed`. If + /// the update saga cannot complete the instance state transition, the + /// `instance-updater` RPW will be activated to ensure another update saga + /// is attempted. + /// + /// This method returns an error if the VMM record could not be marked as + /// failed. No error is returned if the instance-update saga could not + /// execute, as this may just mean that another saga is already updating the + /// instance. The update will be performed eventually even if this method + /// could not update the instance. + pub(crate) async fn mark_vmm_failed( &self, - instance_id: &InstanceUuid, - prev_instance_runtime: &db::model::InstanceRuntimeState, - reason: &SledAgentInstancePutError, + opctx: &OpContext, + authz_instance: authz::Instance, + vmm: &db::model::Vmm, + reason: &SledAgentInstanceError, ) -> Result<(), Error> { - error!(self.log, "marking instance failed due to sled agent API error"; + let instance_id = InstanceUuid::from_untyped_uuid(authz_instance.id()); + let vmm_id = PropolisUuid::from_untyped_uuid(vmm.id); + error!(self.log, "marking VMM failed due to sled agent API error"; "instance_id" => %instance_id, + "vmm_id" => %vmm_id, "error" => ?reason); - let new_runtime = db::model::InstanceRuntimeState { - nexus_state: db::model::InstanceState::Failed, - - // TODO(#4226): Clearing the Propolis ID is required to allow the - // instance to be deleted, but this doesn't actually terminate the - // VMM. - propolis_id: None, - gen: prev_instance_runtime.gen.next().into(), - ..prev_instance_runtime.clone() + let new_runtime = VmmRuntimeState { + state: db::model::VmmState::Failed, + time_state_updated: chrono::Utc::now(), + r#gen: db::model::Generation(vmm.runtime.r#gen.next()), }; - match self - .db_datastore - .instance_update_runtime(&instance_id, &new_runtime) - .await + match self.db_datastore.vmm_update_runtime(&vmm_id, &new_runtime).await { - Ok(_) => info!(self.log, "marked instance as Failed"; - "instance_id" => %instance_id), + Ok(_) => { + info!(self.log, "marked VMM as Failed, preparing update saga"; + "instance_id" => %instance_id, + "vmm_id" => %vmm_id, + "reason" => ?reason, + ); + let saga = instance_update::SagaInstanceUpdate::prepare( + &instance_update::Params { + serialized_authn: authn::saga::Serialized::for_opctx( + opctx, + ), + authz_instance, + }, + )?; + // Unlike `notify_vmm_updated`, which spawns the update + // saga in a "fire and forget" fashion so that it can return a + // HTTP 200 OK as soon as the changed VMM records are persisted + // to the database, `mark_vmm_failed` performs the instance + // update "synchronously". This is so that we can make a + // best-effort attempt to ensure that the instance record will + // be in the failed state prior to returning an error to the + // caller. That way, the caller will not see an error when + // attempting to transition their instance's state, and then, + // upon fetching the instance, transiently see an instance state + // suggesting it's "doing fine" until the update saga completes. + self.update_instance(&opctx.log, saga, instance_id).await; + } // XXX: It's not clear what to do with this error; should it be // bubbled back up to the caller? Err(e) => error!(self.log, "failed to write Failed instance state to DB"; "instance_id" => %instance_id, + "vmm_id" => %vmm_id, "error" => ?e), } @@ -1326,71 +1439,30 @@ impl super::Nexus { propolis_id: PropolisUuid, new_runtime_state: &nexus::SledVmmState, ) -> Result<(), Error> { - let Some((instance_id, saga)) = process_vmm_update( + if let Some((instance_id, saga)) = process_vmm_update( &self.db_datastore, opctx, propolis_id, new_runtime_state, ) .await? - else { - return Ok(()); - }; - - // We don't need to wait for the instance update saga to run to - // completion to return OK to the sled-agent --- all it needs to care - // about is that the VMM/migration state in the database was updated. - // Even if we fail to successfully start an update saga, the - // instance-updater background task will eventually see that the - // instance is in a state which requires an update saga, and ensure that - // one is eventually executed. - // - // Therefore, just spawn the update saga in a new task, and return. - info!(opctx.log, "starting update saga for {instance_id}"; - "instance_id" => %instance_id, - "vmm_state" => ?new_runtime_state.vmm_state, - "migration_state" => ?new_runtime_state.migrations(), - ); - let sagas = self.sagas.clone(); - let task_instance_updater = - self.background_tasks.task_instance_updater.clone(); - let log = opctx.log.clone(); - tokio::spawn(async move { - // TODO(eliza): maybe we should use the lower level saga API so - // we can see if the saga failed due to the lock being held and - // retry it immediately? - let running_saga = async move { - let runnable_saga = sagas.saga_prepare(saga).await?; - runnable_saga.start().await - } - .await; - let result = match running_saga { - Err(error) => { - error!(&log, "failed to start update saga for {instance_id}"; - "instance_id" => %instance_id, - "error" => %error, - ); - // If we couldn't start the update saga for this - // instance, kick the instance-updater background task - // to try and start it again in a timely manner. - task_instance_updater.activate(); - return; - } - Ok(saga) => { - saga.wait_until_stopped().await.into_omicron_result() - } - }; - if let Err(error) = result { - error!(&log, "update saga for {instance_id} failed"; - "instance_id" => %instance_id, - "error" => %error, - ); - // If we couldn't complete the update saga for this - // instance, kick the instance-updater background task - // to try and start it again in a timely manner. - task_instance_updater.activate(); - } - }); + { + // We don't need to wait for the instance update saga to run to + // completion to return OK to the sled-agent --- all it needs to care + // about is that the VMM/migration state in the database was updated. + // Even if we fail to successfully start an update saga, the + // instance-updater background task will eventually see that the + // instance is in a state which requires an update saga, and ensure that + // one is eventually executed. + // + // Therefore, just spawn the update saga in a new task, and return. + info!(opctx.log, "starting update saga for {instance_id}"; + "instance_id" => %instance_id, + "vmm_state" => ?new_runtime_state.vmm_state, + "migration_state" => ?new_runtime_state.migrations(), + ); + tokio::spawn(self.update_instance(&opctx.log, saga, instance_id)); + } Ok(()) } @@ -1514,8 +1586,7 @@ impl super::Nexus { .instance_fetch_with_vmm(opctx, &authz_instance) .await?; - let (instance, vmm) = (state.instance(), state.vmm()); - if let Some(vmm) = vmm { + if let Some(vmm) = state.vmm() { match vmm.runtime.state { DbVmmState::Running | DbVmmState::Rebooting @@ -1526,10 +1597,11 @@ impl super::Nexus { DbVmmState::Starting | DbVmmState::Stopping | DbVmmState::Stopped - | DbVmmState::Failed => { + | DbVmmState::Failed + | DbVmmState::Creating => { Err(Error::invalid_request(format!( "cannot connect to serial console of instance in state \"{}\"", - vmm.runtime.state, + state.effective_state(), ))) } @@ -1541,7 +1613,7 @@ impl super::Nexus { Err(Error::invalid_request(format!( "instance is {} and has no active serial console \ server", - instance.runtime().nexus_state + state.effective_state(), ))) } } @@ -1828,6 +1900,67 @@ impl super::Nexus { }) }) } + + /// Returns a future that executes the [instance-update saga] represented by + /// the provided [`steno::SagaDag`]. + /// + /// The instance-update saga may be executed in the calling function by + /// `await`ing the returned future, or spawned to run in the background + /// using `tokio::spawn`. + /// + /// [instance-update saga]: crate::app::sagas::instance_update + fn update_instance( + &self, + log: &slog::Logger, + saga: steno::SagaDag, + instance_id: InstanceUuid, + ) -> impl std::future::Future + Send { + let sagas = self.sagas.clone(); + let task_instance_updater = + self.background_tasks.task_instance_updater.clone(); + let log = log.clone(); + async move { + debug!( + &log, + "preparing instance-update saga for {instance_id}..."; + "instance_id" => %instance_id, + ); + // TODO(eliza): maybe we should use the lower level saga API so + // we can see if the saga failed due to the lock being held and + // retry it immediately? + let running_saga = async move { + let runnable_saga = sagas.saga_prepare(saga).await?; + runnable_saga.start().await + } + .await; + let result = match running_saga { + Err(error) => { + error!(&log, "failed to start update saga for {instance_id}"; + "instance_id" => %instance_id, + "error" => %error, + ); + // If we couldn't start the update saga for this + // instance, kick the instance-updater background task + // to try and start it again in a timely manner. + task_instance_updater.activate(); + return; + } + Ok(saga) => { + saga.wait_until_stopped().await.into_omicron_result() + } + }; + if let Err(error) = result { + error!(&log, "update saga for {instance_id} failed"; + "instance_id" => %instance_id, + "error" => %error, + ); + // If we couldn't complete the update saga for this + // instance, kick the instance-updater background task + // to try and start it again in a timely manner. + task_instance_updater.activate(); + } + } + } } /// Writes the VMM and migration state supplied in `new_runtime_state` to the @@ -1847,8 +1980,6 @@ pub(crate) async fn process_vmm_update( propolis_id: PropolisUuid, new_runtime_state: &nexus::SledVmmState, ) -> Result, Error> { - use sagas::instance_update; - let migrations = new_runtime_state.migrations(); info!(opctx.log, "received new VMM runtime state from sled agent"; "propolis_id" => %propolis_id, @@ -1930,7 +2061,7 @@ fn instance_start_allowed( Ok(InstanceStartDisposition::AlreadyStarted) } - InstanceState::Stopped => { + s @ InstanceState::Stopped | s @ InstanceState::Failed => { match vmm.as_ref() { // If a previous start saga failed and left behind a VMM in the // SagaUnwound state, allow a new start saga to try to overwrite @@ -1945,18 +2076,20 @@ fn instance_start_allowed( Ok(InstanceStartDisposition::Start) } // This shouldn't happen: `InstanceAndVmm::effective_state` should - // only return `Stopped` if there is no active VMM or if the VMM is - // `SagaUnwound`. + // only return `Stopped` or `Failed` if there is no active VMM + // or if the VMM is `SagaUnwound`. Some(vmm) => { error!(log, - "instance is stopped but still has an active VMM"; + "instance is {s:?} but still has an active VMM"; "instance_id" => %instance.id(), "propolis_id" => %vmm.id, "propolis_state" => ?vmm.runtime.state); - Err(Error::internal_error( - "instance is stopped but still has an active VMM", - )) + Err(Error::InternalError { + internal_message: format!( + "instance is {s:?} but still has an active VMM" + ), + }) } // Ah, it's actually stopped. We can restart it. None => Ok(InstanceStartDisposition::Start), @@ -1992,7 +2125,7 @@ mod tests { use futures::{SinkExt, StreamExt}; use nexus_db_model::{ Instance as DbInstance, InstanceState as DbInstanceState, - VmmInitialState, VmmState as DbVmmState, + VmmState as DbVmmState, }; use omicron_common::api::external::{ Hostname, IdentityMetadataCreateParams, InstanceCpuCount, Name, @@ -2130,7 +2263,6 @@ mod tests { ipnetwork::IpNetwork::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0) .unwrap(), 0, - VmmInitialState::Starting, ); (instance, vmm) diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 049673d2ee..9f9fefea62 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -7,7 +7,6 @@ use std::net::{IpAddr, Ipv6Addr}; use crate::Nexus; -use chrono::Utc; use nexus_db_model::{ ByteCount, ExternalIp, InstanceState, IpAttachState, Ipv4NatEntry, SledReservationConstraints, SledResource, VmmState, @@ -98,7 +97,6 @@ pub async fn create_and_insert_vmm_record( propolis_id: PropolisUuid, sled_id: SledUuid, propolis_ip: Ipv6Addr, - initial_state: nexus_db_model::VmmInitialState, ) -> Result { let vmm = db::model::Vmm::new( propolis_id, @@ -106,7 +104,6 @@ pub async fn create_and_insert_vmm_record( sled_id, IpAddr::V6(propolis_ip).into(), DEFAULT_PROPOLIS_PORT, - initial_state, ); let vmm = datastore @@ -117,32 +114,6 @@ pub async fn create_and_insert_vmm_record( Ok(vmm) } -/// Given a previously-inserted VMM record, set its state to `SagaUnwound` and -/// then delete it. -/// -/// This function succeeds idempotently if called with the same parameters, -/// provided that the VMM record was not changed by some other actor after the -/// calling saga inserted it. -/// -/// This function is intended to be called when a saga which created a VMM -/// record unwinds. -pub async fn unwind_vmm_record( - datastore: &DataStore, - opctx: &OpContext, - prev_record: &db::model::Vmm, -) -> Result<(), anyhow::Error> { - let new_runtime = db::model::VmmRuntimeState { - state: db::model::VmmState::SagaUnwound, - time_state_updated: Utc::now(), - gen: prev_record.runtime.gen.next().into(), - }; - - let prev_id = PropolisUuid::from_untyped_uuid(prev_record.id); - datastore.vmm_update_runtime(&prev_id, &new_runtime).await?; - datastore.vmm_mark_deleted(&opctx, &prev_id).await?; - Ok(()) -} - /// Allocates a new IPv6 address for a propolis instance that will run on the /// supplied sled. pub(super) async fn allocate_vmm_ipv6( @@ -277,14 +248,14 @@ pub(super) async fn instance_ip_get_instance_state( Some(VmmState::Running) | Some(VmmState::Rebooting), ) => {} - // If the VMM is in the Stopping, Migrating, or Starting states, its - // sled assignment is in doubt, so report a transient state error and - // ask the caller to retry. + // If the VMM is in the Creating, Stopping, Migrating, or Starting + // states, its sled assignment is in doubt, so report a transient state + // error and ask the caller to retry. // - // Although an instance with a Starting VMM has a sled assignment, - // there's no way to tell at this point whether or not there's a - // concurrent instance-start saga that has passed the point where it - // sends IP assignments to the instance's new sled: + // Although an instance with a Starting (or Creating) VMM has a sled + // assignment, there's no way to tell at this point whether or not + // there's a concurrent instance-start saga that has passed the point + // where it sends IP assignments to the instance's new sled: // // - If the start saga is still in progress and hasn't pushed any IP // information to the instance's new sled yet, then either of two @@ -306,7 +277,8 @@ pub(super) async fn instance_ip_get_instance_state( Some(state @ VmmState::Starting) | Some(state @ VmmState::Migrating) | Some(state @ VmmState::Stopping) - | Some(state @ VmmState::Stopped), + | Some(state @ VmmState::Stopped) + | Some(state @ VmmState::Creating), ) => { return Err(ActionError::action_failed(Error::unavail(&format!( "can't {verb} in transient state {state}" diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 24d11fcae2..5e5b141c11 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -13,6 +13,7 @@ use crate::app::sagas::{ use nexus_db_queries::db::{identity::Resource, lookup::LookupPath}; use nexus_db_queries::{authn, authz, db}; use nexus_types::internal_api::params::InstanceMigrateRequest; +use omicron_common::api::external::Error; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid, SledUuid}; use serde::Deserialize; use serde::Serialize; @@ -319,7 +320,6 @@ async fn sim_create_vmm_record( propolis_id, sled_id, propolis_ip, - nexus_db_model::VmmInitialState::Migrating, ) .await } @@ -334,16 +334,15 @@ async fn sim_destroy_vmm_record( ¶ms.serialized_authn, ); - let vmm = sagactx.lookup::("dst_vmm_record")?; - info!(osagactx.log(), "destroying vmm record for migration unwind"; - "propolis_id" => %vmm.id); + let propolis_id = sagactx.lookup::("dst_propolis_id")?; + info!( + osagactx.log(), + "destroying vmm record for migration unwind"; + "propolis_id" => %propolis_id, + ); - super::instance_common::unwind_vmm_record( - osagactx.datastore(), - &opctx, - &vmm, - ) - .await + osagactx.datastore().vmm_mark_saga_unwound(&opctx, &propolis_id).await?; + Ok(()) } async fn sim_set_migration_ids( @@ -384,7 +383,7 @@ async fn sim_set_migration_ids( async fn sim_ensure_destination_propolis( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action( @@ -428,9 +427,31 @@ async fn sim_ensure_destination_propolis( }, ) .await - .map_err(ActionError::action_failed)?; - - Ok(()) + .map_err(|err| match err { + InstanceStateChangeError::SledAgent(inner) => { + info!( + osagactx.log(), + "migration saga: sled agent failed to register instance"; + "instance_id" => %db_instance.id(), + "dst_propolis_id" => %dst_propolis_id, + "error" => ?inner, + ); + + // Don't set the instance to Failed in this case. Instead, allow + // the saga to unwind, marking the VMM as SagaUnwound. + ActionError::action_failed(Error::from(inner)) + } + InstanceStateChangeError::Other(inner) => { + info!( + osagactx.log(), + "migration saga: internal error registering instance"; + "instance_id" => %db_instance.id(), + "dst_propolis_id" => %dst_propolis_id, + "error" => ?inner, + ); + ActionError::action_failed(inner) + } + }) } async fn sim_ensure_destination_propolis_undo( @@ -460,7 +481,7 @@ async fn sim_ensure_destination_propolis_undo( { Ok(_) => Ok(()), Err(InstanceStateChangeError::SledAgent(inner)) => { - if !inner.instance_unhealthy() { + if !inner.vmm_gone() { Ok(()) } else { Err(inner.0.into()) @@ -489,7 +510,7 @@ async fn sim_instance_migrate( ); let src_propolis_id = db_instance.runtime().propolis_id.unwrap(); - let dst_vmm = sagactx.lookup::("dst_vmm_record")?; + let dst_vmm = sagactx.lookup::("ensure_destination")?; info!(osagactx.log(), "initiating migration from destination sled"; "instance_id" => %db_instance.id(), "dst_vmm_record" => ?dst_vmm, diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index b6b78bd43c..3325cd72f4 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -93,6 +93,12 @@ declare_saga_actions! { } } +/// Node name for looking up the VMM record once it has been registered with the +/// sled-agent by `sis_ensure_registered`. This is necessary as registering the +/// VMM transitions it from the `Creating` state to the `Starting` state, +/// changing its generation. +const REGISTERED_VMM_RECORD: &'static str = "ensure_registered"; + #[derive(Debug)] pub(crate) struct SagaInstanceStart; impl NexusSaga for SagaInstanceStart { @@ -194,7 +200,6 @@ async fn sis_create_vmm_record( propolis_id, sled_id, propolis_ip, - nexus_db_model::VmmInitialState::Starting, ) .await } @@ -209,13 +214,15 @@ async fn sis_destroy_vmm_record( ¶ms.serialized_authn, ); - let vmm = sagactx.lookup::("vmm_record")?; - super::instance_common::unwind_vmm_record( - osagactx.datastore(), - &opctx, - &vmm, - ) - .await + let propolis_id = sagactx.lookup::("propolis_id")?; + info!( + osagactx.log(), + "destroying vmm record for start saga unwind"; + "propolis_id" => %propolis_id, + ); + + osagactx.datastore().vmm_mark_saga_unwound(&opctx, &propolis_id).await?; + Ok(()) } async fn sis_move_to_starting( @@ -486,7 +493,7 @@ async fn sis_v2p_ensure_undo( async fn sis_ensure_registered( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let params = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action( &sagactx, @@ -526,9 +533,28 @@ async fn sis_ensure_registered( InstanceRegisterReason::Start { vmm_id: propolis_id }, ) .await - .map_err(ActionError::action_failed)?; - - Ok(()) + .map_err(|err| match err { + InstanceStateChangeError::SledAgent(inner) => { + info!(osagactx.log(), + "start saga: sled agent failed to register instance"; + "instance_id" => %instance_id, + "sled_id" => %sled_id, + "error" => ?inner); + + // Don't set the instance to Failed in this case. Instead, allow + // the saga to unwind and restore the instance to the Stopped + // state (matching what would happen if there were a failure + // prior to this point). + ActionError::action_failed(Error::from(inner)) + } + InstanceStateChangeError::Other(inner) => { + info!(osagactx.log(), + "start saga: internal error registering instance"; + "instance_id" => %instance_id, + "error" => ?inner); + ActionError::action_failed(inner) + } + }) } async fn sis_ensure_registered_undo( @@ -540,6 +566,7 @@ async fn sis_ensure_registered_undo( let instance_id = InstanceUuid::from_untyped_uuid(params.db_instance.id()); let propolis_id = sagactx.lookup::("propolis_id")?; let sled_id = sagactx.lookup::("sled_id")?; + let db_vmm = sagactx.lookup::(REGISTERED_VMM_RECORD)?; let opctx = crate::context::op_context_for_saga_action( &sagactx, ¶ms.serialized_authn, @@ -552,7 +579,7 @@ async fn sis_ensure_registered_undo( // Fetch the latest record so that this callee can drive the instance into // a Failed state if the unregister call fails. - let (.., db_instance) = LookupPath::new(&opctx, &datastore) + let (.., authz_instance, _) = LookupPath::new(&opctx, &datastore) .instance_id(instance_id.into_untyped_uuid()) .fetch() .await @@ -593,9 +620,7 @@ async fn sis_ensure_registered_undo( // be a bit of a stretch. See the definition of `instance_unhealthy` for // more details. match e { - InstanceStateChangeError::SledAgent(inner) - if inner.instance_unhealthy() => - { + InstanceStateChangeError::SledAgent(inner) if inner.vmm_gone() => { error!(osagactx.log(), "start saga: failing instance after unregister failure"; "instance_id" => %instance_id, @@ -603,11 +628,7 @@ async fn sis_ensure_registered_undo( if let Err(set_failed_error) = osagactx .nexus() - .mark_instance_failed( - &instance_id, - db_instance.runtime(), - &inner, - ) + .mark_vmm_failed(&opctx, authz_instance, &db_vmm, &inner) .await { error!(osagactx.log(), @@ -653,7 +674,7 @@ async fn sis_ensure_running( let db_instance = sagactx.lookup::("started_record")?; - let db_vmm = sagactx.lookup::("vmm_record")?; + let db_vmm = sagactx.lookup::(REGISTERED_VMM_RECORD)?; let instance_id = InstanceUuid::from_untyped_uuid(params.db_instance.id()); let sled_id = sagactx.lookup::("sled_id")?; info!(osagactx.log(), "start saga: ensuring instance is running"; diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index 4c4c4deff2..0038a594d5 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -361,7 +361,6 @@ use chrono::Utc; use nexus_db_queries::{authn, authz}; use nexus_types::identity::Resource; use omicron_common::api::external::Error; -use omicron_common::api::internal::nexus; use omicron_common::api::internal::nexus::SledVmmState; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; @@ -413,12 +412,12 @@ pub fn update_saga_needed( ) -> bool { // Currently, an instance-update saga is required if (and only if): // - // - The instance's active VMM has transitioned to `Destroyed`. We don't + // - The instance's active VMM has transitioned to `Destroyed` or `Failed`. We don't // actually know whether the VMM whose state was updated here was the // active VMM or not, so we will always attempt to run an instance-update - // saga if the VMM was `Destroyed`. - let vmm_needs_update = result.vmm_updated - && state.vmm_state.state == nexus::VmmState::Destroyed; + // saga if the VMM was `Destroyed` (or `Failed`). + let vmm_needs_update = + result.vmm_updated && state.vmm_state.state.is_terminal(); // - A migration in to this VMM has transitioned to a terminal state // (`Failed` or `Completed`). let migrations = state.migrations(); @@ -514,12 +513,13 @@ impl UpdatesRequired { let instance_id = snapshot.instance.id(); let mut update_required = false; + let mut active_vmm_failed = false; let mut network_config = None; // Has the active VMM been destroyed? let destroy_active_vmm = snapshot.active_vmm.as_ref().and_then(|active_vmm| { - if active_vmm.runtime.state == VmmState::Destroyed { + if active_vmm.runtime.state.is_terminal() { let id = PropolisUuid::from_untyped_uuid(active_vmm.id); // Unlink the active VMM ID. If the active VMM was destroyed // because a migration out completed, the next block, which @@ -527,6 +527,12 @@ impl UpdatesRequired { // instead. new_runtime.propolis_id = None; update_required = true; + + // If the active VMM's state is `Failed`, move the + // instance's new state to `Failed` rather than to `NoVmm`. + if active_vmm.runtime.state == VmmState::Failed { + active_vmm_failed = true; + } Some(id) } else { None @@ -536,7 +542,9 @@ impl UpdatesRequired { // Okay, what about the target? let destroy_target_vmm = snapshot.target_vmm.as_ref().and_then(|target_vmm| { - if target_vmm.runtime.state == VmmState::Destroyed { + // XXX(eliza): AFAIK, target VMMs don't go to `Failed` until + // they become active, but...IDK. double-check that. + if target_vmm.runtime.state.is_terminal() { // Unlink the target VMM ID. new_runtime.dst_propolis_id = None; update_required = true; @@ -672,7 +680,11 @@ impl UpdatesRequired { // was any actual state change. // We no longer have a VMM. - new_runtime.nexus_state = InstanceState::NoVmm; + new_runtime.nexus_state = if active_vmm_failed { + InstanceState::Failed + } else { + InstanceState::NoVmm + }; // If the active VMM was destroyed and the instance has not migrated // out of it, we must delete the instance's network configuration. // @@ -2345,11 +2357,36 @@ mod test { let instance = create_instance(client).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + let log = &cptestctx.logctx.log; + info!( + &log, + "created test instance"; + "instance_id" => %instance_id, + "instance" => ?instance + ); // Poke the instance to get it into the Running state. - let state = - test_helpers::instance_fetch(cptestctx, instance_id).await; + + info!( + &log, + "advancing test instance to Running..."; + "instance_id" => %instance_id, + ); test_helpers::instance_simulate(cptestctx, &instance_id).await; + let state = test_helpers::instance_wait_for_state( + cptestctx, + instance_id, + InstanceState::Vmm, + ) + .await; + + info!( + &log, + "simulated test instance"; + "instance_id" => %instance_id, + "instance_state" => ?state.instance(), + "vmm_state" => ?state.vmm(), + ); let vmm = state.vmm().as_ref().unwrap(); let dst_sled_id = @@ -2593,12 +2630,12 @@ mod test { assert_instance_unlocked(instance); assert_instance_record_is_consistent(instance); - let target_destroyed = self + let target_vmm_state = self .outcome .target .as_ref() - .map(|(_, state)| state == &VmmState::Destroyed) - .unwrap_or(false); + .map(|&(_, state)| state) + .unwrap_or(VmmState::Migrating); if self.outcome.failed { assert_eq!( @@ -2610,11 +2647,11 @@ mod test { "target VMM ID must be unset when a migration has failed" ); } else { - if dbg!(target_destroyed) { + if dbg!(target_vmm_state).is_terminal() { assert_eq!( active_vmm_id, None, - "if the target VMM was destroyed, it should be unset, \ - even if a migration succeeded", + "if the target VMM was {target_vmm_state}, it should \ + be unset, even if a migration succeeded", ); assert_eq!( instance_runtime.nexus_state, @@ -2662,37 +2699,48 @@ mod test { } } - let src_destroyed = self + let src_vmm_state = self .outcome .source .as_ref() - .map(|(_, state)| state == &VmmState::Destroyed) - .unwrap_or(false); + .map(|&(_, state)| state) + .unwrap_or(VmmState::Migrating); assert_eq!( self.src_resource_records_exist(cptestctx).await, - !src_destroyed, - "source VMM should exist if and only if the source hasn't been destroyed", + !dbg!(src_vmm_state).is_terminal(), + "source VMM resource records should exist if and only if the \ + source VMM is not in a terminal state (Destroyed/Failed)", ); assert_eq!( self.target_resource_records_exist(cptestctx).await, - !target_destroyed, - "target VMM should exist if and only if the target hasn't been destroyed", + !target_vmm_state.is_terminal(), + "source VMM resource records should exist if and only if the \ + source VMM is not in a terminal state (Destroyed/Failed)", ); - // VThe instance has a VMM if (and only if): - let has_vmm = if self.outcome.failed { + fn expected_instance_state(vmm: VmmState) -> InstanceState { + match vmm { + VmmState::Destroyed => InstanceState::NoVmm, + VmmState::Failed => InstanceState::Failed, + _ => InstanceState::Vmm, + } + } + + // The instance has a VMM if (and only if): + let instance_state = if self.outcome.failed { // If the migration failed, the instance should have a VMM if // and only if the source VMM is still okay. It doesn't matter // whether the target is still there or not, because we didn't // migrate to it successfully. - !src_destroyed + expected_instance_state(src_vmm_state) } else { // Otherwise, if the migration succeeded, the instance should be // on the target VMM, and virtual provisioning records should - // exist as long as the - !target_destroyed + // exist as long as the target is okay. + expected_instance_state(target_vmm_state) }; + let has_vmm = instance_state == InstanceState::Vmm; assert_eq!( no_virtual_provisioning_resource_records_exist(cptestctx).await, @@ -2710,8 +2758,6 @@ mod test { as the instance has a VMM", ); - let instance_state = - if has_vmm { InstanceState::Vmm } else { InstanceState::NoVmm }; assert_eq!(instance_runtime.nexus_state, instance_state); } diff --git a/nexus/src/app/sagas/region_replacement_drive.rs b/nexus/src/app/sagas/region_replacement_drive.rs index e2f7620178..d95591848e 100644 --- a/nexus/src/app/sagas/region_replacement_drive.rs +++ b/nexus/src/app/sagas/region_replacement_drive.rs @@ -577,26 +577,17 @@ async fn check_from_previous_propolis_step( Ok(DriveCheck::LastStepStillRunning) } - VmmState::Starting => { - // This state is unexpected, considering Nexus previously sent a - // target replacement request to this propolis! + // These states are unexpected, considering Nexus previously sent a + // target replacement request to this propolis! + VmmState::Starting | VmmState::Creating + // This state is unexpected because we should have already + // returned `DriveCheck::Wait` above. + | VmmState::Migrating => { return Err(ActionError::action_failed(format!( - "vmm {} propolis is Starting", - step_vmm_id, + "vmm {step_vmm_id} propolis is {state}", ))); } - - VmmState::Migrating => { - // This state is unexpected because we should have already - // returned `DriveCheck::Wait` above. - - return Err(ActionError::action_failed(format!( - "vmm {} propolis is Migrating!", - step_vmm_id, - ))); - } - VmmState::Stopping | VmmState::Stopped | VmmState::Failed @@ -923,7 +914,8 @@ async fn srrd_drive_region_replacement_prepare( | VmmState::Migrating | VmmState::Failed | VmmState::Destroyed - | VmmState::SagaUnwound => { + | VmmState::SagaUnwound + | VmmState::Creating => { // Propolis server is not ok to receive volume // replacement requests, bail out return Err(ActionError::action_failed(format!( diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step.rs b/nexus/src/app/sagas/region_snapshot_replacement_step.rs index 600bb155bf..507904736a 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step.rs @@ -462,7 +462,8 @@ async fn rsrss_notify_upstairs( | VmmState::Migrating | VmmState::Failed | VmmState::Destroyed - | VmmState::SagaUnwound => { + | VmmState::SagaUnwound + | VmmState::Creating => { // Propolis server is not ok to receive volume replacement requests // - unwind so that this saga can run again. return Err(ActionError::action_failed(format!( diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index a7228e0841..25c8056b22 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1106,15 +1106,117 @@ async fn test_instance_migrate_v2p_and_routes( } // Verifies that if a request to reboot or stop an instance fails because of a -// 500-level error from sled agent, then the instance moves to the Failed state. +// 404 error from sled agent, then the instance moves to the Failed state, and +// can be restarted once it has transitioned to that state.. #[nexus_test] -async fn test_instance_failed_after_sled_agent_error( +async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_restarted( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + + let instance_name = "losing-is-fun"; + let instance_id = make_forgotten_instance(&cptestctx, instance_name).await; + + // Attempting to reboot the forgotten instance will result in a 404 + // NO_SUCH_INSTANCE from the sled-agent, which Nexus turns into a 503. + expect_instance_reboot_fail( + client, + instance_name, + http::StatusCode::SERVICE_UNAVAILABLE, + ) + .await; + + // Wait for the instance to transition to Failed. + instance_wait_for_state(client, instance_id, InstanceState::Failed).await; + + // Now, the instance should be restartable. + expect_instance_start_ok(client, instance_name).await; +} + +// Verifies that if a request to reboot or stop an instance fails because of a +// 404 error from sled agent, then the instance moves to the Failed state, and +// can be deleted once it has transitioned to that state.. +#[nexus_test] +async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_deleted( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + + let instance_name = "losing-is-fun"; + let instance_id = make_forgotten_instance(&cptestctx, instance_name).await; + + // Attempting to reboot the forgotten instance will result in a 404 + // NO_SUCH_INSTANCE from the sled-agent, which Nexus turns into a 503. + expect_instance_reboot_fail( + client, + instance_name, + http::StatusCode::SERVICE_UNAVAILABLE, + ) + .await; + + // Wait for the instance to transition to Failed. + instance_wait_for_state(client, instance_id, InstanceState::Failed).await; + + // Now, the instance should be deleteable. + expect_instance_delete_ok(client, instance_name).await; +} + +// Verifies that the instance-watcher background task transitions an instance +// to Failed when the sled-agent returns a 404, and that the instance can be +// deleted after it transitions to Failed. +#[nexus_test] +async fn test_instance_failed_by_instance_watcher_can_be_deleted( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let instance_name = "losing-is-fun"; + let instance_id = make_forgotten_instance(&cptestctx, instance_name).await; + + nexus_test_utils::background::activate_background_task( + &cptestctx.internal_client, + "instance_watcher", + ) + .await; + + // Wait for the instance to transition to Failed. + instance_wait_for_state(client, instance_id, InstanceState::Failed).await; + + // Now, the instance should be deleteable. + expect_instance_delete_ok(&client, instance_name).await; +} + +// Verifies that the instance-watcher background task transitions an instance +// to Failed when the sled-agent returns a 404, and that the instance can be +// deleted after it transitions to Failed. +#[nexus_test] +async fn test_instance_failed_by_instance_watcher_can_be_restarted( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let instance_name = "losing-is-fun"; + let instance_id = make_forgotten_instance(&cptestctx, instance_name).await; + + nexus_test_utils::background::activate_background_task( + &cptestctx.internal_client, + "instance_watcher", + ) + .await; + + // Wait for the instance to transition to Failed. + instance_wait_for_state(client, instance_id, InstanceState::Failed).await; + + // Now, the instance should be deleteable. + expect_instance_delete_ok(&client, instance_name).await; +} + +#[nexus_test] +async fn test_instances_are_not_marked_failed_on_other_sled_agent_errors( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; let apictx = &cptestctx.server.server_context(); let nexus = &apictx.nexus; - let instance_name = "losing-is-fun"; + let instance_name = "my-cool-instance"; // Create and start the test instance. create_project_and_pool(&client).await; @@ -1129,62 +1231,142 @@ async fn test_instance_failed_after_sled_agent_error( sled_agent .set_instance_ensure_state_error(Some( omicron_common::api::external::Error::internal_error( - "injected by test_instance_failed_after_sled_agent_error", + "somebody set up us the bomb", ), )) .await; - let url = get_instance_url(format!("{}/reboot", instance_name).as_str()); - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &url) - .body(None as Option<&serde_json::Value>), + // Attempting to reboot the instance will fail, but it should *not* go to + // Failed. + expect_instance_reboot_fail( + client, + instance_name, + http::StatusCode::INTERNAL_SERVER_ERROR, ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap() - .parsed_body::() - .expect_err("expected injected failure"); + .await; let instance_next = instance_get(&client, &instance_url).await; - assert_eq!(instance_next.runtime.run_state, InstanceState::Failed); - - NexusRequest::object_delete(client, &get_instance_url(instance_name)) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap(); + assert_eq!(instance_next.runtime.run_state, InstanceState::Running); sled_agent.set_instance_ensure_state_error(None).await; + expect_instance_reboot_ok(client, instance_name).await; +} + +#[nexus_test] +async fn test_instances_are_not_marked_failed_on_other_sled_agent_errors_by_instance_watcher( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + let instance_name = "my-cool-instance"; + + // Create and start the test instance. + create_project_and_pool(&client).await; + let instance_url = get_instance_url(instance_name); let instance = create_instance(client, PROJECT_NAME, instance_name).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); instance_simulate(nexus, &instance_id).await; let instance_next = instance_get(&client, &instance_url).await; assert_eq!(instance_next.runtime.run_state, InstanceState::Running); + let sled_agent = &cptestctx.sled_agent.sled_agent; sled_agent .set_instance_ensure_state_error(Some( omicron_common::api::external::Error::internal_error( - "injected by test_instance_failed_after_sled_agent_error", + "you have no chance to survive, make your time", ), )) .await; - let url = get_instance_url(format!("{}/stop", instance_name).as_str()); - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &url) - .body(None as Option<&serde_json::Value>), + nexus_test_utils::background::activate_background_task( + &cptestctx.internal_client, + "instance_watcher", ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap() - .parsed_body::() - .expect_err("expected injected failure"); + .await; + + let instance_next = instance_get(&client, &instance_url).await; + assert_eq!(instance_next.runtime.run_state, InstanceState::Running); +} + +/// Prepare an instance and sled-agent for one of the "instance fails after +/// sled-agent forgets VMM" tests. +async fn make_forgotten_instance( + cptestctx: &ControlPlaneTestContext, + instance_name: &str, +) -> InstanceUuid { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.server_context(); + let nexus = &apictx.nexus; + // Create and start the test instance. + create_project_and_pool(&client).await; + let instance_url = get_instance_url(instance_name); + let instance = create_instance(client, PROJECT_NAME, instance_name).await; + let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + instance_simulate(nexus, &instance_id).await; let instance_next = instance_get(&client, &instance_url).await; - assert_eq!(instance_next.runtime.run_state, InstanceState::Failed); + assert_eq!(instance_next.runtime.run_state, InstanceState::Running); + + // Forcibly unregister the instance from the sled-agent without telling + // Nexus. It will now behave as though it has forgotten the instance and + // return a 404 error with the "NO_SUCH_INSTANCE" error code + let vmm_id = nexus + .active_instance_info(&instance_id, None) + .await + .unwrap() + .expect("running instance must be on a sled") + .propolis_id; + let sled_agent = &cptestctx.sled_agent.sled_agent; + sled_agent + .instance_unregister(vmm_id) + .await + .expect("instance_unregister must succeed"); + + instance_id +} + +async fn expect_instance_delete_ok( + client: &ClientTestContext, + instance_name: &str, +) { + NexusRequest::object_delete(&client, &get_instance_url(instance_name)) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("instance should be deleted"); +} + +async fn expect_instance_reboot_fail( + client: &ClientTestContext, + instance_name: &str, + status: http::StatusCode, +) { + let url = get_instance_url(format!("{instance_name}/reboot").as_str()); + let builder = RequestBuilder::new(client, Method::POST, &url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(status)); + NexusRequest::new(builder) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("expected instance reboot to fail"); +} + +async fn expect_instance_reboot_ok( + client: &ClientTestContext, + instance_name: &str, +) { + let url = get_instance_url(format!("{instance_name}/reboot").as_str()); + let builder = RequestBuilder::new(client, Method::POST, &url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(http::StatusCode::ACCEPTED)); + NexusRequest::new(builder) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("expected instance reboot to succeed"); } /// Assert values for fleet, silo, and project using both system and silo diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index e851d2ed6b..9906a94ac6 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1001,6 +1001,14 @@ CREATE TYPE IF NOT EXISTS omicron.public.instance_state_v2 AS ENUM ( ); CREATE TYPE IF NOT EXISTS omicron.public.vmm_state AS ENUM ( + /* + * The VMM is known to Nexus, but may not yet exist on a sled. + * + * VMM records are always inserted into the database in this state, and + * then transition to 'starting' or 'migrating' once a sled-agent reports + * that the VMM has been registered. + */ + 'creating', 'starting', 'running', 'stopping', @@ -4225,7 +4233,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '93.0.0', NULL) + (TRUE, NOW(), NOW(), '94.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/put-back-creating-vmm-state/up.sql b/schema/crdb/put-back-creating-vmm-state/up.sql new file mode 100644 index 0000000000..d183a15eac --- /dev/null +++ b/schema/crdb/put-back-creating-vmm-state/up.sql @@ -0,0 +1,2 @@ +ALTER TYPE omicron.public.vmm_state + ADD VALUE IF NOT EXISTS 'creating' BEFORE 'starting'; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index aaac7f63d0..cf3c022551 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -439,30 +439,14 @@ impl SledAgent { self: &Arc, propolis_id: PropolisUuid, state: VmmStateRequested, - ) -> Result { + ) -> Result { if let Some(e) = self.instance_ensure_state_error.lock().await.as_ref() { - return Err(e.clone()); + return Err(e.clone().into()); } - let current = match self - .vmms - .sim_get_cloned_object(&propolis_id.into_untyped_uuid()) - .await - { - Ok(i) => i.current().clone(), - Err(_) => match state { - VmmStateRequested::Stopped => { - return Ok(VmmPutStateResponse { updated_runtime: None }); - } - _ => { - return Err(Error::invalid_request(&format!( - "Propolis {} not registered on sled", - propolis_id, - ))); - } - }, - }; + let current = + self.get_sim_instance(propolis_id).await?.current().clone(); let mock_lock = self.mock_propolis.lock().await; if let Some((_srv, client)) = mock_lock.as_ref() { @@ -470,7 +454,8 @@ impl SledAgent { VmmStateRequested::MigrationTarget(_) => { return Err(Error::internal_error( "migration not implemented for mock Propolis", - )); + ) + .into()); } VmmStateRequested::Running => { let vmms = self.vmms.clone(); @@ -506,7 +491,13 @@ impl SledAgent { } }; client.instance_state_put().body(body).send().await.map_err( - |e| Error::internal_error(&format!("propolis-client: {}", e)), + |e| { + crate::sled_agent::Error::Instance( + crate::instance_manager::Error::Instance( + crate::instance::Error::Propolis(e), // whew! + ), + ) + }, )?; } @@ -518,20 +509,27 @@ impl SledAgent { Ok(VmmPutStateResponse { updated_runtime: Some(new_state) }) } - pub async fn instance_get_state( + /// Wrapper around `sim_get_cloned_object` that returns the same error as + /// the real sled-agent on an unknown VMM. + async fn get_sim_instance( &self, propolis_id: PropolisUuid, - ) -> Result { - let instance = self - .vmms + ) -> Result { + self.vmms .sim_get_cloned_object(&propolis_id.into_untyped_uuid()) .await .map_err(|_| { crate::sled_agent::Error::Instance( crate::instance_manager::Error::NoSuchVmm(propolis_id), ) - })?; - Ok(instance.current()) + }) + } + + pub async fn instance_get_state( + &self, + propolis_id: PropolisUuid, + ) -> Result { + Ok(self.get_sim_instance(propolis_id).await?.current()) } pub async fn instance_simulate_migration_source( @@ -539,15 +537,7 @@ impl SledAgent { propolis_id: PropolisUuid, migration: instance::SimulateMigrationSource, ) -> Result<(), HttpError> { - let instance = self - .vmms - .sim_get_cloned_object(&propolis_id.into_untyped_uuid()) - .await - .map_err(|_| { - crate::sled_agent::Error::Instance( - crate::instance_manager::Error::NoSuchVmm(propolis_id), - ) - })?; + let instance = self.get_sim_instance(propolis_id).await?; instance.set_simulated_migration_source(migration); Ok(()) }