From c656a4033953ae417fa811490e606e8808e0616f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 28 Aug 2024 12:30:39 -0700 Subject: [PATCH] add `VmmState::Creating` This commit introduces a new `VmmState` variant, `Creating`. The purpose of this variant is to differentiate between VMMs which have been created in the database but do not yet exist on a sled, from those which are known to sled-agent and are actually starting/migrating. This is necessary because presently, the `instance-watcher` background task will attempt to perform health checks for `Starting` and `Migrating` VMMs, and --- if the sled-agent does not actually know about the VMM yet --- will move it to `Failed`. But, when a VMM is newly created, the sled-agent won't know about it yet until it has been registered with the sled-agent, so if the `instance-watcher` task activates while an `instance-start` or `instance-migrate` saga is in flight, the `instance-watcher`` task may incorrectly move its VMM to `Failed` if the VMM record has been created but not yet registered. Now, after adding the `Creating` variant, we can avoid performing health checks for those VMMs until the sled-agent actually acknowledges a request to register the VMM. --- clients/nexus-client/src/lib.rs | 2 + clients/sled-agent-client/src/lib.rs | 2 + common/src/api/external/mod.rs | 9 ++- common/src/api/internal/nexus.rs | 6 ++ nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-model/src/vmm.rs | 16 +--- nexus/db-model/src/vmm_state.rs | 24 +++++- nexus/db-queries/src/db/datastore/instance.rs | 12 ++- nexus/src/app/instance.rs | 43 ++++++++--- nexus/src/app/sagas/instance_common.rs | 13 ++-- nexus/src/app/sagas/instance_migrate.rs | 25 +++++-- nexus/src/app/sagas/instance_start.rs | 26 ++++--- nexus/src/app/sagas/instance_update/mod.rs | 74 +++++++++++++------ .../src/app/sagas/region_replacement_drive.rs | 26 +++---- .../sagas/region_snapshot_replacement_step.rs | 3 +- openapi/nexus-internal.json | 7 ++ openapi/sled-agent.json | 7 ++ schema/crdb/dbinit.sql | 10 ++- .../crdb/put-back-creating-vmm-state/up.sql | 2 + sled-agent/src/sim/instance.rs | 1 + 20 files changed, 219 insertions(+), 92 deletions(-) create mode 100644 schema/crdb/put-back-creating-vmm-state/up.sql diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 97f6373e29c..48184406ce2 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -95,6 +95,7 @@ impl From for types::VmmState { fn from(s: omicron_common::api::internal::nexus::VmmState) -> Self { use omicron_common::api::internal::nexus::VmmState as Input; match s { + Input::Creating => types::VmmState::Creating, Input::Starting => types::VmmState::Starting, Input::Running => types::VmmState::Running, Input::Stopping => types::VmmState::Stopping, @@ -111,6 +112,7 @@ impl From for omicron_common::api::internal::nexus::VmmState { fn from(s: types::VmmState) -> Self { use omicron_common::api::internal::nexus::VmmState as Output; match s { + types::VmmState::Creating => Output::Creating, types::VmmState::Starting => Output::Starting, types::VmmState::Running => Output::Running, types::VmmState::Stopping => Output::Stopping, diff --git a/clients/sled-agent-client/src/lib.rs b/clients/sled-agent-client/src/lib.rs index be19659c69d..eb0cb59dc09 100644 --- a/clients/sled-agent-client/src/lib.rs +++ b/clients/sled-agent-client/src/lib.rs @@ -105,6 +105,7 @@ impl From for types::VmmState { fn from(s: omicron_common::api::internal::nexus::VmmState) -> Self { use omicron_common::api::internal::nexus::VmmState as Input; match s { + Input::Creating => types::VmmState::Creating, Input::Starting => types::VmmState::Starting, Input::Running => types::VmmState::Running, Input::Stopping => types::VmmState::Stopping, @@ -143,6 +144,7 @@ impl From for omicron_common::api::internal::nexus::VmmState { fn from(s: types::VmmState) -> Self { use omicron_common::api::internal::nexus::VmmState as Output; match s { + types::VmmState::Creating => Output::Creating, types::VmmState::Starting => Output::Starting, types::VmmState::Running => Output::Running, types::VmmState::Stopping => Output::Stopping, diff --git a/common/src/api/external/mod.rs b/common/src/api/external/mod.rs index 58cace30323..0b58a04bd43 100644 --- a/common/src/api/external/mod.rs +++ b/common/src/api/external/mod.rs @@ -1064,7 +1064,14 @@ impl From for InstanceState { fn from(state: crate::api::internal::nexus::VmmState) -> Self { use crate::api::internal::nexus::VmmState as InternalVmmState; match state { - InternalVmmState::Starting => Self::Starting, + // An instance with a VMM which is in the `Creating` state maps to + // `InstanceState::Starting`, rather than `InstanceState::Creating`. + // If we are still creating the VMM, this is because we are + // attempting to *start* the instance; instances may be created + // without creating a VMM to run them, and then started later. + InternalVmmState::Creating | InternalVmmState::Starting => { + Self::Starting + } InternalVmmState::Running => Self::Running, InternalVmmState::Stopping => Self::Stopping, InternalVmmState::Stopped => Self::Stopped, diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index 47f0d7c59d2..6338d67b03b 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -83,6 +83,12 @@ pub struct InstanceRuntimeState { )] #[serde(rename_all = "snake_case")] pub enum VmmState { + /// The VMM is known to Nexus, but may not yet exist on a sled. + /// + /// VMM records are always inserted into the database in this state, and + /// then transition to 'starting' or 'migrating' once a sled-agent reports + /// that the VMM has been registered. + Creating, /// The VMM is initializing and has not started running guest CPUs yet. Starting, /// The VMM has finished initializing and may be running guest CPUs. diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 2438f37fba8..54718bbddb8 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(93, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(94, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(94, "put-back-creating-vmm-state"), KnownVersion::new(93, "dataset-kinds-zone-and-debug"), KnownVersion::new(92, "lldp-link-config-nullable"), KnownVersion::new(91, "add-management-gateway-producer-kind"), diff --git a/nexus/db-model/src/vmm.rs b/nexus/db-model/src/vmm.rs index 4cc6235e1d6..45411f587ac 100644 --- a/nexus/db-model/src/vmm.rs +++ b/nexus/db-model/src/vmm.rs @@ -60,27 +60,19 @@ pub struct Vmm { pub runtime: VmmRuntimeState, } -/// The set of states that a VMM can have when it is created. -pub enum VmmInitialState { - Starting, - Migrating, -} - impl Vmm { /// Creates a new VMM record. + /// + /// The new VMM record will be in [`VmmState::Creating`] until it is + /// registered with a sled-agent. pub fn new( id: PropolisUuid, instance_id: InstanceUuid, sled_id: SledUuid, propolis_ip: ipnetwork::IpNetwork, propolis_port: u16, - initial_state: VmmInitialState, ) -> Self { let now = Utc::now(); - let state = match initial_state { - VmmInitialState::Starting => VmmState::Starting, - VmmInitialState::Migrating => VmmState::Migrating, - }; Self { id: id.into_untyped_uuid(), @@ -91,7 +83,7 @@ impl Vmm { propolis_ip, propolis_port: SqlU16(propolis_port), runtime: VmmRuntimeState { - state, + state: VmmState::Creating, time_state_updated: now, gen: Generation::new(), }, diff --git a/nexus/db-model/src/vmm_state.rs b/nexus/db-model/src/vmm_state.rs index b07d82a4110..8d3c3f2d9c4 100644 --- a/nexus/db-model/src/vmm_state.rs +++ b/nexus/db-model/src/vmm_state.rs @@ -17,6 +17,7 @@ impl_enum_type!( #[diesel(sql_type = VmmStateEnum)] pub enum VmmState; + Creating => b"creating" Starting => b"starting" Running => b"running" Stopping => b"stopping" @@ -31,6 +32,7 @@ impl_enum_type!( impl VmmState { pub fn label(&self) -> &'static str { match self { + VmmState::Creating => "creating", VmmState::Starting => "starting", VmmState::Running => "running", VmmState::Stopping => "stopping", @@ -51,9 +53,21 @@ impl VmmState { pub const TERMINAL_STATES: &'static [Self] = &[Self::Destroyed, Self::Failed]; + /// States in which a VMM record is present in the database but is not + /// resident on a sled, either because it does not yet exist, was produced + /// by an unwound update saga and will never exist, or has already been + /// destroyed. + pub const NONEXISTENT_STATES: &'static [Self] = + &[Self::Creating, Self::SagaUnwound, Self::Destroyed]; + pub fn is_terminal(&self) -> bool { Self::TERMINAL_STATES.contains(self) } + /// Returns `true` if the instance is in a state in which it exists on a + /// sled. + pub fn exists_on_sled(&self) -> bool { + !Self::NONEXISTENT_STATES.contains(self) + } } impl fmt::Display for VmmState { @@ -66,6 +80,7 @@ impl From for omicron_common::api::internal::nexus::VmmState { fn from(value: VmmState) -> Self { use omicron_common::api::internal::nexus::VmmState as Output; match value { + VmmState::Creating => Output::Creating, VmmState::Starting => Output::Starting, VmmState::Running => Output::Running, VmmState::Stopping => Output::Stopping, @@ -82,6 +97,7 @@ impl From for sled_agent_client::types::VmmState { fn from(value: VmmState) -> Self { use sled_agent_client::types::VmmState as Output; match value { + VmmState::Creating => Output::Creating, VmmState::Starting => Output::Starting, VmmState::Running => Output::Running, VmmState::Stopping => Output::Stopping, @@ -98,6 +114,7 @@ impl From for VmmState { fn from(value: ApiState) -> Self { use VmmState as Output; match value { + ApiState::Creating => Output::Creating, ApiState::Starting => Output::Starting, ApiState::Running => Output::Running, ApiState::Stopping => Output::Stopping, @@ -115,7 +132,12 @@ impl From for omicron_common::api::external::InstanceState { use omicron_common::api::external::InstanceState as Output; match value { - VmmState::Starting => Output::Starting, + // An instance with a VMM which is in the `Creating` state maps to + // `InstanceState::Starting`, rather than `InstanceState::Creating`. + // If we are still creating the VMM, this is because we are + // attempting to *start* the instance; instances may be created + // without creating a VMM to run them, and then started later. + VmmState::Creating | VmmState::Starting => Output::Starting, VmmState::Running => Output::Running, VmmState::Stopping => Output::Stopping, // `SagaUnwound` should map to `Stopped` so that an `instance_view` diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs index 474014fb864..0dda1ce4ef4 100644 --- a/nexus/db-queries/src/db/datastore/instance.rs +++ b/nexus/db-queries/src/db/datastore/instance.rs @@ -741,7 +741,7 @@ impl DataStore { format!( "cannot set migration ID {migration_id} for instance \ {instance_id} (perhaps another migration ID is \ - already present): {error:#}" + already present): {error:#?}" ), ), }) @@ -825,7 +825,15 @@ impl DataStore { vmm_dsl::vmm .on(vmm_dsl::sled_id .eq(sled_dsl::id) - .and(vmm_dsl::time_deleted.is_null())) + .and(vmm_dsl::time_deleted.is_null()) + // Ignore instances which are in states that are not + // known to exist on a sled. Since this query drives + // instance-watcher health checking, it is not necessary + // to perform health checks for VMMs that don't actually + // exist in real life. + .and( + vmm_dsl::state.ne_all(VmmState::NONEXISTENT_STATES), + )) .inner_join( instance_dsl::instance .on(instance_dsl::id diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 906c10f2dde..a00a37a9c9a 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -914,7 +914,7 @@ impl super::Nexus { propolis_id: &PropolisUuid, initial_vmm: &db::model::Vmm, operation: InstanceRegisterReason, - ) -> Result<(), Error> { + ) -> Result { opctx.authorize(authz::Action::Modify, authz_instance).await?; // Check that the hostname is valid. @@ -1147,13 +1147,31 @@ impl super::Nexus { let sa = self .sled_client(&SledUuid::from_untyped_uuid(initial_vmm.sled_id)) .await?; + // The state of a freshly-created VMM record in the database is always + // `VmmState::Creating`. Based on whether this VMM is created in order + // to start or migrate an instance, determine the VMM state we will want + // the sled-agent to create the VMM in. Once sled-agent acknoledges our + // registration request, we will advance the database record to the new + // state. + let vmm_runtime = sled_agent_client::types::VmmRuntimeState { + time_updated: chrono::Utc::now(), + r#gen: initial_vmm.runtime.gen.next(), + state: match operation { + InstanceRegisterReason::Migrate { .. } => { + sled_agent_client::types::VmmState::Migrating + } + InstanceRegisterReason::Start { .. } => { + sled_agent_client::types::VmmState::Starting + } + }, + }; let instance_register_result = sa .vmm_register( propolis_id, &sled_agent_client::types::InstanceEnsureBody { hardware: instance_hardware, instance_runtime: db_instance.runtime().clone().into(), - vmm_runtime: initial_vmm.clone().into(), + vmm_runtime, instance_id, propolis_addr: SocketAddr::new( initial_vmm.propolis_ip.ip(), @@ -1170,6 +1188,10 @@ impl super::Nexus { match instance_register_result { Ok(state) => { self.notify_vmm_updated(opctx, *propolis_id, &state).await?; + Ok(db::model::Vmm { + runtime: state.vmm_state.into(), + ..initial_vmm.clone() + }) } Err(e) => { if e.instance_unhealthy() { @@ -1182,11 +1204,9 @@ impl super::Nexus { ) .await; } - return Err(e.into()); + Err(e.into()) } } - - Ok(()) } /// Attempts to move an instance from `prev_instance_runtime` to the @@ -1529,8 +1549,7 @@ impl super::Nexus { .instance_fetch_with_vmm(opctx, &authz_instance) .await?; - let (instance, vmm) = (state.instance(), state.vmm()); - if let Some(vmm) = vmm { + if let Some(vmm) = state.vmm() { match vmm.runtime.state { DbVmmState::Running | DbVmmState::Rebooting @@ -1541,10 +1560,11 @@ impl super::Nexus { DbVmmState::Starting | DbVmmState::Stopping | DbVmmState::Stopped - | DbVmmState::Failed => { + | DbVmmState::Failed + | DbVmmState::Creating => { Err(Error::invalid_request(format!( "cannot connect to serial console of instance in state \"{}\"", - vmm.runtime.state, + state.effective_state(), ))) } @@ -1556,7 +1576,7 @@ impl super::Nexus { Err(Error::invalid_request(format!( "instance is {} and has no active serial console \ server", - instance.runtime().nexus_state + state.effective_state(), ))) } } @@ -2068,7 +2088,7 @@ mod tests { use futures::{SinkExt, StreamExt}; use nexus_db_model::{ Instance as DbInstance, InstanceState as DbInstanceState, - VmmInitialState, VmmState as DbVmmState, + VmmState as DbVmmState, }; use omicron_common::api::external::{ Hostname, IdentityMetadataCreateParams, InstanceCpuCount, Name, @@ -2206,7 +2226,6 @@ mod tests { ipnetwork::IpNetwork::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0) .unwrap(), 0, - VmmInitialState::Starting, ); (instance, vmm) diff --git a/nexus/src/app/sagas/instance_common.rs b/nexus/src/app/sagas/instance_common.rs index 049673d2ee5..ca793fc1238 100644 --- a/nexus/src/app/sagas/instance_common.rs +++ b/nexus/src/app/sagas/instance_common.rs @@ -98,7 +98,6 @@ pub async fn create_and_insert_vmm_record( propolis_id: PropolisUuid, sled_id: SledUuid, propolis_ip: Ipv6Addr, - initial_state: nexus_db_model::VmmInitialState, ) -> Result { let vmm = db::model::Vmm::new( propolis_id, @@ -106,7 +105,6 @@ pub async fn create_and_insert_vmm_record( sled_id, IpAddr::V6(propolis_ip).into(), DEFAULT_PROPOLIS_PORT, - initial_state, ); let vmm = datastore @@ -277,9 +275,9 @@ pub(super) async fn instance_ip_get_instance_state( Some(VmmState::Running) | Some(VmmState::Rebooting), ) => {} - // If the VMM is in the Stopping, Migrating, or Starting states, its - // sled assignment is in doubt, so report a transient state error and - // ask the caller to retry. + // If the VMM is in the Creating, Stopping, Migrating, or Starting + // states, its sled assignment is in doubt, so report a transient state + // error and ask the caller to retry. // // Although an instance with a Starting VMM has a sled assignment, // there's no way to tell at this point whether or not there's a @@ -304,9 +302,12 @@ pub(super) async fn instance_ip_get_instance_state( ( InstanceState::Vmm, Some(state @ VmmState::Starting) + // TODO(eliza): now that a `Creating` state exists that's separate + // from `Starting`, perhaps we can remove `Starting` from this list? | Some(state @ VmmState::Migrating) | Some(state @ VmmState::Stopping) - | Some(state @ VmmState::Stopped), + | Some(state @ VmmState::Stopped) + | Some(state @ VmmState::Creating), ) => { return Err(ActionError::action_failed(Error::unavail(&format!( "can't {verb} in transient state {state}" diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 24d11fcae27..14d289a77c3 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -130,6 +130,16 @@ declare_saga_actions! { } } + +/// Node for looking up the initial VMM record output by +/// `sim_create_vmm_record`. +const INITIAL_VMM_RECORD: &'static str = "dst_vmm_record"; +/// Node name for looking up the VMM record once it has been registered with the +/// sled-agent by `sim_ensure_destination_propolis`. This is necessary as +/// registering the VMM transitions it from the `Creating` state to the +/// `Migrating` state, changing its generation. +const REGISTERED_VMM_RECORD: &'static str = "ensure_destination"; + #[derive(Debug)] pub struct SagaInstanceMigrate; impl NexusSaga for SagaInstanceMigrate { @@ -319,7 +329,6 @@ async fn sim_create_vmm_record( propolis_id, sled_id, propolis_ip, - nexus_db_model::VmmInitialState::Migrating, ) .await } @@ -334,7 +343,9 @@ async fn sim_destroy_vmm_record( ¶ms.serialized_authn, ); - let vmm = sagactx.lookup::("dst_vmm_record")?; + let vmm = sagactx + .lookup::(REGISTERED_VMM_RECORD) + .or_else(|_| sagactx.lookup::(INITIAL_VMM_RECORD))?; info!(osagactx.log(), "destroying vmm record for migration unwind"; "propolis_id" => %vmm.id); @@ -384,7 +395,7 @@ async fn sim_set_migration_ids( async fn sim_ensure_destination_propolis( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action( @@ -392,7 +403,7 @@ async fn sim_ensure_destination_propolis( ¶ms.serialized_authn, ); - let vmm = sagactx.lookup::("dst_vmm_record")?; + let vmm = sagactx.lookup::(INITIAL_VMM_RECORD)?; let db_instance = sagactx.lookup::("set_migration_ids")?; @@ -428,9 +439,7 @@ async fn sim_ensure_destination_propolis( }, ) .await - .map_err(ActionError::action_failed)?; - - Ok(()) + .map_err(ActionError::action_failed) } async fn sim_ensure_destination_propolis_undo( @@ -489,7 +498,7 @@ async fn sim_instance_migrate( ); let src_propolis_id = db_instance.runtime().propolis_id.unwrap(); - let dst_vmm = sagactx.lookup::("dst_vmm_record")?; + let dst_vmm = sagactx.lookup::(REGISTERED_VMM_RECORD)?; info!(osagactx.log(), "initiating migration from destination sled"; "instance_id" => %db_instance.id(), "dst_vmm_record" => ?dst_vmm, diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index e342746c1b0..a67ca58e701 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -93,6 +93,15 @@ declare_saga_actions! { } } +/// Node for looking up the initial VMM record output by +/// `sis_create_vmm_record`. +const INITIAL_VMM_RECORD: &'static str = "vmm_record"; +/// Node name for looking up the VMM record once it has been registered with the +/// sled-agent by `sis_ensure_registered`. This is necessary as registering the +/// VMM transitions it from the `Creating` state to the `Starting` state, +/// changing its generation. +const REGISTERED_VMM_RECORD: &'static str = "ensure_registered"; + #[derive(Debug)] pub(crate) struct SagaInstanceStart; impl NexusSaga for SagaInstanceStart { @@ -194,7 +203,6 @@ async fn sis_create_vmm_record( propolis_id, sled_id, propolis_ip, - nexus_db_model::VmmInitialState::Starting, ) .await } @@ -209,7 +217,9 @@ async fn sis_destroy_vmm_record( ¶ms.serialized_authn, ); - let vmm = sagactx.lookup::("vmm_record")?; + let vmm = sagactx + .lookup::(REGISTERED_VMM_RECORD) + .or_else(|_| sagactx.lookup::(INITIAL_VMM_RECORD))?; super::instance_common::unwind_vmm_record( osagactx.datastore(), &opctx, @@ -486,7 +496,7 @@ async fn sis_v2p_ensure_undo( async fn sis_ensure_registered( sagactx: NexusActionContext, -) -> Result<(), ActionError> { +) -> Result { let params = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action( &sagactx, @@ -497,7 +507,7 @@ async fn sis_ensure_registered( sagactx.lookup::("started_record")?; let instance_id = db_instance.id(); let sled_id = sagactx.lookup::("sled_id")?; - let vmm_record = sagactx.lookup::("vmm_record")?; + let vmm_record = sagactx.lookup::(INITIAL_VMM_RECORD)?; let propolis_id = sagactx.lookup::("propolis_id")?; info!(osagactx.log(), "start saga: ensuring instance is registered on sled"; @@ -526,9 +536,7 @@ async fn sis_ensure_registered( InstanceRegisterReason::Start { vmm_id: propolis_id }, ) .await - .map_err(ActionError::action_failed)?; - - Ok(()) + .map_err(ActionError::action_failed) } async fn sis_ensure_registered_undo( @@ -540,7 +548,7 @@ async fn sis_ensure_registered_undo( let instance_id = InstanceUuid::from_untyped_uuid(params.db_instance.id()); let propolis_id = sagactx.lookup::("propolis_id")?; let sled_id = sagactx.lookup::("sled_id")?; - let db_vmm = sagactx.lookup::("vmm_record")?; + let db_vmm = sagactx.lookup::(REGISTERED_VMM_RECORD)?; let opctx = crate::context::op_context_for_saga_action( &sagactx, ¶ms.serialized_authn, @@ -650,7 +658,7 @@ async fn sis_ensure_running( let db_instance = sagactx.lookup::("started_record")?; - let db_vmm = sagactx.lookup::("vmm_record")?; + let db_vmm = sagactx.lookup::(REGISTERED_VMM_RECORD)?; let instance_id = InstanceUuid::from_untyped_uuid(params.db_instance.id()); let sled_id = sagactx.lookup::("sled_id")?; info!(osagactx.log(), "start saga: ensuring instance is running"; diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs index b599bc0d308..0038a594d59 100644 --- a/nexus/src/app/sagas/instance_update/mod.rs +++ b/nexus/src/app/sagas/instance_update/mod.rs @@ -2357,11 +2357,36 @@ mod test { let instance = create_instance(client).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + let log = &cptestctx.logctx.log; + info!( + &log, + "created test instance"; + "instance_id" => %instance_id, + "instance" => ?instance + ); // Poke the instance to get it into the Running state. - let state = - test_helpers::instance_fetch(cptestctx, instance_id).await; + + info!( + &log, + "advancing test instance to Running..."; + "instance_id" => %instance_id, + ); test_helpers::instance_simulate(cptestctx, &instance_id).await; + let state = test_helpers::instance_wait_for_state( + cptestctx, + instance_id, + InstanceState::Vmm, + ) + .await; + + info!( + &log, + "simulated test instance"; + "instance_id" => %instance_id, + "instance_state" => ?state.instance(), + "vmm_state" => ?state.vmm(), + ); let vmm = state.vmm().as_ref().unwrap(); let dst_sled_id = @@ -2605,12 +2630,12 @@ mod test { assert_instance_unlocked(instance); assert_instance_record_is_consistent(instance); - let target_terminated = self + let target_vmm_state = self .outcome .target .as_ref() - .map(|(_, state)| state.is_terminal()) - .unwrap_or(false); + .map(|&(_, state)| state) + .unwrap_or(VmmState::Migrating); if self.outcome.failed { assert_eq!( @@ -2622,11 +2647,11 @@ mod test { "target VMM ID must be unset when a migration has failed" ); } else { - if dbg!(target_terminated) { + if dbg!(target_vmm_state).is_terminal() { assert_eq!( active_vmm_id, None, - "if the target VMM was destroyed, it should be unset, \ - even if a migration succeeded", + "if the target VMM was {target_vmm_state}, it should \ + be unset, even if a migration succeeded", ); assert_eq!( instance_runtime.nexus_state, @@ -2674,39 +2699,48 @@ mod test { } } - let src_terminated = self + let src_vmm_state = self .outcome .source .as_ref() - .map(|(_, state)| state.is_terminal()) - .unwrap_or(false); + .map(|&(_, state)| state) + .unwrap_or(VmmState::Migrating); assert_eq!( self.src_resource_records_exist(cptestctx).await, - !src_terminated, + !dbg!(src_vmm_state).is_terminal(), "source VMM resource records should exist if and only if the \ source VMM is not in a terminal state (Destroyed/Failed)", ); assert_eq!( self.target_resource_records_exist(cptestctx).await, - !target_terminated, - "target VMM resource records should exist if and only if the \ - target VMM is not in a terminal state (Destroyed/Failed)", + !target_vmm_state.is_terminal(), + "source VMM resource records should exist if and only if the \ + source VMM is not in a terminal state (Destroyed/Failed)", ); + fn expected_instance_state(vmm: VmmState) -> InstanceState { + match vmm { + VmmState::Destroyed => InstanceState::NoVmm, + VmmState::Failed => InstanceState::Failed, + _ => InstanceState::Vmm, + } + } + // The instance has a VMM if (and only if): - let has_vmm = if self.outcome.failed { + let instance_state = if self.outcome.failed { // If the migration failed, the instance should have a VMM if // and only if the source VMM is still okay. It doesn't matter // whether the target is still there or not, because we didn't // migrate to it successfully. - !src_terminated + expected_instance_state(src_vmm_state) } else { // Otherwise, if the migration succeeded, the instance should be // on the target VMM, and virtual provisioning records should - // exist as long as the - !target_terminated + // exist as long as the target is okay. + expected_instance_state(target_vmm_state) }; + let has_vmm = instance_state == InstanceState::Vmm; assert_eq!( no_virtual_provisioning_resource_records_exist(cptestctx).await, @@ -2724,8 +2758,6 @@ mod test { as the instance has a VMM", ); - let instance_state = - if has_vmm { InstanceState::Vmm } else { InstanceState::NoVmm }; assert_eq!(instance_runtime.nexus_state, instance_state); } diff --git a/nexus/src/app/sagas/region_replacement_drive.rs b/nexus/src/app/sagas/region_replacement_drive.rs index e2f76201780..d95591848ed 100644 --- a/nexus/src/app/sagas/region_replacement_drive.rs +++ b/nexus/src/app/sagas/region_replacement_drive.rs @@ -577,26 +577,17 @@ async fn check_from_previous_propolis_step( Ok(DriveCheck::LastStepStillRunning) } - VmmState::Starting => { - // This state is unexpected, considering Nexus previously sent a - // target replacement request to this propolis! + // These states are unexpected, considering Nexus previously sent a + // target replacement request to this propolis! + VmmState::Starting | VmmState::Creating + // This state is unexpected because we should have already + // returned `DriveCheck::Wait` above. + | VmmState::Migrating => { return Err(ActionError::action_failed(format!( - "vmm {} propolis is Starting", - step_vmm_id, + "vmm {step_vmm_id} propolis is {state}", ))); } - - VmmState::Migrating => { - // This state is unexpected because we should have already - // returned `DriveCheck::Wait` above. - - return Err(ActionError::action_failed(format!( - "vmm {} propolis is Migrating!", - step_vmm_id, - ))); - } - VmmState::Stopping | VmmState::Stopped | VmmState::Failed @@ -923,7 +914,8 @@ async fn srrd_drive_region_replacement_prepare( | VmmState::Migrating | VmmState::Failed | VmmState::Destroyed - | VmmState::SagaUnwound => { + | VmmState::SagaUnwound + | VmmState::Creating => { // Propolis server is not ok to receive volume // replacement requests, bail out return Err(ActionError::action_failed(format!( diff --git a/nexus/src/app/sagas/region_snapshot_replacement_step.rs b/nexus/src/app/sagas/region_snapshot_replacement_step.rs index 600bb155bff..507904736ae 100644 --- a/nexus/src/app/sagas/region_snapshot_replacement_step.rs +++ b/nexus/src/app/sagas/region_snapshot_replacement_step.rs @@ -462,7 +462,8 @@ async fn rsrss_notify_upstairs( | VmmState::Migrating | VmmState::Failed | VmmState::Destroyed - | VmmState::SagaUnwound => { + | VmmState::SagaUnwound + | VmmState::Creating => { // Propolis server is not ok to receive volume replacement requests // - unwind so that this saga can run again. return Err(ActionError::action_failed(format!( diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index da8bbacf8bf..9390b0caa16 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -5436,6 +5436,13 @@ "VmmState": { "description": "One of the states that a VMM can be in.", "oneOf": [ + { + "description": "The VMM is known to Nexus, but may not yet exist on a sled.\n\nVMM records are always inserted into the database in this state, and then transition to 'starting' or 'migrating' once a sled-agent reports that the VMM has been registered.", + "type": "string", + "enum": [ + "creating" + ] + }, { "description": "The VMM is initializing and has not started running guest CPUs yet.", "type": "string", diff --git a/openapi/sled-agent.json b/openapi/sled-agent.json index bb8e4e0b87e..dcdcc6ca80b 100644 --- a/openapi/sled-agent.json +++ b/openapi/sled-agent.json @@ -5234,6 +5234,13 @@ "VmmState": { "description": "One of the states that a VMM can be in.", "oneOf": [ + { + "description": "The VMM is known to Nexus, but may not yet exist on a sled.\n\nVMM records are always inserted into the database in this state, and then transition to 'starting' or 'migrating' once a sled-agent reports that the VMM has been registered.", + "type": "string", + "enum": [ + "creating" + ] + }, { "description": "The VMM is initializing and has not started running guest CPUs yet.", "type": "string", diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index e851d2ed6bb..9906a94ac63 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1001,6 +1001,14 @@ CREATE TYPE IF NOT EXISTS omicron.public.instance_state_v2 AS ENUM ( ); CREATE TYPE IF NOT EXISTS omicron.public.vmm_state AS ENUM ( + /* + * The VMM is known to Nexus, but may not yet exist on a sled. + * + * VMM records are always inserted into the database in this state, and + * then transition to 'starting' or 'migrating' once a sled-agent reports + * that the VMM has been registered. + */ + 'creating', 'starting', 'running', 'stopping', @@ -4225,7 +4233,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '93.0.0', NULL) + (TRUE, NOW(), NOW(), '94.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/put-back-creating-vmm-state/up.sql b/schema/crdb/put-back-creating-vmm-state/up.sql new file mode 100644 index 00000000000..d183a15eac7 --- /dev/null +++ b/schema/crdb/put-back-creating-vmm-state/up.sql @@ -0,0 +1,2 @@ +ALTER TYPE omicron.public.vmm_state + ADD VALUE IF NOT EXISTS 'creating' BEFORE 'starting'; diff --git a/sled-agent/src/sim/instance.rs b/sled-agent/src/sim/instance.rs index eb7ea0ca794..a3caf77b532 100644 --- a/sled-agent/src/sim/instance.rs +++ b/sled-agent/src/sim/instance.rs @@ -210,6 +210,7 @@ impl SimInstanceInner { } VmmStateRequested::Running => { match self.next_resting_state() { + VmmState::Creating => todo!("eliza: should sled-agents even know about this state??"), VmmState::Starting => { self.queue_propolis_state( PropolisInstanceState::Running,