Skip to content

Commit

Permalink
add VmmState::Creating
Browse files Browse the repository at this point in the history
This commit introduces a new `VmmState` variant, `Creating`. The purpose
of this variant is to differentiate between VMMs which have been created
in the database but do not yet exist on a sled, from those which are
known to sled-agent and are actually starting/migrating.

This is necessary because presently, the `instance-watcher` background
task will attempt to perform health checks for `Starting` and
`Migrating` VMMs, and --- if the sled-agent does not actually know about
the VMM yet --- will move it to `Failed`. But, when a VMM is newly
created, the sled-agent won't know about it yet until it has been
registered with the sled-agent, so if the `instance-watcher` task
activates while an `instance-start` or `instance-migrate` saga is in
flight, the `instance-watcher`` task may incorrectly move its VMM to
`Failed` if the VMM record has been created but not yet registered. Now,
after adding the `Creating` variant, we can avoid performing health
checks for those VMMs until the sled-agent actually acknowledges a
request to register the VMM.
  • Loading branch information
hawkw committed Aug 29, 2024
1 parent ce1159b commit c656a40
Show file tree
Hide file tree
Showing 20 changed files with 219 additions and 92 deletions.
2 changes: 2 additions & 0 deletions clients/nexus-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ impl From<omicron_common::api::internal::nexus::VmmState> for types::VmmState {
fn from(s: omicron_common::api::internal::nexus::VmmState) -> Self {
use omicron_common::api::internal::nexus::VmmState as Input;
match s {
Input::Creating => types::VmmState::Creating,
Input::Starting => types::VmmState::Starting,
Input::Running => types::VmmState::Running,
Input::Stopping => types::VmmState::Stopping,
Expand All @@ -111,6 +112,7 @@ impl From<types::VmmState> for omicron_common::api::internal::nexus::VmmState {
fn from(s: types::VmmState) -> Self {
use omicron_common::api::internal::nexus::VmmState as Output;
match s {
types::VmmState::Creating => Output::Creating,
types::VmmState::Starting => Output::Starting,
types::VmmState::Running => Output::Running,
types::VmmState::Stopping => Output::Stopping,
Expand Down
2 changes: 2 additions & 0 deletions clients/sled-agent-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ impl From<omicron_common::api::internal::nexus::VmmState> for types::VmmState {
fn from(s: omicron_common::api::internal::nexus::VmmState) -> Self {
use omicron_common::api::internal::nexus::VmmState as Input;
match s {
Input::Creating => types::VmmState::Creating,
Input::Starting => types::VmmState::Starting,
Input::Running => types::VmmState::Running,
Input::Stopping => types::VmmState::Stopping,
Expand Down Expand Up @@ -143,6 +144,7 @@ impl From<types::VmmState> for omicron_common::api::internal::nexus::VmmState {
fn from(s: types::VmmState) -> Self {
use omicron_common::api::internal::nexus::VmmState as Output;
match s {
types::VmmState::Creating => Output::Creating,
types::VmmState::Starting => Output::Starting,
types::VmmState::Running => Output::Running,
types::VmmState::Stopping => Output::Stopping,
Expand Down
9 changes: 8 additions & 1 deletion common/src/api/external/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1064,7 +1064,14 @@ impl From<crate::api::internal::nexus::VmmState> for InstanceState {
fn from(state: crate::api::internal::nexus::VmmState) -> Self {
use crate::api::internal::nexus::VmmState as InternalVmmState;
match state {
InternalVmmState::Starting => Self::Starting,
// An instance with a VMM which is in the `Creating` state maps to
// `InstanceState::Starting`, rather than `InstanceState::Creating`.
// If we are still creating the VMM, this is because we are
// attempting to *start* the instance; instances may be created
// without creating a VMM to run them, and then started later.
InternalVmmState::Creating | InternalVmmState::Starting => {
Self::Starting
}
InternalVmmState::Running => Self::Running,
InternalVmmState::Stopping => Self::Stopping,
InternalVmmState::Stopped => Self::Stopped,
Expand Down
6 changes: 6 additions & 0 deletions common/src/api/internal/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ pub struct InstanceRuntimeState {
)]
#[serde(rename_all = "snake_case")]
pub enum VmmState {
/// The VMM is known to Nexus, but may not yet exist on a sled.
///
/// VMM records are always inserted into the database in this state, and
/// then transition to 'starting' or 'migrating' once a sled-agent reports
/// that the VMM has been registered.
Creating,
/// The VMM is initializing and has not started running guest CPUs yet.
Starting,
/// The VMM has finished initializing and may be running guest CPUs.
Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::collections::BTreeMap;
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(93, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(94, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(94, "put-back-creating-vmm-state"),
KnownVersion::new(93, "dataset-kinds-zone-and-debug"),
KnownVersion::new(92, "lldp-link-config-nullable"),
KnownVersion::new(91, "add-management-gateway-producer-kind"),
Expand Down
16 changes: 4 additions & 12 deletions nexus/db-model/src/vmm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,27 +60,19 @@ pub struct Vmm {
pub runtime: VmmRuntimeState,
}

/// The set of states that a VMM can have when it is created.
pub enum VmmInitialState {
Starting,
Migrating,
}

impl Vmm {
/// Creates a new VMM record.
///
/// The new VMM record will be in [`VmmState::Creating`] until it is
/// registered with a sled-agent.
pub fn new(
id: PropolisUuid,
instance_id: InstanceUuid,
sled_id: SledUuid,
propolis_ip: ipnetwork::IpNetwork,
propolis_port: u16,
initial_state: VmmInitialState,
) -> Self {
let now = Utc::now();
let state = match initial_state {
VmmInitialState::Starting => VmmState::Starting,
VmmInitialState::Migrating => VmmState::Migrating,
};

Self {
id: id.into_untyped_uuid(),
Expand All @@ -91,7 +83,7 @@ impl Vmm {
propolis_ip,
propolis_port: SqlU16(propolis_port),
runtime: VmmRuntimeState {
state,
state: VmmState::Creating,
time_state_updated: now,
gen: Generation::new(),
},
Expand Down
24 changes: 23 additions & 1 deletion nexus/db-model/src/vmm_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ impl_enum_type!(
#[diesel(sql_type = VmmStateEnum)]
pub enum VmmState;

Creating => b"creating"
Starting => b"starting"
Running => b"running"
Stopping => b"stopping"
Expand All @@ -31,6 +32,7 @@ impl_enum_type!(
impl VmmState {
pub fn label(&self) -> &'static str {
match self {
VmmState::Creating => "creating",
VmmState::Starting => "starting",
VmmState::Running => "running",
VmmState::Stopping => "stopping",
Expand All @@ -51,9 +53,21 @@ impl VmmState {
pub const TERMINAL_STATES: &'static [Self] =
&[Self::Destroyed, Self::Failed];

/// States in which a VMM record is present in the database but is not
/// resident on a sled, either because it does not yet exist, was produced
/// by an unwound update saga and will never exist, or has already been
/// destroyed.
pub const NONEXISTENT_STATES: &'static [Self] =
&[Self::Creating, Self::SagaUnwound, Self::Destroyed];

pub fn is_terminal(&self) -> bool {
Self::TERMINAL_STATES.contains(self)
}
/// Returns `true` if the instance is in a state in which it exists on a
/// sled.
pub fn exists_on_sled(&self) -> bool {
!Self::NONEXISTENT_STATES.contains(self)
}
}

impl fmt::Display for VmmState {
Expand All @@ -66,6 +80,7 @@ impl From<VmmState> for omicron_common::api::internal::nexus::VmmState {
fn from(value: VmmState) -> Self {
use omicron_common::api::internal::nexus::VmmState as Output;
match value {
VmmState::Creating => Output::Creating,
VmmState::Starting => Output::Starting,
VmmState::Running => Output::Running,
VmmState::Stopping => Output::Stopping,
Expand All @@ -82,6 +97,7 @@ impl From<VmmState> for sled_agent_client::types::VmmState {
fn from(value: VmmState) -> Self {
use sled_agent_client::types::VmmState as Output;
match value {
VmmState::Creating => Output::Creating,
VmmState::Starting => Output::Starting,
VmmState::Running => Output::Running,
VmmState::Stopping => Output::Stopping,
Expand All @@ -98,6 +114,7 @@ impl From<ApiState> for VmmState {
fn from(value: ApiState) -> Self {
use VmmState as Output;
match value {
ApiState::Creating => Output::Creating,
ApiState::Starting => Output::Starting,
ApiState::Running => Output::Running,
ApiState::Stopping => Output::Stopping,
Expand All @@ -115,7 +132,12 @@ impl From<VmmState> for omicron_common::api::external::InstanceState {
use omicron_common::api::external::InstanceState as Output;

match value {
VmmState::Starting => Output::Starting,
// An instance with a VMM which is in the `Creating` state maps to
// `InstanceState::Starting`, rather than `InstanceState::Creating`.
// If we are still creating the VMM, this is because we are
// attempting to *start* the instance; instances may be created
// without creating a VMM to run them, and then started later.
VmmState::Creating | VmmState::Starting => Output::Starting,
VmmState::Running => Output::Running,
VmmState::Stopping => Output::Stopping,
// `SagaUnwound` should map to `Stopped` so that an `instance_view`
Expand Down
12 changes: 10 additions & 2 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@ impl DataStore {
format!(
"cannot set migration ID {migration_id} for instance \
{instance_id} (perhaps another migration ID is \
already present): {error:#}"
already present): {error:#?}"
),
),
})
Expand Down Expand Up @@ -825,7 +825,15 @@ impl DataStore {
vmm_dsl::vmm
.on(vmm_dsl::sled_id
.eq(sled_dsl::id)
.and(vmm_dsl::time_deleted.is_null()))
.and(vmm_dsl::time_deleted.is_null())
// Ignore instances which are in states that are not
// known to exist on a sled. Since this query drives
// instance-watcher health checking, it is not necessary
// to perform health checks for VMMs that don't actually
// exist in real life.
.and(
vmm_dsl::state.ne_all(VmmState::NONEXISTENT_STATES),
))
.inner_join(
instance_dsl::instance
.on(instance_dsl::id
Expand Down
43 changes: 31 additions & 12 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -914,7 +914,7 @@ impl super::Nexus {
propolis_id: &PropolisUuid,
initial_vmm: &db::model::Vmm,
operation: InstanceRegisterReason,
) -> Result<(), Error> {
) -> Result<db::model::Vmm, Error> {
opctx.authorize(authz::Action::Modify, authz_instance).await?;

// Check that the hostname is valid.
Expand Down Expand Up @@ -1147,13 +1147,31 @@ impl super::Nexus {
let sa = self
.sled_client(&SledUuid::from_untyped_uuid(initial_vmm.sled_id))
.await?;
// The state of a freshly-created VMM record in the database is always
// `VmmState::Creating`. Based on whether this VMM is created in order
// to start or migrate an instance, determine the VMM state we will want
// the sled-agent to create the VMM in. Once sled-agent acknoledges our
// registration request, we will advance the database record to the new
// state.
let vmm_runtime = sled_agent_client::types::VmmRuntimeState {
time_updated: chrono::Utc::now(),
r#gen: initial_vmm.runtime.gen.next(),
state: match operation {
InstanceRegisterReason::Migrate { .. } => {
sled_agent_client::types::VmmState::Migrating
}
InstanceRegisterReason::Start { .. } => {
sled_agent_client::types::VmmState::Starting
}
},
};
let instance_register_result = sa
.vmm_register(
propolis_id,
&sled_agent_client::types::InstanceEnsureBody {
hardware: instance_hardware,
instance_runtime: db_instance.runtime().clone().into(),
vmm_runtime: initial_vmm.clone().into(),
vmm_runtime,
instance_id,
propolis_addr: SocketAddr::new(
initial_vmm.propolis_ip.ip(),
Expand All @@ -1170,6 +1188,10 @@ impl super::Nexus {
match instance_register_result {
Ok(state) => {
self.notify_vmm_updated(opctx, *propolis_id, &state).await?;
Ok(db::model::Vmm {
runtime: state.vmm_state.into(),
..initial_vmm.clone()
})
}
Err(e) => {
if e.instance_unhealthy() {
Expand All @@ -1182,11 +1204,9 @@ impl super::Nexus {
)
.await;
}
return Err(e.into());
Err(e.into())
}
}

Ok(())
}

/// Attempts to move an instance from `prev_instance_runtime` to the
Expand Down Expand Up @@ -1529,8 +1549,7 @@ impl super::Nexus {
.instance_fetch_with_vmm(opctx, &authz_instance)
.await?;

let (instance, vmm) = (state.instance(), state.vmm());
if let Some(vmm) = vmm {
if let Some(vmm) = state.vmm() {
match vmm.runtime.state {
DbVmmState::Running
| DbVmmState::Rebooting
Expand All @@ -1541,10 +1560,11 @@ impl super::Nexus {
DbVmmState::Starting
| DbVmmState::Stopping
| DbVmmState::Stopped
| DbVmmState::Failed => {
| DbVmmState::Failed
| DbVmmState::Creating => {
Err(Error::invalid_request(format!(
"cannot connect to serial console of instance in state \"{}\"",
vmm.runtime.state,
state.effective_state(),
)))
}

Expand All @@ -1556,7 +1576,7 @@ impl super::Nexus {
Err(Error::invalid_request(format!(
"instance is {} and has no active serial console \
server",
instance.runtime().nexus_state
state.effective_state(),
)))
}
}
Expand Down Expand Up @@ -2068,7 +2088,7 @@ mod tests {
use futures::{SinkExt, StreamExt};
use nexus_db_model::{
Instance as DbInstance, InstanceState as DbInstanceState,
VmmInitialState, VmmState as DbVmmState,
VmmState as DbVmmState,
};
use omicron_common::api::external::{
Hostname, IdentityMetadataCreateParams, InstanceCpuCount, Name,
Expand Down Expand Up @@ -2206,7 +2226,6 @@ mod tests {
ipnetwork::IpNetwork::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0)
.unwrap(),
0,
VmmInitialState::Starting,
);

(instance, vmm)
Expand Down
13 changes: 7 additions & 6 deletions nexus/src/app/sagas/instance_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,13 @@ pub async fn create_and_insert_vmm_record(
propolis_id: PropolisUuid,
sled_id: SledUuid,
propolis_ip: Ipv6Addr,
initial_state: nexus_db_model::VmmInitialState,
) -> Result<db::model::Vmm, ActionError> {
let vmm = db::model::Vmm::new(
propolis_id,
instance_id,
sled_id,
IpAddr::V6(propolis_ip).into(),
DEFAULT_PROPOLIS_PORT,
initial_state,
);

let vmm = datastore
Expand Down Expand Up @@ -277,9 +275,9 @@ pub(super) async fn instance_ip_get_instance_state(
Some(VmmState::Running) | Some(VmmState::Rebooting),
) => {}

// If the VMM is in the Stopping, Migrating, or Starting states, its
// sled assignment is in doubt, so report a transient state error and
// ask the caller to retry.
// If the VMM is in the Creating, Stopping, Migrating, or Starting
// states, its sled assignment is in doubt, so report a transient state
// error and ask the caller to retry.
//
// Although an instance with a Starting VMM has a sled assignment,
// there's no way to tell at this point whether or not there's a
Expand All @@ -304,9 +302,12 @@ pub(super) async fn instance_ip_get_instance_state(
(
InstanceState::Vmm,
Some(state @ VmmState::Starting)
// TODO(eliza): now that a `Creating` state exists that's separate
// from `Starting`, perhaps we can remove `Starting` from this list?
| Some(state @ VmmState::Migrating)
| Some(state @ VmmState::Stopping)
| Some(state @ VmmState::Stopped),
| Some(state @ VmmState::Stopped)
| Some(state @ VmmState::Creating),
) => {
return Err(ActionError::action_failed(Error::unavail(&format!(
"can't {verb} in transient state {state}"
Expand Down
Loading

0 comments on commit c656a40

Please sign in to comment.