Skip to content

Commit

Permalink
Properly check prior instance/vmm states when starting an instance (#…
Browse files Browse the repository at this point in the history
…6278)

Fix the following instance start dispositions:

- an instance with no active VMM can only be started if it's in the
"NoVmm" instance state
- an instance with a migrating active VMM has been started, so allow
start to succeed for idempotency
- an instance with a SagaUnwound active VMM is allowed to be started
again (the new start saga's VMM supplants the old VMM)

Add some unit tests to cover these cases.

Fixes #6274.
  • Loading branch information
gjcolombo authored Aug 9, 2024
1 parent 6e829a2 commit 04fdbcd
Showing 1 changed file with 228 additions and 49 deletions.
277 changes: 228 additions & 49 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ use crate::external_api::params;
use cancel_safe_futures::prelude::*;
use futures::future::Fuse;
use futures::{FutureExt, SinkExt, StreamExt};
use nexus_db_model::InstanceState as DbInstanceState;
use nexus_db_model::IpAttachState;
use nexus_db_model::IpKind;
use nexus_db_model::Vmm;
use nexus_db_model::Vmm as DbVmm;
use nexus_db_model::VmmState as DbVmmState;
use nexus_db_queries::authn;
use nexus_db_queries::authz;
Expand Down Expand Up @@ -189,6 +190,11 @@ pub(crate) enum InstanceRegisterReason {
Migrate { vmm_id: PropolisUuid, target_vmm_id: PropolisUuid },
}

enum InstanceStartDisposition {
Start,
AlreadyStarted,
}

impl super::Nexus {
pub fn instance_lookup<'a>(
&'a self,
Expand Down Expand Up @@ -719,54 +725,26 @@ impl super::Nexus {
.db_datastore
.instance_fetch_with_vmm(opctx, &authz_instance)
.await?;
let (instance, vmm) = (state.instance(), state.vmm());

if let Some(vmm) = vmm {
match vmm.runtime.state {
DbVmmState::Starting
| DbVmmState::Running
| DbVmmState::Rebooting => {
debug!(self.log, "asked to start an active instance";
"instance_id" => %authz_instance.id());

return Ok(state);
}
DbVmmState::Stopped => {
let propolis_id = instance
.runtime()
.propolis_id
.expect("needed a VMM ID to fetch a VMM record");
error!(self.log,
"instance is stopped but still has an active VMM";
"instance_id" => %authz_instance.id(),
"propolis_id" => %propolis_id);
match instance_start_allowed(&self.log, &state)? {
InstanceStartDisposition::AlreadyStarted => Ok(state),
InstanceStartDisposition::Start => {
let saga_params = sagas::instance_start::Params {
serialized_authn: authn::saga::Serialized::for_opctx(opctx),
db_instance: state.instance().clone(),
};

self.sagas
.saga_execute::<sagas::instance_start::SagaInstanceStart>(
saga_params,
)
.await?;

return Err(Error::internal_error(
"instance is stopped but still has an active VMM",
));
}
_ => {
return Err(Error::conflict(&format!(
"instance is in state {} but must be {} to be started",
vmm.runtime.state,
InstanceState::Stopped
)));
}
self.db_datastore
.instance_fetch_with_vmm(opctx, &authz_instance)
.await
}
}

let saga_params = sagas::instance_start::Params {
serialized_authn: authn::saga::Serialized::for_opctx(opctx),
db_instance: instance.clone(),
};

self.sagas
.saga_execute::<sagas::instance_start::SagaInstanceStart>(
saga_params,
)
.await?;

self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await
}

/// Make sure the given Instance is stopped.
Expand Down Expand Up @@ -1670,7 +1648,7 @@ impl super::Nexus {
opctx: &OpContext,
instance_lookup: &lookup::Instance<'_>,
action: authz::Action,
) -> Result<(Vmm, SocketAddr), Error> {
) -> Result<(DbVmm, SocketAddr), Error> {
let (.., authz_instance) = instance_lookup.lookup_for(action).await?;

let state = self
Expand Down Expand Up @@ -1717,7 +1695,7 @@ impl super::Nexus {
opctx: &OpContext,
instance_lookup: &lookup::Instance<'_>,
action: authz::Action,
) -> Result<(Vmm, propolis_client::Client), Error> {
) -> Result<(DbVmm, propolis_client::Client), Error> {
let (vmm, client_addr) = self
.propolis_addr_for_instance(opctx, instance_lookup, action)
.await?;
Expand Down Expand Up @@ -2186,18 +2164,118 @@ pub(crate) async fn notify_instance_updated(
}
}

/// Determines the disposition of a request to start an instance given its state
/// (and its current VMM's state, if it has one) in the database.
fn instance_start_allowed(
log: &slog::Logger,
state: &InstanceAndActiveVmm,
) -> Result<InstanceStartDisposition, Error> {
let (instance, vmm) = (state.instance(), state.vmm());

// If the instance has an active VMM, there's nothing to start, but this
// disposition of this call (succeed for idempotency vs. fail with an
// error describing the conflict) depends on the state that VMM is in.
//
// If the instance doesn't have an active VMM, see if the instance state
// permits it to start.
if let Some(vmm) = vmm {
match vmm.runtime.state {
// If the VMM is already starting or is in another "active"
// state, succeed to make successful start attempts idempotent.
DbVmmState::Starting
| DbVmmState::Running
| DbVmmState::Rebooting
| DbVmmState::Migrating => {
debug!(log, "asked to start an active instance";
"instance_id" => %instance.id());

Ok(InstanceStartDisposition::AlreadyStarted)
}
// If a previous start saga failed and left behind a VMM in the
// SagaUnwound state, allow a new start saga to try to overwrite
// it.
DbVmmState::SagaUnwound => {
debug!(
log,
"instance's last VMM's start saga unwound, OK to start";
"instance_id" => %instance.id()
);

Ok(InstanceStartDisposition::Start)
}
// When sled agent publishes a Stopped state, Nexus should clean
// up the instance/VMM pointer.
DbVmmState::Stopped => {
let propolis_id = instance
.runtime()
.propolis_id
.expect("needed a VMM ID to fetch a VMM record");
error!(log,
"instance is stopped but still has an active VMM";
"instance_id" => %instance.id(),
"propolis_id" => %propolis_id);

Err(Error::internal_error(
"instance is stopped but still has an active VMM",
))
}
_ => Err(Error::conflict(&format!(
"instance is in state {} but must be {} to be started",
vmm.runtime.state,
InstanceState::Stopped
))),
}
} else {
match instance.runtime_state.nexus_state {
// If the instance is in a known-good no-VMM state, it can
// start.
DbInstanceState::NoVmm => {
debug!(log, "instance has no VMM, OK to start";
"instance_id" => %instance.id());

Ok(InstanceStartDisposition::Start)
}
// If the instance isn't ready yet or has been destroyed, it
// can't start.
//
// TODO(#2825): If the "Failed" state could be interpreted to
// mean "stopped abnormally" and not just "Nexus doesn't know
// what state the instance is in," it would be fine to start the
// instance here. See RFD 486.
DbInstanceState::Creating
| DbInstanceState::Failed
| DbInstanceState::Destroyed => Err(Error::conflict(&format!(
"instance is in state {} but must be {} to be started",
instance.runtime_state.nexus_state,
InstanceState::Stopped
))),
// If the instance is in the Vmm state, there should have been
// an active Propolis ID and a VMM record to read, so this
// branch shouldn't have been reached.
DbInstanceState::Vmm => Err(Error::internal_error(
"instance is in state Vmm but has no active VMM",
)),
}
}
}

#[cfg(test)]
mod tests {
use super::super::Nexus;
use super::{CloseCode, CloseFrame, WebSocketMessage, WebSocketStream};
use super::*;
use core::time::Duration;
use futures::{SinkExt, StreamExt};
use nexus_db_model::{Instance as DbInstance, VmmInitialState};
use omicron_common::api::external::{
Hostname, IdentityMetadataCreateParams, InstanceCpuCount, Name,
};
use omicron_test_utils::dev::test_setup_log;
use params::InstanceNetworkInterfaceAttachment;
use propolis_client::support::tungstenite::protocol::Role;
use propolis_client::support::{
InstanceSerialConsoleHelper, WSClientOffset,
};
use std::net::{Ipv4Addr, SocketAddr, SocketAddrV4};
use std::net::{IpAddr, Ipv4Addr, SocketAddr, SocketAddrV4};

#[tokio::test]
async fn test_serial_console_stream_proxying() {
Expand Down Expand Up @@ -2290,4 +2368,105 @@ mod tests {
.expect("proxy task exited successfully");
logctx.cleanup_successful();
}

/// Creates an instance record and a VMM record that points back to it. Note
/// that the VMM is *not* installed in the instance's `active_propolis_id`
/// field.
fn make_instance_and_vmm() -> (DbInstance, DbVmm) {
let params = params::InstanceCreate {
identity: IdentityMetadataCreateParams {
name: Name::try_from("elysium".to_owned()).unwrap(),
description: "this instance is disco".to_owned(),
},
ncpus: InstanceCpuCount(1),
memory: ByteCount::from_gibibytes_u32(1),
hostname: Hostname::try_from("elysium").unwrap(),
user_data: vec![],
network_interfaces: InstanceNetworkInterfaceAttachment::None,
external_ips: vec![],
disks: vec![],
ssh_public_keys: None,
start: false,
};

let instance_id = InstanceUuid::from_untyped_uuid(Uuid::new_v4());
let project_id = Uuid::new_v4();
let instance = DbInstance::new(instance_id, project_id, &params);

let propolis_id = PropolisUuid::from_untyped_uuid(Uuid::new_v4());
let sled_id = SledUuid::from_untyped_uuid(Uuid::new_v4());
let vmm = DbVmm::new(
propolis_id,
instance_id,
sled_id,
ipnetwork::IpNetwork::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0)
.unwrap(),
0,
VmmInitialState::Starting,
);

(instance, vmm)
}

#[test]
fn test_instance_start_allowed_when_no_vmm() {
let logctx = test_setup_log("test_instance_start_allowed_when_no_vmm");
let (mut instance, _vmm) = make_instance_and_vmm();
instance.runtime_state.nexus_state = DbInstanceState::NoVmm;
let state = InstanceAndActiveVmm::from((instance, None));
assert!(instance_start_allowed(&logctx.log, &state).is_ok());
logctx.cleanup_successful();
}

#[test]
fn test_instance_start_allowed_when_vmm_in_saga_unwound() {
let logctx = test_setup_log(
"test_instance_start_allowed_when_vmm_in_saga_unwound",
);
let (mut instance, mut vmm) = make_instance_and_vmm();
instance.runtime_state.nexus_state = DbInstanceState::Vmm;
instance.runtime_state.propolis_id = Some(vmm.id);
vmm.runtime.state = DbVmmState::SagaUnwound;
let state = InstanceAndActiveVmm::from((instance, Some(vmm)));
assert!(instance_start_allowed(&logctx.log, &state).is_ok());
logctx.cleanup_successful();
}

#[test]
fn test_instance_start_forbidden_while_creating() {
let logctx =
test_setup_log("test_instance_start_forbidden_while_creating");
let (mut instance, _vmm) = make_instance_and_vmm();
instance.runtime_state.nexus_state = DbInstanceState::Creating;
let state = InstanceAndActiveVmm::from((instance, None));
assert!(instance_start_allowed(&logctx.log, &state).is_err());
logctx.cleanup_successful();
}

#[test]
fn test_instance_start_idempotent_if_active() {
let logctx = test_setup_log("test_instance_start_idempotent_if_active");
let (mut instance, mut vmm) = make_instance_and_vmm();
instance.runtime_state.nexus_state = DbInstanceState::Vmm;
instance.runtime_state.propolis_id = Some(vmm.id);
vmm.runtime.state = DbVmmState::Starting;
let state =
InstanceAndActiveVmm::from((instance.clone(), Some(vmm.clone())));
assert!(instance_start_allowed(&logctx.log, &state).is_ok());

vmm.runtime.state = DbVmmState::Running;
let state =
InstanceAndActiveVmm::from((instance.clone(), Some(vmm.clone())));
assert!(instance_start_allowed(&logctx.log, &state).is_ok());

vmm.runtime.state = DbVmmState::Rebooting;
let state =
InstanceAndActiveVmm::from((instance.clone(), Some(vmm.clone())));
assert!(instance_start_allowed(&logctx.log, &state).is_ok());

vmm.runtime.state = DbVmmState::Migrating;
let state = InstanceAndActiveVmm::from((instance, Some(vmm)));
assert!(instance_start_allowed(&logctx.log, &state).is_ok());
logctx.cleanup_successful();
}
}

0 comments on commit 04fdbcd

Please sign in to comment.