Skip to content

Commit

Permalink
[nexus] Allow stopping Failed instances
Browse files Browse the repository at this point in the history
PR #6503 changed Nexus to attempt to automatically restart instances
which are in the `Failed` state. Now that we do this, we should probably
change the allowable instance state transitions to permit a user to stop
an instance that is `Failed`, as a way to say "stop trying to restart
this instance" (as `Stopped` instances are not restarted). This branch
changes `Nexus::instance_request_state` and
`select_instance_change_action` to permit stopping a `Failed` instance.

Fixes #6640

I believe this also fixes #2825, along with #6455 (which allowed
restarting `Failed` instances).
  • Loading branch information
hawkw committed Sep 24, 2024
1 parent 45813be commit c7d739e
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 4 deletions.
53 changes: 49 additions & 4 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,10 @@ enum InstanceStateChangeRequestAction {
/// Request the appropriate state change from the sled with the specified
/// UUID.
SendToSled { sled_id: SledUuid, propolis_id: PropolisUuid },

/// The instance is not currently incarnated on a sled, so just update its
/// runtime state in the database without communicating with a sled-agent.
UpdateRuntime(db::model::InstanceRuntimeState),
}

/// What is the higher level operation that is calling
Expand Down Expand Up @@ -811,13 +815,36 @@ impl super::Nexus {
still being created"
))
}
// Failed instances may transition to Stopped by just changing
// the Nexus state in the database to NoVmm.
//
// An instance's effective state will never be Failed while it
// is linked with a VMM. If the instance has an active VMM which
// is Failed, the instance's effective state will be Stopping,
// rather than Failed, until an instance-update saga has
// unlinked the Failed VMM. Therefore, we know that a Failed
// instance is not incarnated on a sled, so all we need to do to
// "stop" it is to update its state in the database.
InstanceState::Failed if matches!(requested, InstanceStateChangeRequest::Stop) => {
// As discussed above, this shouldn't happen, so return an
// internal error and complain about it in the logs.
if vmm_state.is_some() {
return Err(Error::internal_error(
"an instance should not be in the Failed \
effective state if it has an active VMM"
));
}

let prev_runtime = instance_state.runtime();
return Ok(InstanceStateChangeRequestAction::UpdateRuntime(db::model::InstanceRuntimeState {
time_updated: chrono::Utc::now(),
r#gen: prev_runtime.r#gen.0.next().into(),
nexus_state: db::model::InstanceState::NoVmm,
..prev_runtime.clone()
}));
}
// If the instance has no sled beacuse it's been destroyed or
// has fallen over, reject the state change.
//
// TODO(#2825): Failed instances should be allowed to stop, but
// this requires a special action because there is no sled to
// send the request to.
InstanceState::Failed | InstanceState::Destroyed => {
return Err(Error::invalid_request(&format!(
"instance state cannot be changed from {}",
Expand Down Expand Up @@ -898,6 +925,24 @@ impl super::Nexus {
&requested,
)? {
InstanceStateChangeRequestAction::AlreadyDone => Ok(()),
InstanceStateChangeRequestAction::UpdateRuntime(new_runtime) => {
let instance_id =
InstanceUuid::from_untyped_uuid(prev_instance_state.id());
let changed = self
.datastore()
.instance_update_runtime(&instance_id, &new_runtime)
.await
.map_err(InstanceStateChangeError::Other)?;
if !changed {
// TODO(eliza): perhaps we should refetch the instance here
// and return Ok if it was in the desired state...
Err(InstanceStateChangeError::Other(Error::conflict(
"instance changed state before it could be updated",
)))
} else {
Ok(())
}
}
InstanceStateChangeRequestAction::SendToSled {
sled_id,
propolis_id,
Expand Down
38 changes: 38 additions & 0 deletions nexus/tests/integration_tests/instances.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1177,6 +1177,44 @@ async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_deleted(
expect_instance_delete_ok(client, instance_name).await;
}

// Verifies that if a request to reboot or stop an instance fails because of a
// 404 error from sled agent, then the instance moves to the Failed state, and
// can then be Stopped once it has transitioned to Failed.
#[nexus_test]
async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_stopped(
cptestctx: &ControlPlaneTestContext,
) {
let client = &cptestctx.external_client;

let instance_name = "losing-is-fun";
let instance_id = make_forgotten_instance(
&cptestctx,
instance_name,
InstanceAutoRestartPolicy::Never,
)
.await;

// Attempting to reboot the forgotten instance will result in a 404
// NO_SUCH_INSTANCE from the sled-agent, which Nexus turns into a 503.
expect_instance_reboot_fail(
client,
instance_name,
http::StatusCode::SERVICE_UNAVAILABLE,
)
.await;

// Wait for the instance to transition to Failed.
instance_wait_for_state(client, instance_id, InstanceState::Failed).await;

// Now, it should be possible to stop the instance.
let instance_next =
instance_post(&client, instance_name, InstanceOp::Stop).await;
assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped);

// Now, the Stopped nstance should be deleteable..
expect_instance_delete_ok(client, instance_name).await;
}

// Verifies that the instance-watcher background task transitions an instance
// to Failed when the sled-agent returns a 404, and that the instance can be
// deleted after it transitions to Failed.
Expand Down

0 comments on commit c7d739e

Please sign in to comment.