From c7d739ed40288e7bdddb3f8be5f5c978b8db97cc Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 24 Sep 2024 09:51:02 -0700 Subject: [PATCH] [nexus] Allow stopping `Failed` instances PR #6503 changed Nexus to attempt to automatically restart instances which are in the `Failed` state. Now that we do this, we should probably change the allowable instance state transitions to permit a user to stop an instance that is `Failed`, as a way to say "stop trying to restart this instance" (as `Stopped` instances are not restarted). This branch changes `Nexus::instance_request_state` and `select_instance_change_action` to permit stopping a `Failed` instance. Fixes #6640 I believe this also fixes #2825, along with #6455 (which allowed restarting `Failed` instances). --- nexus/src/app/instance.rs | 53 ++++++++++++++++++++-- nexus/tests/integration_tests/instances.rs | 38 ++++++++++++++++ 2 files changed, 87 insertions(+), 4 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 09fe364aaa..daea4171f8 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -233,6 +233,10 @@ enum InstanceStateChangeRequestAction { /// Request the appropriate state change from the sled with the specified /// UUID. SendToSled { sled_id: SledUuid, propolis_id: PropolisUuid }, + + /// The instance is not currently incarnated on a sled, so just update its + /// runtime state in the database without communicating with a sled-agent. + UpdateRuntime(db::model::InstanceRuntimeState), } /// What is the higher level operation that is calling @@ -811,13 +815,36 @@ impl super::Nexus { still being created" )) } + // Failed instances may transition to Stopped by just changing + // the Nexus state in the database to NoVmm. + // + // An instance's effective state will never be Failed while it + // is linked with a VMM. If the instance has an active VMM which + // is Failed, the instance's effective state will be Stopping, + // rather than Failed, until an instance-update saga has + // unlinked the Failed VMM. Therefore, we know that a Failed + // instance is not incarnated on a sled, so all we need to do to + // "stop" it is to update its state in the database. + InstanceState::Failed if matches!(requested, InstanceStateChangeRequest::Stop) => { + // As discussed above, this shouldn't happen, so return an + // internal error and complain about it in the logs. + if vmm_state.is_some() { + return Err(Error::internal_error( + "an instance should not be in the Failed \ + effective state if it has an active VMM" + )); + } + let prev_runtime = instance_state.runtime(); + return Ok(InstanceStateChangeRequestAction::UpdateRuntime(db::model::InstanceRuntimeState { + time_updated: chrono::Utc::now(), + r#gen: prev_runtime.r#gen.0.next().into(), + nexus_state: db::model::InstanceState::NoVmm, + ..prev_runtime.clone() + })); + } // If the instance has no sled beacuse it's been destroyed or // has fallen over, reject the state change. - // - // TODO(#2825): Failed instances should be allowed to stop, but - // this requires a special action because there is no sled to - // send the request to. InstanceState::Failed | InstanceState::Destroyed => { return Err(Error::invalid_request(&format!( "instance state cannot be changed from {}", @@ -898,6 +925,24 @@ impl super::Nexus { &requested, )? { InstanceStateChangeRequestAction::AlreadyDone => Ok(()), + InstanceStateChangeRequestAction::UpdateRuntime(new_runtime) => { + let instance_id = + InstanceUuid::from_untyped_uuid(prev_instance_state.id()); + let changed = self + .datastore() + .instance_update_runtime(&instance_id, &new_runtime) + .await + .map_err(InstanceStateChangeError::Other)?; + if !changed { + // TODO(eliza): perhaps we should refetch the instance here + // and return Ok if it was in the desired state... + Err(InstanceStateChangeError::Other(Error::conflict( + "instance changed state before it could be updated", + ))) + } else { + Ok(()) + } + } InstanceStateChangeRequestAction::SendToSled { sled_id, propolis_id, diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 96f3a759e9..1af4052dc3 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1177,6 +1177,44 @@ async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_deleted( expect_instance_delete_ok(client, instance_name).await; } +// Verifies that if a request to reboot or stop an instance fails because of a +// 404 error from sled agent, then the instance moves to the Failed state, and +// can then be Stopped once it has transitioned to Failed. +#[nexus_test] +async fn test_instance_failed_after_sled_agent_forgets_vmm_can_be_stopped( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + + let instance_name = "losing-is-fun"; + let instance_id = make_forgotten_instance( + &cptestctx, + instance_name, + InstanceAutoRestartPolicy::Never, + ) + .await; + + // Attempting to reboot the forgotten instance will result in a 404 + // NO_SUCH_INSTANCE from the sled-agent, which Nexus turns into a 503. + expect_instance_reboot_fail( + client, + instance_name, + http::StatusCode::SERVICE_UNAVAILABLE, + ) + .await; + + // Wait for the instance to transition to Failed. + instance_wait_for_state(client, instance_id, InstanceState::Failed).await; + + // Now, it should be possible to stop the instance. + let instance_next = + instance_post(&client, instance_name, InstanceOp::Stop).await; + assert_eq!(instance_next.runtime.run_state, InstanceState::Stopped); + + // Now, the Stopped nstance should be deleteable.. + expect_instance_delete_ok(client, instance_name).await; +} + // Verifies that the instance-watcher background task transitions an instance // to Failed when the sled-agent returns a 404, and that the instance can be // deleted after it transitions to Failed.