From b07dec639fbc15ca2d13caead1e2b83e18fa0a07 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Tue, 24 Sep 2024 09:51:02 -0700
Subject: [PATCH] [nexus] Allow stopping `Failed` instances

PR #6503 changed Nexus to attempt to automatically restart instances
which are in the `Failed` state. Now that we do this, we should probably
change the allowable instance state transitions to permit a user to stop
an instance that is `Failed`, as a way to say "stop trying to restart
this instance" (as `Stopped` instances are not restarted). This branch
changes `Nexus::instance_request_state` and
`select_instance_change_action` to permit stopping a `Failed` instance.

Fixes #6640

I believe this also fixes #2825, along with #6455 (which allowed
restarting `Failed` instances).
---
 nexus/src/app/instance.rs | 54 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
index 09fe364aaab..5123246d71e 100644
--- a/nexus/src/app/instance.rs
+++ b/nexus/src/app/instance.rs
@@ -202,6 +202,7 @@ impl From<InstanceStateChangeError> for dropshot::HttpError {
 
 /// The kinds of state changes that can be requested of an instance's current
 /// VMM (i.e. the VMM pointed to be the instance's `propolis_id` field).
+#[derive(PartialEq, Eq)]
 pub(crate) enum InstanceStateChangeRequest {
     Run,
     Reboot,
@@ -233,6 +234,10 @@ enum InstanceStateChangeRequestAction {
     /// Request the appropriate state change from the sled with the specified
     /// UUID.
     SendToSled { sled_id: SledUuid, propolis_id: PropolisUuid },
+
+    /// The instance is not currently incarnated on a sled, so just update its
+    /// runtime state in the database without communicating with a sled-agent.
+    UpdateRuntime(db::model::InstanceRuntimeState),
 }
 
 /// What is the higher level operation that is calling
@@ -811,13 +816,36 @@ impl super::Nexus {
                                 still being created"
                                 ))
                 }
+                // Failed instances may transition to Stopped by just changing
+                // the Nexus state in the database to NoVmm.
+                //
+                // An instance's effective state will never be Failed while it
+                // is linked with a VMM. If the instance has an active VMM which
+                // is Failed, the instance's effective state will be Stopping,
+                // rather than Failed, until an instance-update saga has
+                // unlinked the Failed VMM. Therefore, we know that a Failed
+                // instance is not incarnated on a sled, so all we need to do to
+                // "stop" it is to update its state in the database.
+                InstanceState::Failed if requested == &InstanceStateChangeRequest::Stop => {
+                    // As discussed above, this shouldn't happen, so return an
+                    // internal error and complain about it in the logs.
+                    if vmm_state.is_some() {
+                        return Err(Error::internal_error(
+                            "an instance should not be in the Failed \
+                             effective state if it has an active VMM"
+                        ));
+                    }
 
+                    let prev_runtime = instance_state.runtime();
+                    return Ok(InstanceStateChangeRequestAction::UpdateRuntime(db::model::InstanceRuntimeState {
+                        time_updated: chrono::Utc::now(),
+                        r#gen: prev_runtime.r#gen.0.next().into(),
+                        nexus_state: db::model::InstanceState::NoVmm,
+                        ..prev_runtime.clone()
+                    }));
+                }
                 // If the instance has no sled beacuse it's been destroyed or
                 // has fallen over, reject the state change.
-                //
-                // TODO(#2825): Failed instances should be allowed to stop, but
-                // this requires a special action because there is no sled to
-                // send the request to.
                 InstanceState::Failed | InstanceState::Destroyed => {
                     return Err(Error::invalid_request(&format!(
                         "instance state cannot be changed from {}",
@@ -898,6 +926,24 @@ impl super::Nexus {
             &requested,
         )? {
             InstanceStateChangeRequestAction::AlreadyDone => Ok(()),
+            InstanceStateChangeRequestAction::UpdateRuntime(new_runtime) => {
+                let instance_id =
+                    InstanceUuid::from_untyped_uuid(prev_instance_state.id());
+                let changed = self
+                    .datastore()
+                    .instance_update_runtime(&instance_id, &new_runtime)
+                    .await
+                    .map_err(InstanceStateChangeError::Other)?;
+                if !changed {
+                    // TODO(eliza): perhaps we should refetch the instance here
+                    // and return Ok if it was in the desired state...
+                    Err(InstanceStateChangeError::Other(Error::conflict(
+                        "instance changed state before it could be updated",
+                    )))
+                } else {
+                    Ok(())
+                }
+            }
             InstanceStateChangeRequestAction::SendToSled {
                 sled_id,
                 propolis_id,