From f4b29bb6165ac38f8206c759921b480acc98de87 Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Mon, 23 Sep 2024 13:02:51 -0700
Subject: [PATCH] [nexus] Separate action to chain instance-updates (#6630)

Currently, the `instance-update` saga's `siu_commit_instance_updates`
saga attempts to check if an additional update saga is needed and start
one if so. This is done in the same action that writes back the updated
instance record. In [this comment][1] on PR #6503, @davepacheco pointed
out that conflating these two operations makes the idempotency tests for
the update saga less effective, since the action performs multiple
operations (even if some of them are permitted to fail). This commit
refactors the instance-update saga so that the chaining operation is
performed in a separate action from the rest of the saga.

There should be no functional change to the saga's behavior as a result
of this, beyond making the idempotency tests more correct.

[1]:
https://github.com/oxidecomputer/omicron/pull/6503#discussion_r1765499555
---
 nexus/src/app/sagas/instance_update/mod.rs | 268 +++++++++++----------
 1 file changed, 147 insertions(+), 121 deletions(-)

diff --git a/nexus/src/app/sagas/instance_update/mod.rs b/nexus/src/app/sagas/instance_update/mod.rs
index 5464a214dc..9d2fc58512 100644
--- a/nexus/src/app/sagas/instance_update/mod.rs
+++ b/nexus/src/app/sagas/instance_update/mod.rs
@@ -250,10 +250,10 @@
 //! required.
 //!
 //! The second mechanism for ensuring updates are performed in a timely manner
-//! is what I'm calling _saga chaining_. When the final action in an
-//! instance-update saga writes back the instance record and releases the
-//! updater lock, it will then perform a second query to read the instance, VMM,
-//! and migration records. If the current state of the instance indicates that
+//! is what I'm calling _saga chaining_. After the instance-update saga writes
+//! back the instance record and releases the  updater lock, the saga's final
+//! action  will then perform a second query to read the instance, VMM, and
+//! migration records. If the current state of the instance indicates that
 //! another update saga is needed, then the completing saga will execute a new
 //! start saga as its final action.
 //!
@@ -805,6 +805,21 @@ declare_saga_actions! {
         + siu_commit_instance_updates
     }
 
+    // Check if the VMM or migration state has changed while the update saga was
+    // running and whether an additional update saga is now required. If one is
+    // required, try to start it.
+    //
+    // Unlike other actions, this step is optional and allowed to fail without
+    // unwinding the saga. We attempt to start a successor saga only as an
+    // optimization to improve the latency with which instance states are
+    // reconciled with VMM and migration states. If another update saga is
+    // required, it will always be started eventually by other mechanisms
+    // regardless of whether this action succeeds, and a failure here does not
+    // invalidate any of the other work performed by this saga.
+    CHAIN_SUCCESSOR_SAGA -> "chain_successor_saga" {
+        + siu_chain_successor_saga
+    }
+
 }
 
 // instance update saga: definition
@@ -907,6 +922,11 @@ impl NexusSaga for SagaDoActualInstanceUpdate {
             append_destroyed_vmm_subsaga(vmm_id, "target")?;
         }
 
+        // Finally, check if any additional updates are required to reconcile
+        // the instance, VMM, and migration states, and attempt to start a new
+        // saga if needed.
+        builder.append(chain_successor_saga_action());
+
         Ok(builder.build()?)
     }
 }
@@ -1180,138 +1200,144 @@ async fn siu_commit_instance_updates(
         nexus.vpc_needed_notify_sleds();
     }
 
-    // Check if the VMM or migration state has changed while the update saga was
-    // running and whether an additional update saga is now required. If one is
-    // required, try to start it.
-    //
-    // TODO(eliza): it would be nice if we didn't release the lock, determine
-    // the needed updates, and then start a new start-instance-update saga that
-    // re-locks the instance --- instead, perhaps we could keep the lock, and
-    // try to start a new "actual" instance update saga that inherits our lock.
-    // This way, we could also avoid computing updates required twice.
-    // But, I'm a bit sketched out by the implications of not committing update
-    // and dropping the lock in the same operation. This deserves more
-    // thought...
-
-    // Fetch the state from the database again to see if we should immediately
-    // run a new saga.
-    let new_state = match osagactx
-        .datastore()
-        .instance_fetch_all(&opctx, &authz_instance)
-        .await
-    {
-        Ok(s) => s,
-        // Do NOT unwind the rest of the saga if this fails, since we've
-        // already committed the update.
-        Err(e) => {
-            warn!(log,
-                "instance update: failed to fetch latest state after update";
-                "instance_id" => %instance_id,
-                "error" => %e,
-            );
-            nexus.background_tasks.task_instance_updater.activate();
-            return Ok(());
-        }
-    };
+    Ok(())
+}
 
-    if let Err(error) = chain_update_saga(
-        &sagactx,
-        authz_instance,
-        serialized_authn,
-        &new_state,
-    )
-    .await
-    {
-        // If starting the new update saga failed, DO NOT unwind this saga and
-        // undo all the work we've done successfully! Instead, just kick the
-        // instance-updater background task to try and start a new saga
-        // eventually, and log a warning.
-        warn!(
-            log,
-            "instance update: failed to start successor saga!";
-            "instance_id" => %instance_id,
-            "error" => %error,
-        );
-        nexus.background_tasks.task_instance_updater.activate();
-        return Ok(());
-    }
+/// Check if the VMM or migration state has changed while the update saga was
+/// running and whether an additional update saga is now required. If one is
+/// required, try to start it.
+///
+/// TODO(eliza): it would be nice if we didn't release the lock, determine
+/// the needed updates, and then start a new start-instance-update saga that
+/// re-locks the instance --- instead, perhaps we could keep the lock, and
+/// try to start a new "actual" instance update saga that inherits our lock.
+/// This way, we could also avoid computing updates required twice.
+/// But, I'm a bit sketched out by the implications of not committing update
+/// and dropping the lock in the same operation. This deserves more thought...
+async fn siu_chain_successor_saga(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let osagactx = sagactx.user_data();
+    let RealParams { serialized_authn, authz_instance, .. } =
+        sagactx.saga_params::<RealParams>()?;
 
-    // If the instance has transitioned to the `Failed` state, no additional
-    // update saga is required, and the instance's auto-restart policy allows it
-    // to be automatically restarted, activate the instance-reincarnation
-    // background task to automatically restart it.
-    let auto_restart = new_state.instance.auto_restart;
-    match auto_restart.status(&new_state.instance.runtime_state) {
-        InstanceKarmicStatus::Ready => {
-            info!(
+    let opctx =
+        crate::context::op_context_for_saga_action(&sagactx, &serialized_authn);
+    let log = osagactx.log();
+    let nexus = osagactx.nexus();
+
+    let instance_id = authz_instance.id();
+    // Note: unlike most other saga actions, we DO NOT return an error and
+    // unwind the update saga if this operation fails. The update saga has
+    // already committed the necessary changes to the instance record, so
+    // everything we needed to do here has already been done. Attempting to
+    // start a subsequent update saga if additional updates are required is only
+    // done as an optimization to ensure that updates are performed in a timely
+    // manner; the `instance-updater` background task will always eventually
+    // ensure that an instance's state is reconciled eventually.
+    //
+    // Therefore, we wrap the code that starts the successor saga in an `async`
+    // block to catch any errors it returns so that we can just log a warning
+    // and kick the instance-updater background task to try and start a new saga
+    // eventually.
+    let saga_started = async {
+        // Fetch the state from the database again to see if we should immediately
+        // run a new saga.
+        let new_state = osagactx
+            .datastore()
+            .instance_fetch_all(&opctx, &authz_instance)
+            .await
+            .context("failed to fetch latest snapshot for instance")?;
+
+        if let Some(update) = UpdatesRequired::for_instance(log, &new_state) {
+            debug!(
                 log,
-                "instance update: instance transitioned to Failed, but can \
-                 be automatically restarted; activating reincarnation.";
+                "instance update: additional updates required, preparing a \
+                 successor update saga...";
                 "instance_id" => %instance_id,
-                "auto_restart" => ?auto_restart,
-                "runtime_state" => ?new_state.instance.runtime_state,
+                "update.new_runtime_state" => ?update.new_runtime,
+                "update.network_config_update" => ?update.network_config,
+                "update.destroy_active_vmm" => ?update.destroy_active_vmm,
+                "update.destroy_target_vmm" => ?update.destroy_target_vmm,
+                "update.deprovision" => ?update.deprovision,
             );
-            nexus.background_tasks.task_instance_reincarnation.activate();
-        }
-        InstanceKarmicStatus::CoolingDown(remaining) => {
+            let saga_dag = SagaInstanceUpdate::prepare(&Params {
+                serialized_authn,
+                authz_instance,
+            })
+            .context("failed to build new update saga DAG")?;
+            let saga = osagactx
+                .nexus()
+                .sagas
+                .saga_prepare(saga_dag)
+                .await
+                .context("failed to prepare new update saga")?;
+            // N.B. that we don't wait for the successor update saga to *complete*
+            // here. We just want to make sure it starts.
+            saga.start()
+                .await
+                .context("failed to start successor update saga")?;
             info!(
                 log,
-                "instance update: instance transitioned to Failed, but is \
-                 still in cooldown from a previous reincarnation";
+                "instance update: successor update saga started!";
                 "instance_id" => %instance_id,
-                "auto_restart" => ?auto_restart,
-                "cooldown_remaining" => ?remaining,
-                "runtime_state" => ?new_state.instance.runtime_state,
             );
+        } else {
+            // If the instance has transitioned to the `Failed` state and no
+            // additional update saga is required, check if the instance's
+            // auto-restart policy allows it to be automatically restarted. If
+            // it does, activate the instance-reincarnation background task to
+            // automatically restart it.
+            let auto_restart = new_state.instance.auto_restart;
+            match auto_restart.status(&new_state.instance.runtime_state) {
+                InstanceKarmicStatus::Ready => {
+                    info!(
+                        log,
+                        "instance update: instance transitioned to Failed, \
+                         but can be automatically restarted; activating \
+                         reincarnation.";
+                        "instance_id" => %instance_id,
+                        "auto_restart" => ?auto_restart,
+                        "runtime_state" => ?new_state.instance.runtime_state,
+                    );
+                    nexus
+                        .background_tasks
+                        .task_instance_reincarnation
+                        .activate();
+                }
+                InstanceKarmicStatus::CoolingDown(remaining) => {
+                    info!(
+                        log,
+                        "instance update: instance transitioned to Failed, \
+                         but is still in cooldown from a previous \
+                         reincarnation";
+                        "instance_id" => %instance_id,
+                        "auto_restart" => ?auto_restart,
+                        "cooldown_remaining" => ?remaining,
+                        "runtime_state" => ?new_state.instance.runtime_state,
+                    );
+                }
+                InstanceKarmicStatus::Forbidden
+                | InstanceKarmicStatus::NotFailed => {}
+            }
         }
-        InstanceKarmicStatus::Forbidden | InstanceKarmicStatus::NotFailed => {}
-    }
-
-    Ok(())
-}
 
-async fn chain_update_saga(
-    sagactx: &NexusActionContext,
-    authz_instance: authz::Instance,
-    serialized_authn: authn::saga::Serialized,
-    new_state: &InstanceGestalt,
-) -> Result<(), anyhow::Error> {
-    let osagactx = sagactx.user_data();
-    let log = osagactx.log();
-
-    let instance_id = authz_instance.id();
+        Ok::<(), anyhow::Error>(())
+    }
+    .await;
 
-    if let Some(update) = UpdatesRequired::for_instance(log, new_state) {
-        debug!(
-            log,
-            "instance update: additional updates required, preparing a \
-             successor update saga...";
-            "instance_id" => %instance_id,
-            "update.new_runtime_state" => ?update.new_runtime,
-            "update.network_config_update" => ?update.network_config,
-            "update.destroy_active_vmm" => ?update.destroy_active_vmm,
-            "update.destroy_target_vmm" => ?update.destroy_target_vmm,
-            "update.deprovision" => ?update.deprovision,
-        );
-        let saga_dag = SagaInstanceUpdate::prepare(&Params {
-            serialized_authn,
-            authz_instance,
-        })
-        .context("failed to build new update saga DAG")?;
-        let saga = osagactx
-            .nexus()
-            .sagas
-            .saga_prepare(saga_dag)
-            .await
-            .context("failed to prepare new update saga")?;
-        saga.start().await.context("failed to start successor update saga")?;
-        // N.B. that we don't wait for the successor update saga to *complete*
-        // here. We just want to make sure it starts.
-        info!(
+    // If we were unable to start a successor saga (or determine whether we
+    // needed to do so), log a warning, and active the instance-updater
+    // background task in order to attempt to start a new saga again.
+    if let Err(error) = saga_started {
+        warn!(
             log,
-            "instance update: successor update saga started!";
+            "instance update: failed to start successor saga!";
             "instance_id" => %instance_id,
+            "error" => %error,
         );
+        nexus.background_tasks.task_instance_updater.activate();
+        return Ok(());
     }
 
     Ok(())