From cf6c4cab595c185f07c65563595b299cfdeb8e2d Mon Sep 17 00:00:00 2001
From: Eliza Weisman <eliza@elizas.website>
Date: Tue, 1 Oct 2024 12:09:42 -0700
Subject: [PATCH] [sled-agent] Unbreak handling of Propolis error codes (#6726)

At present, sled-agent's handling of the error codes used by Propolis to
indicate that it has crashed and been restarted is woefully incorrect.
In particular, there are two cases where such an error may be
encountered by a sled-agent:

1. When attempting to ask the VMM to change state (e.g. reboot or stop
   the instance)
2. When hitting Propolis' `instance-state-monitor` API endpoint to
   proactively check the VM's current state

Neither of these are handled correctly today,

In the first case, if a sled-agent operation on behalf of Nexus
encounters a client error from Propolis, it will forward that error code
to Nexus...but, _Nexus_ only moves instances to `Failed` in the face of
sled-agent's `410 NO_SUCH_INSTANCE`, and _not_ [if it sees the
`NoInstance` error code from Propolis][1], which means that the ghosts
left behind by crashed and restarted Propolii still won't be moved to
`Failed`. Agh. Furthermore, in that case, sled-agent itself will not
perform the necessary cleanup actions to deal with the now-defunct
Propolis zone (e.g. collecting a zone bundle and then destroying the
zone).

In the second case, where we hit the instance-state-monitor endpoint and
get back a `NoInstance` error, things are equally dire. The
`InstanceMonitorRunner` task, which is responsible for polling the state
monitor endpoint, will just bail out on receipt of any error from
Propolis:

https://github.com/oxidecomputer/omicron/blob/888f6a1eae91e5e7091f2e174dec7a8ee5bd04b5/sled-agent/src/instance.rs#L253-L289

We would _expect_ this to drop the send-side of the channel it uses to
communicate with the `InstanceRunner`, closing the channel, and hitting
this select, which would correctly mark the VMM as failed and do what we
want, despite eating the actual error message from propolis:

https://github.com/oxidecomputer/omicron/blob/888f6a1eae91e5e7091f2e174dec7a8ee5bd04b5/sled-agent/src/instance.rs#L388-L394

HOWEVER, as the comment points out, the `InstanceRunner` is _also_
holding its own clone of the channel sender, keeping us from ever
hitting that case:

https://github.com/oxidecomputer/omicron/blob/888f6a1eae91e5e7091f2e174dec7a8ee5bd04b5/sled-agent/src/instance.rs#L308-L309

AFAICT, this means we'll just totally give up on monitoring the instance
as soon as we hit any error here, which seems...quite bad. I *think*
that this actually means that when a Propolis process crashes
unexpectedly, we'll get an error when the TCP connection closes, bail
out, and then _never try hitting the instance monitor endpoint ever
again_[^1]. So, we won't notice that the Propolis is actually out to
lunch until we try to ask it to change state. **This Is Not Great!**

This commit fixes both of these cases, by making sled-agent actually
handle Propolis' error codes correctly. I've added a dependency on the
`propolis_api_types` crate, which is where the error code lives, and
some code in sled-agent to attempt to parse these codes when receiving
an error response from the Propolis client. Now, when we try to PUT a
new state to the instance, we'll look at the error code that comes back,
mark it as `Failed` if the error indicates that we should do so, and
publish the `Failed` VMM state to Nexus before tearing down the zone.
The `InstanceMonitorTask` now behaves similarly, and I've changed it to
retry all other errors with a backoff, rather than just bailing out
immediately on receipt fo the first error.

I've manually tested this on `london` as discussed here:

https://github.com/oxidecomputer/omicron/pull/6726#issuecomment-2384012654

Unfortunately, it's hard to do any kind of automated testing of this
with the current test situation, as none of this code is exercised by
the simulated sled-agent.

Fixes #3209
Fixes #3238

[^1]: Although I've not actually verified that this is what happens.

[1]: https://github.com/oxidecomputer/propolis/blob/516dabe473cecdc3baea1db98a80441968c144cf/crates/propolis-api-types/src/lib.rs#L434-L441
---
 Cargo.lock                 |  29 +++++-
 Cargo.toml                 |   1 +
 sled-agent/Cargo.toml      |   1 +
 sled-agent/src/instance.rs | 191 ++++++++++++++++++++++++++++++-------
 4 files changed, 188 insertions(+), 34 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 63610802461..53e938bc11a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1720,6 +1720,19 @@ dependencies = [
  "serde_json",
 ]
 
+[[package]]
+name = "crucible-client-types"
+version = "0.1.0"
+source = "git+https://github.com/oxidecomputer/crucible?rev=2b88ab88461fb06aaf2aab11c5e381a3cad25eac#2b88ab88461fb06aaf2aab11c5e381a3cad25eac"
+dependencies = [
+ "base64 0.22.1",
+ "crucible-workspace-hack",
+ "schemars",
+ "serde",
+ "serde_json",
+ "uuid",
+]
+
 [[package]]
 name = "crucible-common"
 version = "0.0.1"
@@ -3983,7 +3996,7 @@ dependencies = [
  "httpdate",
  "itoa",
  "pin-project-lite",
- "socket2 0.5.7",
+ "socket2 0.4.10",
  "tokio",
  "tower-service",
  "tracing",
@@ -6881,6 +6894,7 @@ dependencies = [
  "pretty_assertions",
  "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=11371b0f3743f8df5b047dc0edc2699f4bdf3927)",
  "propolis-mock-server",
+ "propolis_api_types",
  "rand",
  "rcgen",
  "reqwest 0.12.7",
@@ -8592,6 +8606,19 @@ dependencies = [
  "toml 0.7.8",
 ]
 
+[[package]]
+name = "propolis_api_types"
+version = "0.0.0"
+source = "git+https://github.com/oxidecomputer/propolis?rev=fae5334bcad5e864794332c6fed5e6bb9ec88831#fae5334bcad5e864794332c6fed5e6bb9ec88831"
+dependencies = [
+ "crucible-client-types",
+ "propolis_types",
+ "schemars",
+ "serde",
+ "thiserror",
+ "uuid",
+]
+
 [[package]]
 name = "propolis_types"
 version = "0.0.0"
diff --git a/Cargo.toml b/Cargo.toml
index 98b0246d5f6..5411e233b2b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -512,6 +512,7 @@ proc-macro2 = "1.0"
 progenitor = "0.8.0"
 progenitor-client = "0.8.0"
 bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "11371b0f3743f8df5b047dc0edc2699f4bdf3927" }
+propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "11371b0f3743f8df5b047dc0edc2699f4bdf3927" }
 propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "11371b0f3743f8df5b047dc0edc2699f4bdf3927" }
 propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "11371b0f3743f8df5b047dc0edc2699f4bdf3927" }
 proptest = "1.5.0"
diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml
index b4b51e2f092..3208f1c031f 100644
--- a/sled-agent/Cargo.toml
+++ b/sled-agent/Cargo.toml
@@ -59,6 +59,7 @@ oximeter.workspace = true
 oximeter-instruments.workspace = true
 oximeter-producer.workspace = true
 oxnet.workspace = true
+propolis_api_types.workspace = true
 propolis-client.workspace = true
 propolis-mock-server.workspace = true # Only used by the simulated sled agent
 rand = { workspace = true, features = ["getrandom"] }
diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs
index 3d8441e5146..35380fa6665 100644
--- a/sled-agent/src/instance.rs
+++ b/sled-agent/src/instance.rs
@@ -17,7 +17,6 @@ use crate::profile::*;
 use crate::zone_bundle::BundleError;
 use crate::zone_bundle::ZoneBundler;
 use anyhow::anyhow;
-use backoff::BackoffError;
 use chrono::Utc;
 use illumos_utils::dladm::Etherstub;
 use illumos_utils::link::VnicAllocator;
@@ -30,9 +29,11 @@ use omicron_common::api::internal::shared::{
     NetworkInterface, ResolvedVpcFirewallRule, SledIdentifiers, SourceNatConfig,
 };
 use omicron_common::backoff;
+use omicron_common::backoff::BackoffError;
 use omicron_common::zpool_name::ZpoolName;
 use omicron_common::NoDebug;
 use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid};
+use propolis_api_types::ErrorCode as PropolisErrorCode;
 use propolis_client::Client as PropolisClient;
 use rand::prelude::IteratorRandom;
 use rand::SeedableRng;
@@ -64,7 +65,7 @@ pub enum Error {
     VnicCreation(#[from] illumos_utils::dladm::CreateVnicError),
 
     #[error("Failure from Propolis Client: {0}")]
-    Propolis(#[from] propolis_client::Error<propolis_client::types::Error>),
+    Propolis(#[from] PropolisClientError),
 
     // TODO: Remove this error; prefer to retry notifications.
     #[error("Notifying Nexus failed: {0}")]
@@ -130,6 +131,9 @@ pub enum Error {
     Terminating,
 }
 
+type PropolisClientError =
+    propolis_client::Error<propolis_client::types::Error>;
+
 // Issues read-only, idempotent HTTP requests at propolis until it responds with
 // an acknowledgement. This provides a hacky mechanism to "wait until the HTTP
 // server is serving requests".
@@ -248,24 +252,32 @@ enum InstanceRequest {
 struct InstanceMonitorRunner {
     client: Arc<PropolisClient>,
     tx_monitor: mpsc::Sender<InstanceMonitorRequest>,
+    log: slog::Logger,
 }
 
 impl InstanceMonitorRunner {
     async fn run(self) -> Result<(), anyhow::Error> {
-        let mut gen = 0;
+        let mut generation = 0;
         loop {
-            // State monitoring always returns the most recent state/gen pair
-            // known to Propolis.
-            let response = self
-                .client
-                .instance_state_monitor()
-                .body(propolis_client::types::InstanceStateMonitorRequest {
-                    gen,
-                })
-                .send()
-                .await?
-                .into_inner();
-            let observed_gen = response.gen;
+            let update = backoff::retry_notify(
+                backoff::retry_policy_local(),
+                || self.monitor(generation),
+                |error, delay| {
+                    warn!(
+                        self.log,
+                        "Failed to poll Propolis state";
+                        "error" => %error,
+                        "retry_in" => ?delay,
+                        "generation" => generation,
+                    );
+                },
+            )
+            .await?;
+
+            // Update the state generation for the next poll.
+            if let InstanceMonitorUpdate::State(ref state) = update {
+                generation = state.r#gen + 1;
+            }
 
             // Now that we have the response from Propolis' HTTP server, we
             // forward that to the InstanceRunner.
@@ -273,26 +285,59 @@ impl InstanceMonitorRunner {
             // It will decide the new state, provide that info to Nexus,
             // and possibly identify if we should terminate.
             let (tx, rx) = oneshot::channel();
-            self.tx_monitor
-                .send(InstanceMonitorRequest::Update { state: response, tx })
-                .await?;
+            self.tx_monitor.send(InstanceMonitorRequest { update, tx }).await?;
 
             if let Reaction::Terminate = rx.await? {
                 return Ok(());
             }
+        }
+    }
+
+    async fn monitor(
+        &self,
+        generation: u64,
+    ) -> Result<InstanceMonitorUpdate, BackoffError<PropolisClientError>> {
+        // State monitoring always returns the most recent state/gen pair
+        // known to Propolis.
+        let result = self
+            .client
+            .instance_state_monitor()
+            .body(propolis_client::types::InstanceStateMonitorRequest {
+                r#gen: generation,
+            })
+            .send()
+            .await;
+        match result {
+            Ok(response) => {
+                let state = response.into_inner();
 
-            // Update the generation number we're asking for, to ensure the
-            // Propolis will only return more recent values.
-            gen = observed_gen + 1;
+                Ok(InstanceMonitorUpdate::State(state))
+            }
+            // If the channel has closed, then there's nothing left for us to do
+            // here. Go die.
+            Err(e) if self.tx_monitor.is_closed() => {
+                Err(BackoffError::permanent(e))
+            }
+            // Otherwise, was there a known error code from Propolis?
+            Err(e) => propolis_error_code(&self.log, &e)
+                // If we were able to parse a known error code, send it along to
+                // the instance runner task.
+                .map(InstanceMonitorUpdate::Error)
+                // Otherwise, just keep trying until we see a good state or
+                // known error code.
+                .ok_or_else(|| BackoffError::transient(e)),
         }
     }
 }
 
-enum InstanceMonitorRequest {
-    Update {
-        state: propolis_client::types::InstanceStateMonitorResponse,
-        tx: oneshot::Sender<Reaction>,
-    },
+enum InstanceMonitorUpdate {
+    State(propolis_client::types::InstanceStateMonitorResponse),
+    Error(PropolisErrorCode),
+}
+
+struct InstanceMonitorRequest {
+    update: InstanceMonitorUpdate,
+    tx: oneshot::Sender<Reaction>,
 }
 
 struct InstanceRunner {
@@ -371,9 +416,9 @@ impl InstanceRunner {
 
                 // Handle messages from our own "Monitor the VMM" task.
                 request = self.rx_monitor.recv() => {
-                    use InstanceMonitorRequest::*;
+                    use InstanceMonitorUpdate::*;
                     match request {
-                        Some(Update { state, tx }) => {
+                        Some(InstanceMonitorRequest { update: State(state), tx }) => {
                             let observed = ObservedPropolisState::new(&state);
                             let reaction = self.observe_state(&observed).await;
                             self.publish_state_to_nexus().await;
@@ -385,6 +430,41 @@ impl InstanceRunner {
                             if let Err(_) = tx.send(reaction) {
                                 warn!(self.log, "InstanceRunner failed to send to InstanceMonitorRunner");
                             }
+                        },
+                         Some(InstanceMonitorRequest { update: Error(code), tx }) => {
+                            let reaction = if code == PropolisErrorCode::NoInstance {
+                                // If we see a `NoInstance` error code from
+                                // Propolis after the instance has been ensured,
+                                // this means that Propolis must have crashed
+                                // and been restarted, and now no longer
+                                // remembers that it once had a VM. In that
+                                // case, this Propolis is permanently busted, so
+                                // mark it as Failed and tear down the zone.
+                                warn!(
+                                    self.log,
+                                    "Propolis has lost track of its instance! \
+                                     It must have crashed. Moving to Failed";
+                                    "error_code" => ?code,
+                                );
+                                self.terminate(true).await;
+                                Reaction::Terminate
+                            } else {
+                                // The other error codes we know of are not
+                                // expected here --- they all relate to the
+                                // instance-creation APIs. So I guess we'll just
+                                // whine about it and then keep trying to
+                                // monitor the instance.
+                                warn!(
+                                    self.log,
+                                    "Propolis state monitor returned an \
+                                     unexpected error code";
+                                    "error_code" => ?code,
+                                );
+                                Reaction::Continue
+                            };
+                            if let Err(_) = tx.send(reaction) {
+                                warn!(self.log, "InstanceRunner failed to send to InstanceMonitorRunner");
+                            }
                         },
                         // NOTE: This case shouldn't really happen, as we keep a copy
                         // of the sender alive in "self.tx_monitor".
@@ -643,7 +723,7 @@ impl InstanceRunner {
     async fn propolis_state_put(
         &self,
         request: propolis_client::types::InstanceStateRequested,
-    ) -> Result<(), Error> {
+    ) -> Result<(), PropolisClientError> {
         let res = self
             .running_state
             .as_ref()
@@ -656,11 +736,10 @@ impl InstanceRunner {
 
         if let Err(e) = &res {
             error!(self.log, "Error from Propolis client: {:?}", e;
-                   "status" => ?e.status());
+               "status" => ?e.status());
         }
 
-        res?;
-        Ok(())
+        res.map(|_| ())
     }
 
     /// Sends an instance ensure request to this instance's Propolis.
@@ -774,6 +853,7 @@ impl InstanceRunner {
         let runner = InstanceMonitorRunner {
             client: client.clone(),
             tx_monitor: self.tx_monitor.clone(),
+            log: self.log.clone(),
         };
         let log = self.log.clone();
         let monitor_handle = tokio::task::spawn(async move {
@@ -962,6 +1042,29 @@ impl InstanceRunner {
     }
 }
 
+fn propolis_error_code(
+    log: &slog::Logger,
+    error: &PropolisClientError,
+) -> Option<PropolisErrorCode> {
+    // Is this a structured error response from the Propolis server?
+    let propolis_client::Error::ErrorResponse(ref rv) = &error else {
+        return None;
+    };
+
+    let code = rv.error_code.as_deref()?;
+    match code.parse::<PropolisErrorCode>() {
+        Err(parse_error) => {
+            warn!(log, "Propolis returned an unknown error code: {code:?}";
+                "status" => ?error.status(),
+                "error" => %error,
+                "code" => ?code,
+                "parse_error" => ?parse_error);
+            None
+        }
+        Ok(code) => Some(code),
+    }
+}
+
 /// Describes a single Propolis server that incarnates a specific instance.
 pub struct Instance {
     id: InstanceUuid,
@@ -1361,7 +1464,29 @@ impl InstanceRunner {
         };
 
         if let Some(p) = propolis_state {
-            self.propolis_state_put(p).await?;
+            if let Err(e) = self.propolis_state_put(p).await {
+                match propolis_error_code(&self.log, &e) {
+                    Some(
+                        code @ PropolisErrorCode::NoInstance
+                        | code @ PropolisErrorCode::CreateFailed,
+                    ) => {
+                        error!(self.log,
+                            "Propolis error code indicates VMM failure";
+                            "code" => ?code,
+                        );
+                        self.terminate(true).await;
+                        // We've transitioned to `Failed`, so just return the
+                        // failed state normally. We return early here instead
+                        // of falling through because we don't want to overwrite
+                        // `self.state` with the published VMM state determined
+                        // above.
+                        return Ok(self.state.sled_instance_state());
+                    }
+                    _ => {
+                        return Err(Error::Propolis(e));
+                    }
+                }
+            }
         }
         if let Some(s) = next_published {
             self.state.transition_vmm(s, Utc::now());