Move instances to Failed if sled agent returns an "unhealthy" error t…

…ype from calls to stop/reboot (#4711) Restore `instance_reboot` and `instance_stop` to their prior behavior: if these routines try to contact sled agent and get back a server error, mark the instance as unhealthy and move it to the Failed state. Also use `#[source]` instead of message interpolation in `InstanceStateChangeError::SledAgent`. This restores the status quo ante from #4682 in anticipation of reaching a better overall mechanism for dealing with failures to communicate about instances with sled agents. See #3206, #3238, and #4226 for more discussion. Tests: new integration test; stood up a dev cluster, started an instance, killed the zone with `zoneadm halt`, and verified that calls to reboot/stop the instance eventually marked it as Failed (due to a timeout attempting to contact the Propolis zone). Fixes #4709.
oxidecomputer · Dec 18, 2023 · 6d3b8be · 6d3b8be
1 parent 1032885
commit 6d3b8be
Show file tree

Hide file tree

Showing 2 changed files with 132 additions and 19 deletions.
diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
@@ -65,7 +65,7 @@ type SledAgentClientError =
     sled_agent_client::Error<sled_agent_client::types::Error>;
 
 // Newtype wrapper to avoid the orphan type rule.
-#[derive(Debug)]
+#[derive(Debug, thiserror::Error)]
 pub struct SledAgentInstancePutError(pub SledAgentClientError);
 
 impl std::fmt::Display for SledAgentInstancePutError {
@@ -117,8 +117,8 @@ impl SledAgentInstancePutError {
 #[derive(Debug, thiserror::Error)]
 pub enum InstanceStateChangeError {
     /// Sled agent returned an error from one of its instance endpoints.
-    #[error("sled agent client error: {0}")]
-    SledAgent(SledAgentInstancePutError),
+    #[error("sled agent client error")]
+    SledAgent(#[source] SledAgentInstancePutError),
 
     /// Some other error occurred outside of the attempt to communicate with
     /// sled agent.
@@ -624,14 +624,31 @@ impl super::Nexus {
             .instance_fetch_with_vmm(opctx, &authz_instance)
             .await?;
 
-        self.instance_request_state(
-            opctx,
-            &authz_instance,
-            state.instance(),
-            state.vmm(),
-            InstanceStateChangeRequest::Reboot,
-        )
-        .await?;
+        if let Err(e) = self
+            .instance_request_state(
+                opctx,
+                &authz_instance,
+                state.instance(),
+                state.vmm(),
+                InstanceStateChangeRequest::Reboot,
+            )
+            .await
+        {
+            if let InstanceStateChangeError::SledAgent(inner) = &e {
+                if inner.instance_unhealthy() {
+                    let _ = self
+                        .mark_instance_failed(
+                            &authz_instance.id(),
+                            state.instance().runtime(),
+                            inner,
+                        )
+                        .await;
+                }
+            }
+
+            return Err(e.into());
+        }
+
         self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await
     }
 
@@ -711,14 +728,30 @@ impl super::Nexus {
             .instance_fetch_with_vmm(opctx, &authz_instance)
             .await?;
 
-        self.instance_request_state(
-            opctx,
-            &authz_instance,
-            state.instance(),
-            state.vmm(),
-            InstanceStateChangeRequest::Stop,
-        )
-        .await?;
+        if let Err(e) = self
+            .instance_request_state(
+                opctx,
+                &authz_instance,
+                state.instance(),
+                state.vmm(),
+                InstanceStateChangeRequest::Stop,
+            )
+            .await
+        {
+            if let InstanceStateChangeError::SledAgent(inner) = &e {
+                if inner.instance_unhealthy() {
+                    let _ = self
+                        .mark_instance_failed(
+                            &authz_instance.id(),
+                            state.instance().runtime(),
+                            inner,
+                        )
+                        .await;
+                }
+            }
+
+            return Err(e.into());
+        }
 
         self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await
     }

diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs
@@ -867,6 +867,86 @@ async fn test_instance_migrate_v2p(cptestctx: &ControlPlaneTestContext) {
     }
 }
 
+// Verifies that if a request to reboot or stop an instance fails because of a
+// 500-level error from sled agent, then the instance moves to the Failed state.
+#[nexus_test]
+async fn test_instance_failed_after_sled_agent_error(
+    cptestctx: &ControlPlaneTestContext,
+) {
+    let client = &cptestctx.external_client;
+    let apictx = &cptestctx.server.apictx();
+    let nexus = &apictx.nexus;
+    let instance_name = "losing-is-fun";
+
+    // Create and start the test instance.
+    create_org_and_project(&client).await;
+    let instance_url = get_instance_url(instance_name);
+    let instance = create_instance(client, PROJECT_NAME, instance_name).await;
+    instance_simulate(nexus, &instance.identity.id).await;
+    let instance_next = instance_get(&client, &instance_url).await;
+    assert_eq!(instance_next.runtime.run_state, InstanceState::Running);
+
+    let sled_agent = &cptestctx.sled_agent.sled_agent;
+    sled_agent
+        .set_instance_ensure_state_error(Some(
+            omicron_common::api::external::Error::internal_error(
+                "injected by test_instance_failed_after_sled_agent_error",
+            ),
+        ))
+        .await;
+
+    let url = get_instance_url(format!("{}/reboot", instance_name).as_str());
+    NexusRequest::new(
+        RequestBuilder::new(client, Method::POST, &url)
+            .body(None as Option<&serde_json::Value>),
+    )
+    .authn_as(AuthnMode::PrivilegedUser)
+    .execute()
+    .await
+    .unwrap()
+    .parsed_body::<Instance>()
+    .expect_err("expected injected failure");
+
+    let instance_next = instance_get(&client, &instance_url).await;
+    assert_eq!(instance_next.runtime.run_state, InstanceState::Failed);
+
+    NexusRequest::object_delete(client, &get_instance_url(instance_name))
+        .authn_as(AuthnMode::PrivilegedUser)
+        .execute()
+        .await
+        .unwrap();
+
+    sled_agent.set_instance_ensure_state_error(None).await;
+
+    let instance = create_instance(client, PROJECT_NAME, instance_name).await;
+    instance_simulate(nexus, &instance.identity.id).await;
+    let instance_next = instance_get(&client, &instance_url).await;
+    assert_eq!(instance_next.runtime.run_state, InstanceState::Running);
+
+    sled_agent
+        .set_instance_ensure_state_error(Some(
+            omicron_common::api::external::Error::internal_error(
+                "injected by test_instance_failed_after_sled_agent_error",
+            ),
+        ))
+        .await;
+
+    let url = get_instance_url(format!("{}/stop", instance_name).as_str());
+    NexusRequest::new(
+        RequestBuilder::new(client, Method::POST, &url)
+            .body(None as Option<&serde_json::Value>),
+    )
+    .authn_as(AuthnMode::PrivilegedUser)
+    .execute()
+    .await
+    .unwrap()
+    .parsed_body::<Instance>()
+    .expect_err("expected injected failure");
+
+    let instance_next = instance_get(&client, &instance_url).await;
+    assert_eq!(instance_next.runtime.run_state, InstanceState::Failed);
+}
+
 /// Assert values for fleet, silo, and project using both system and silo
 /// metrics endpoints
 async fn assert_metrics(