From 6d3b8be167f2ef328e2976586de7006bb4ddbe03 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Mon, 18 Dec 2023 11:28:07 -0800 Subject: [PATCH] Move instances to Failed if sled agent returns an "unhealthy" error type from calls to stop/reboot (#4711) Restore `instance_reboot` and `instance_stop` to their prior behavior: if these routines try to contact sled agent and get back a server error, mark the instance as unhealthy and move it to the Failed state. Also use `#[source]` instead of message interpolation in `InstanceStateChangeError::SledAgent`. This restores the status quo ante from #4682 in anticipation of reaching a better overall mechanism for dealing with failures to communicate about instances with sled agents. See #3206, #3238, and #4226 for more discussion. Tests: new integration test; stood up a dev cluster, started an instance, killed the zone with `zoneadm halt`, and verified that calls to reboot/stop the instance eventually marked it as Failed (due to a timeout attempting to contact the Propolis zone). Fixes #4709. --- nexus/src/app/instance.rs | 71 ++++++++++++++----- nexus/tests/integration_tests/instances.rs | 80 ++++++++++++++++++++++ 2 files changed, 132 insertions(+), 19 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 93386a66d0..4045269878 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -65,7 +65,7 @@ type SledAgentClientError = sled_agent_client::Error; // Newtype wrapper to avoid the orphan type rule. -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub struct SledAgentInstancePutError(pub SledAgentClientError); impl std::fmt::Display for SledAgentInstancePutError { @@ -117,8 +117,8 @@ impl SledAgentInstancePutError { #[derive(Debug, thiserror::Error)] pub enum InstanceStateChangeError { /// Sled agent returned an error from one of its instance endpoints. - #[error("sled agent client error: {0}")] - SledAgent(SledAgentInstancePutError), + #[error("sled agent client error")] + SledAgent(#[source] SledAgentInstancePutError), /// Some other error occurred outside of the attempt to communicate with /// sled agent. @@ -624,14 +624,31 @@ impl super::Nexus { .instance_fetch_with_vmm(opctx, &authz_instance) .await?; - self.instance_request_state( - opctx, - &authz_instance, - state.instance(), - state.vmm(), - InstanceStateChangeRequest::Reboot, - ) - .await?; + if let Err(e) = self + .instance_request_state( + opctx, + &authz_instance, + state.instance(), + state.vmm(), + InstanceStateChangeRequest::Reboot, + ) + .await + { + if let InstanceStateChangeError::SledAgent(inner) = &e { + if inner.instance_unhealthy() { + let _ = self + .mark_instance_failed( + &authz_instance.id(), + state.instance().runtime(), + inner, + ) + .await; + } + } + + return Err(e.into()); + } + self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await } @@ -711,14 +728,30 @@ impl super::Nexus { .instance_fetch_with_vmm(opctx, &authz_instance) .await?; - self.instance_request_state( - opctx, - &authz_instance, - state.instance(), - state.vmm(), - InstanceStateChangeRequest::Stop, - ) - .await?; + if let Err(e) = self + .instance_request_state( + opctx, + &authz_instance, + state.instance(), + state.vmm(), + InstanceStateChangeRequest::Stop, + ) + .await + { + if let InstanceStateChangeError::SledAgent(inner) = &e { + if inner.instance_unhealthy() { + let _ = self + .mark_instance_failed( + &authz_instance.id(), + state.instance().runtime(), + inner, + ) + .await; + } + } + + return Err(e.into()); + } self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await } diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 4acc918333..44b65fa67b 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -867,6 +867,86 @@ async fn test_instance_migrate_v2p(cptestctx: &ControlPlaneTestContext) { } } +// Verifies that if a request to reboot or stop an instance fails because of a +// 500-level error from sled agent, then the instance moves to the Failed state. +#[nexus_test] +async fn test_instance_failed_after_sled_agent_error( + cptestctx: &ControlPlaneTestContext, +) { + let client = &cptestctx.external_client; + let apictx = &cptestctx.server.apictx(); + let nexus = &apictx.nexus; + let instance_name = "losing-is-fun"; + + // Create and start the test instance. + create_org_and_project(&client).await; + let instance_url = get_instance_url(instance_name); + let instance = create_instance(client, PROJECT_NAME, instance_name).await; + instance_simulate(nexus, &instance.identity.id).await; + let instance_next = instance_get(&client, &instance_url).await; + assert_eq!(instance_next.runtime.run_state, InstanceState::Running); + + let sled_agent = &cptestctx.sled_agent.sled_agent; + sled_agent + .set_instance_ensure_state_error(Some( + omicron_common::api::external::Error::internal_error( + "injected by test_instance_failed_after_sled_agent_error", + ), + )) + .await; + + let url = get_instance_url(format!("{}/reboot", instance_name).as_str()); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(None as Option<&serde_json::Value>), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body::() + .expect_err("expected injected failure"); + + let instance_next = instance_get(&client, &instance_url).await; + assert_eq!(instance_next.runtime.run_state, InstanceState::Failed); + + NexusRequest::object_delete(client, &get_instance_url(instance_name)) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap(); + + sled_agent.set_instance_ensure_state_error(None).await; + + let instance = create_instance(client, PROJECT_NAME, instance_name).await; + instance_simulate(nexus, &instance.identity.id).await; + let instance_next = instance_get(&client, &instance_url).await; + assert_eq!(instance_next.runtime.run_state, InstanceState::Running); + + sled_agent + .set_instance_ensure_state_error(Some( + omicron_common::api::external::Error::internal_error( + "injected by test_instance_failed_after_sled_agent_error", + ), + )) + .await; + + let url = get_instance_url(format!("{}/stop", instance_name).as_str()); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &url) + .body(None as Option<&serde_json::Value>), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .unwrap() + .parsed_body::() + .expect_err("expected injected failure"); + + let instance_next = instance_get(&client, &instance_url).await; + assert_eq!(instance_next.runtime.run_state, InstanceState::Failed); +} + /// Assert values for fleet, silo, and project using both system and silo /// metrics endpoints async fn assert_metrics(