Skip to content

Commit

Permalink
Move instances to Failed if sled agent returns an "unhealthy" error t…
Browse files Browse the repository at this point in the history
…ype from calls to stop/reboot (#4711)

Restore `instance_reboot` and `instance_stop` to their prior behavior:
if these routines try to contact sled agent and get back a server error,
mark the instance as unhealthy and move it to the Failed state.

Also use `#[source]` instead of message interpolation in
`InstanceStateChangeError::SledAgent`.

This restores the status quo ante from #4682 in anticipation of reaching
a better overall mechanism for dealing with failures to communicate
about instances with sled agents. See #3206, #3238, and #4226 for more
discussion.

Tests: new integration test; stood up a dev cluster, started an
instance, killed the zone with `zoneadm halt`, and verified that calls
to reboot/stop the instance eventually marked it as Failed (due to a
timeout attempting to contact the Propolis zone).

Fixes #4709.
  • Loading branch information
gjcolombo authored Dec 18, 2023
1 parent 1032885 commit 6d3b8be
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 19 deletions.
71 changes: 52 additions & 19 deletions nexus/src/app/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ type SledAgentClientError =
sled_agent_client::Error<sled_agent_client::types::Error>;

// Newtype wrapper to avoid the orphan type rule.
#[derive(Debug)]
#[derive(Debug, thiserror::Error)]
pub struct SledAgentInstancePutError(pub SledAgentClientError);

impl std::fmt::Display for SledAgentInstancePutError {
Expand Down Expand Up @@ -117,8 +117,8 @@ impl SledAgentInstancePutError {
#[derive(Debug, thiserror::Error)]
pub enum InstanceStateChangeError {
/// Sled agent returned an error from one of its instance endpoints.
#[error("sled agent client error: {0}")]
SledAgent(SledAgentInstancePutError),
#[error("sled agent client error")]
SledAgent(#[source] SledAgentInstancePutError),

/// Some other error occurred outside of the attempt to communicate with
/// sled agent.
Expand Down Expand Up @@ -624,14 +624,31 @@ impl super::Nexus {
.instance_fetch_with_vmm(opctx, &authz_instance)
.await?;

self.instance_request_state(
opctx,
&authz_instance,
state.instance(),
state.vmm(),
InstanceStateChangeRequest::Reboot,
)
.await?;
if let Err(e) = self
.instance_request_state(
opctx,
&authz_instance,
state.instance(),
state.vmm(),
InstanceStateChangeRequest::Reboot,
)
.await
{
if let InstanceStateChangeError::SledAgent(inner) = &e {
if inner.instance_unhealthy() {
let _ = self
.mark_instance_failed(
&authz_instance.id(),
state.instance().runtime(),
inner,
)
.await;
}
}

return Err(e.into());
}

self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await
}

Expand Down Expand Up @@ -711,14 +728,30 @@ impl super::Nexus {
.instance_fetch_with_vmm(opctx, &authz_instance)
.await?;

self.instance_request_state(
opctx,
&authz_instance,
state.instance(),
state.vmm(),
InstanceStateChangeRequest::Stop,
)
.await?;
if let Err(e) = self
.instance_request_state(
opctx,
&authz_instance,
state.instance(),
state.vmm(),
InstanceStateChangeRequest::Stop,
)
.await
{
if let InstanceStateChangeError::SledAgent(inner) = &e {
if inner.instance_unhealthy() {
let _ = self
.mark_instance_failed(
&authz_instance.id(),
state.instance().runtime(),
inner,
)
.await;
}
}

return Err(e.into());
}

self.db_datastore.instance_fetch_with_vmm(opctx, &authz_instance).await
}
Expand Down
80 changes: 80 additions & 0 deletions nexus/tests/integration_tests/instances.rs
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,86 @@ async fn test_instance_migrate_v2p(cptestctx: &ControlPlaneTestContext) {
}
}

// Verifies that if a request to reboot or stop an instance fails because of a
// 500-level error from sled agent, then the instance moves to the Failed state.
#[nexus_test]
async fn test_instance_failed_after_sled_agent_error(
cptestctx: &ControlPlaneTestContext,
) {
let client = &cptestctx.external_client;
let apictx = &cptestctx.server.apictx();
let nexus = &apictx.nexus;
let instance_name = "losing-is-fun";

// Create and start the test instance.
create_org_and_project(&client).await;
let instance_url = get_instance_url(instance_name);
let instance = create_instance(client, PROJECT_NAME, instance_name).await;
instance_simulate(nexus, &instance.identity.id).await;
let instance_next = instance_get(&client, &instance_url).await;
assert_eq!(instance_next.runtime.run_state, InstanceState::Running);

let sled_agent = &cptestctx.sled_agent.sled_agent;
sled_agent
.set_instance_ensure_state_error(Some(
omicron_common::api::external::Error::internal_error(
"injected by test_instance_failed_after_sled_agent_error",
),
))
.await;

let url = get_instance_url(format!("{}/reboot", instance_name).as_str());
NexusRequest::new(
RequestBuilder::new(client, Method::POST, &url)
.body(None as Option<&serde_json::Value>),
)
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.unwrap()
.parsed_body::<Instance>()
.expect_err("expected injected failure");

let instance_next = instance_get(&client, &instance_url).await;
assert_eq!(instance_next.runtime.run_state, InstanceState::Failed);

NexusRequest::object_delete(client, &get_instance_url(instance_name))
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.unwrap();

sled_agent.set_instance_ensure_state_error(None).await;

let instance = create_instance(client, PROJECT_NAME, instance_name).await;
instance_simulate(nexus, &instance.identity.id).await;
let instance_next = instance_get(&client, &instance_url).await;
assert_eq!(instance_next.runtime.run_state, InstanceState::Running);

sled_agent
.set_instance_ensure_state_error(Some(
omicron_common::api::external::Error::internal_error(
"injected by test_instance_failed_after_sled_agent_error",
),
))
.await;

let url = get_instance_url(format!("{}/stop", instance_name).as_str());
NexusRequest::new(
RequestBuilder::new(client, Method::POST, &url)
.body(None as Option<&serde_json::Value>),
)
.authn_as(AuthnMode::PrivilegedUser)
.execute()
.await
.unwrap()
.parsed_body::<Instance>()
.expect_err("expected injected failure");

let instance_next = instance_get(&client, &instance_url).await;
assert_eq!(instance_next.runtime.run_state, InstanceState::Failed);
}

/// Assert values for fleet, silo, and project using both system and silo
/// metrics endpoints
async fn assert_metrics(
Expand Down

0 comments on commit 6d3b8be

Please sign in to comment.