From 1368d96dfca0e81d1387b51ef312a56c272d6e73 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 27 Sep 2024 11:02:28 -0700 Subject: [PATCH 1/8] ugly first draft --- Cargo.toml | 3 ++ sled-agent/Cargo.toml | 1 + sled-agent/src/instance.rs | 57 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 07765e098e..135e6f0f3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -512,6 +512,9 @@ proc-macro2 = "1.0" progenitor = "0.8.0" progenitor-client = "0.8.0" bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } +# TODO(eliza): `propolis-client` should maybe re-export the needed types from +# this crate? +propolis-api-types = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } proptest = "1.5.0" diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index b4b51e2f09..85f03244ee 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -59,6 +59,7 @@ oximeter.workspace = true oximeter-instruments.workspace = true oximeter-producer.workspace = true oxnet.workspace = true +propolis-api-types.workspace = true propolis-client.workspace = true propolis-mock-server.workspace = true # Only used by the simulated sled agent rand = { workspace = true, features = ["getrandom"] } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 071c70a497..4e3246c6d4 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -33,6 +33,7 @@ use omicron_common::backoff; use omicron_common::zpool_name::ZpoolName; use omicron_common::NoDebug; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid}; +use propolis_api_types::ErrorCode as PropolisErrorCode; use propolis_client::Client as PropolisClient; use rand::prelude::IteratorRandom; use rand::SeedableRng; @@ -64,7 +65,7 @@ pub enum Error { VnicCreation(#[from] illumos_utils::dladm::CreateVnicError), #[error("Failure from Propolis Client: {0}")] - Propolis(#[from] propolis_client::Error), + Propolis(#[from] PropolisClientError), // TODO: Remove this error; prefer to retry notifications. #[error("Notifying Nexus failed: {0}")] @@ -130,6 +131,10 @@ pub enum Error { Terminating, } +// Make this less gross. +type PropolisClientError = + propolis_client::Error; + // Issues read-only, idempotent HTTP requests at propolis until it responds with // an acknowledgement. This provides a hacky mechanism to "wait until the HTTP // server is serving requests". @@ -655,7 +660,7 @@ impl InstanceRunner { if let Err(e) = &res { error!(self.log, "Error from Propolis client: {:?}", e; - "status" => ?e.status()); + "status" => ?e.status()); } res?; @@ -960,6 +965,29 @@ impl InstanceRunner { } } +fn propolis_error_code( + log: &slog::Logger, + &error: PropolisClientError, +) -> Option { + // Is this a structured error response from the Propolis server? + let propolis_client::Error::ErrorResponse(ref rv) = &error else { + return None; + }; + + let code = rv.error_code.as_deref()?; + match code.parse::() { + Err(parse_error) => { + error!(log, "Propolis returned an unknown error code: {code:?}"; + "status" => ?error.status(), + "error" => %error, + "code" => ?code, + "parse_error" => ?parse_error); + None + } + Ok(code) => Some(code), + } +} + /// Describes a single Propolis server that incarnates a specific instance. pub struct Instance { id: InstanceUuid, @@ -1358,7 +1386,30 @@ impl InstanceRunner { }; if let Some(p) = propolis_state { - self.propolis_state_put(p).await?; + if let Err(e) = self.propolis_state_put(p).await { + match propolis_error_code(&self.log, e) { + Some( + code @ PropolisErrorCode::NoInstance + | code @ PropolisErrorCode::CreateFailed, + ) => { + error!(self.log, + "Propolis error code indicates VMM failure"; + "code" => ?code, + ); + self.terminate(true).await; + // We've transitioned to `Failed`, so just return the + // failed state normally. We return early here instead + // of falling through because we don't want to overwrite + // that with the published VMM state determined above. + return Ok(self.state.sled_instance_state()); + } + _ => { + return Err(Error::Propolis(e)); + } + } + } else { + return Err(Error::Propolis(e)); + } } if let Some(s) = next_published { self.state.transition_vmm(s, Utc::now()); From be7c5842de01d0af0c573ee8e491f9ccfbe1a2cd Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 28 Sep 2024 21:03:11 -0700 Subject: [PATCH 2/8] wip --- Cargo.toml | 2 +- sled-agent/Cargo.toml | 2 +- sled-agent/src/instance.rs | 94 ++++++++++++++++++++++++++++---------- 3 files changed, 71 insertions(+), 27 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 135e6f0f3c..09869fcdc1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -514,7 +514,7 @@ progenitor-client = "0.8.0" bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } # TODO(eliza): `propolis-client` should maybe re-export the needed types from # this crate? -propolis-api-types = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } +propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } proptest = "1.5.0" diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index 85f03244ee..3208f1c031 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -59,7 +59,7 @@ oximeter.workspace = true oximeter-instruments.workspace = true oximeter-producer.workspace = true oxnet.workspace = true -propolis-api-types.workspace = true +propolis_api_types.workspace = true propolis-client.workspace = true propolis-mock-server.workspace = true # Only used by the simulated sled agent rand = { workspace = true, features = ["getrandom"] } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 4e3246c6d4..06d80959df 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -253,25 +253,27 @@ enum InstanceRequest { struct InstanceMonitorRunner { client: Arc, tx_monitor: mpsc::Sender, + log: slog::Logger, + generation: i32, } impl InstanceMonitorRunner { async fn run(self) -> Result<(), anyhow::Error> { let mut gen = 0; loop { - // State monitoring always returns the most recent state/gen pair - // known to Propolis. - let response = self - .client - .instance_state_monitor() - .body(propolis_client::types::InstanceStateMonitorRequest { - gen, - }) - .send() - .await? - .into_inner(); - let observed_gen = response.gen; - + let update = backoff::retry_notify( + backoff::retry_policy_local(), + || self.monitor(), + |error, delay| { + warn!( + self.log, + "failed to poll Propolis state"; + "error" => %error + "retry_in" => ?delay, + ); + }, + ) + .await?; // Now that we have the response from Propolis' HTTP server, we // forward that to the InstanceRunner. // @@ -279,25 +281,60 @@ impl InstanceMonitorRunner { // and possibly identify if we should terminate. let (tx, rx) = oneshot::channel(); self.tx_monitor - .send(InstanceMonitorRequest::Update { state: response, tx }) + .send(InstanceMonitorRequest::Update { update, tx }) .await?; if let Reaction::Terminate = rx.await? { return Ok(()); } + } + } + + async fn monitor( + &self, + ) -> Result> + { + // State monitoring always returns the most recent state/gen pair + // known to Propolis. + let result = self + .client + .instance_state_monitor() + .body(propolis_client::types::InstanceStateMonitorRequest { + r#gen: self.generation, + }) + .send() + .await; + match result { + Ok(response) => { + let state = response.into_inner(); - // Update the generation number we're asking for, to ensure the - // Propolis will only return more recent values. - gen = observed_gen + 1; + // Update the generation number we're asking for, to ensure the + // Propolis will only return more recent values. + self.generation = response.r#gen + 1; + + Ok(InstanceMonitorUpdate::State(state)) + } + // If the channel has closed, then there's nothing left for us to do + // here. Go die. + Err(e) if self.tx_monitor.is_closed() => { + Err(backoff::Error::permanent(e)) + } + Err(e) if let Some(code) = propolis_error_code(&self.log, &e) => { + Ok(InstanceMonitorUpdate::Error(code)) + } + Err(e) => Err(backoff::Error::transient(e)), } } } -enum InstanceMonitorRequest { - Update { - state: propolis_client::types::InstanceStateMonitorResponse, - tx: oneshot::Sender, - }, +enum InstanceMonitorUpdate { + State(propolis_client::types::InstanceStateMonitorResponse), + Error(PropolisErrorCode), +} + +struct InstanceMonitorRequest { + update: InstanceMonitorUpdate, + tx: oneshot::Sender, } struct InstanceRunner { @@ -375,9 +412,9 @@ impl InstanceRunner { // Handle messages from our own "Monitor the VMM" task. request = self.rx_monitor.recv() => { - use InstanceMonitorRequest::*; + use InstanceMonitorUpdate::*; match request { - Some(Update { state, tx }) => { + Some(InstanceMonitorRequest { update: State(state), tx }) => { let observed = ObservedPropolisState::new(&state); let reaction = self.observe_state(&observed).await; self.publish_state_to_nexus().await; @@ -388,6 +425,12 @@ impl InstanceRunner { // next iteration of the loop. if let Err(_) = tx.send(reaction) { warn!(self.log, "InstanceRunner failed to send to InstanceMonitorRunner"); + + + + + + } }, // NOTE: This case shouldn't really happen, as we keep a copy @@ -777,6 +820,7 @@ impl InstanceRunner { let runner = InstanceMonitorRunner { client: client.clone(), tx_monitor: self.tx_monitor.clone(), + log: self.log.clone(), }; let log = self.log.clone(); let monitor_handle = tokio::task::spawn(async move { @@ -977,7 +1021,7 @@ fn propolis_error_code( let code = rv.error_code.as_deref()?; match code.parse::() { Err(parse_error) => { - error!(log, "Propolis returned an unknown error code: {code:?}"; + warn!(log, "Propolis returned an unknown error code: {code:?}"; "status" => ?error.status(), "error" => %error, "code" => ?code, From ece0dacc489670de293a3036f91eb076c8ceb5a8 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 30 Sep 2024 10:02:45 -0700 Subject: [PATCH 3/8] try to do the right thing --- sled-agent/src/instance.rs | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 06d80959df..072c231b05 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -406,6 +406,7 @@ struct InstanceRunner { impl InstanceRunner { async fn run(mut self) { use InstanceRequest::*; + let mut has_ever_seen_a_good_state = false; while !self.should_terminate { tokio::select! { biased; @@ -415,6 +416,7 @@ impl InstanceRunner { use InstanceMonitorUpdate::*; match request { Some(InstanceMonitorRequest { update: State(state), tx }) => { + has_ever_seen_a_good_state = true; let observed = ObservedPropolisState::new(&state); let reaction = self.observe_state(&observed).await; self.publish_state_to_nexus().await; @@ -425,12 +427,32 @@ impl InstanceRunner { // next iteration of the loop. if let Err(_) = tx.send(reaction) { warn!(self.log, "InstanceRunner failed to send to InstanceMonitorRunner"); + } + }, + Some(InstanceMonitorRequest { update: Error(code), tx }) => { + // TODO(eliza): this is kinda jank but Propolis + // currently returns the same error when the + // instance has not been ensured and when it's still + // coming up, which is sad... + let reaction = if has_ever_seen_a_good_state { + info!( + self.log, + "Error code from Propolis after instance initialized, assuming it's gone"; + "error" => ?code, + ); + self.terminate(mark_failed).await; + Reaction::Terminate + } else { + debug!( + self.log, + "Error code from Propolis before initialization"; + "error" => ?code, + ); + Reaction::Continue + }; - - - - - + if let Err(_) = tx.send(reaction) { + warn!(self.log, "InstanceRunner failed to send to InstanceMonitorRunner"); } }, // NOTE: This case shouldn't really happen, as we keep a copy @@ -821,6 +843,7 @@ impl InstanceRunner { client: client.clone(), tx_monitor: self.tx_monitor.clone(), log: self.log.clone(), + generation: 0, }; let log = self.log.clone(); let monitor_handle = tokio::task::spawn(async move { From 464e9a15d8961488bd742a1a4b54b16c569567b7 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 30 Sep 2024 10:04:55 -0700 Subject: [PATCH 4/8] blarg --- Cargo.lock | 29 ++++++++++++++++++++++++++++- sled-agent/src/instance.rs | 2 +- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4aab6fc802..73ffb1613c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1720,6 +1720,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "crucible-client-types" +version = "0.1.0" +source = "git+https://github.com/oxidecomputer/crucible?rev=2b88ab88461fb06aaf2aab11c5e381a3cad25eac#2b88ab88461fb06aaf2aab11c5e381a3cad25eac" +dependencies = [ + "base64 0.22.1", + "crucible-workspace-hack", + "schemars", + "serde", + "serde_json", + "uuid", +] + [[package]] name = "crucible-common" version = "0.0.1" @@ -3983,7 +3996,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.5.7", + "socket2 0.4.10", "tokio", "tower-service", "tracing", @@ -6881,6 +6894,7 @@ dependencies = [ "pretty_assertions", "propolis-client", "propolis-mock-server", + "propolis_api_types", "rand", "rcgen", "reqwest 0.12.7", @@ -8571,6 +8585,19 @@ dependencies = [ "toml 0.7.8", ] +[[package]] +name = "propolis_api_types" +version = "0.0.0" +source = "git+https://github.com/oxidecomputer/propolis?rev=fae5334bcad5e864794332c6fed5e6bb9ec88831#fae5334bcad5e864794332c6fed5e6bb9ec88831" +dependencies = [ + "crucible-client-types", + "propolis_types", + "schemars", + "serde", + "thiserror", + "uuid", +] + [[package]] name = "propolis_types" version = "0.0.0" diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 072c231b05..fba4e1c9b4 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -268,7 +268,7 @@ impl InstanceMonitorRunner { warn!( self.log, "failed to poll Propolis state"; - "error" => %error + "error" => %error, "retry_in" => ?delay, ); }, From 74030f0bf12c9873890ae8e30f8c20caa479ae7f Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 30 Sep 2024 10:19:18 -0700 Subject: [PATCH 5/8] okay there we go --- sled-agent/src/instance.rs | 61 +++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index fba4e1c9b4..8588793b13 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -17,7 +17,6 @@ use crate::profile::*; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; use anyhow::anyhow; -use backoff::BackoffError; use chrono::Utc; use illumos_utils::dladm::Etherstub; use illumos_utils::link::VnicAllocator; @@ -30,6 +29,7 @@ use omicron_common::api::internal::shared::{ NetworkInterface, ResolvedVpcFirewallRule, SledIdentifiers, SourceNatConfig, }; use omicron_common::backoff; +use omicron_common::backoff::BackoffError; use omicron_common::zpool_name::ZpoolName; use omicron_common::NoDebug; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid}; @@ -254,26 +254,32 @@ struct InstanceMonitorRunner { client: Arc, tx_monitor: mpsc::Sender, log: slog::Logger, - generation: i32, } impl InstanceMonitorRunner { async fn run(self) -> Result<(), anyhow::Error> { - let mut gen = 0; + let mut generation = 0; loop { let update = backoff::retry_notify( backoff::retry_policy_local(), - || self.monitor(), + || self.monitor(generation), |error, delay| { warn!( self.log, - "failed to poll Propolis state"; + "Failed to poll Propolis state"; "error" => %error, "retry_in" => ?delay, + "generation" => generation, ); }, ) .await?; + + // Update the state generation for the next poll. + if let InstanceMonitorUpdate::State(ref state) = update { + generation = state.r#gen + 1; + } + // Now that we have the response from Propolis' HTTP server, we // forward that to the InstanceRunner. // @@ -281,7 +287,7 @@ impl InstanceMonitorRunner { // and possibly identify if we should terminate. let (tx, rx) = oneshot::channel(); self.tx_monitor - .send(InstanceMonitorRequest::Update { update, tx }) + .send(InstanceMonitorRequest { update, tx }) .await?; if let Reaction::Terminate = rx.await? { @@ -292,7 +298,8 @@ impl InstanceMonitorRunner { async fn monitor( &self, - ) -> Result> + generation: u64, + ) -> Result> { // State monitoring always returns the most recent state/gen pair // known to Propolis. @@ -300,7 +307,7 @@ impl InstanceMonitorRunner { .client .instance_state_monitor() .body(propolis_client::types::InstanceStateMonitorRequest { - r#gen: self.generation, + r#gen: generation, }) .send() .await; @@ -308,21 +315,21 @@ impl InstanceMonitorRunner { Ok(response) => { let state = response.into_inner(); - // Update the generation number we're asking for, to ensure the - // Propolis will only return more recent values. - self.generation = response.r#gen + 1; - Ok(InstanceMonitorUpdate::State(state)) } // If the channel has closed, then there's nothing left for us to do // here. Go die. Err(e) if self.tx_monitor.is_closed() => { - Err(backoff::Error::permanent(e)) + Err(BackoffError::permanent(e)) } - Err(e) if let Some(code) = propolis_error_code(&self.log, &e) => { - Ok(InstanceMonitorUpdate::Error(code)) - } - Err(e) => Err(backoff::Error::transient(e)), + // Otherwise, was there a known error code from Propolis? + Err(e) => propolis_error_code(&self.log, &e) + // If we were able to parse a known error code, send it along to + // the instance runner task. + .map(InstanceMonitorUpdate::Error) + // Otherwise, just keep trying until we see a good state or + // known error code. + .ok_or_else(|| BackoffError::transient(e)), } } } @@ -431,16 +438,18 @@ impl InstanceRunner { }, Some(InstanceMonitorRequest { update: Error(code), tx }) => { // TODO(eliza): this is kinda jank but Propolis - // currently returns the same error when the + // currently could return the same error when the // instance has not been ensured and when it's still - // coming up, which is sad... + // coming up, which is sad. I'm not sure whether + // that happens IRL or not --- investigate this. let reaction = if has_ever_seen_a_good_state { info!( self.log, "Error code from Propolis after instance initialized, assuming it's gone"; "error" => ?code, ); - self.terminate(mark_failed).await; + self.terminate(true).await; + self.publish_state_to_nexus().await; Reaction::Terminate } else { debug!( @@ -712,7 +721,7 @@ impl InstanceRunner { async fn propolis_state_put( &self, request: propolis_client::types::InstanceStateRequested, - ) -> Result<(), Error> { + ) -> Result<(), PropolisClientError> { let res = self .running_state .as_ref() @@ -728,8 +737,7 @@ impl InstanceRunner { "status" => ?e.status()); } - res?; - Ok(()) + res.map(|_| ()) } /// Sends an instance ensure request to this instance's Propolis. @@ -843,7 +851,6 @@ impl InstanceRunner { client: client.clone(), tx_monitor: self.tx_monitor.clone(), log: self.log.clone(), - generation: 0, }; let log = self.log.clone(); let monitor_handle = tokio::task::spawn(async move { @@ -1034,7 +1041,7 @@ impl InstanceRunner { fn propolis_error_code( log: &slog::Logger, - &error: PropolisClientError, + error: &PropolisClientError, ) -> Option { // Is this a structured error response from the Propolis server? let propolis_client::Error::ErrorResponse(ref rv) = &error else { @@ -1454,7 +1461,7 @@ impl InstanceRunner { if let Some(p) = propolis_state { if let Err(e) = self.propolis_state_put(p).await { - match propolis_error_code(&self.log, e) { + match propolis_error_code(&self.log, &e) { Some( code @ PropolisErrorCode::NoInstance | code @ PropolisErrorCode::CreateFailed, @@ -1474,8 +1481,6 @@ impl InstanceRunner { return Err(Error::Propolis(e)); } } - } else { - return Err(Error::Propolis(e)); } } if let Some(s) = next_published { From b3346d917f086c7de6cdebbe723e3017a3783abc Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 30 Sep 2024 10:29:14 -0700 Subject: [PATCH 6/8] rustfmt and stuff --- sled-agent/src/instance.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 8588793b13..c2da2f9832 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -286,9 +286,7 @@ impl InstanceMonitorRunner { // It will decide the new state, provide that info to Nexus, // and possibly identify if we should terminate. let (tx, rx) = oneshot::channel(); - self.tx_monitor - .send(InstanceMonitorRequest { update, tx }) - .await?; + self.tx_monitor.send(InstanceMonitorRequest { update, tx }).await?; if let Reaction::Terminate = rx.await? { return Ok(()); @@ -299,8 +297,7 @@ impl InstanceMonitorRunner { async fn monitor( &self, generation: u64, - ) -> Result> - { + ) -> Result> { // State monitoring always returns the most recent state/gen pair // known to Propolis. let result = self From b59cfc1102d1fa4e9eb40a0c996a0cae651ecd4e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 30 Sep 2024 14:03:47 -0700 Subject: [PATCH 7/8] clean things up a bit --- sled-agent/src/instance.rs | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index c2da2f9832..dc2ef07cb9 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -410,7 +410,6 @@ struct InstanceRunner { impl InstanceRunner { async fn run(mut self) { use InstanceRequest::*; - let mut has_ever_seen_a_good_state = false; while !self.should_terminate { tokio::select! { biased; @@ -420,7 +419,6 @@ impl InstanceRunner { use InstanceMonitorUpdate::*; match request { Some(InstanceMonitorRequest { update: State(state), tx }) => { - has_ever_seen_a_good_state = true; let observed = ObservedPropolisState::new(&state); let reaction = self.observe_state(&observed).await; self.publish_state_to_nexus().await; @@ -434,29 +432,36 @@ impl InstanceRunner { } }, Some(InstanceMonitorRequest { update: Error(code), tx }) => { - // TODO(eliza): this is kinda jank but Propolis - // currently could return the same error when the - // instance has not been ensured and when it's still - // coming up, which is sad. I'm not sure whether - // that happens IRL or not --- investigate this. - let reaction = if has_ever_seen_a_good_state { - info!( + let reaction = if code == PropolisErrorCode::NoInstance { + // If we see a `NoInstance` error code from + // Propolis after the instance has been ensured, + // this means that Propolis must have crashed + // and been restarted, and now no longer + // remembers that it once had a VM. In that + // case, this Propolis is permanently busted, so + // mark it as Failed and tear down the zone. + warn!( self.log, - "Error code from Propolis after instance initialized, assuming it's gone"; - "error" => ?code, + "Propolis has lost track of its instance! \ + It must have crashed. Moving to Failed"; + "error_code" => ?code, ); self.terminate(true).await; - self.publish_state_to_nexus().await; Reaction::Terminate } else { - debug!( + // The other error codes we know of are not + // expected here --- they all relate to the + // instance-creation APIs. So I guess we'll just + // whine about it and then keep trying to + // monitor the instance. + warn!( self.log, - "Error code from Propolis before initialization"; - "error" => ?code, + "Propolis state monitor returned an \ + unexpected error code"; + "error_code" => ?code, ); Reaction::Continue }; - if let Err(_) = tx.send(reaction) { warn!(self.log, "InstanceRunner failed to send to InstanceMonitorRunner"); } From 2758f9769ebfa528c150113b75273a4bb888ea87 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 1 Oct 2024 09:37:51 -0700 Subject: [PATCH 8/8] comment fixup suggested by @smklein --- Cargo.toml | 2 -- sled-agent/src/instance.rs | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 09869fcdc1..fdd5936540 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -512,8 +512,6 @@ proc-macro2 = "1.0" progenitor = "0.8.0" progenitor-client = "0.8.0" bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } -# TODO(eliza): `propolis-client` should maybe re-export the needed types from -# this crate? propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index dc2ef07cb9..594474b370 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -131,7 +131,6 @@ pub enum Error { Terminating, } -// Make this less gross. type PropolisClientError = propolis_client::Error; @@ -1476,7 +1475,8 @@ impl InstanceRunner { // We've transitioned to `Failed`, so just return the // failed state normally. We return early here instead // of falling through because we don't want to overwrite - // that with the published VMM state determined above. + // `self.state` with the published VMM state determined + // above. return Ok(self.state.sled_instance_state()); } _ => {