diff --git a/Cargo.lock b/Cargo.lock index 6361080246..53e938bc11 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1720,6 +1720,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "crucible-client-types" +version = "0.1.0" +source = "git+https://github.com/oxidecomputer/crucible?rev=2b88ab88461fb06aaf2aab11c5e381a3cad25eac#2b88ab88461fb06aaf2aab11c5e381a3cad25eac" +dependencies = [ + "base64 0.22.1", + "crucible-workspace-hack", + "schemars", + "serde", + "serde_json", + "uuid", +] + [[package]] name = "crucible-common" version = "0.0.1" @@ -3983,7 +3996,7 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "socket2 0.5.7", + "socket2 0.4.10", "tokio", "tower-service", "tracing", @@ -6881,6 +6894,7 @@ dependencies = [ "pretty_assertions", "propolis-client 0.1.0 (git+https://github.com/oxidecomputer/propolis?rev=11371b0f3743f8df5b047dc0edc2699f4bdf3927)", "propolis-mock-server", + "propolis_api_types", "rand", "rcgen", "reqwest 0.12.7", @@ -8592,6 +8606,19 @@ dependencies = [ "toml 0.7.8", ] +[[package]] +name = "propolis_api_types" +version = "0.0.0" +source = "git+https://github.com/oxidecomputer/propolis?rev=fae5334bcad5e864794332c6fed5e6bb9ec88831#fae5334bcad5e864794332c6fed5e6bb9ec88831" +dependencies = [ + "crucible-client-types", + "propolis_types", + "schemars", + "serde", + "thiserror", + "uuid", +] + [[package]] name = "propolis_types" version = "0.0.0" diff --git a/Cargo.toml b/Cargo.toml index 98b0246d5f..5411e233b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -512,6 +512,7 @@ proc-macro2 = "1.0" progenitor = "0.8.0" progenitor-client = "0.8.0" bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "11371b0f3743f8df5b047dc0edc2699f4bdf3927" } +propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "11371b0f3743f8df5b047dc0edc2699f4bdf3927" } propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "11371b0f3743f8df5b047dc0edc2699f4bdf3927" } propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "11371b0f3743f8df5b047dc0edc2699f4bdf3927" } proptest = "1.5.0" diff --git a/sled-agent/Cargo.toml b/sled-agent/Cargo.toml index b4b51e2f09..3208f1c031 100644 --- a/sled-agent/Cargo.toml +++ b/sled-agent/Cargo.toml @@ -59,6 +59,7 @@ oximeter.workspace = true oximeter-instruments.workspace = true oximeter-producer.workspace = true oxnet.workspace = true +propolis_api_types.workspace = true propolis-client.workspace = true propolis-mock-server.workspace = true # Only used by the simulated sled agent rand = { workspace = true, features = ["getrandom"] } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 3d8441e514..35380fa666 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -17,7 +17,6 @@ use crate::profile::*; use crate::zone_bundle::BundleError; use crate::zone_bundle::ZoneBundler; use anyhow::anyhow; -use backoff::BackoffError; use chrono::Utc; use illumos_utils::dladm::Etherstub; use illumos_utils::link::VnicAllocator; @@ -30,9 +29,11 @@ use omicron_common::api::internal::shared::{ NetworkInterface, ResolvedVpcFirewallRule, SledIdentifiers, SourceNatConfig, }; use omicron_common::backoff; +use omicron_common::backoff::BackoffError; use omicron_common::zpool_name::ZpoolName; use omicron_common::NoDebug; use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid}; +use propolis_api_types::ErrorCode as PropolisErrorCode; use propolis_client::Client as PropolisClient; use rand::prelude::IteratorRandom; use rand::SeedableRng; @@ -64,7 +65,7 @@ pub enum Error { VnicCreation(#[from] illumos_utils::dladm::CreateVnicError), #[error("Failure from Propolis Client: {0}")] - Propolis(#[from] propolis_client::Error), + Propolis(#[from] PropolisClientError), // TODO: Remove this error; prefer to retry notifications. #[error("Notifying Nexus failed: {0}")] @@ -130,6 +131,9 @@ pub enum Error { Terminating, } +type PropolisClientError = + propolis_client::Error; + // Issues read-only, idempotent HTTP requests at propolis until it responds with // an acknowledgement. This provides a hacky mechanism to "wait until the HTTP // server is serving requests". @@ -248,24 +252,32 @@ enum InstanceRequest { struct InstanceMonitorRunner { client: Arc, tx_monitor: mpsc::Sender, + log: slog::Logger, } impl InstanceMonitorRunner { async fn run(self) -> Result<(), anyhow::Error> { - let mut gen = 0; + let mut generation = 0; loop { - // State monitoring always returns the most recent state/gen pair - // known to Propolis. - let response = self - .client - .instance_state_monitor() - .body(propolis_client::types::InstanceStateMonitorRequest { - gen, - }) - .send() - .await? - .into_inner(); - let observed_gen = response.gen; + let update = backoff::retry_notify( + backoff::retry_policy_local(), + || self.monitor(generation), + |error, delay| { + warn!( + self.log, + "Failed to poll Propolis state"; + "error" => %error, + "retry_in" => ?delay, + "generation" => generation, + ); + }, + ) + .await?; + + // Update the state generation for the next poll. + if let InstanceMonitorUpdate::State(ref state) = update { + generation = state.r#gen + 1; + } // Now that we have the response from Propolis' HTTP server, we // forward that to the InstanceRunner. @@ -273,26 +285,59 @@ impl InstanceMonitorRunner { // It will decide the new state, provide that info to Nexus, // and possibly identify if we should terminate. let (tx, rx) = oneshot::channel(); - self.tx_monitor - .send(InstanceMonitorRequest::Update { state: response, tx }) - .await?; + self.tx_monitor.send(InstanceMonitorRequest { update, tx }).await?; if let Reaction::Terminate = rx.await? { return Ok(()); } + } + } + + async fn monitor( + &self, + generation: u64, + ) -> Result> { + // State monitoring always returns the most recent state/gen pair + // known to Propolis. + let result = self + .client + .instance_state_monitor() + .body(propolis_client::types::InstanceStateMonitorRequest { + r#gen: generation, + }) + .send() + .await; + match result { + Ok(response) => { + let state = response.into_inner(); - // Update the generation number we're asking for, to ensure the - // Propolis will only return more recent values. - gen = observed_gen + 1; + Ok(InstanceMonitorUpdate::State(state)) + } + // If the channel has closed, then there's nothing left for us to do + // here. Go die. + Err(e) if self.tx_monitor.is_closed() => { + Err(BackoffError::permanent(e)) + } + // Otherwise, was there a known error code from Propolis? + Err(e) => propolis_error_code(&self.log, &e) + // If we were able to parse a known error code, send it along to + // the instance runner task. + .map(InstanceMonitorUpdate::Error) + // Otherwise, just keep trying until we see a good state or + // known error code. + .ok_or_else(|| BackoffError::transient(e)), } } } -enum InstanceMonitorRequest { - Update { - state: propolis_client::types::InstanceStateMonitorResponse, - tx: oneshot::Sender, - }, +enum InstanceMonitorUpdate { + State(propolis_client::types::InstanceStateMonitorResponse), + Error(PropolisErrorCode), +} + +struct InstanceMonitorRequest { + update: InstanceMonitorUpdate, + tx: oneshot::Sender, } struct InstanceRunner { @@ -371,9 +416,9 @@ impl InstanceRunner { // Handle messages from our own "Monitor the VMM" task. request = self.rx_monitor.recv() => { - use InstanceMonitorRequest::*; + use InstanceMonitorUpdate::*; match request { - Some(Update { state, tx }) => { + Some(InstanceMonitorRequest { update: State(state), tx }) => { let observed = ObservedPropolisState::new(&state); let reaction = self.observe_state(&observed).await; self.publish_state_to_nexus().await; @@ -385,6 +430,41 @@ impl InstanceRunner { if let Err(_) = tx.send(reaction) { warn!(self.log, "InstanceRunner failed to send to InstanceMonitorRunner"); } + }, + Some(InstanceMonitorRequest { update: Error(code), tx }) => { + let reaction = if code == PropolisErrorCode::NoInstance { + // If we see a `NoInstance` error code from + // Propolis after the instance has been ensured, + // this means that Propolis must have crashed + // and been restarted, and now no longer + // remembers that it once had a VM. In that + // case, this Propolis is permanently busted, so + // mark it as Failed and tear down the zone. + warn!( + self.log, + "Propolis has lost track of its instance! \ + It must have crashed. Moving to Failed"; + "error_code" => ?code, + ); + self.terminate(true).await; + Reaction::Terminate + } else { + // The other error codes we know of are not + // expected here --- they all relate to the + // instance-creation APIs. So I guess we'll just + // whine about it and then keep trying to + // monitor the instance. + warn!( + self.log, + "Propolis state monitor returned an \ + unexpected error code"; + "error_code" => ?code, + ); + Reaction::Continue + }; + if let Err(_) = tx.send(reaction) { + warn!(self.log, "InstanceRunner failed to send to InstanceMonitorRunner"); + } }, // NOTE: This case shouldn't really happen, as we keep a copy // of the sender alive in "self.tx_monitor". @@ -643,7 +723,7 @@ impl InstanceRunner { async fn propolis_state_put( &self, request: propolis_client::types::InstanceStateRequested, - ) -> Result<(), Error> { + ) -> Result<(), PropolisClientError> { let res = self .running_state .as_ref() @@ -656,11 +736,10 @@ impl InstanceRunner { if let Err(e) = &res { error!(self.log, "Error from Propolis client: {:?}", e; - "status" => ?e.status()); + "status" => ?e.status()); } - res?; - Ok(()) + res.map(|_| ()) } /// Sends an instance ensure request to this instance's Propolis. @@ -774,6 +853,7 @@ impl InstanceRunner { let runner = InstanceMonitorRunner { client: client.clone(), tx_monitor: self.tx_monitor.clone(), + log: self.log.clone(), }; let log = self.log.clone(); let monitor_handle = tokio::task::spawn(async move { @@ -962,6 +1042,29 @@ impl InstanceRunner { } } +fn propolis_error_code( + log: &slog::Logger, + error: &PropolisClientError, +) -> Option { + // Is this a structured error response from the Propolis server? + let propolis_client::Error::ErrorResponse(ref rv) = &error else { + return None; + }; + + let code = rv.error_code.as_deref()?; + match code.parse::() { + Err(parse_error) => { + warn!(log, "Propolis returned an unknown error code: {code:?}"; + "status" => ?error.status(), + "error" => %error, + "code" => ?code, + "parse_error" => ?parse_error); + None + } + Ok(code) => Some(code), + } +} + /// Describes a single Propolis server that incarnates a specific instance. pub struct Instance { id: InstanceUuid, @@ -1361,7 +1464,29 @@ impl InstanceRunner { }; if let Some(p) = propolis_state { - self.propolis_state_put(p).await?; + if let Err(e) = self.propolis_state_put(p).await { + match propolis_error_code(&self.log, &e) { + Some( + code @ PropolisErrorCode::NoInstance + | code @ PropolisErrorCode::CreateFailed, + ) => { + error!(self.log, + "Propolis error code indicates VMM failure"; + "code" => ?code, + ); + self.terminate(true).await; + // We've transitioned to `Failed`, so just return the + // failed state normally. We return early here instead + // of falling through because we don't want to overwrite + // `self.state` with the published VMM state determined + // above. + return Ok(self.state.sled_instance_state()); + } + _ => { + return Err(Error::Propolis(e)); + } + } + } } if let Some(s) = next_published { self.state.transition_vmm(s, Utc::now());