Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[sled-agent] Unbreak handling of Propolis error codes #6726

Merged
merged 9 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,9 @@ proc-macro2 = "1.0"
progenitor = "0.8.0"
progenitor-client = "0.8.0"
bhyve_api = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" }
# TODO(eliza): `propolis-client` should maybe re-export the needed types from
hawkw marked this conversation as resolved.
Show resolved Hide resolved
# this crate?
hawkw marked this conversation as resolved.
Show resolved Hide resolved
propolis_api_types = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" }
propolis-client = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" }
propolis-mock-server = { git = "https://github.com/oxidecomputer/propolis", rev = "fae5334bcad5e864794332c6fed5e6bb9ec88831" }
proptest = "1.5.0"
Expand Down
1 change: 1 addition & 0 deletions sled-agent/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ oximeter.workspace = true
oximeter-instruments.workspace = true
oximeter-producer.workspace = true
oxnet.workspace = true
propolis_api_types.workspace = true
propolis-client.workspace = true
propolis-mock-server.workspace = true # Only used by the simulated sled agent
rand = { workspace = true, features = ["getrandom"] }
Expand Down
186 changes: 153 additions & 33 deletions sled-agent/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ use crate::profile::*;
use crate::zone_bundle::BundleError;
use crate::zone_bundle::ZoneBundler;
use anyhow::anyhow;
use backoff::BackoffError;
use chrono::Utc;
use illumos_utils::dladm::Etherstub;
use illumos_utils::link::VnicAllocator;
Expand All @@ -30,9 +29,11 @@ use omicron_common::api::internal::shared::{
NetworkInterface, ResolvedVpcFirewallRule, SledIdentifiers, SourceNatConfig,
};
use omicron_common::backoff;
use omicron_common::backoff::BackoffError;
use omicron_common::zpool_name::ZpoolName;
use omicron_common::NoDebug;
use omicron_uuid_kinds::{GenericUuid, InstanceUuid, PropolisUuid};
use propolis_api_types::ErrorCode as PropolisErrorCode;
use propolis_client::Client as PropolisClient;
use rand::prelude::IteratorRandom;
use rand::SeedableRng;
Expand Down Expand Up @@ -64,7 +65,7 @@ pub enum Error {
VnicCreation(#[from] illumos_utils::dladm::CreateVnicError),

#[error("Failure from Propolis Client: {0}")]
Propolis(#[from] propolis_client::Error<propolis_client::types::Error>),
Propolis(#[from] PropolisClientError),

// TODO: Remove this error; prefer to retry notifications.
#[error("Notifying Nexus failed: {0}")]
Expand Down Expand Up @@ -130,6 +131,10 @@ pub enum Error {
Terminating,
}

// Make this less gross.
hawkw marked this conversation as resolved.
Show resolved Hide resolved
type PropolisClientError =
propolis_client::Error<propolis_client::types::Error>;

// Issues read-only, idempotent HTTP requests at propolis until it responds with
// an acknowledgement. This provides a hacky mechanism to "wait until the HTTP
// server is serving requests".
Expand Down Expand Up @@ -248,51 +253,92 @@ enum InstanceRequest {
struct InstanceMonitorRunner {
client: Arc<PropolisClient>,
tx_monitor: mpsc::Sender<InstanceMonitorRequest>,
log: slog::Logger,
}

impl InstanceMonitorRunner {
async fn run(self) -> Result<(), anyhow::Error> {
let mut gen = 0;
let mut generation = 0;
loop {
// State monitoring always returns the most recent state/gen pair
// known to Propolis.
let response = self
.client
.instance_state_monitor()
.body(propolis_client::types::InstanceStateMonitorRequest {
gen,
})
.send()
.await?
.into_inner();
let observed_gen = response.gen;
let update = backoff::retry_notify(
backoff::retry_policy_local(),
|| self.monitor(generation),
|error, delay| {
warn!(
self.log,
"Failed to poll Propolis state";
"error" => %error,
"retry_in" => ?delay,
"generation" => generation,
);
},
)
.await?;

// Update the state generation for the next poll.
if let InstanceMonitorUpdate::State(ref state) = update {
generation = state.r#gen + 1;
hawkw marked this conversation as resolved.
Show resolved Hide resolved
}

// Now that we have the response from Propolis' HTTP server, we
// forward that to the InstanceRunner.
//
// It will decide the new state, provide that info to Nexus,
// and possibly identify if we should terminate.
let (tx, rx) = oneshot::channel();
self.tx_monitor
.send(InstanceMonitorRequest::Update { state: response, tx })
.await?;
self.tx_monitor.send(InstanceMonitorRequest { update, tx }).await?;

if let Reaction::Terminate = rx.await? {
return Ok(());
}
}
}

async fn monitor(
&self,
generation: u64,
) -> Result<InstanceMonitorUpdate, BackoffError<PropolisClientError>> {
// State monitoring always returns the most recent state/gen pair
// known to Propolis.
let result = self
.client
.instance_state_monitor()
.body(propolis_client::types::InstanceStateMonitorRequest {
r#gen: generation,
})
.send()
.await;
match result {
Ok(response) => {
let state = response.into_inner();

// Update the generation number we're asking for, to ensure the
// Propolis will only return more recent values.
gen = observed_gen + 1;
Ok(InstanceMonitorUpdate::State(state))
}
// If the channel has closed, then there's nothing left for us to do
// here. Go die.
Err(e) if self.tx_monitor.is_closed() => {
Err(BackoffError::permanent(e))
}
// Otherwise, was there a known error code from Propolis?
Err(e) => propolis_error_code(&self.log, &e)
// If we were able to parse a known error code, send it along to
// the instance runner task.
.map(InstanceMonitorUpdate::Error)
// Otherwise, just keep trying until we see a good state or
// known error code.
.ok_or_else(|| BackoffError::transient(e)),
}
}
}

enum InstanceMonitorRequest {
Update {
state: propolis_client::types::InstanceStateMonitorResponse,
tx: oneshot::Sender<Reaction>,
},
enum InstanceMonitorUpdate {
State(propolis_client::types::InstanceStateMonitorResponse),
Error(PropolisErrorCode),
}

struct InstanceMonitorRequest {
update: InstanceMonitorUpdate,
tx: oneshot::Sender<Reaction>,
}

struct InstanceRunner {
Expand Down Expand Up @@ -364,15 +410,17 @@ struct InstanceRunner {
impl InstanceRunner {
async fn run(mut self) {
use InstanceRequest::*;
let mut has_ever_seen_a_good_state = false;
while !self.should_terminate {
tokio::select! {
biased;

// Handle messages from our own "Monitor the VMM" task.
request = self.rx_monitor.recv() => {
use InstanceMonitorRequest::*;
use InstanceMonitorUpdate::*;
match request {
Some(Update { state, tx }) => {
Some(InstanceMonitorRequest { update: State(state), tx }) => {
has_ever_seen_a_good_state = true;
let observed = ObservedPropolisState::new(&state);
let reaction = self.observe_state(&observed).await;
self.publish_state_to_nexus().await;
Expand All @@ -385,6 +433,34 @@ impl InstanceRunner {
warn!(self.log, "InstanceRunner failed to send to InstanceMonitorRunner");
}
},
Some(InstanceMonitorRequest { update: Error(code), tx }) => {
// TODO(eliza): this is kinda jank but Propolis
// currently could return the same error when the
// instance has not been ensured and when it's still
// coming up, which is sad. I'm not sure whether
// that happens IRL or not --- investigate this.
let reaction = if has_ever_seen_a_good_state {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This extra junk is because I noticed that Propolis' instance-state-monitor endpoint will presently return the NoInstance error on both VmError::NotCreated and VmError::WaitingToInitialize:

https://github.com/oxidecomputer/propolis/blob/11371b0f3743f8df5b047dc0edc2699f4bdf3927/bin/propolis-server/src/lib/server.rs#L397-L399

This gave me some anxiety about sled-agent potentially deciding that a Propolis has "died" while it's actually still waiting to be born, although I'm not sure whether we would actually see the WaitingToInitialize error there in practice or not. I think the proper thing to do would be to change that behavior in Propolis, but I wanted to get something working in sled-agent regardless.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

after actually looking in Propolis, it looks like the Vm::state_watcher() function will never actually return VmError::WaitingToInitialize; it will only return VmError::NotCreated:
https://github.com/oxidecomputer/propolis/blob/11371b0f3743f8df5b047dc0edc2699f4bdf3927/bin/propolis-server/src/lib/vm/mod.rs#L353-L364

So, hitting the state-monitor endpoint should just return a Starting VMM state in that case, which is good. I still don't like that the method can return any of the error types, though, even though it doesn't currently --- I'll go try to clean that up on the Propolis side.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, oxidecomputer/propolis#771 will make the propolis side way less scary. But I think this code can also just be removed as propolis' current behavior is also correct.

info!(
self.log,
"Error code from Propolis after instance initialized, assuming it's gone";
"error" => ?code,
);
self.terminate(true).await;
self.publish_state_to_nexus().await;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably why we sent the failed state to nexus twice in my manual testing, so I think it can be removed.

Reaction::Terminate
} else {
debug!(
self.log,
"Error code from Propolis before initialization";
"error" => ?code,
);
Reaction::Continue
};

if let Err(_) = tx.send(reaction) {
warn!(self.log, "InstanceRunner failed to send to InstanceMonitorRunner");
}
},
// NOTE: This case shouldn't really happen, as we keep a copy
// of the sender alive in "self.tx_monitor".
None => {
Expand Down Expand Up @@ -642,7 +718,7 @@ impl InstanceRunner {
async fn propolis_state_put(
&self,
request: propolis_client::types::InstanceStateRequested,
) -> Result<(), Error> {
) -> Result<(), PropolisClientError> {
let res = self
.running_state
.as_ref()
Expand All @@ -655,11 +731,10 @@ impl InstanceRunner {

if let Err(e) = &res {
error!(self.log, "Error from Propolis client: {:?}", e;
"status" => ?e.status());
"status" => ?e.status());
}

res?;
Ok(())
res.map(|_| ())
}

/// Sends an instance ensure request to this instance's Propolis.
Expand Down Expand Up @@ -772,6 +847,7 @@ impl InstanceRunner {
let runner = InstanceMonitorRunner {
client: client.clone(),
tx_monitor: self.tx_monitor.clone(),
log: self.log.clone(),
};
let log = self.log.clone();
let monitor_handle = tokio::task::spawn(async move {
Expand Down Expand Up @@ -960,6 +1036,29 @@ impl InstanceRunner {
}
}

fn propolis_error_code(
log: &slog::Logger,
error: &PropolisClientError,
) -> Option<PropolisErrorCode> {
// Is this a structured error response from the Propolis server?
let propolis_client::Error::ErrorResponse(ref rv) = &error else {
return None;
};

let code = rv.error_code.as_deref()?;
match code.parse::<PropolisErrorCode>() {
hawkw marked this conversation as resolved.
Show resolved Hide resolved
Err(parse_error) => {
warn!(log, "Propolis returned an unknown error code: {code:?}";
"status" => ?error.status(),
"error" => %error,
"code" => ?code,
"parse_error" => ?parse_error);
None
}
Ok(code) => Some(code),
}
}

/// Describes a single Propolis server that incarnates a specific instance.
pub struct Instance {
id: InstanceUuid,
Expand Down Expand Up @@ -1358,7 +1457,28 @@ impl InstanceRunner {
};

if let Some(p) = propolis_state {
self.propolis_state_put(p).await?;
if let Err(e) = self.propolis_state_put(p).await {
match propolis_error_code(&self.log, &e) {
Some(
code @ PropolisErrorCode::NoInstance
| code @ PropolisErrorCode::CreateFailed,
) => {
error!(self.log,
"Propolis error code indicates VMM failure";
"code" => ?code,
);
self.terminate(true).await;
// We've transitioned to `Failed`, so just return the
// failed state normally. We return early here instead
// of falling through because we don't want to overwrite
// that with the published VMM state determined above.
hawkw marked this conversation as resolved.
Show resolved Hide resolved
return Ok(self.state.sled_instance_state());
}
_ => {
return Err(Error::Propolis(e));
}
}
}
}
if let Some(s) = next_published {
self.state.transition_vmm(s, Utc::now());
Expand Down
Loading