From 83592e9924a2ab35c4072d02ac96d39cb335d600 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 2 Sep 2024 11:58:36 -0700 Subject: [PATCH] update to track `SledFailuresOnly` policy 2db6eff96c0b2d7f94999b2a6332908fce09aa7e added a `SledFailuresOnly` auto-restart policy in addition to `Never` and `AllFailures`. I discussed the rationale for that in [this comment][1]. Currently, there isn't a mechanism to detect whether an instance is `Failed` because the individual instance crashed or because the whole sled was restarted, so for now, we assume all failures are instance-level. But, we still need to handle the new variant. [1]: https://github.com/oxidecomputer/omicron/pull/6499#issuecomment-2325131201 --- nexus/db-model/src/instance.rs | 11 ++++++++--- .../app/background/tasks/instance_reincarnation.rs | 10 ++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/nexus/db-model/src/instance.rs b/nexus/db-model/src/instance.rs index 98195411b7c..4b1c85a90f9 100644 --- a/nexus/db-model/src/instance.rs +++ b/nexus/db-model/src/instance.rs @@ -242,9 +242,14 @@ impl InstanceRuntimeState { match policy { InstanceAutoRestart::Never => false, InstanceAutoRestart::AllFailures => true, - // TODO(eliza): future auto-restart policies may - // require additional checks here, such as a limited restart - // budget... + // TODO(eliza): currently, we don't have the ability to determine + // whether an instance is failed because the sled it was on has + // rebooted, or because the individual Propolis VMM crashed. For + // now, we assume all failures are VMM failures rather than sled + // failures. In the future, we will need to determine if a failure + // was a sled-level or VMM-level failure, and use that here to + // determine whether or not the instance is restartable. + InstanceAutoRestart::SledFailuresOnly => false, } } } diff --git a/nexus/src/app/background/tasks/instance_reincarnation.rs b/nexus/src/app/background/tasks/instance_reincarnation.rs index 05e0a22e1af..a3cf3321173 100644 --- a/nexus/src/app/background/tasks/instance_reincarnation.rs +++ b/nexus/src/app/background/tasks/instance_reincarnation.rs @@ -408,12 +408,14 @@ mod test { let mut will_not_reincarnate = std::collections::BTreeSet::new(); // Some instances which are `Failed`` but don't have policies permitting // them to be reincarnated. - for _ in 0..3 { + for policy in + [InstanceAutoRestart::Never, InstanceAutoRestart::SledFailuresOnly] + { let id = create_instance( &cptestctx, &opctx, &authz_project, - InstanceAutoRestart::Never, + policy, InstanceState::Failed, ) .await; @@ -422,12 +424,12 @@ mod test { // Some instances with policies permitting them to be reincarnated, but // which are not `Failed`. - for _ in 0..3 { + for _ in 0..2 { let id = create_instance( &cptestctx, &opctx, &authz_project, - InstanceAutoRestart::Never, + InstanceAutoRestart::AllFailures, InstanceState::NoVmm, ) .await;