Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nexus] Reincarnate instances with SagaUnwound VMMs #6669

Merged
merged 13 commits into from
Sep 27, 2024
35 changes: 24 additions & 11 deletions dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2882,7 +2882,8 @@ async fn cmd_db_instance_info(
vmm::dsl as vmm_dsl,
};
use nexus_db_model::{
Instance, InstanceKarmicStatus, InstanceRuntimeState, Migration, Vmm,
Instance, InstanceKarmicStatus, InstanceRuntimeState, Migration,
Reincarnatability, Vmm,
};
let InstanceInfoArgs { id } = args;

Expand Down Expand Up @@ -2943,8 +2944,9 @@ async fn cmd_db_instance_info(
const STATE: &'static str = "nexus state";
const LAST_MODIFIED: &'static str = "last modified at";
const LAST_UPDATED: &'static str = "last updated at";
const LAST_AUTO_RESTART: &'static str = "last auto-restarted at";
const KARMIC_STATUS: &'static str = "karmic status";
const LAST_AUTO_RESTART: &'static str = " last reincarnated at";
const KARMIC_STATUS: &'static str = " karmic status";
const NEEDS_REINCARNATION: &'static str = "needs reincarnation";
const ACTIVE_VMM: &'static str = "active VMM ID";
const TARGET_VMM: &'static str = "target VMM ID";
const MIGRATION_ID: &'static str = "migration ID";
Expand All @@ -2968,6 +2970,7 @@ async fn cmd_db_instance_info(
LAST_MODIFIED,
LAST_AUTO_RESTART,
KARMIC_STATUS,
NEEDS_REINCARNATION,
ACTIVE_VMM,
TARGET_VMM,
MIGRATION_ID,
Expand Down Expand Up @@ -3026,22 +3029,32 @@ async fn cmd_db_instance_info(
" {LAST_UPDATED:>WIDTH$}: {time_updated:?} (generation {})",
r#gen.0
);
println!(" {LAST_AUTO_RESTART:>WIDTH$}: {time_last_auto_restarted:?}");
match instance.auto_restart.status(&instance.runtime_state) {
InstanceKarmicStatus::NotFailed => {}
InstanceKarmicStatus::Ready => {
println!("(i) {KARMIC_STATUS:>WIDTH$}: ready to reincarnate!");

// Reincarnation status
let InstanceKarmicStatus { needs_reincarnation, can_reincarnate } =
instance
.auto_restart
.status(&instance.runtime_state, active_vmm.as_ref());
println!(
"{} {NEEDS_REINCARNATION:>WIDTH$}: {needs_reincarnation}",
if needs_reincarnation { "(i)" } else { " " }
);
match can_reincarnate {
Reincarnatability::WillReincarnate => {
println!(" {KARMIC_STATUS:>WIDTH$}: bound to saṃsāra");
}
InstanceKarmicStatus::Forbidden => {
println!("(i) {KARMIC_STATUS:>WIDTH$}: reincarnation forbidden");
Reincarnatability::Nirvana => {
println!(" {KARMIC_STATUS:>WIDTH$}: attained nirvāṇa");
}
InstanceKarmicStatus::CoolingDown(remaining) => {
Reincarnatability::CoolingDown(remaining) => {
println!(
"/!\\ {KARMIC_STATUS:>WIDTH$}: cooling down \
({remaining:?} remaining)"
);
}
}
println!(" {LAST_AUTO_RESTART:>WIDTH$}: {time_last_auto_restarted:?}");

println!(" {ACTIVE_VMM:>WIDTH$}: {propolis_id:?}");
println!(" {TARGET_VMM:>WIDTH$}: {dst_propolis_id:?}");
println!(
Expand Down
62 changes: 36 additions & 26 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1790,57 +1790,67 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
Ok(InstanceReincarnationStatus {
instances_found,
instances_reincarnated,
changed_state,
query_error,
restart_errors,
}) => {
Ok(status) => {
const FOUND: &'static str =
"instances eligible for reincarnation:";
const REINCARNATED: &'static str = " instances reincarnated:";
const REINCARNATED: &'static str =
"instances reincarnated successfully:";
const CHANGED_STATE: &'static str =
" instances which changed state before they could be reincarnated:";
"instances which changed state before they could reincarnate:";
const ERRORS: &'static str =
" instances which failed to be reincarnated:";
const COOLDOWN_PERIOD: &'static str =
"default cooldown period:";
"instances which failed to reincarnate:";
const WIDTH: usize = const_max_len(&[
FOUND,
REINCARNATED,
CHANGED_STATE,
ERRORS,
COOLDOWN_PERIOD,
]);
let n_restart_errors = restart_errors.len();
let n_restarted = instances_reincarnated.len();
let n_changed_state = changed_state.len();
println!(" {FOUND:<WIDTH$} {instances_found:>3}");
println!(" {REINCARNATED:<WIDTH$} {n_restarted:>3}");
println!(" {CHANGED_STATE:<WIDTH$} {n_changed_state:>3}",);
println!(" {ERRORS:<WIDTH$} {n_restart_errors:>3}");
if status.disabled {
println!(
" instance reincarnation explicitly disabled \
by config!"
);
return;
}

if let Some(e) = query_error {
if !status.errors.is_empty() {
println!(
" an error occurred while searching for instances \
to reincarnate:\n {e}",
" errors occurred while finding instances to \
reincarnate:"
);
for error in &status.errors {
println!(" > {error}")
}
}

let n_restart_errors = status.restart_errors.len();
let n_restarted = status.instances_reincarnated.len();
let n_changed_state = status.changed_state.len();
println!(
" {FOUND:<WIDTH$} {:>3}",
status.total_instances_found()
);
for (reason, count) in &status.instances_found {
let reason = format!(" {reason} instances:");
println!(" {reason:<WIDTH$} {count:>3}",);
}
println!(" {REINCARNATED:<WIDTH$} {n_restarted:>3}");
println!(" {CHANGED_STATE:<WIDTH$} {n_changed_state:>3}",);
println!(" {ERRORS:<WIDTH$} {n_restart_errors:>3}");

if n_restart_errors > 0 {
println!(
" errors occurred while restarting the following \
instances:"
);
for (id, error) in restart_errors {
for (id, error) in status.restart_errors {
println!(" > {id}: {error}");
}
}

if n_restarted > 0 {
println!(" the following instances have reincarnated:");
for id in instances_reincarnated {
for id in status.instances_reincarnated {
println!(" > {id}")
}
}
Expand All @@ -1850,7 +1860,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
" the following instances states changed before \
they could be reincarnated:"
);
for id in changed_state {
for id in status.changed_state {
println!(" > {id}")
}
}
Expand Down
24 changes: 14 additions & 10 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -524,14 +524,16 @@ task: "external_endpoints"
TLS certificates: 0

task: "instance_reincarnation"
configured period: every 1m
configured period: every 10m
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
instances eligible for reincarnation: 0
instances reincarnated: 0
instances which changed state before they could be reincarnated: 0
instances which failed to be reincarnated: 0
instances eligible for reincarnation: 0
instance failed instances: 0
start saga failed instances: 0
instances reincarnated successfully: 0
instances which changed state before they could reincarnate: 0
instances which failed to reincarnate: 0

task: "instance_updater"
configured period: every <REDACTED_DURATION>s
Expand Down Expand Up @@ -964,14 +966,16 @@ task: "external_endpoints"
TLS certificates: 0

task: "instance_reincarnation"
configured period: every 1m
configured period: every 10m
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
instances eligible for reincarnation: 0
instances reincarnated: 0
instances which changed state before they could be reincarnated: 0
instances which failed to be reincarnated: 0
instances eligible for reincarnation: 0
instance failed instances: 0
start saga failed instances: 0
instances reincarnated successfully: 0
instances which changed state before they could reincarnate: 0
instances which failed to reincarnate: 0

task: "instance_updater"
configured period: every <REDACTED_DURATION>s
Expand Down
121 changes: 89 additions & 32 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

use super::{
ByteCount, Disk, ExternalIp, Generation, InstanceAutoRestartPolicy,
InstanceCpuCount, InstanceState,
InstanceCpuCount, InstanceState, Vmm, VmmState,
};
use crate::collection::DatastoreAttachTargetConfig;
use crate::schema::{disk, external_ip, instance};
Expand Down Expand Up @@ -266,18 +266,36 @@ pub struct InstanceAutoRestart {
pub cooldown: Option<TimeDelta>,
}

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct InstanceKarmicStatus {
/// Whether the instance is permitted to reincarnate if
/// `needs_reincarnation` is `true`.
pub can_reincarnate: Reincarnatability,
/// `true` if the instance is in a state in which it could reincarnate if
/// `can_reincarnate` would permit it to do so.
pub needs_reincarnation: bool,
}

impl InstanceKarmicStatus {
/// Returns `true` if this instance is in a state that requires
/// reincarnation, and is permitted to reincarnate immediately.
pub fn should_reincarnate(&self) -> bool {
self.needs_reincarnation
&& self.can_reincarnate == Reincarnatability::WillReincarnate
}
}

/// Describes whether or not an instance can reincarnate.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum InstanceKarmicStatus {
/// The instance is ready to reincarnate.
Ready,
/// The instance does not need reincarnation, as it is not currently in the
/// `Failed` state.
NotFailed,
pub enum Reincarnatability {
/// The instance remains bound to the cycle of saṃsāra and can return in the
/// next life.
WillReincarnate,
/// The instance cannot reincarnate again until the specified time.
CoolingDown(TimeDelta),
/// The instance's auto-restart policy forbids it from reincarnating.
Forbidden,
/// The instance's auto-restart policy indicates that it has attained
/// nirvāṇa and will not reincarnate.
Nirvana,
}

impl InstanceAutoRestart {
Expand All @@ -292,20 +310,58 @@ impl InstanceAutoRestart {
pub const DEFAULT_POLICY: InstanceAutoRestartPolicy =
InstanceAutoRestartPolicy::BestEffort;

/// Returns `true` if `self` permits an instance to reincarnate given the
/// provided `state`.
pub fn status(&self, state: &InstanceRuntimeState) -> InstanceKarmicStatus {
/// Returns an instance's karmic status.
pub fn status(
&self,
state: &InstanceRuntimeState,
active_vmm: Option<&Vmm>,
) -> InstanceKarmicStatus {
// Instances only need to be automatically restarted if they are in the
// `Failed` state.
if state.nexus_state != InstanceState::Failed {
return InstanceKarmicStatus::NotFailed;
// `Failed` state, or if their active VMM is in the `SagaUnwound` state.
let needs_reincarnation = match (state.nexus_state, active_vmm) {
(InstanceState::Failed, _vmm) => {
debug_assert!(
_vmm.is_none(),
"a Failed instance will never have an active VMM!"
);
true
}
(InstanceState::Vmm, Some(ref vmm)) => {
debug_assert_eq!(
state.propolis_id,
Some(vmm.id),
"don't call `InstanceAutoRestart::status with a VMM \
that isn't this instance's active VMM!?!?"
);
// Note that we *don't* reincarnate instances with `Failed` active
// VMMs; in that case, an instance-update saga must first run to
// move the *instance* record to the `Failed` state.
vmm.runtime.state == VmmState::SagaUnwound
}
_ => false,
};

InstanceKarmicStatus {
needs_reincarnation,
can_reincarnate: self.can_reincarnate(&state),
}
}

/// Returns whether or not this auto-restart configuration will permit an
/// instance with the provided `InstanceRuntimeState` to reincarnate.
///
/// This does *not* indicate that the instance currently needs
/// reincarnation, but instead, whether the instance will be permitted to
/// reincarnate should it be in such a state.
pub fn can_reincarnate(
&self,
state: &InstanceRuntimeState,
) -> Reincarnatability {
// Check if the instance's configured auto-restart policy permits the
// control plane to automatically restart it.
let policy = self.policy.unwrap_or(Self::DEFAULT_POLICY);
if policy == InstanceAutoRestartPolicy::Never {
return InstanceKarmicStatus::Forbidden;
return Reincarnatability::Nirvana;
}

// If the instance is permitted to reincarnate, ensure that its last
Expand All @@ -317,15 +373,15 @@ impl InstanceAutoRestart {
let cooldown = self.cooldown.unwrap_or(Self::DEFAULT_COOLDOWN);
let time_since_last = Utc::now().signed_duration_since(last);
if time_since_last >= cooldown {
return InstanceKarmicStatus::Ready;
return Reincarnatability::WillReincarnate;
} else {
return InstanceKarmicStatus::CoolingDown(
return Reincarnatability::CoolingDown(
cooldown - time_since_last,
);
}
}

InstanceKarmicStatus::Ready
Reincarnatability::WillReincarnate
}

/// Filters a database query to include only instances whose auto-restart
Expand All @@ -348,19 +404,14 @@ impl InstanceAutoRestart {

let now = diesel::dsl::now.into_sql::<pg::sql_types::Timestamptz>();

dsl::state
// Only attempt to restart Failed instances.
.eq(InstanceState::Failed)
// The instance's auto-restart policy must allow the control plane
// to restart it automatically.
//
// N.B. that this may become more complex in the future if we grow
// additional auto-restart policies that require additional logic
// (such as restart limits...)
.and(
dsl::auto_restart_policy
.eq(InstanceAutoRestartPolicy::BestEffort),
)
// The instance's auto-restart policy must allow the control plane
// to restart it automatically.
//
// N.B. that this may become more complex in the future if we grow
// additional auto-restart policies that require additional logic
// (such as restart limits...)
dsl::auto_restart_policy
.eq(InstanceAutoRestartPolicy::BestEffort)
// An instance whose last reincarnation was within the cooldown
// interval from now must remain in _bardo_ --- the liminal
// state between death and rebirth --- before its next
Expand All @@ -382,6 +433,12 @@ impl InstanceAutoRestart {
.le((now - Self::DEFAULT_COOLDOWN).nullable()),
)),
)
// Deleted instances may not be reincarnated.
.and(dsl::time_deleted.is_null())
// If the instance is currently in the process of being updated,
// let's not mess with it for now and try to restart it on another
// pass.
.and(dsl::updater_id.is_null())
}
}

Expand Down
Loading
Loading