Skip to content

Commit

Permalink
include reincarnation details in OMDB and logs
Browse files Browse the repository at this point in the history
  • Loading branch information
hawkw committed Sep 21, 2024
1 parent cd3236d commit 4e56df5
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 21 deletions.
42 changes: 38 additions & 4 deletions dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2881,7 +2881,9 @@ async fn cmd_db_instance_info(
instance::dsl as instance_dsl, migration::dsl as migration_dsl,
vmm::dsl as vmm_dsl,
};
use nexus_db_model::{Instance, InstanceRuntimeState, Migration, Vmm};
use nexus_db_model::{
Instance, InstanceKarmicStatus, InstanceRuntimeState, Migration, Vmm,
};
let InstanceInfoArgs { id } = args;

let instance = instance_dsl::instance
Expand Down Expand Up @@ -2926,7 +2928,7 @@ async fn cmd_db_instance_info(
// `nexus_db_model::Instance` type will want to make sure to update this
// code as well. Unfortunately, we can't just destructure the struct here to
// make sure this code breaks, since the `identity` field isn't public.
// So...just don't forget to do that, I guess.
// So...just don't forget to do that, I guess.
const ID: &'static str = "ID";
const PROJECT_ID: &'static str = "project ID";
const NAME: &'static str = "name";
Expand All @@ -2937,10 +2939,12 @@ async fn cmd_db_instance_info(
const VCPUS: &'static str = "vCPUs";
const MEMORY: &'static str = "memory";
const HOSTNAME: &'static str = "hostname";
const AUTO_RESTART: &'static str = "auto-restart policy";
const AUTO_RESTART: &'static str = "auto-restart";
const STATE: &'static str = "nexus state";
const LAST_MODIFIED: &'static str = "last modified at";
const LAST_UPDATED: &'static str = "last updated at";
const LAST_AUTO_RESTART: &'static str = "last auto-restarted at";
const KARMIC_STATUS: &'static str = "karmic status";
const ACTIVE_VMM: &'static str = "active VMM ID";
const TARGET_VMM: &'static str = "target VMM ID";
const MIGRATION_ID: &'static str = "migration ID";
Expand All @@ -2962,6 +2966,8 @@ async fn cmd_db_instance_info(
API_STATE,
LAST_UPDATED,
LAST_MODIFIED,
LAST_AUTO_RESTART,
KARMIC_STATUS,
ACTIVE_VMM,
TARGET_VMM,
MIGRATION_ID,
Expand All @@ -2970,6 +2976,17 @@ async fn cmd_db_instance_info(
MIGRATION_RECORD,
TARGET_VMM_RECORD,
]);

fn print_multiline_debug(slug: &str, thing: &impl core::fmt::Debug) {
println!(
" {slug:>WIDTH$}:\n{}",
textwrap::indent(
&format!("{thing:#?}"),
&" ".repeat(WIDTH - slug.len() + 8)
)
);
}

println!("\n{:=<80}", "== INSTANCE ");
println!(" {ID:>WIDTH$}: {}", instance.id());
println!(" {PROJECT_ID:>WIDTH$}: {}", instance.project_id);
Expand All @@ -2985,7 +3002,7 @@ async fn cmd_db_instance_info(
println!(" {VCPUS:>WIDTH$}: {}", instance.ncpus.0 .0);
println!(" {MEMORY:>WIDTH$}: {}", instance.memory.0);
println!(" {HOSTNAME:>WIDTH$}: {}", instance.hostname);
println!(" {AUTO_RESTART:>WIDTH$}: {:?}", instance.auto_restart_policy);
print_multiline_debug(AUTO_RESTART, &instance.auto_restart);
println!("\n{:=<80}", "== RUNTIME STATE ");
let InstanceRuntimeState {
time_updated,
Expand All @@ -2994,6 +3011,7 @@ async fn cmd_db_instance_info(
migration_id,
nexus_state,
r#gen,
time_last_auto_restarted,
} = instance.runtime_state;
println!(" {STATE:>WIDTH$}: {nexus_state:?}");
let effective_state = InstanceAndActiveVmm::determine_effective_state(
Expand All @@ -3008,6 +3026,22 @@ async fn cmd_db_instance_info(
" {LAST_UPDATED:>WIDTH$}: {time_updated:?} (generation {})",
r#gen.0
);
println!(" {LAST_AUTO_RESTART:>WIDTH$}: {time_last_auto_restarted:?}");
match instance.auto_restart.status(&instance.runtime_state) {
InstanceKarmicStatus::NotFailed => {}
InstanceKarmicStatus::Ready => {
println!("(i) {KARMIC_STATUS:>WIDTH$}: ready to reincarnate!");
}
InstanceKarmicStatus::Forbidden => {
println!("(i) {KARMIC_STATUS:>WIDTH$}: reincarnation forbidden");
}
InstanceKarmicStatus::CoolingDown(remaining) => {
println!(
"/!\\ {KARMIC_STATUS:>WIDTH$}: cooling down \
({remaining:?} remaining)"
);
}
}
println!(" {ACTIVE_VMM:>WIDTH$}: {propolis_id:?}");
println!(" {TARGET_VMM:>WIDTH$}: {dst_propolis_id:?}");
println!(
Expand Down
33 changes: 27 additions & 6 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,20 @@ pub struct InstanceAutoRestart {
pub cooldown: Option<TimeDelta>,
}

/// Describes whether or not an instance can reincarnate.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum InstanceKarmicStatus {
/// The instance is ready to reincarnate.
Ready,
/// The instance does not need reincarnation, as it is not currently in the
/// `Failed` state.
NotFailed,
/// The instance cannot reincarnate again until the specified time.
CoolingDown(TimeDelta),
/// The instance's auto-restart policy forbids it from reincarnating.
Forbidden,
}

impl InstanceAutoRestart {
/// The default cooldown used when an instance has no overridden cooldown.
pub const DEFAULT_COOLDOWN: TimeDelta = match TimeDelta::try_hours(1) {
Expand All @@ -280,18 +294,18 @@ impl InstanceAutoRestart {

/// Returns `true` if `self` permits an instance to reincarnate given the
/// provided `state`.
pub fn can_reincarnate(&self, state: &InstanceRuntimeState) -> bool {
pub fn status(&self, state: &InstanceRuntimeState) -> InstanceKarmicStatus {
// Instances only need to be automatically restarted if they are in the
// `Failed` state.
if state.nexus_state != InstanceState::Failed {
return false;
return InstanceKarmicStatus::NotFailed;
}

// Check if the instance's configured auto-restart policy permits the
// control plane to automatically restart it.
let policy = self.policy.unwrap_or(Self::DEFAULT_POLICY);
if policy == InstanceAutoRestartPolicy::Never {
return false;
return InstanceKarmicStatus::Forbidden;
}

// If the instance is permitted to reincarnate, ensure that its last
Expand All @@ -301,10 +315,17 @@ impl InstanceAutoRestart {
// Eventually, we may also allow a project-level default, so we will
// need to consider that as well.
let cooldown = self.cooldown.unwrap_or(Self::DEFAULT_COOLDOWN);
Utc::now().signed_duration_since(last) >= cooldown
} else {
true
let time_since_last = Utc::now().signed_duration_since(last);
if time_since_last >= cooldown {
return InstanceKarmicStatus::Ready;
} else {
return InstanceKarmicStatus::CoolingDown(
cooldown - time_since_last,
);
}
}

InstanceKarmicStatus::Ready
}

/// Filters a database query to include only instances whose auto-restart
Expand Down
37 changes: 26 additions & 11 deletions nexus/src/app/sagas/instance_update/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ use crate::app::db::datastore::VmmStateUpdateResult;
use crate::app::db::lookup::LookupPath;
use crate::app::db::model::ByteCount;
use crate::app::db::model::Generation;
use crate::app::db::model::InstanceKarmicStatus;
use crate::app::db::model::InstanceRuntimeState;
use crate::app::db::model::InstanceState;
use crate::app::db::model::MigrationState;
Expand Down Expand Up @@ -1239,17 +1240,31 @@ async fn siu_commit_instance_updates(
// update saga is required, and the instance's auto-restart policy allows it
// to be automatically restarted, activate the instance-reincarnation
// background task to automatically restart it.
let auto_restart = &new_state.instance.auto_restart;
if auto_restart.can_reincarnate(&new_state.instance.runtime_state) {
info!(
log,
"instance update: instance transitioned to Failed, but can \
be automatically restarted; activating reincarnation.";
"instance_id" => %instance_id,
"auto_restart" => ?auto_restart,
"runtime_state" => ?new_state.instance.runtime_state,
);
nexus.background_tasks.task_instance_reincarnation.activate();
let auto_restart = new_state.instance.auto_restart;
match auto_restart.status(&new_state.instance.runtime_state) {
InstanceKarmicStatus::Ready => {
info!(
log,
"instance update: instance transitioned to Failed, but can \
be automatically restarted; activating reincarnation.";
"instance_id" => %instance_id,
"auto_restart" => ?auto_restart,
"runtime_state" => ?new_state.instance.runtime_state,
);
nexus.background_tasks.task_instance_reincarnation.activate();
}
InstanceKarmicStatus::CoolingDown(remaining) => {
info!(
log,
"instance update: instance transitioned to Failed, but is \
still in cooldown from a previous reincarnation";
"instance_id" => %instance_id,
"auto_restart" => ?auto_restart,
"cooldown_remaining" => ?remaining,
"runtime_state" => ?new_state.instance.runtime_state,
);
}
InstanceKarmicStatus::Forbidden | InstanceKarmicStatuts::NotFailed => {}
}

Ok(())
Expand Down

0 comments on commit 4e56df5

Please sign in to comment.