Skip to content

Commit

Permalink
make cooldown period configurable
Browse files Browse the repository at this point in the history
  • Loading branch information
hawkw committed Sep 9, 2024
1 parent f5a17b5 commit 38e3dcb
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 29 deletions.
19 changes: 12 additions & 7 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1655,9 +1655,10 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
error, details
),
Ok(InstanceReincarnationStatus {
default_cooldown,
instances_found,
instances_reincarnated,
instances_in_chill_out_time,
instances_cooling_down,
already_reincarnated,
query_error,
restart_errors,
Expand All @@ -1669,27 +1670,31 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
" instances already reincarnated:";
const ERRORS: &'static str =
" instances which failed to be reincarnated:";
const CHILLING_OUT: &'static str =
" instances still in chill-out time:";
const COOLING_DOWN: &'static str =
" instances still in cooldown:";
const COOLDOWN_PERIOD: &'static str =
"default cooldown period:";
const WIDTH: usize = const_max_len(&[
FOUND,
REINCARNATED,
ALREADY_REINCARNATED,
ERRORS,
CHILLING_OUT,
COOLING_DOWN,
COOLDOWN_PERIOD,
]);
let n_restart_errors = restart_errors.len();
let n_restarted = instances_reincarnated.len();
let n_already_restarted = already_reincarnated.len();
let n_chilling_out = instances_in_chill_out_time.len();
let n_chilling_out = instances_cooling_down.len();
println!(" {FOUND:<WIDTH$} {instances_found:>3}");
println!(" {REINCARNATED:<WIDTH$} {n_restarted:>3}");
println!(
" {ALREADY_REINCARNATED:<WIDTH$} {:>3}",
n_already_restarted
);
println!(" {CHILLING_OUT:<WIDTH$} {n_chilling_out:>3}",);
println!(" {COOLING_DOWN:<WIDTH$} {n_chilling_out:>3}");
println!(" {ERRORS:<WIDTH$} {n_restart_errors:>3}");
println!(" {COOLDOWN_PERIOD:<WIDTH$} {default_cooldown}");

if let Some(e) = query_error {
println!(
Expand Down Expand Up @@ -1731,7 +1736,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
time to chill out before\n they can be reincarnated:",
);
}
for (id, last_reincarnated) in instances_in_chill_out_time {
for (id, last_reincarnated) in instances_cooling_down {
println!(
" > {id} (reincarnated at {last_reincarnated:?})",
);
Expand Down
7 changes: 7 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,11 @@ pub struct InstancereincarnationConfig {
/// period (in seconds) for periodic activations of this background task
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,

/// default cooldown period (in seconds) before an auto-restarted instance
/// can be restarted again.
#[serde_as(as = "DurationSeconds<u64>")]
pub default_cooldown_secs: Duration,
}

#[serde_as]
Expand Down Expand Up @@ -1079,6 +1084,7 @@ mod test {
},
instance_reincarnation: InstancereincarnationConfig {
period_secs: Duration::from_secs(60),
default_cooldown_secs: Duration::from_secs(60),
},
service_firewall_propagation:
ServiceFirewallPropagationConfig {
Expand Down Expand Up @@ -1184,6 +1190,7 @@ mod test {
instance_watcher.period_secs = 30
instance_updater.period_secs = 30
instance_reincarnation.period_secs = 60
instance_reincarnation.default_cooldown_secs = 60
service_firewall_propagation.period_secs = 300
v2p_mapping_propagation.period_secs = 30
abandoned_vmm_reaper.period_secs = 60
Expand Down
2 changes: 2 additions & 0 deletions nexus/examples/config-second.toml
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ instance_watcher.period_secs = 30
instance_updater.period_secs = 30
# How frequently to attempt to restart Failed instances?
instance_reincarnation.period_secs = 60
# Default cooldown period between automatic restarts for Failed instances.
instance_reincarnation.default_cooldown_secs = 60
service_firewall_propagation.period_secs = 300
v2p_mapping_propagation.period_secs = 30
abandoned_vmm_reaper.period_secs = 60
Expand Down
2 changes: 2 additions & 0 deletions nexus/examples/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ instance_watcher.period_secs = 30
instance_updater.period_secs = 30
# How frequently to attempt to restart Failed instances?
instance_reincarnation.period_secs = 60
# Default cooldown period between automatic restarts for Failed instances.
instance_reincarnation.default_cooldown_secs = 60
service_firewall_propagation.period_secs = 300
v2p_mapping_propagation.period_secs = 30
abandoned_vmm_reaper.period_secs = 60
Expand Down
56 changes: 35 additions & 21 deletions nexus/src/app/background/tasks/instance_reincarnation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,36 @@ use nexus_db_queries::db::DataStore;
use nexus_types::identity::Resource;
use nexus_types::internal_api::background::InstanceReincarnationStatus;
use omicron_common::api::external::Error;
use std::num::NonZeroU32;
use std::sync::Arc;
use std::time::Duration;
use tokio::task::JoinSet;

pub struct InstanceReincarnation {
datastore: Arc<DataStore>,
sagas: Arc<dyn StartSaga>,
/// The default cooldown period between automatic restarts.
///
/// If an instance's last automatic restart occurred less than this duration
/// from now, it may not be automatically restarted until it's had some time
/// to calm down. This is intended to try and reduce the impact of tight
/// crash loops.
//
// TODO(eliza): this default should be overridden by a project-level default
// when https://github.com/oxidecomputer/omicron/issues/1015 is implemented.
default_cooldown: TimeDelta,
}

// Do not auto-restart instances that have been auto-restarted within the last
// 1 minute, to try and reduce the impact of tight crash loops.
//
// TODO(eliza): eventually, perhaps this should be a project-level configuration?
const CHILL_OUT_TIME: TimeDelta = match TimeDelta::new(60, 0) {
Some(d) => d,
// `Option::unwrap()` isn't const fn.
None => panic!(
"60 seconds definitely ought to be in range for a `TimeDelta`..."
),
};

impl BackgroundTask for InstanceReincarnation {
fn activate<'a>(
&'a mut self,
opctx: &'a OpContext,
) -> BoxFuture<'a, serde_json::Value> {
Box::pin(async move {
let mut status = InstanceReincarnationStatus::default();
status.default_cooldown = self.default_cooldown.to_std().expect(
"cooldown came from a `std::time::Duration` which was \
non-negative, so it should remain non-negative",
);
self.actually_activate(opctx, &mut status).await;
if !status.restart_errors.is_empty() || status.query_error.is_some()
{
Expand Down Expand Up @@ -84,8 +86,11 @@ impl InstanceReincarnation {
pub(crate) fn new(
datastore: Arc<DataStore>,
sagas: Arc<dyn StartSaga>,
default_cooldown: Duration,
) -> Self {
Self { datastore, sagas }
let default_cooldown = TimeDelta::from_std(default_cooldown)
.expect("duration should be in range");
Self { datastore, sagas, default_cooldown }
}

async fn actually_activate(
Expand Down Expand Up @@ -143,15 +148,18 @@ impl InstanceReincarnation {
if let Some(last_reincarnation) =
db_instance.runtime().time_last_auto_restarted
{
// TODO(eliza): allow overriding the cooldown at the project
// level, once we implement
// https://github.com/oxidecomputer/omicron/issues/1015
if Utc::now().signed_duration_since(last_reincarnation)
< CHILL_OUT_TIME
< self.default_cooldown
{
status
.instances_in_chill_out_time
.instances_cooling_down
.push((instance_id, last_reincarnation));
debug!(
opctx.log,
"instance still needs to take some time to settle \
"instance still needs to take some time to cool \
down before its next reincarnation";
"instance_id" => %instance_id,
"last_reincarnated_at" => %last_reincarnation,
Expand Down Expand Up @@ -369,8 +377,11 @@ mod test {
let authz_project = setup_test_project(&cptestctx, &opctx).await;

let starter = Arc::new(NoopStartSaga::new());
let mut task =
InstanceReincarnation::new(datastore.clone(), starter.clone());
let mut task = InstanceReincarnation::new(
datastore.clone(),
starter.clone(),
Duration::from_secs(60),
);

// Noop test
let result = task.activate(&opctx).await;
Expand Down Expand Up @@ -424,8 +435,11 @@ mod test {
let authz_project = setup_test_project(&cptestctx, &opctx).await;

let starter = Arc::new(NoopStartSaga::new());
let mut task =
InstanceReincarnation::new(datastore.clone(), starter.clone());
let mut task = InstanceReincarnation::new(
datastore.clone(),
starter.clone(),
Duration::from_secs(60),
);

// Create instances in the `Failed` state that are eligible to be
// restarted.
Expand Down
5 changes: 4 additions & 1 deletion nexus/types/src/internal_api/background.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use chrono::DateTime;
use chrono::Utc;
use serde::Deserialize;
use serde::Serialize;
use std::time::Duration;
use uuid::Uuid;

/// The status of a `region_replacement_drive` background task activation
Expand Down Expand Up @@ -60,13 +61,15 @@ pub struct RegionSnapshotReplacementFinishStatus {
/// The status of an `instance_reincarnation` background task activation.
#[derive(Default, Serialize, Deserialize, Debug)]
pub struct InstanceReincarnationStatus {
/// Default cooldown period between reincarnations.
pub default_cooldown: Duration,
/// Total number of instances in need of reincarnation on this activation.
pub instances_found: usize,
/// UUIDs of instances reincarnated successfully by this activation.
pub instances_reincarnated: Vec<Uuid>,
/// Instances which reincarnated too recently and still need to take some
/// time out to settle down a bit.
pub instances_in_chill_out_time: Vec<(Uuid, DateTime<Utc>)>,
pub instances_cooling_down: Vec<(Uuid, DateTime<Utc>)>,
/// UUIDs of instances which were reincarnated by a different Nexus'
/// instance-reincarnation task, or by a user-triggered restart saga.
pub already_reincarnated: Vec<Uuid>,
Expand Down
2 changes: 2 additions & 0 deletions smf/nexus/single-sled/config-partial.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ lookup_region_port.period_secs = 60
instance_updater.period_secs = 30
# How frequently to attempt to restart Failed instances?
instance_reincarnation.period_secs = 60
# Default cooldown period between automatic restarts for Failed instances.
instance_reincarnation.default_cooldown_secs = 60
region_snapshot_replacement_start.period_secs = 30
region_snapshot_replacement_garbage_collection.period_secs = 30
region_snapshot_replacement_step.period_secs = 30
Expand Down

0 comments on commit 38e3dcb

Please sign in to comment.