diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 7af878e2bc3..2a88e4002a8 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -1655,9 +1655,10 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { error, details ), Ok(InstanceReincarnationStatus { + default_cooldown, instances_found, instances_reincarnated, - instances_in_chill_out_time, + instances_cooling_down, already_reincarnated, query_error, restart_errors, @@ -1669,27 +1670,31 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { " instances already reincarnated:"; const ERRORS: &'static str = " instances which failed to be reincarnated:"; - const CHILLING_OUT: &'static str = - " instances still in chill-out time:"; + const COOLING_DOWN: &'static str = + " instances still in cooldown:"; + const COOLDOWN_PERIOD: &'static str = + "default cooldown period:"; const WIDTH: usize = const_max_len(&[ FOUND, REINCARNATED, ALREADY_REINCARNATED, ERRORS, - CHILLING_OUT, + COOLING_DOWN, + COOLDOWN_PERIOD, ]); let n_restart_errors = restart_errors.len(); let n_restarted = instances_reincarnated.len(); let n_already_restarted = already_reincarnated.len(); - let n_chilling_out = instances_in_chill_out_time.len(); + let n_chilling_out = instances_cooling_down.len(); println!(" {FOUND:3}"); println!(" {REINCARNATED:3}"); println!( " {ALREADY_REINCARNATED:3}", n_already_restarted ); - println!(" {CHILLING_OUT:3}",); + println!(" {COOLING_DOWN:3}"); println!(" {ERRORS:3}"); + println!(" {COOLDOWN_PERIOD: {id} (reincarnated at {last_reincarnated:?})", ); diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index 35f91cd34f3..b43f8a79eb3 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -597,6 +597,11 @@ pub struct InstancereincarnationConfig { /// period (in seconds) for periodic activations of this background task #[serde_as(as = "DurationSeconds")] pub period_secs: Duration, + + /// default cooldown period (in seconds) before an auto-restarted instance + /// can be restarted again. + #[serde_as(as = "DurationSeconds")] + pub default_cooldown_secs: Duration, } #[serde_as] @@ -1079,6 +1084,7 @@ mod test { }, instance_reincarnation: InstancereincarnationConfig { period_secs: Duration::from_secs(60), + default_cooldown_secs: Duration::from_secs(60), }, service_firewall_propagation: ServiceFirewallPropagationConfig { @@ -1184,6 +1190,7 @@ mod test { instance_watcher.period_secs = 30 instance_updater.period_secs = 30 instance_reincarnation.period_secs = 60 + instance_reincarnation.default_cooldown_secs = 60 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index f5d0b240da4..ddfec40f3b0 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -136,6 +136,8 @@ instance_watcher.period_secs = 30 instance_updater.period_secs = 30 # How frequently to attempt to restart Failed instances? instance_reincarnation.period_secs = 60 +# Default cooldown period between automatic restarts for Failed instances. +instance_reincarnation.default_cooldown_secs = 60 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index c653acb8239..f24322e6794 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -122,6 +122,8 @@ instance_watcher.period_secs = 30 instance_updater.period_secs = 30 # How frequently to attempt to restart Failed instances? instance_reincarnation.period_secs = 60 +# Default cooldown period between automatic restarts for Failed instances. +instance_reincarnation.default_cooldown_secs = 60 service_firewall_propagation.period_secs = 300 v2p_mapping_propagation.period_secs = 30 abandoned_vmm_reaper.period_secs = 60 diff --git a/nexus/src/app/background/tasks/instance_reincarnation.rs b/nexus/src/app/background/tasks/instance_reincarnation.rs index 0f4d681e284..b472a00373a 100644 --- a/nexus/src/app/background/tasks/instance_reincarnation.rs +++ b/nexus/src/app/background/tasks/instance_reincarnation.rs @@ -18,27 +18,25 @@ use nexus_db_queries::db::DataStore; use nexus_types::identity::Resource; use nexus_types::internal_api::background::InstanceReincarnationStatus; use omicron_common::api::external::Error; -use std::num::NonZeroU32; use std::sync::Arc; +use std::time::Duration; use tokio::task::JoinSet; pub struct InstanceReincarnation { datastore: Arc, sagas: Arc, + /// The default cooldown period between automatic restarts. + /// + /// If an instance's last automatic restart occurred less than this duration + /// from now, it may not be automatically restarted until it's had some time + /// to calm down. This is intended to try and reduce the impact of tight + /// crash loops. + // + // TODO(eliza): this default should be overridden by a project-level default + // when https://github.com/oxidecomputer/omicron/issues/1015 is implemented. + default_cooldown: TimeDelta, } -// Do not auto-restart instances that have been auto-restarted within the last -// 1 minute, to try and reduce the impact of tight crash loops. -// -// TODO(eliza): eventually, perhaps this should be a project-level configuration? -const CHILL_OUT_TIME: TimeDelta = match TimeDelta::new(60, 0) { - Some(d) => d, - // `Option::unwrap()` isn't const fn. - None => panic!( - "60 seconds definitely ought to be in range for a `TimeDelta`..." - ), -}; - impl BackgroundTask for InstanceReincarnation { fn activate<'a>( &'a mut self, @@ -46,6 +44,10 @@ impl BackgroundTask for InstanceReincarnation { ) -> BoxFuture<'a, serde_json::Value> { Box::pin(async move { let mut status = InstanceReincarnationStatus::default(); + status.default_cooldown = self.default_cooldown.to_std().expect( + "cooldown came from a `std::time::Duration` which was \ + non-negative, so it should remain non-negative", + ); self.actually_activate(opctx, &mut status).await; if !status.restart_errors.is_empty() || status.query_error.is_some() { @@ -84,8 +86,11 @@ impl InstanceReincarnation { pub(crate) fn new( datastore: Arc, sagas: Arc, + default_cooldown: Duration, ) -> Self { - Self { datastore, sagas } + let default_cooldown = TimeDelta::from_std(default_cooldown) + .expect("duration should be in range"); + Self { datastore, sagas, default_cooldown } } async fn actually_activate( @@ -143,15 +148,18 @@ impl InstanceReincarnation { if let Some(last_reincarnation) = db_instance.runtime().time_last_auto_restarted { + // TODO(eliza): allow overriding the cooldown at the project + // level, once we implement + // https://github.com/oxidecomputer/omicron/issues/1015 if Utc::now().signed_duration_since(last_reincarnation) - < CHILL_OUT_TIME + < self.default_cooldown { status - .instances_in_chill_out_time + .instances_cooling_down .push((instance_id, last_reincarnation)); debug!( opctx.log, - "instance still needs to take some time to settle \ + "instance still needs to take some time to cool \ down before its next reincarnation"; "instance_id" => %instance_id, "last_reincarnated_at" => %last_reincarnation, @@ -369,8 +377,11 @@ mod test { let authz_project = setup_test_project(&cptestctx, &opctx).await; let starter = Arc::new(NoopStartSaga::new()); - let mut task = - InstanceReincarnation::new(datastore.clone(), starter.clone()); + let mut task = InstanceReincarnation::new( + datastore.clone(), + starter.clone(), + Duration::from_secs(60), + ); // Noop test let result = task.activate(&opctx).await; @@ -424,8 +435,11 @@ mod test { let authz_project = setup_test_project(&cptestctx, &opctx).await; let starter = Arc::new(NoopStartSaga::new()); - let mut task = - InstanceReincarnation::new(datastore.clone(), starter.clone()); + let mut task = InstanceReincarnation::new( + datastore.clone(), + starter.clone(), + Duration::from_secs(60), + ); // Create instances in the `Failed` state that are eligible to be // restarted. diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 2cfe34ccc8a..b0b422ab8e8 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -6,6 +6,7 @@ use chrono::DateTime; use chrono::Utc; use serde::Deserialize; use serde::Serialize; +use std::time::Duration; use uuid::Uuid; /// The status of a `region_replacement_drive` background task activation @@ -60,13 +61,15 @@ pub struct RegionSnapshotReplacementFinishStatus { /// The status of an `instance_reincarnation` background task activation. #[derive(Default, Serialize, Deserialize, Debug)] pub struct InstanceReincarnationStatus { + /// Default cooldown period between reincarnations. + pub default_cooldown: Duration, /// Total number of instances in need of reincarnation on this activation. pub instances_found: usize, /// UUIDs of instances reincarnated successfully by this activation. pub instances_reincarnated: Vec, /// Instances which reincarnated too recently and still need to take some /// time out to settle down a bit. - pub instances_in_chill_out_time: Vec<(Uuid, DateTime)>, + pub instances_cooling_down: Vec<(Uuid, DateTime)>, /// UUIDs of instances which were reincarnated by a different Nexus' /// instance-reincarnation task, or by a user-triggered restart saga. pub already_reincarnated: Vec, diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index e966b3eabd4..5fb4f524ad4 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -67,6 +67,8 @@ lookup_region_port.period_secs = 60 instance_updater.period_secs = 30 # How frequently to attempt to restart Failed instances? instance_reincarnation.period_secs = 60 +# Default cooldown period between automatic restarts for Failed instances. +instance_reincarnation.default_cooldown_secs = 60 region_snapshot_replacement_start.period_secs = 30 region_snapshot_replacement_garbage_collection.period_secs = 30 region_snapshot_replacement_step.period_secs = 30