From 2db6eff96c0b2d7f94999b2a6332908fce09aa7e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 2 Sep 2024 11:49:28 -0700 Subject: [PATCH] make "sled failures only" the default policy --- nexus/db-model/src/instance.rs | 2 +- nexus/db-model/src/instance_auto_restart.rs | 12 ++++++++++-- schema/crdb/dbinit.sql | 7 +++++++ .../turn-boot-on-fault-into-auto-restart/README.adoc | 8 ++++---- .../turn-boot-on-fault-into-auto-restart/up01.sql | 7 +++++++ .../turn-boot-on-fault-into-auto-restart/up03.sql | 2 +- 6 files changed, 30 insertions(+), 8 deletions(-) diff --git a/nexus/db-model/src/instance.rs b/nexus/db-model/src/instance.rs index fc6678369c..512500d15d 100644 --- a/nexus/db-model/src/instance.rs +++ b/nexus/db-model/src/instance.rs @@ -109,7 +109,7 @@ impl Instance { ncpus: params.ncpus.into(), memory: params.memory.into(), hostname: params.hostname.to_string(), - auto_restart_policy: InstanceAutoRestart::Never, + auto_restart_policy: InstanceAutoRestart::default(), runtime_state, updater_gen: Generation::new(), diff --git a/nexus/db-model/src/instance_auto_restart.rs b/nexus/db-model/src/instance_auto_restart.rs index 1291a1e527..1501445595 100644 --- a/nexus/db-model/src/instance_auto_restart.rs +++ b/nexus/db-model/src/instance_auto_restart.rs @@ -18,18 +18,26 @@ impl_enum_type!( // Enum values Never => b"never" + SledFailuresOnly => b"sled_failures_only" AllFailures => b"all_failures" ); impl InstanceAutoRestart { pub fn label(&self) -> &'static str { match self { - InstanceAutoRestart::Never => "never", - InstanceAutoRestart::AllFailures => "all_failures", + Self::Never => "never", + Self::SledFailuresOnly => "sled_failures_only", + Self::AllFailures => "all_failures", } } } +impl Default for InstanceAutoRestart { + fn default() -> Self { + Self::SledFailuresOnly + } +} + impl fmt::Display for InstanceAutoRestart { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", self.label()) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index c962ffda60..212346c8eb 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1026,6 +1026,13 @@ CREATE TYPE IF NOT EXISTS omicron.public.instance_auto_restart AS ENUM ( * rebooted by the control plane. */ 'never', + /* + * The instance should be automatically restarted if, and only if, the sled + * it was running on has restarted or become unavailable. If the individual + * Propolis VMM process for this instance crashes, it should *not* be + * restarted automatically. + */ + 'sled_failures_only', /* * The instance should be automatically restarted any time a fault is * detected diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc b/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc index ce6b6e5429..bb021c5544 100644 --- a/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc @@ -3,9 +3,9 @@ This migration replaces the `omicron.public.instance.boot_on_fault` column, which is a `bool`, with a new `auto_restart_policy` column, which is an enum (`omicron.public.instance_auto_restart`). The new enum type will allow -auto-restart policies other than "always" and "never" to be added in the future. +auto-restart policies other than "always" and "never". Existing instance records are backfilled with the `all_failures` variant of -`instance_auto_restart` if `boot_on_fault` is `true`, or `never` if +`instance_auto_restart` if `boot_on_fault` is `true`, or `sled_failures_only` if `boot_on_fault` is `false`. The migration performs the following operations: @@ -14,8 +14,8 @@ The migration performs the following operations: 2. `up02.sql` adds a (nullable) `auto_restart_policy` column to the `instance` table. 3. `up03.sql` updates instance records by setting `auto_restart_policy` to - `all_failures` if `boot_on_fault` is `true`, or `never` if `boot_on_fault` is - `false`. + `all_failures` if `boot_on_fault` is `true`, or `sled_failures_only` if + `boot_on_fault` is `false`. 4. Now that all instance records have a value for `auto_restart_policy`, `up04.sql` makes the `auto_restart_policy` column non-null. 5. Finally, `up05.sql` drops the now-defunct `boot_on_fault` column. diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql index ad1ca45a42..263fe81844 100644 --- a/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql @@ -4,6 +4,13 @@ CREATE TYPE IF NOT EXISTS omicron.public.instance_auto_restart AS ENUM ( * rebooted by the control plane. */ 'never', + /* + * The instance should be automatically restarted if, and only if, the sled + * it was running on has restarted or become unavailable. If the individual + * Propolis VMM process for this instance crashes, it should *not* be + * restarted automatically. + */ + 'sled_failures_only', /* * The instance should be automatically restarted any time a fault is * detected diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql index 98a17781a4..c101e0d715 100644 --- a/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql @@ -1,5 +1,5 @@ SET LOCAL disallow_full_table_scans = off; UPDATE omicron.public.instance SET auto_restart_policy = CASE WHEN boot_on_fault = true THEN 'all_failures' - ELSE 'never' + ELSE 'sled_failures_only' END;