From aca08dfa7d3fd05696dde646416c62f963c189d3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 4 Sep 2024 13:30:46 -0700 Subject: [PATCH] [nexus] Turn `instance.boot_on_fault` into an enum (#6499) Currently, the `instance` table has a `boot_on_fault` column, which is a `bool`. This is intended to indicate whether an instance should be automatically restarted in response to failures, although currently, it isn't actually consumed by anything. However, as determined in RFD 486, Nexus will eventually grow functionality for automatically restarting instances that have transitioned to the `Failed` state, if `boot_on_fault` is set. @gjcolombo suggests that this field should probably be replaced with an enum, rather than a `bool`. This way, we could represent more boot-on-fault disciplines than "always reboot on faults" and "never do that". Since nothing is currently consuming this field, it's probably a good idea to go ahead and do the requisite SQL surgery to turn it into an enum _now_, before we write code that actually consumes it...especially since I'm planning on actually doing that next (see #6491). This commit replaces the `boot_on_fault` column with a new `auto_restart_policy` column. The type for this column is a newly-added `InstanceAutoRestart` enum, which currently has variants for `AllFailures` (the instance should be automatically restarted any time it fails), `SledFailuresOnly`, (the instance should be restarted if the sled it's on reboots or is expunged, but it should *not* be restarted if its individual VMM process crashes), and `Never` (the instance should never be automatically restarted). The database migration adding `auto_restart_policy` backfills any existing `boot_on_fault: true` instances with `InstanceAutoRestart::AllFailures` (although, because users can't currently set a value for `boot_on_fault`, there should usually be no such instances). For instances with `boot_on_fault: false`, the auto-restart policy is left `NULL`; in the future, we can determine what the default policy should be (perhaps at the project-level) and backfill these `NULL`s as well. For now, instances with an unset (`NULL`) policy will not be restarted. Closes #6490 --- nexus/db-model/src/instance.rs | 16 +++++-- nexus/db-model/src/instance_auto_restart.rs | 44 +++++++++++++++++++ nexus/db-model/src/lib.rs | 2 + nexus/db-model/src/schema.rs | 2 +- nexus/db-model/src/schema_versions.rs | 3 +- schema/crdb/dbinit.sql | 30 ++++++++++++- .../README.adoc | 18 ++++++++ .../up01.sql | 19 ++++++++ .../up02.sql | 2 + .../up03.sql | 5 +++ .../up04.sql | 1 + 11 files changed, 134 insertions(+), 8 deletions(-) create mode 100644 nexus/db-model/src/instance_auto_restart.rs create mode 100644 schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc create mode 100644 schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql create mode 100644 schema/crdb/turn-boot-on-fault-into-auto-restart/up02.sql create mode 100644 schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql create mode 100644 schema/crdb/turn-boot-on-fault-into-auto-restart/up04.sql diff --git a/nexus/db-model/src/instance.rs b/nexus/db-model/src/instance.rs index dadf4e89ae..8388611231 100644 --- a/nexus/db-model/src/instance.rs +++ b/nexus/db-model/src/instance.rs @@ -3,7 +3,8 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{ - ByteCount, Disk, ExternalIp, Generation, InstanceCpuCount, InstanceState, + ByteCount, Disk, ExternalIp, Generation, InstanceAutoRestart, + InstanceCpuCount, InstanceState, }; use crate::collection::DatastoreAttachTargetConfig; use crate::schema::{disk, external_ip, instance}; @@ -54,8 +55,13 @@ pub struct Instance { #[diesel(column_name = hostname)] pub hostname: String, - #[diesel(column_name = boot_on_fault)] - pub boot_on_fault: bool, + /// The auto-restart policy for this instance. + /// + /// This indicates whether the instance should be automatically restarted by + /// the control plane on failure. If this is `NULL`, no auto-restart policy + /// has been configured for this instance by the user. + #[diesel(column_name = auto_restart_policy)] + pub auto_restart_policy: Option, #[diesel(embed)] pub runtime_state: InstanceRuntimeState, @@ -104,7 +110,9 @@ impl Instance { ncpus: params.ncpus.into(), memory: params.memory.into(), hostname: params.hostname.to_string(), - boot_on_fault: false, + // TODO(eliza): allow this to be configured via the instance-create + // params... + auto_restart_policy: None, runtime_state, updater_gen: Generation::new(), diff --git a/nexus/db-model/src/instance_auto_restart.rs b/nexus/db-model/src/instance_auto_restart.rs new file mode 100644 index 0000000000..ae546a4690 --- /dev/null +++ b/nexus/db-model/src/instance_auto_restart.rs @@ -0,0 +1,44 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use serde::Deserialize; +use serde::Serialize; +use std::fmt; + +impl_enum_type!( + #[derive(SqlType, Debug)] + #[diesel(postgres_type(name = "instance_auto_restart", schema = "public"))] + pub struct InstanceAutoRestartEnum; + + #[derive(Copy, Clone, Debug, PartialEq, AsExpression, FromSqlRow, Serialize, Deserialize)] + #[diesel(sql_type = InstanceAutoRestartEnum)] + pub enum InstanceAutoRestart; + + // Enum values + Never => b"never" + SledFailuresOnly => b"sled_failures_only" + AllFailures => b"all_failures" +); + +impl InstanceAutoRestart { + pub fn label(&self) -> &'static str { + match self { + Self::Never => "never", + Self::SledFailuresOnly => "sled_failures_only", + Self::AllFailures => "all_failures", + } + } +} + +impl fmt::Display for InstanceAutoRestart { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.label()) + } +} + +impl diesel::query_builder::QueryId for InstanceAutoRestart { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 82f4b78fa8..e127ef1afd 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -34,6 +34,7 @@ mod generation; mod identity_provider; mod image; mod instance; +mod instance_auto_restart; mod instance_cpu_count; mod instance_state; mod inventory; @@ -149,6 +150,7 @@ pub use generation::*; pub use identity_provider::*; pub use image::*; pub use instance::*; +pub use instance_auto_restart::*; pub use instance_cpu_count::*; pub use instance_state::*; pub use inventory::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 5d9b3da78f..733a0657bc 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -407,7 +407,7 @@ table! { ncpus -> Int8, memory -> Int8, hostname -> Text, - boot_on_fault -> Bool, + auto_restart_policy -> Nullable, time_state_updated -> Timestamptz, state_generation -> Int8, active_propolis_id -> Nullable, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 54718bbddb..81e18386f4 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(94, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(95, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(95, "turn-boot-on-fault-into-auto-restart"), KnownVersion::new(94, "put-back-creating-vmm-state"), KnownVersion::new(93, "dataset-kinds-zone-and-debug"), KnownVersion::new(92, "lldp-link-config-nullable"), diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9906a94ac6..388bc81993 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1020,6 +1020,27 @@ CREATE TYPE IF NOT EXISTS omicron.public.vmm_state AS ENUM ( 'saga_unwound' ); +CREATE TYPE IF NOT EXISTS omicron.public.instance_auto_restart AS ENUM ( + /* + * The instance should not, under any circumstances, be automatically + * rebooted by the control plane. + */ + 'never', + /* + * The instance should be automatically restarted if, and only if, the sled + * it was running on has restarted or become unavailable. If the individual + * Propolis VMM process for this instance crashes, it should *not* be + * restarted automatically. + */ + 'sled_failures_only', + /* + * The instance should be automatically restarted any time a fault is + * detected + */ + 'all_failures' +); + + /* * TODO consider how we want to manage multiple sagas operating on the same * Instance -- e.g., reboot concurrent with destroy or concurrent reboots or the @@ -1059,7 +1080,6 @@ CREATE TABLE IF NOT EXISTS omicron.public.instance ( ncpus INT NOT NULL, memory INT NOT NULL, hostname STRING(63) NOT NULL, - boot_on_fault BOOL NOT NULL DEFAULT false, /* ID of the instance update saga that has locked this instance for * updating, if one exists. */ @@ -1077,6 +1097,12 @@ CREATE TABLE IF NOT EXISTS omicron.public.instance ( */ state omicron.public.instance_state_v2 NOT NULL, + /* + * What failures should result in an instance being automatically restarted + * by the control plane. + */ + auto_restart_policy omicron.public.instance_auto_restart, + CONSTRAINT vmm_iff_active_propolis CHECK ( ((state = 'vmm') AND (active_propolis_id IS NOT NULL)) OR ((state != 'vmm') AND (active_propolis_id IS NULL)) @@ -4233,7 +4259,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '94.0.0', NULL) + (TRUE, NOW(), NOW(), '95.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc b/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc new file mode 100644 index 0000000000..123807ae39 --- /dev/null +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc @@ -0,0 +1,18 @@ += Overview + +This migration replaces the `omicron.public.instance.boot_on_fault` column, +which is a `bool`, with a new `auto_restart_policy` column, which is an enum +(`omicron.public.instance_auto_restart`). The new enum type will allow +auto-restart policies other than "always" and "never". +Existing instance records are backfilled with the `all_failures` variant of +`instance_auto_restart` if `boot_on_fault` is `true`. Otherwise, the +auto-restart policy is `NULL`. + +The migration performs the following operations: + +1. `up01.sql` creates the `instance_auto_restart` enum. +2. `up02.sql` adds a (nullable) `auto_restart_policy` column to the `instance` + table. +3. `up03.sql` updates instance records by setting `auto_restart_policy` to + `all_failures` if `boot_on_fault` is `true`. +4. Finally, `up04.sql` drops the now-defunct `boot_on_fault` column. diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql new file mode 100644 index 0000000000..263fe81844 --- /dev/null +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql @@ -0,0 +1,19 @@ +CREATE TYPE IF NOT EXISTS omicron.public.instance_auto_restart AS ENUM ( + /* + * The instance should not, under any circumstances, be automatically + * rebooted by the control plane. + */ + 'never', + /* + * The instance should be automatically restarted if, and only if, the sled + * it was running on has restarted or become unavailable. If the individual + * Propolis VMM process for this instance crashes, it should *not* be + * restarted automatically. + */ + 'sled_failures_only', + /* + * The instance should be automatically restarted any time a fault is + * detected + */ + 'all_failures' +); diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up02.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up02.sql new file mode 100644 index 0000000000..bd85672653 --- /dev/null +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up02.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.instance +ADD COLUMN IF NOT EXISTS auto_restart_policy omicron.public.instance_auto_restart; diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql new file mode 100644 index 0000000000..629c624663 --- /dev/null +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql @@ -0,0 +1,5 @@ +SET LOCAL disallow_full_table_scans = off; +UPDATE omicron.public.instance SET auto_restart_policy = CASE + WHEN boot_on_fault = true THEN 'all_failures' + ELSE NULL +END; diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up04.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up04.sql new file mode 100644 index 0000000000..214cab2873 --- /dev/null +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up04.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.instance DROP COLUMN IF EXISTS boot_on_fault;