diff --git a/nexus/db-model/src/instance.rs b/nexus/db-model/src/instance.rs index dadf4e89ae..8388611231 100644 --- a/nexus/db-model/src/instance.rs +++ b/nexus/db-model/src/instance.rs @@ -3,7 +3,8 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{ - ByteCount, Disk, ExternalIp, Generation, InstanceCpuCount, InstanceState, + ByteCount, Disk, ExternalIp, Generation, InstanceAutoRestart, + InstanceCpuCount, InstanceState, }; use crate::collection::DatastoreAttachTargetConfig; use crate::schema::{disk, external_ip, instance}; @@ -54,8 +55,13 @@ pub struct Instance { #[diesel(column_name = hostname)] pub hostname: String, - #[diesel(column_name = boot_on_fault)] - pub boot_on_fault: bool, + /// The auto-restart policy for this instance. + /// + /// This indicates whether the instance should be automatically restarted by + /// the control plane on failure. If this is `NULL`, no auto-restart policy + /// has been configured for this instance by the user. + #[diesel(column_name = auto_restart_policy)] + pub auto_restart_policy: Option, #[diesel(embed)] pub runtime_state: InstanceRuntimeState, @@ -104,7 +110,9 @@ impl Instance { ncpus: params.ncpus.into(), memory: params.memory.into(), hostname: params.hostname.to_string(), - boot_on_fault: false, + // TODO(eliza): allow this to be configured via the instance-create + // params... + auto_restart_policy: None, runtime_state, updater_gen: Generation::new(), diff --git a/nexus/db-model/src/instance_auto_restart.rs b/nexus/db-model/src/instance_auto_restart.rs new file mode 100644 index 0000000000..ae546a4690 --- /dev/null +++ b/nexus/db-model/src/instance_auto_restart.rs @@ -0,0 +1,44 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use super::impl_enum_type; +use serde::Deserialize; +use serde::Serialize; +use std::fmt; + +impl_enum_type!( + #[derive(SqlType, Debug)] + #[diesel(postgres_type(name = "instance_auto_restart", schema = "public"))] + pub struct InstanceAutoRestartEnum; + + #[derive(Copy, Clone, Debug, PartialEq, AsExpression, FromSqlRow, Serialize, Deserialize)] + #[diesel(sql_type = InstanceAutoRestartEnum)] + pub enum InstanceAutoRestart; + + // Enum values + Never => b"never" + SledFailuresOnly => b"sled_failures_only" + AllFailures => b"all_failures" +); + +impl InstanceAutoRestart { + pub fn label(&self) -> &'static str { + match self { + Self::Never => "never", + Self::SledFailuresOnly => "sled_failures_only", + Self::AllFailures => "all_failures", + } + } +} + +impl fmt::Display for InstanceAutoRestart { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.label()) + } +} + +impl diesel::query_builder::QueryId for InstanceAutoRestart { + type QueryId = (); + const HAS_STATIC_QUERY_ID: bool = false; +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 82f4b78fa8..e127ef1afd 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -34,6 +34,7 @@ mod generation; mod identity_provider; mod image; mod instance; +mod instance_auto_restart; mod instance_cpu_count; mod instance_state; mod inventory; @@ -149,6 +150,7 @@ pub use generation::*; pub use identity_provider::*; pub use image::*; pub use instance::*; +pub use instance_auto_restart::*; pub use instance_cpu_count::*; pub use instance_state::*; pub use inventory::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 5d9b3da78f..733a0657bc 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -407,7 +407,7 @@ table! { ncpus -> Int8, memory -> Int8, hostname -> Text, - boot_on_fault -> Bool, + auto_restart_policy -> Nullable, time_state_updated -> Timestamptz, state_generation -> Int8, active_propolis_id -> Nullable, diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 54718bbddb..81e18386f4 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(94, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(95, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(95, "turn-boot-on-fault-into-auto-restart"), KnownVersion::new(94, "put-back-creating-vmm-state"), KnownVersion::new(93, "dataset-kinds-zone-and-debug"), KnownVersion::new(92, "lldp-link-config-nullable"), diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9906a94ac6..388bc81993 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1020,6 +1020,27 @@ CREATE TYPE IF NOT EXISTS omicron.public.vmm_state AS ENUM ( 'saga_unwound' ); +CREATE TYPE IF NOT EXISTS omicron.public.instance_auto_restart AS ENUM ( + /* + * The instance should not, under any circumstances, be automatically + * rebooted by the control plane. + */ + 'never', + /* + * The instance should be automatically restarted if, and only if, the sled + * it was running on has restarted or become unavailable. If the individual + * Propolis VMM process for this instance crashes, it should *not* be + * restarted automatically. + */ + 'sled_failures_only', + /* + * The instance should be automatically restarted any time a fault is + * detected + */ + 'all_failures' +); + + /* * TODO consider how we want to manage multiple sagas operating on the same * Instance -- e.g., reboot concurrent with destroy or concurrent reboots or the @@ -1059,7 +1080,6 @@ CREATE TABLE IF NOT EXISTS omicron.public.instance ( ncpus INT NOT NULL, memory INT NOT NULL, hostname STRING(63) NOT NULL, - boot_on_fault BOOL NOT NULL DEFAULT false, /* ID of the instance update saga that has locked this instance for * updating, if one exists. */ @@ -1077,6 +1097,12 @@ CREATE TABLE IF NOT EXISTS omicron.public.instance ( */ state omicron.public.instance_state_v2 NOT NULL, + /* + * What failures should result in an instance being automatically restarted + * by the control plane. + */ + auto_restart_policy omicron.public.instance_auto_restart, + CONSTRAINT vmm_iff_active_propolis CHECK ( ((state = 'vmm') AND (active_propolis_id IS NOT NULL)) OR ((state != 'vmm') AND (active_propolis_id IS NULL)) @@ -4233,7 +4259,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '94.0.0', NULL) + (TRUE, NOW(), NOW(), '95.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc b/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc new file mode 100644 index 0000000000..123807ae39 --- /dev/null +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc @@ -0,0 +1,18 @@ += Overview + +This migration replaces the `omicron.public.instance.boot_on_fault` column, +which is a `bool`, with a new `auto_restart_policy` column, which is an enum +(`omicron.public.instance_auto_restart`). The new enum type will allow +auto-restart policies other than "always" and "never". +Existing instance records are backfilled with the `all_failures` variant of +`instance_auto_restart` if `boot_on_fault` is `true`. Otherwise, the +auto-restart policy is `NULL`. + +The migration performs the following operations: + +1. `up01.sql` creates the `instance_auto_restart` enum. +2. `up02.sql` adds a (nullable) `auto_restart_policy` column to the `instance` + table. +3. `up03.sql` updates instance records by setting `auto_restart_policy` to + `all_failures` if `boot_on_fault` is `true`. +4. Finally, `up04.sql` drops the now-defunct `boot_on_fault` column. diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql new file mode 100644 index 0000000000..263fe81844 --- /dev/null +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql @@ -0,0 +1,19 @@ +CREATE TYPE IF NOT EXISTS omicron.public.instance_auto_restart AS ENUM ( + /* + * The instance should not, under any circumstances, be automatically + * rebooted by the control plane. + */ + 'never', + /* + * The instance should be automatically restarted if, and only if, the sled + * it was running on has restarted or become unavailable. If the individual + * Propolis VMM process for this instance crashes, it should *not* be + * restarted automatically. + */ + 'sled_failures_only', + /* + * The instance should be automatically restarted any time a fault is + * detected + */ + 'all_failures' +); diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up02.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up02.sql new file mode 100644 index 0000000000..bd85672653 --- /dev/null +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up02.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.instance +ADD COLUMN IF NOT EXISTS auto_restart_policy omicron.public.instance_auto_restart; diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql new file mode 100644 index 0000000000..629c624663 --- /dev/null +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql @@ -0,0 +1,5 @@ +SET LOCAL disallow_full_table_scans = off; +UPDATE omicron.public.instance SET auto_restart_policy = CASE + WHEN boot_on_fault = true THEN 'all_failures' + ELSE NULL +END; diff --git a/schema/crdb/turn-boot-on-fault-into-auto-restart/up04.sql b/schema/crdb/turn-boot-on-fault-into-auto-restart/up04.sql new file mode 100644 index 0000000000..214cab2873 --- /dev/null +++ b/schema/crdb/turn-boot-on-fault-into-auto-restart/up04.sql @@ -0,0 +1 @@ +ALTER TABLE omicron.public.instance DROP COLUMN IF EXISTS boot_on_fault;