Skip to content

Commit

Permalink
[nexus] Turn instance.boot_on_fault into an enum (#6499)
Browse files Browse the repository at this point in the history
Currently, the `instance` table has a `boot_on_fault` column, which is a
`bool`. This is intended to indicate whether an instance should be
automatically restarted in response to failures, although currently, it
isn't actually consumed by anything. However, as determined in RFD 486,
Nexus will eventually grow functionality for automatically restarting
instances that have transitioned to the `Failed` state, if
`boot_on_fault` is set.

@gjcolombo suggests that this field should probably be replaced with an
enum, rather than a `bool`. This way, we could represent more
boot-on-fault disciplines than "always reboot on faults" and "never do
that". Since nothing is currently consuming this field, it's probably a
good idea to go ahead and do the requisite SQL surgery to turn it into
an enum _now_, before we write code that actually consumes
it...especially since I'm planning on actually doing that next (see
#6491).

This commit replaces the `boot_on_fault` column with a new
`auto_restart_policy` column. The type for this column is a newly-added
`InstanceAutoRestart` enum, which currently has variants for
`AllFailures` (the instance should be automatically restarted any time
it fails), `SledFailuresOnly`, (the instance should be restarted if the
sled it's on reboots or is expunged, but it should *not* be restarted if
its individual VMM process crashes), and `Never` (the instance should
never be automatically restarted). The database migration adding
`auto_restart_policy` backfills any existing `boot_on_fault: true`
instances with `InstanceAutoRestart::AllFailures` (although, because
users can't currently set a value for `boot_on_fault`, there should
usually be no such instances). For instances with `boot_on_fault:
false`, the auto-restart policy is left `NULL`; in the future, we can
determine what the default policy should be (perhaps at the
project-level) and backfill these `NULL`s as well. For now, instances
with an unset (`NULL`) policy will not be restarted.

Closes #6490
  • Loading branch information
hawkw authored Sep 4, 2024
1 parent ea85b02 commit aca08df
Show file tree
Hide file tree
Showing 11 changed files with 134 additions and 8 deletions.
16 changes: 12 additions & 4 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::{
ByteCount, Disk, ExternalIp, Generation, InstanceCpuCount, InstanceState,
ByteCount, Disk, ExternalIp, Generation, InstanceAutoRestart,
InstanceCpuCount, InstanceState,
};
use crate::collection::DatastoreAttachTargetConfig;
use crate::schema::{disk, external_ip, instance};
Expand Down Expand Up @@ -54,8 +55,13 @@ pub struct Instance {
#[diesel(column_name = hostname)]
pub hostname: String,

#[diesel(column_name = boot_on_fault)]
pub boot_on_fault: bool,
/// The auto-restart policy for this instance.
///
/// This indicates whether the instance should be automatically restarted by
/// the control plane on failure. If this is `NULL`, no auto-restart policy
/// has been configured for this instance by the user.
#[diesel(column_name = auto_restart_policy)]
pub auto_restart_policy: Option<InstanceAutoRestart>,

#[diesel(embed)]
pub runtime_state: InstanceRuntimeState,
Expand Down Expand Up @@ -104,7 +110,9 @@ impl Instance {
ncpus: params.ncpus.into(),
memory: params.memory.into(),
hostname: params.hostname.to_string(),
boot_on_fault: false,
// TODO(eliza): allow this to be configured via the instance-create
// params...
auto_restart_policy: None,
runtime_state,

updater_gen: Generation::new(),
Expand Down
44 changes: 44 additions & 0 deletions nexus/db-model/src/instance_auto_restart.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::impl_enum_type;
use serde::Deserialize;
use serde::Serialize;
use std::fmt;

impl_enum_type!(
#[derive(SqlType, Debug)]
#[diesel(postgres_type(name = "instance_auto_restart", schema = "public"))]
pub struct InstanceAutoRestartEnum;

#[derive(Copy, Clone, Debug, PartialEq, AsExpression, FromSqlRow, Serialize, Deserialize)]
#[diesel(sql_type = InstanceAutoRestartEnum)]
pub enum InstanceAutoRestart;

// Enum values
Never => b"never"
SledFailuresOnly => b"sled_failures_only"
AllFailures => b"all_failures"
);

impl InstanceAutoRestart {
pub fn label(&self) -> &'static str {
match self {
Self::Never => "never",
Self::SledFailuresOnly => "sled_failures_only",
Self::AllFailures => "all_failures",
}
}
}

impl fmt::Display for InstanceAutoRestart {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.label())
}
}

impl diesel::query_builder::QueryId for InstanceAutoRestart {
type QueryId = ();
const HAS_STATIC_QUERY_ID: bool = false;
}
2 changes: 2 additions & 0 deletions nexus/db-model/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ mod generation;
mod identity_provider;
mod image;
mod instance;
mod instance_auto_restart;
mod instance_cpu_count;
mod instance_state;
mod inventory;
Expand Down Expand Up @@ -149,6 +150,7 @@ pub use generation::*;
pub use identity_provider::*;
pub use image::*;
pub use instance::*;
pub use instance_auto_restart::*;
pub use instance_cpu_count::*;
pub use instance_state::*;
pub use inventory::*;
Expand Down
2 changes: 1 addition & 1 deletion nexus/db-model/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ table! {
ncpus -> Int8,
memory -> Int8,
hostname -> Text,
boot_on_fault -> Bool,
auto_restart_policy -> Nullable<crate::InstanceAutoRestartEnum>,
time_state_updated -> Timestamptz,
state_generation -> Int8,
active_propolis_id -> Nullable<Uuid>,
Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::collections::BTreeMap;
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(94, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(95, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(95, "turn-boot-on-fault-into-auto-restart"),
KnownVersion::new(94, "put-back-creating-vmm-state"),
KnownVersion::new(93, "dataset-kinds-zone-and-debug"),
KnownVersion::new(92, "lldp-link-config-nullable"),
Expand Down
30 changes: 28 additions & 2 deletions schema/crdb/dbinit.sql
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,27 @@ CREATE TYPE IF NOT EXISTS omicron.public.vmm_state AS ENUM (
'saga_unwound'
);

CREATE TYPE IF NOT EXISTS omicron.public.instance_auto_restart AS ENUM (
/*
* The instance should not, under any circumstances, be automatically
* rebooted by the control plane.
*/
'never',
/*
* The instance should be automatically restarted if, and only if, the sled
* it was running on has restarted or become unavailable. If the individual
* Propolis VMM process for this instance crashes, it should *not* be
* restarted automatically.
*/
'sled_failures_only',
/*
* The instance should be automatically restarted any time a fault is
* detected
*/
'all_failures'
);


/*
* TODO consider how we want to manage multiple sagas operating on the same
* Instance -- e.g., reboot concurrent with destroy or concurrent reboots or the
Expand Down Expand Up @@ -1059,7 +1080,6 @@ CREATE TABLE IF NOT EXISTS omicron.public.instance (
ncpus INT NOT NULL,
memory INT NOT NULL,
hostname STRING(63) NOT NULL,
boot_on_fault BOOL NOT NULL DEFAULT false,

/* ID of the instance update saga that has locked this instance for
* updating, if one exists. */
Expand All @@ -1077,6 +1097,12 @@ CREATE TABLE IF NOT EXISTS omicron.public.instance (
*/
state omicron.public.instance_state_v2 NOT NULL,

/*
* What failures should result in an instance being automatically restarted
* by the control plane.
*/
auto_restart_policy omicron.public.instance_auto_restart,

CONSTRAINT vmm_iff_active_propolis CHECK (
((state = 'vmm') AND (active_propolis_id IS NOT NULL)) OR
((state != 'vmm') AND (active_propolis_id IS NULL))
Expand Down Expand Up @@ -4233,7 +4259,7 @@ INSERT INTO omicron.public.db_metadata (
version,
target_version
) VALUES
(TRUE, NOW(), NOW(), '94.0.0', NULL)
(TRUE, NOW(), NOW(), '95.0.0', NULL)
ON CONFLICT DO NOTHING;

COMMIT;
18 changes: 18 additions & 0 deletions schema/crdb/turn-boot-on-fault-into-auto-restart/README.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
= Overview

This migration replaces the `omicron.public.instance.boot_on_fault` column,
which is a `bool`, with a new `auto_restart_policy` column, which is an enum
(`omicron.public.instance_auto_restart`). The new enum type will allow
auto-restart policies other than "always" and "never".
Existing instance records are backfilled with the `all_failures` variant of
`instance_auto_restart` if `boot_on_fault` is `true`. Otherwise, the
auto-restart policy is `NULL`.

The migration performs the following operations:

1. `up01.sql` creates the `instance_auto_restart` enum.
2. `up02.sql` adds a (nullable) `auto_restart_policy` column to the `instance`
table.
3. `up03.sql` updates instance records by setting `auto_restart_policy` to
`all_failures` if `boot_on_fault` is `true`.
4. Finally, `up04.sql` drops the now-defunct `boot_on_fault` column.
19 changes: 19 additions & 0 deletions schema/crdb/turn-boot-on-fault-into-auto-restart/up01.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
CREATE TYPE IF NOT EXISTS omicron.public.instance_auto_restart AS ENUM (
/*
* The instance should not, under any circumstances, be automatically
* rebooted by the control plane.
*/
'never',
/*
* The instance should be automatically restarted if, and only if, the sled
* it was running on has restarted or become unavailable. If the individual
* Propolis VMM process for this instance crashes, it should *not* be
* restarted automatically.
*/
'sled_failures_only',
/*
* The instance should be automatically restarted any time a fault is
* detected
*/
'all_failures'
);
2 changes: 2 additions & 0 deletions schema/crdb/turn-boot-on-fault-into-auto-restart/up02.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE omicron.public.instance
ADD COLUMN IF NOT EXISTS auto_restart_policy omicron.public.instance_auto_restart;
5 changes: 5 additions & 0 deletions schema/crdb/turn-boot-on-fault-into-auto-restart/up03.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SET LOCAL disallow_full_table_scans = off;
UPDATE omicron.public.instance SET auto_restart_policy = CASE
WHEN boot_on_fault = true THEN 'all_failures'
ELSE NULL
END;
1 change: 1 addition & 0 deletions schema/crdb/turn-boot-on-fault-into-auto-restart/up04.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE omicron.public.instance DROP COLUMN IF EXISTS boot_on_fault;

0 comments on commit aca08df

Please sign in to comment.