Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nexus] Reincarnate instances with SagaUnwound VMMs #6669

Merged
merged 13 commits into from
Sep 27, 2024
62 changes: 36 additions & 26 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1790,57 +1790,67 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
Ok(InstanceReincarnationStatus {
instances_found,
instances_reincarnated,
changed_state,
query_error,
restart_errors,
}) => {
Ok(status) => {
const FOUND: &'static str =
"instances eligible for reincarnation:";
const REINCARNATED: &'static str = " instances reincarnated:";
const REINCARNATED: &'static str =
"instances reincarnated successfully:";
const CHANGED_STATE: &'static str =
" instances which changed state before they could be reincarnated:";
"instances which changed state before they could reincarnate:";
const ERRORS: &'static str =
" instances which failed to be reincarnated:";
const COOLDOWN_PERIOD: &'static str =
"default cooldown period:";
"instances which failed to reincarnate:";
const WIDTH: usize = const_max_len(&[
FOUND,
REINCARNATED,
CHANGED_STATE,
ERRORS,
COOLDOWN_PERIOD,
]);
let n_restart_errors = restart_errors.len();
let n_restarted = instances_reincarnated.len();
let n_changed_state = changed_state.len();
println!(" {FOUND:<WIDTH$} {instances_found:>3}");
println!(" {REINCARNATED:<WIDTH$} {n_restarted:>3}");
println!(" {CHANGED_STATE:<WIDTH$} {n_changed_state:>3}",);
println!(" {ERRORS:<WIDTH$} {n_restart_errors:>3}");
if status.disabled {
println!(
" instance reincarnation explicitly disabled \
by config!"
);
return;
}

if let Some(e) = query_error {
if !status.errors.is_empty() {
println!(
" an error occurred while searching for instances \
to reincarnate:\n {e}",
" errors occurred while finding instances to \
reincarnate:"
);
for error in &status.errors {
println!(" > {error}")
}
}

let n_restart_errors = status.restart_errors.len();
let n_restarted = status.instances_reincarnated.len();
let n_changed_state = status.changed_state.len();
println!(
" {FOUND:<WIDTH$} {:>3}",
status.total_instances_found()
);
for (reason, count) in &status.instances_found {
let reason = format!(" {reason} instances:");
println!(" {reason:<WIDTH$} {count:>3}",);
}
println!(" {REINCARNATED:<WIDTH$} {n_restarted:>3}");
println!(" {CHANGED_STATE:<WIDTH$} {n_changed_state:>3}",);
println!(" {ERRORS:<WIDTH$} {n_restart_errors:>3}");

if n_restart_errors > 0 {
println!(
" errors occurred while restarting the following \
instances:"
);
for (id, error) in restart_errors {
for (id, error) in status.restart_errors {
println!(" > {id}: {error}");
}
}

if n_restarted > 0 {
println!(" the following instances have reincarnated:");
for id in instances_reincarnated {
for id in status.instances_reincarnated {
println!(" > {id}")
}
}
Expand All @@ -1850,7 +1860,7 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
" the following instances states changed before \
they could be reincarnated:"
);
for id in changed_state {
for id in status.changed_state {
println!(" > {id}")
}
}
Expand Down
20 changes: 12 additions & 8 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -528,10 +528,12 @@ task: "instance_reincarnation"
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
instances eligible for reincarnation: 0
instances reincarnated: 0
instances which changed state before they could be reincarnated: 0
instances which failed to be reincarnated: 0
instances eligible for reincarnation: 0
instance failed instances: 0
start saga failed instances: 0
instances reincarnated successfully: 0
instances which changed state before they could reincarnate: 0
instances which failed to reincarnate: 0

task: "instance_updater"
configured period: every <REDACTED_DURATION>s
Expand Down Expand Up @@ -968,10 +970,12 @@ task: "instance_reincarnation"
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
instances eligible for reincarnation: 0
instances reincarnated: 0
instances which changed state before they could be reincarnated: 0
instances which failed to be reincarnated: 0
instances eligible for reincarnation: 0
instance failed instances: 0
start saga failed instances: 0
instances reincarnated successfully: 0
instances which changed state before they could reincarnate: 0
instances which failed to reincarnate: 0

task: "instance_updater"
configured period: every <REDACTED_DURATION>s
Expand Down
27 changes: 14 additions & 13 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -348,19 +348,14 @@ impl InstanceAutoRestart {

let now = diesel::dsl::now.into_sql::<pg::sql_types::Timestamptz>();

dsl::state
// Only attempt to restart Failed instances.
.eq(InstanceState::Failed)
// The instance's auto-restart policy must allow the control plane
// to restart it automatically.
//
// N.B. that this may become more complex in the future if we grow
// additional auto-restart policies that require additional logic
// (such as restart limits...)
.and(
dsl::auto_restart_policy
.eq(InstanceAutoRestartPolicy::BestEffort),
)
// The instance's auto-restart policy must allow the control plane
// to restart it automatically.
//
// N.B. that this may become more complex in the future if we grow
// additional auto-restart policies that require additional logic
// (such as restart limits...)
dsl::auto_restart_policy
.eq(InstanceAutoRestartPolicy::BestEffort)
// An instance whose last reincarnation was within the cooldown
// interval from now must remain in _bardo_ --- the liminal
// state between death and rebirth --- before its next
Expand All @@ -382,6 +377,12 @@ impl InstanceAutoRestart {
.le((now - Self::DEFAULT_COOLDOWN).nullable()),
)),
)
// Deleted instances may not be reincarnated.
.and(dsl::time_deleted.is_null())
// If the instance is currently in the process of being updated,
// let's not mess with it for now and try to restart it on another
// pass.
.and(dsl::updater_id.is_null())
}
}

Expand Down
59 changes: 36 additions & 23 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ use crate::db::update_and_check::UpdateStatus;
use async_bb8_diesel::AsyncRunQueryDsl;
use chrono::Utc;
use diesel::prelude::*;
use diesel::sql_types;
use nexus_db_model::Disk;
use nexus_types::internal_api::background::ReincarnationReason;
use omicron_common::api;
use omicron_common::api::external;
use omicron_common::api::external::http_pagination::PaginatedBy;
Expand Down Expand Up @@ -498,33 +498,46 @@ impl DataStore {
pub async fn find_reincarnatable_instances(
&self,
opctx: &OpContext,
reason: ReincarnationReason,
pagparams: &DataPageParams<'_, Uuid>,
) -> ListResultVec<Instance> {
use db::schema::instance::dsl;
use db::schema::vmm::dsl as vmm_dsl;

define_sql_function!(fn random() -> sql_types::Float);

paginated(dsl::instance, dsl::id, &pagparams)
let q = paginated(dsl::instance, dsl::id, &pagparams)
// Select only those instances which may be reincarnated.
.filter(InstanceAutoRestart::filter_reincarnatable())
// Deleted instances may not be reincarnated.
.filter(dsl::time_deleted.is_null())
// If the instance is currently in the process of being updated,
// let's not mess with it for now and try to restart it on another
// pass.
.filter(dsl::updater_id.is_null())
// N.B. that it's tempting to also filter out instances that have no
// active VMM, since they're only valid targets for instance-start
// sagas once the active VMM is unlinked, *or* if the active VMM is
// `SagaUnwound`. However, checking for the second case
// (SagaUnwound) would require joining with the VMM table, so let's
// not bother.
.select(Instance::as_select())
.load_async::<Instance>(
&*self.pool_connection_authorized(opctx).await?,
)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
.filter(InstanceAutoRestart::filter_reincarnatable());

match reason {
ReincarnationReason::Failed => {
// The instance must be in the Failed state.
q.filter(dsl::state.eq(InstanceState::Failed))
.filter(dsl::active_propolis_id.is_null())
.select(Instance::as_select())
.load_async::<Instance>(
&*self.pool_connection_authorized(opctx).await?,
)
.await
}
ReincarnationReason::SagaUnwound => {
// The instance must have an active VMM.
q.filter(dsl::state.eq(InstanceState::Vmm))
.inner_join(
vmm_dsl::vmm
.on(dsl::active_propolis_id
.eq(vmm_dsl::id.nullable())),
)
// The instance's active VMM must be in the `SagaUnwound`
// state.
.filter(vmm_dsl::state.eq(VmmState::SagaUnwound))
.select(Instance::as_select())
.load_async::<Instance>(
&*self.pool_connection_authorized(opctx).await?,
)
.await
}
}
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

/// Fetches information about an Instance that the caller has previously
Expand Down
Loading
Loading