Skip to content

Commit

Permalink
[nexus] add instance_reincarnation RPW
Browse files Browse the repository at this point in the history
This commit introduces a new background task, called
`instance_reincarnation`, which automatically restarts `Failed`
instances if the instance's `auto_restart_policy` indicates that it
should be restarted.
  • Loading branch information
hawkw committed Sep 1, 2024
1 parent 24b49e5 commit 3c90832
Show file tree
Hide file tree
Showing 10 changed files with 356 additions and 7 deletions.
15 changes: 15 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,8 @@ pub struct BackgroundTaskConfig {
pub instance_watcher: InstanceWatcherConfig,
/// configuration for instance updater task
pub instance_updater: InstanceUpdaterConfig,
/// configuration for instance reincarnation task
pub instance_reincarnation: InstancereincarnationConfig,
/// configuration for service VPC firewall propagation task
pub service_firewall_propagation: ServiceFirewallPropagationConfig,
/// configuration for v2p mapping propagation task
Expand Down Expand Up @@ -589,6 +591,14 @@ pub struct InstanceUpdaterConfig {
pub disable: bool,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct InstancereincarnationConfig {
/// period (in seconds) for periodic activations of this background task
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct ServiceFirewallPropagationConfig {
Expand Down Expand Up @@ -911,6 +921,7 @@ mod test {
instance_watcher.period_secs = 30
instance_updater.period_secs = 30
instance_updater.disable = false
instance_reincarnation.period_secs = 60
service_firewall_propagation.period_secs = 300
v2p_mapping_propagation.period_secs = 30
abandoned_vmm_reaper.period_secs = 60
Expand Down Expand Up @@ -1066,6 +1077,9 @@ mod test {
period_secs: Duration::from_secs(30),
disable: false,
},
instance_reincarnation: InstancereincarnationConfig {
period_secs: Duration::from_secs(60),
},
service_firewall_propagation:
ServiceFirewallPropagationConfig {
period_secs: Duration::from_secs(300),
Expand Down Expand Up @@ -1169,6 +1183,7 @@ mod test {
region_replacement_driver.period_secs = 30
instance_watcher.period_secs = 30
instance_updater.period_secs = 30
instance_reincarnation.period_secs = 60
service_firewall_propagation.period_secs = 300
v2p_mapping_propagation.period_secs = 30
abandoned_vmm_reaper.period_secs = 60
Expand Down
56 changes: 52 additions & 4 deletions nexus/db-queries/src/db/datastore/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ use crate::db::identity::Resource;
use crate::db::lookup::LookupPath;
use crate::db::model::Generation;
use crate::db::model::Instance;
use crate::db::model::InstanceAutoRestart;
use crate::db::model::InstanceRuntimeState;
use crate::db::model::InstanceState;
use crate::db::model::Migration;
use crate::db::model::MigrationState;
use crate::db::model::Name;
Expand Down Expand Up @@ -435,6 +437,53 @@ impl DataStore {
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

/// List all instances in the [`Failed`](InstanceState::Failed) with an
/// auto-restart policy that permits them to be automatically restarted by
/// the control plane.
///
/// This is used by the `instance_reincarnation` RPW to ensure that that any
/// such instances are restarted.
///
/// This query is paginated by the instance's UUID, using the provided
/// [`DataPageParams`].
pub async fn find_reincarnatable_instances(
&self,
opctx: &OpContext,
pagparams: &DataPageParams<'_, Uuid>,
) -> ListResultVec<Instance> {
use db::schema::instance::dsl;

paginated(dsl::instance, dsl::id, pagparams)
// Only attempt to reincarnate Failed instances.
.filter(dsl::state.eq(InstanceState::Failed))
// The instance's auto-restart policy must allow the control plane
// to restart it automatically.
//
// N.B. that this may become more complex in the future if we grow
// additional auto-restart policies that require additional logic
// (such as restart limits...)
.filter(
dsl::auto_restart_policy.eq(InstanceAutoRestart::AllFailures),
)
// Deleted instances may not be reincarnated.
.filter(dsl::time_deleted.is_null())
// If the instance is currently in the process of being updated,
// let's not mess with it for now and try to restart it on another
// pass.
.filter(dsl::updater_id.is_null())
// TODO(eliza): perhaps we ought to check for the presence of an
// active VMM here? If there is one, that would indicate that the
// instance hasn't been moved to `Failed` correctly. But, we would
// also need to handle the case where the active VMM is
// SagaUnwound...
.select(Instance::as_select())
.load_async::<Instance>(
&*self.pool_connection_authorized(opctx).await?,
)
.await
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
}

/// Fetches information about an Instance that the caller has previously
/// fetched
///
Expand Down Expand Up @@ -871,12 +920,11 @@ impl DataStore {
// instance must be "stopped" or "failed" in order to delete it. The
// delete operation sets "time_deleted" (just like with other objects)
// and also sets the state to "destroyed".
use db::model::InstanceState as DbInstanceState;
use db::schema::{disk, instance};

let stopped = DbInstanceState::NoVmm;
let failed = DbInstanceState::Failed;
let destroyed = DbInstanceState::Destroyed;
let stopped = InstanceState::NoVmm;
let failed = InstanceState::Failed;
let destroyed = InstanceState::Destroyed;
let ok_to_delete_instance_states = vec![stopped, failed];

let detached_label = api::external::DiskState::Detached.label();
Expand Down
3 changes: 2 additions & 1 deletion nexus/examples/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ region_replacement_driver.period_secs = 10
instance_watcher.period_secs = 30
# How frequently to schedule new instance update sagas.
instance_updater.period_secs = 30
service_firewall_propagation.period_secs = 300
# How frequently to attempt to restart Failed instances?
instance_reincarnation.period_secs = 60
v2p_mapping_propagation.period_secs = 30
abandoned_vmm_reaper.period_secs = 60
saga_recovery.period_secs = 600
Expand Down
24 changes: 24 additions & 0 deletions nexus/src/app/background/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ use super::tasks::dns_config;
use super::tasks::dns_propagation;
use super::tasks::dns_servers;
use super::tasks::external_endpoints;
use super::tasks::instance_reincarnation;
use super::tasks::instance_updater;
use super::tasks::instance_watcher;
use super::tasks::inventory_collection;
Expand Down Expand Up @@ -160,6 +161,7 @@ pub struct BackgroundTasks {
pub task_region_replacement_driver: Activator,
pub task_instance_watcher: Activator,
pub task_instance_updater: Activator,
pub task_instance_reincarnation: Activator,
pub task_service_firewall_propagation: Activator,
pub task_abandoned_vmm_reaper: Activator,
pub task_vpc_route_manager: Activator,
Expand Down Expand Up @@ -245,6 +247,7 @@ impl BackgroundTasksInitializer {
task_region_replacement_driver: Activator::new(),
task_instance_watcher: Activator::new(),
task_instance_updater: Activator::new(),
task_instance_reincarnation: Activator::new(),
task_service_firewall_propagation: Activator::new(),
task_abandoned_vmm_reaper: Activator::new(),
task_vpc_route_manager: Activator::new(),
Expand Down Expand Up @@ -311,6 +314,7 @@ impl BackgroundTasksInitializer {
task_region_replacement_driver,
task_instance_watcher,
task_instance_updater,
task_instance_reincarnation,
task_service_firewall_propagation,
task_abandoned_vmm_reaper,
task_vpc_route_manager,
Expand Down Expand Up @@ -669,6 +673,26 @@ impl BackgroundTasksInitializer {
});
}

// Background task: schedule restart sagas for failed instances that can
// be automatically restarted.
{
let reincarnator =
instance_reincarnation::InstanceReincarnation::new(
datastore.clone(),
sagas.clone(),
);
driver.register(TaskDefinition {
name: "instance_reincarnation",
description: "schedules start sagas for failed instances that \
can be automatically restarted",
period: config.instance_reincarnation.period_secs,
task_impl: Box::new(reincarnator),
opctx: opctx.child(BTreeMap::new()),
watchers: vec![],
activator: task_instance_reincarnation,
});
}

// Background task: service firewall rule propagation
driver.register(TaskDefinition {
name: "service_firewall_rule_propagation",
Expand Down
Loading

0 comments on commit 3c90832

Please sign in to comment.