[nexus] Mark VMMs on Expunged sleds as Failed (#6519)

oxidecomputer · Sep 9, 2024 · ec0d88d · ec0d88d
1 parent 69af861
commit ec0d88d
Show file tree

Hide file tree

Showing 4 changed files with 229 additions and 111 deletions.
diff --git a/nexus/db-queries/src/db/datastore/instance.rs b/nexus/db-queries/src/db/datastore/instance.rs
@@ -36,9 +36,7 @@ use crate::db::update_and_check::UpdateStatus;
 use async_bb8_diesel::AsyncRunQueryDsl;
 use chrono::Utc;
 use diesel::prelude::*;
-use nexus_db_model::ApplySledFilterExt;
 use nexus_db_model::Disk;
-use nexus_types::deployment::SledFilter;
 use omicron_common::api;
 use omicron_common::api::external;
 use omicron_common::api::external::http_pagination::PaginatedBy;
@@ -801,12 +799,12 @@ impl DataStore {
         Ok(updated)
     }
 
-    /// Lists all instances on in-service sleds with active Propolis VMM
-    /// processes, returning the instance along with the VMM on which it's
-    /// running, the sled on which the VMM is running, and the project that owns
-    /// the instance.
+    /// Lists all instances with active Propolis VMM processes, returning the
+    /// instance along with the VMM on which it's running, the sled on which the
+    /// VMM is running, and the project that owns the instance.
     ///
-    /// The query performed by this function is paginated by the sled's UUID.
+    /// The query performed by this function is paginated by the sled and
+    /// instance UUIDs, in that order.
     pub async fn instance_and_vmm_list_by_sled_agent(
         &self,
         opctx: &OpContext,
@@ -821,9 +819,9 @@ impl DataStore {
 
         // We're going to build the query in stages.
         //
-        // First, select all active sleds, and join the `sled` table with the
-        // `vmm` table on the VMM's sled_id`, filtering out VMMs which are not
-        // actually incarnated on a sled.
+        // First, select all non-deleted sled records, and join the `sled` table
+        // with the `vmm` table on the VMM's `sled_id`, filtering out VMMs which
+        // are not actually incarnated on a sled.
         let query = sled_dsl::sled
             .inner_join(vmm_dsl::vmm.on(vmm_dsl::sled_id.eq(sled_dsl::id)));
         // Next, paginate the results, ordering by the sled ID first, so that we
@@ -844,11 +842,10 @@ impl DataStore {
             (sled_dsl::id, vmm_dsl::id),
             pagparams,
         );
-        // Filter out sleds that aren't in service, and VMM states in which the
-        // VMM isn't incarnated on a sled.
+
         let query = query
+            // Filter out sled and VMM records which have been deleted.
             .filter(sled_dsl::time_deleted.is_null())
-            .sled_filter(SledFilter::InService)
             .filter(vmm_dsl::time_deleted.is_null())
             // Ignore VMMs which are in states that are not known to exist on a
             // sled. Since this query drives instance-watcher health checking,

diff --git a/nexus/src/app/background/tasks/instance_watcher.rs b/nexus/src/app/background/tasks/instance_watcher.rs
@@ -15,6 +15,7 @@ use nexus_db_model::Vmm;
 use nexus_db_queries::context::OpContext;
 use nexus_db_queries::db::pagination::Paginator;
 use nexus_db_queries::db::DataStore;
+use nexus_types::external_api::views::SledPolicy;
 use nexus_types::identity::Asset;
 use nexus_types::identity::Resource;
 use omicron_common::api::external::Error;
@@ -87,6 +88,7 @@ impl InstanceWatcher {
         client: SledAgentClient,
         target: VirtualMachine,
         vmm: Vmm,
+        sled: Sled,
     ) -> impl Future<Output = Check> + Send + 'static {
         let datastore = self.datastore.clone();
         let sagas = self.sagas.clone();
@@ -99,6 +101,7 @@ impl InstanceWatcher {
                 target.instance_id.to_string(),
             );
             meta.insert("vmm_id".to_string(), vmm_id.to_string());
+            meta.insert("sled_id".to_string(), sled.id().to_string());
             opctx.child(meta)
         };
 
@@ -107,96 +110,123 @@ impl InstanceWatcher {
                 opctx.log, "checking on VMM"; "propolis_id" => %vmm_id
             );
 
-            let rsp = client
-                .vmm_get_state(&vmm_id)
-                .await
-                .map_err(SledAgentInstanceError);
-
             let mut check = Check {
                 target,
                 outcome: Default::default(),
                 result: Ok(()),
                 update_saga_queued: false,
             };
-            let state: SledVmmState = match rsp {
-                Ok(rsp) => rsp.into_inner().into(),
-                // Oh, this error indicates that the VMM should transition to
-                // `Failed`. Let's synthesize a `SledInstanceState` that does
-                // that.
-                Err(e) if e.vmm_gone() => {
-                    slog::info!(
-                        opctx.log,
-                        "sled-agent error indicates that this instance's \
-                         VMM has failed!";
-                        "error" => %e,
-                    );
-                    check.outcome =
-                        CheckOutcome::Failure(Failure::NoSuchInstance);
-                    // TODO(eliza): it would be nicer if this used the same
-                    // code path as `mark_instance_failed`...
-                    SledVmmState {
-                        vmm_state: nexus::VmmRuntimeState {
-                            r#gen: vmm.runtime.r#gen.0.next(),
-                            state: nexus::VmmState::Failed,
-                            time_updated: chrono::Utc::now(),
-                        },
-                        // It's fine to synthesize `None`s here because a `None`
-                        // just means "don't update the migration state", not
-                        // "there is no migration".
-                        migration_in: None,
-                        migration_out: None,
-                    }
+
+            let state = if sled.policy() == SledPolicy::Expunged {
+                // If the sled has been expunged, any VMMs still on that sled
+                // should be marked as `Failed`.
+                slog::info!(
+                    opctx.log,
+                    "instance is assigned to a VMM on an Expunged sled; \
+                     marking it as Failed";
+                );
+                check.outcome = CheckOutcome::Failure(Failure::SledExpunged);
+                // TODO(eliza): it would be nicer if this used the same
+                // code path as `mark_instance_failed`...
+                SledVmmState {
+                    vmm_state: nexus::VmmRuntimeState {
+                        r#gen: vmm.runtime.r#gen.0.next(),
+                        state: nexus::VmmState::Failed,
+                        time_updated: chrono::Utc::now(),
+                    },
+                    // It's fine to synthesize `None`s here because a `None`
+                    // just means "don't update the migration state", not
+                    // "there is no migration".
+                    migration_in: None,
+                    migration_out: None,
                 }
-                Err(SledAgentInstanceError(ClientError::ErrorResponse(
-                    rsp,
-                ))) => {
-                    let status = rsp.status();
-                    if status.is_client_error() {
-                        slog::warn!(opctx.log, "check failed due to client error";
+            } else {
+                // Otherwise, ask the sled-agent what it has to say for itself.
+                let rsp = client
+                    .vmm_get_state(&vmm_id)
+                    .await
+                    .map_err(SledAgentInstanceError);
+                match rsp {
+                    Ok(rsp) => {
+                        let state: SledVmmState = rsp.into_inner().into();
+                        check.outcome =
+                            CheckOutcome::Success(state.vmm_state.state.into());
+                        state
+                    }
+                    // Oh, this error indicates that the VMM should transition to
+                    // `Failed`. Let's synthesize a `SledInstanceState` that does
+                    // that.
+                    Err(e) if e.vmm_gone() => {
+                        slog::info!(
+                            opctx.log,
+                            "sled-agent error indicates that this instance's \
+                             VMM has failed!";
+                            "error" => %e,
+                        );
+                        check.outcome =
+                            CheckOutcome::Failure(Failure::NoSuchInstance);
+                        // TODO(eliza): it would be nicer if this used the same
+                        // code path as `mark_instance_failed`...
+                        SledVmmState {
+                            vmm_state: nexus::VmmRuntimeState {
+                                r#gen: vmm.runtime.r#gen.0.next(),
+                                state: nexus::VmmState::Failed,
+                                time_updated: chrono::Utc::now(),
+                            },
+                            // It's fine to synthesize `None`s here because a `None`
+                            // just means "don't update the migration state", not
+                            // "there is no migration".
+                            migration_in: None,
+                            migration_out: None,
+                        }
+                    }
+                    Err(SledAgentInstanceError(
+                        ClientError::ErrorResponse(rsp),
+                    )) => {
+                        let status = rsp.status();
+                        if status.is_client_error() {
+                            slog::warn!(opctx.log, "check incomplete due to client error";
                             "status" => ?status, "error" => ?rsp.into_inner());
-                        check.result =
-                            Err(Incomplete::ClientHttpError(status.as_u16()));
-                    } else {
-                        slog::info!(opctx.log, "check failed due to server error";
+                        } else {
+                            slog::info!(opctx.log, "check incomplete due to server error";
                         "status" => ?status, "error" => ?rsp.into_inner());
-                    }
+                        }
 
-                    check.outcome = CheckOutcome::Failure(
-                        Failure::SledAgentResponse(status.as_u16()),
-                    );
-                    return check;
-                }
-                Err(SledAgentInstanceError(
-                    ClientError::CommunicationError(e),
-                )) => {
-                    // TODO(eliza): eventually, we may want to transition the
-                    // instance to the `Failed` state if the sled-agent has been
-                    // unreachable for a while. We may also want to take other
-                    // corrective actions or alert an operator in this case.
-                    //
-                    // TODO(eliza): because we have the preported IP address
-                    // of the instance's VMM from our databse query, we could
-                    // also ask the VMM directly when the sled-agent is
-                    // unreachable. We should start doing that here at some
-                    // point.
-                    slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
-                    check.outcome =
-                        CheckOutcome::Failure(Failure::SledAgentUnreachable);
-                    return check;
-                }
-                Err(SledAgentInstanceError(e)) => {
-                    slog::warn!(
-                        opctx.log,
-                        "error checking up on instance";
-                        "error" => ?e,
-                        "status" => ?e.status(),
-                    );
-                    check.result = Err(Incomplete::ClientError);
-                    return check;
+                        check.result = Err(Incomplete::SledAgentHttpError(
+                            status.as_u16(),
+                        ));
+                        return check;
+                    }
+                    Err(SledAgentInstanceError(
+                        ClientError::CommunicationError(e),
+                    )) => {
+                        // TODO(eliza): eventually, we may want to transition the
+                        // instance to the `Failed` state if the sled-agent has been
+                        // unreachable for a while. We may also want to take other
+                        // corrective actions or alert an operator in this case.
+                        //
+                        // TODO(eliza): because we have the purported IP address
+                        // of the instance's VMM from our database query, we could
+                        // also ask the VMM directly when the sled-agent is
+                        // unreachable. We should start doing that here at some
+                        // point.
+                        slog::info!(opctx.log, "sled agent is unreachable"; "error" => ?e);
+                        check.result = Err(Incomplete::SledAgentUnreachable);
+                        return check;
+                    }
+                    Err(SledAgentInstanceError(e)) => {
+                        slog::warn!(
+                            opctx.log,
+                            "error checking up on instance";
+                            "error" => ?e,
+                            "status" => ?e.status(),
+                        );
+                        check.result = Err(Incomplete::ClientError);
+                        return check;
+                    }
                 }
             };
 
-            check.outcome = CheckOutcome::Success(state.vmm_state.state.into());
             debug!(
                 opctx.log,
                 "updating instance state";
@@ -328,26 +358,20 @@ impl Check {
     Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize,
 )]
 enum Failure {
-    /// The sled-agent for the sled on which the instance is running was
-    /// unreachable.
-    ///
-    /// This may indicate a network partition between us and that sled, that
-    /// the sled-agent process has crashed, or that the sled is down.
-    SledAgentUnreachable,
-    /// The sled-agent responded with an unexpected HTTP error.
-    SledAgentResponse(u16),
     /// The sled-agent indicated that it doesn't know about an instance ID that
     /// we believe it *should* know about. This probably means the sled-agent,
     /// and potentially the whole sled, has been restarted.
     NoSuchInstance,
+    /// The instance was assigned to a sled that was expunged. Its VMM has been
+    /// marked as `Failed`, since the sled is no longer present.
+    SledExpunged,
 }
 
 impl Failure {
     fn as_str(&self) -> Cow<'static, str> {
         match self {
-            Self::SledAgentUnreachable => "unreachable".into(),
-            Self::SledAgentResponse(status) => status.to_string().into(),
             Self::NoSuchInstance => "no_such_instance".into(),
+            Self::SledExpunged => "sled_expunged".into(),
         }
     }
 }
@@ -356,11 +380,16 @@ impl Failure {
     Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, serde::Serialize,
 )]
 enum Incomplete {
-    /// The sled-agent responded with an HTTP client error, indicating that our
-    /// request as somehow malformed.
-    ClientHttpError(u16),
     /// Something else went wrong while making an HTTP request.
     ClientError,
+    /// The sled-agent for the sled on which the instance is running was
+    /// unreachable.
+    ///
+    /// This may indicate a network partition between us and that sled, that
+    /// the sled-agent process has crashed, or that the sled is down.
+    SledAgentUnreachable,
+    /// The sled-agent responded with an unexpected HTTP error.
+    SledAgentHttpError(u16),
     /// We attempted to update the instance state in the database, but no
     /// instance with that UUID existed.
     ///
@@ -377,8 +406,9 @@ enum Incomplete {
 impl Incomplete {
     fn as_str(&self) -> Cow<'static, str> {
         match self {
-            Self::ClientHttpError(status) => status.to_string().into(),
             Self::ClientError => "client_error".into(),
+            Self::SledAgentHttpError(status) => status.to_string().into(),
+            Self::SledAgentUnreachable => "unreachable".into(),
             Self::InstanceNotFound => "instance_not_found".into(),
             Self::UpdateFailed => "update_failed".into(),
         }
@@ -463,9 +493,8 @@ impl BackgroundTask for InstanceWatcher {
                         },
                     };
 
-
                     let target = VirtualMachine::new(self.id, &sled, &instance, &vmm, &project);
-                    tasks.spawn(self.check_instance(opctx, client, target, vmm));
+                    tasks.spawn(self.check_instance(opctx, client, target, vmm, sled));
                 }
 
                 // Now, wait for the check results to come back.
@@ -487,10 +516,11 @@ impl BackgroundTask for InstanceWatcher {
                         CheckOutcome::Failure(reason) => {
                             *check_failures.entry(reason.as_str().into_owned()).or_default() += 1;
                         }
-                        CheckOutcome::Unknown => {}
-                    }
-                    if let Err(ref reason) = check.result {
-                        *check_errors.entry(reason.as_str().into_owned()).or_default() += 1;
+                        CheckOutcome::Unknown => {
+                            if let Err(reason) = check.result {
+                                *check_errors.entry(reason.as_str().into_owned()).or_default() += 1;
+                            }
+                        }
                     }
                     if check.update_saga_queued {
                         update_sagas_queued += 1;

diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs
@@ -102,7 +102,19 @@ impl super::Nexus {
         let sled_lookup = self.sled_lookup(opctx, &sled_id)?;
         let (authz_sled,) =
             sled_lookup.lookup_for(authz::Action::Modify).await?;
-        self.db_datastore.sled_set_policy_to_expunged(opctx, &authz_sled).await
+        let prev_policy = self
+            .db_datastore
+            .sled_set_policy_to_expunged(opctx, &authz_sled)
+            .await?;
+
+        // The instance-watcher background task is responsible for marking any
+        // VMMs running on `Expunged` sleds as `Failed`, so that their instances
+        // can transition to `Failed` and be deleted or restarted. Let's go
+        // ahead and activate it now so that those instances don't need to wait
+        // for the next periodic activation before they can be cleaned up.
+        self.background_tasks.task_instance_watcher.activate();
+
+        Ok(prev_policy)
     }
 
     pub(crate) async fn sled_request_firewall_rules(