From 0ed4b6f8b20035bc275444035b90865baf0547d0 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 7 Sep 2024 09:32:16 -0700 Subject: [PATCH] [nexus] improve `abandoned_vmm_reaper` OMDB status While I was working on #6503, I became quite taken with the way @jmpesp had done the OMDB status for his region-replacement background tasks --- the approach of defining a struct in `nexus-types` representing the status JSON that both the background task implementation and OMDB could depend on seems much nicer than constructing an untyped JSON message and then deserializing it to a type that only exists in OMDB, because this way, OMDB and the background task implementation are kept in sync automatically. So, I thought it would be nice to also rework the `abandoned_vmm_reaper` task to use this style. This commit does that. While I was making this change, I also changed the status JSON to include a list of all errors that occurred during the task's activation, instead of just the last one, which will probably make it easier to debug any issues that come up. I also added some code for aligning the numeric columns in the OMDB output. --- dev-tools/omdb/src/bin/omdb/nexus.rs | 91 ++++++----- dev-tools/omdb/tests/successes.out | 16 +- .../background/tasks/abandoned_vmm_reaper.rs | 147 ++++++++---------- nexus/types/src/internal_api/background.rs | 10 ++ 4 files changed, 136 insertions(+), 128 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 2a9e1c4db8f..0a55e77d61e 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -32,6 +32,7 @@ use nexus_client::types::UninitializedSledId; use nexus_db_queries::db::lookup::LookupPath; use nexus_saga_recovery::LastPass; use nexus_types::deployment::Blueprint; +use nexus_types::internal_api::background::AbandonedVmmReaperStatus; use nexus_types::internal_api::background::LookupRegionPortStatus; use nexus_types::internal_api::background::RegionReplacementDriverStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus; @@ -1165,54 +1166,61 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { } }; } else if name == "abandoned_vmm_reaper" { - #[derive(Deserialize)] - struct TaskSuccess { - /// total number of abandoned VMMs found - found: usize, - - /// number of abandoned VMM records that were deleted - vmms_deleted: usize, - - /// number of abandoned VMM records that were already deleted when - /// we tried to delete them. - vmms_already_deleted: usize, - - /// sled resource reservations that were released - sled_reservations_deleted: usize, - - /// number of errors that occurred during the activation - error_count: usize, - - /// the last error that occurred during execution. - error: Option, - } - match serde_json::from_value::(details.clone()) { + match serde_json::from_value::( + details.clone(), + ) { Err(error) => eprintln!( "warning: failed to interpret task details: {:?}: {:?}", error, details ), - Ok(TaskSuccess { - found, + Ok(AbandonedVmmReaperStatus { + vmms_found, vmms_deleted, vmms_already_deleted, sled_reservations_deleted, - error_count, - error, + errors, }) => { - if let Some(error) = error { - println!(" task did not complete successfully!"); - println!(" total errors: {error_count}"); - println!(" most recent error: {error}"); + if !errors.is_empty() { + println!( + " task did not complete successfully! ({} errors)", + errors.len() + ); + for error in errors { + println!(" > {error}"); + } } - println!(" total abandoned VMMs found: {found}"); - println!(" VMM records deleted: {vmms_deleted}"); + const VMMS_FOUND: &'static str = "total abandoned VMMs found:"; + const VMMS_DELETED: &'static str = " VMM records deleted:"; + const VMMS_ALREADY_DELETED: &'static str = + " VMMs already deleted by another Nexus:"; + const SLED_RESERVATIONS_DELETED: &'static str = + "sled resource reservations deleted:"; + // To align the number column, figure out the length of the + // longest line of text and add one (so that there's a space). + // + // Yes, I *could* just count the number of characters in each + // line myself, but why do something by hand when you could make + // the computer do it for you? And, this way, if we change the + // text, we won't need to figure it out again. + const WIDTH: usize = const_max_len(&[ + VMMS_FOUND, + VMMS_DELETED, + VMMS_ALREADY_DELETED, + SLED_RESERVATIONS_DELETED, + ]) + 1; + const NUM_WIDTH: usize = 3; + + println!(" {VMMS_FOUND:NUM_WIDTH$}"); println!( - " VMM records already deleted by another Nexus: {}", - vmms_already_deleted, + " {VMMS_DELETED:NUM_WIDTH$}" ); println!( - " sled resource reservations deleted: {}", + " {VMMS_ALREADY_DELETED:NUM_WIDTH$}", + vmms_already_deleted + ); + println!( + " {SLED_RESERVATIONS_DELETED:NUM_WIDTH$}", sled_reservations_deleted, ); } @@ -2578,3 +2586,16 @@ async fn cmd_nexus_sled_expunge_disk( eprintln!("expunged disk {}", args.physical_disk_id); Ok(()) } + +const fn const_max_len(strs: &[&str]) -> usize { + let mut max = 0; + let mut i = 0; + while i < strs.len() { + let len = strs[i].len(); + if len > max { + max = len; + } + i += 1; + } + max +} diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index ab68409e32e..2951b39c06a 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -475,10 +475,10 @@ task: "abandoned_vmm_reaper" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - total abandoned VMMs found: 0 - VMM records deleted: 0 - VMM records already deleted by another Nexus: 0 - sled resource reservations deleted: 0 + total abandoned VMMs found: 0 + VMM records deleted: 0 + VMMs already deleted by another Nexus: 0 + sled resource reservations deleted: 0 task: "bfd_manager" configured period: every s @@ -902,10 +902,10 @@ task: "abandoned_vmm_reaper" currently executing: no last completed activation: , triggered by a periodic timer firing started at (s ago) and ran for ms - total abandoned VMMs found: 0 - VMM records deleted: 0 - VMM records already deleted by another Nexus: 0 - sled resource reservations deleted: 0 + total abandoned VMMs found: 0 + VMM records deleted: 0 + VMMs already deleted by another Nexus: 0 + sled resource reservations deleted: 0 task: "bfd_manager" configured period: every s diff --git a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs index c50f5281fcb..98a5140963a 100644 --- a/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs +++ b/nexus/src/app/background/tasks/abandoned_vmm_reaper.rs @@ -40,6 +40,7 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db::datastore::SQL_BATCH_SIZE; use nexus_db_queries::db::pagination::Paginator; use nexus_db_queries::db::DataStore; +use nexus_types::internal_api::background::AbandonedVmmReaperStatus; use omicron_uuid_kinds::{GenericUuid, PropolisUuid}; use std::sync::Arc; @@ -48,15 +49,6 @@ pub struct AbandonedVmmReaper { datastore: Arc, } -#[derive(Debug, Default)] -struct ActivationResults { - found: usize, - sled_reservations_deleted: usize, - vmms_deleted: usize, - vmms_already_deleted: usize, - error_count: usize, -} - impl AbandonedVmmReaper { pub fn new(datastore: Arc) -> Self { Self { datastore } @@ -65,13 +57,10 @@ impl AbandonedVmmReaper { /// List abandoned VMMs and clean up all of their database records. async fn reap_all( &mut self, - results: &mut ActivationResults, + status: &mut AbandonedVmmReaperStatus, opctx: &OpContext, ) -> Result<(), anyhow::Error> { - slog::info!(opctx.log, "Abandoned VMM reaper running"); - let mut paginator = Paginator::new(SQL_BATCH_SIZE); - let mut last_err = Ok(()); while let Some(p) = paginator.next() { let vmms = self .datastore @@ -79,10 +68,10 @@ impl AbandonedVmmReaper { .await .context("failed to list abandoned VMMs")?; paginator = p.found_batch(&vmms, &|vmm| vmm.id); - self.reap_batch(results, &mut last_err, opctx, &vmms).await; + self.reap_batch(status, opctx, &vmms).await; } - last_err + Ok(()) } /// Clean up a batch of abandoned VMMs. @@ -95,13 +84,17 @@ impl AbandonedVmmReaper { /// than doing it all in one go. Thus, this is factored out. async fn reap_batch( &mut self, - results: &mut ActivationResults, - last_err: &mut Result<(), anyhow::Error>, + status: &mut AbandonedVmmReaperStatus, opctx: &OpContext, vmms: &[Vmm], ) { - results.found += vmms.len(); - slog::debug!(opctx.log, "Found abandoned VMMs"; "count" => vmms.len()); + status.vmms_found += vmms.len(); + slog::debug!( + opctx.log, + "Found abandoned VMMs"; + "count" => vmms.len(), + "total" => status.vmms_found, + ); for vmm in vmms { let vmm_id = PropolisUuid::from_untyped_uuid(vmm.id); @@ -118,22 +111,18 @@ impl AbandonedVmmReaper { "Deleted abandoned VMM's sled reservation"; "vmm" => %vmm_id, ); - results.sled_reservations_deleted += 1; + status.sled_reservations_deleted += 1; } Err(e) => { + const ERR_MSG: &'static str = + "Failed to delete sled reservation"; slog::warn!( opctx.log, - "Failed to delete sled reservation for abandoned VMM"; + "{ERR_MSG} for abandoned VMM"; "vmm" => %vmm_id, "error" => %e, ); - results.error_count += 1; - *last_err = Err(e).with_context(|| { - format!( - "failed to delete sled reservation for VMM \ - {vmm_id}" - ) - }); + status.errors.push(format!("{ERR_MSG} for {vmm_id}: {e}")); } } @@ -145,7 +134,7 @@ impl AbandonedVmmReaper { "Deleted abandoned VMM"; "vmm" => %vmm_id, ); - results.vmms_deleted += 1; + status.vmms_deleted += 1; } Ok(false) => { slog::trace!( @@ -153,19 +142,17 @@ impl AbandonedVmmReaper { "Abandoned VMM was already deleted"; "vmm" => %vmm_id, ); - results.vmms_already_deleted += 1; + status.vmms_already_deleted += 1; } Err(e) => { + const ERR_MSG: &'static str = "Failed to delete"; slog::warn!( opctx.log, - "Failed to mark abandoned VMM as deleted"; + "{ERR_MSG} abandoned VMM"; "vmm" => %vmm_id, "error" => %e, ); - results.error_count += 1; - *last_err = Err(e).with_context(|| { - format!("failed to mark VMM {vmm_id} as deleted") - }); + status.errors.push(format!("{ERR_MSG} {vmm_id}: {e}")) } } } @@ -178,36 +165,28 @@ impl BackgroundTask for AbandonedVmmReaper { opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { async move { - let mut results = ActivationResults::default(); - let error = match self.reap_all(&mut results, opctx).await { + let mut status = AbandonedVmmReaperStatus::default(); + match self.reap_all(&mut status, opctx).await { Ok(_) => { slog::info!(opctx.log, "Abandoned VMMs reaped"; - "found" => results.found, - "sled_reservations_deleted" => results.sled_reservations_deleted, - "vmms_deleted" => results.vmms_deleted, - "vmms_already_deleted" => results.vmms_already_deleted, + "vmms_found" => status.vmms_found, + "sled_reservations_deleted" => status.sled_reservations_deleted, + "vmms_deleted" => status.vmms_deleted, + "vmms_already_deleted" => status.vmms_already_deleted, ); - None } Err(err) => { slog::error!(opctx.log, "Abandoned VMM reaper activation failed"; "error" => %err, - "found" => results.found, - "sled_reservations_deleted" => results.sled_reservations_deleted, - "vmms_deleted" => results.vmms_deleted, - "vmms_already_deleted" => results.vmms_already_deleted, + "vmms_found" => status.vmms_found, + "sled_reservations_deleted" => status.sled_reservations_deleted, + "vmms_deleted" => status.vmms_deleted, + "vmms_already_deleted" => status.vmms_already_deleted, ); - Some(err.to_string()) + status.errors.push(err.to_string()); } }; - serde_json::json!({ - "found": results.found, - "vmms_deleted": results.vmms_deleted, - "vmms_already_deleted": results.vmms_already_deleted, - "sled_reservations_deleted": results.sled_reservations_deleted, - "error_count": results.error_count, - "error": error, - }) + serde_json::json!(status) } .boxed() } @@ -356,15 +335,15 @@ mod tests { let mut task = AbandonedVmmReaper::new(datastore.clone()); - let mut results = ActivationResults::default(); - dbg!(task.reap_all(&mut results, &opctx,).await) + let mut status = AbandonedVmmReaperStatus::default(); + dbg!(task.reap_all(&mut status, &opctx,).await) .expect("activation completes successfully"); - dbg!(&results); + dbg!(&status); - assert_eq!(results.vmms_deleted, 1); - assert_eq!(results.sled_reservations_deleted, 1); - assert_eq!(results.vmms_already_deleted, 0); - assert_eq!(results.error_count, 0); + assert_eq!(status.vmms_deleted, 1); + assert_eq!(status.sled_reservations_deleted, 1); + assert_eq!(status.vmms_already_deleted, 0); + assert_eq!(status.errors, Vec::new()); fixture.assert_reaped(datastore).await; } @@ -399,19 +378,18 @@ mod tests { .await .expect("simulate another nexus marking the VMM deleted"); - let mut results = ActivationResults::default(); - let mut last_err = Ok(()); + let mut status = AbandonedVmmReaperStatus::default(); let mut task = AbandonedVmmReaper::new(datastore.clone()); - task.reap_batch(&mut results, &mut last_err, &opctx, &abandoned_vmms) - .await; - dbg!(last_err).expect("should not have errored"); + task.reap_batch(&mut status, &opctx, &abandoned_vmms) + .await + .expect("no database error should occur"); dbg!(&results); - assert_eq!(results.found, 1); - assert_eq!(results.vmms_deleted, 0); - assert_eq!(results.sled_reservations_deleted, 1); - assert_eq!(results.vmms_already_deleted, 1); - assert_eq!(results.error_count, 0); + assert_eq!(status.found, 1); + assert_eq!(status.vmms_deleted, 0); + assert_eq!(status.sled_reservations_deleted, 1); + assert_eq!(status.vmms_already_deleted, 1); + assert_eq!(status.errors, Vec::new()); fixture.assert_reaped(datastore).await } @@ -435,7 +413,7 @@ mod tests { // order to simulate a condition where the sled reservation record was // deleted between when the listing query was run and when the bg task // attempted to delete the sled reservation.. - let paginator = Paginator::new(MAX_BATCH); + let paginator = Paginator::new(SQL_BATCH_SIZE); let p = paginator.next().unwrap(); let abandoned_vmms = datastore .vmm_list_abandoned(&opctx, &p.current_pagparams()) @@ -454,19 +432,18 @@ mod tests { "simulate another nexus marking the sled reservation deleted", ); - let mut results = ActivationResults::default(); - let mut last_err = Ok(()); + let mut status = AbandonedVmmReaperStatus::default(); let mut task = AbandonedVmmReaper::new(datastore.clone()); - task.reap_batch(&mut results, &mut last_err, &opctx, &abandoned_vmms) - .await; - dbg!(last_err).expect("should not have errored"); - dbg!(&results); - - assert_eq!(results.found, 1); - assert_eq!(results.vmms_deleted, 1); - assert_eq!(results.sled_reservations_deleted, 1); - assert_eq!(results.vmms_already_deleted, 0); - assert_eq!(results.error_count, 0); + task.reap_batch(&mut status, &opctx, &abandoned_vmms) + .await + .expect("should not have errored"); + dbg!(&status); + + assert_eq!(status.found, 1); + assert_eq!(status.vmms_deleted, 1); + assert_eq!(status.sled_reservations_deleted, 1); + assert_eq!(status.vmms_already_deleted, 0); + assert_eq!(status.errors, Vec::new()); fixture.assert_reaped(datastore).await } diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 6f6e80cb60e..cc9f4507c0b 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -53,3 +53,13 @@ pub struct RegionSnapshotReplacementFinishStatus { pub records_set_to_done: Vec, pub errors: Vec, } + +/// The status of an `abandoned_vmm_reaper` background task activation. +#[derive(Serialize, Deserialize, Default, Debug, PartialEq, Eq)] +pub struct AbandonedVmmReaperStatus { + pub vmms_found: usize, + pub sled_reservations_deleted: usize, + pub vmms_deleted: usize, + pub vmms_already_deleted: usize, + pub errors: Vec, +}