Skip to content

Commit

Permalink
[nexus] improve abandoned_vmm_reaper OMDB status
Browse files Browse the repository at this point in the history
While I was working on #6503, I became quite taken with the way @jmpesp
had done the OMDB status for his region-replacement background tasks ---
the approach of defining a struct in `nexus-types` representing the
status JSON that both the background task implementation and OMDB could
depend on seems much nicer than constructing an untyped JSON message and
then deserializing it to a type that only exists in OMDB, because this
way, OMDB and the background task implementation are kept in sync
automatically. So, I thought it would be nice to also rework the
`abandoned_vmm_reaper` task to use this style.

This commit does that. While I was making this change, I also changed
the status JSON to include a list of all errors that occurred during the
task's activation, instead of just the last one, which will probably
make it easier to debug any issues that come up. I also added some code
for aligning the numeric columns in the OMDB output.
  • Loading branch information
hawkw committed Sep 7, 2024
1 parent 2bedc6b commit b7c1585
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 130 deletions.
91 changes: 56 additions & 35 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ use nexus_client::types::UninitializedSledId;
use nexus_db_queries::db::lookup::LookupPath;
use nexus_saga_recovery::LastPass;
use nexus_types::deployment::Blueprint;
use nexus_types::internal_api::background::AbandonedVmmReaperStatus;
use nexus_types::internal_api::background::LookupRegionPortStatus;
use nexus_types::internal_api::background::RegionReplacementDriverStatus;
use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus;
Expand Down Expand Up @@ -1165,54 +1166,61 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
}
};
} else if name == "abandoned_vmm_reaper" {
#[derive(Deserialize)]
struct TaskSuccess {
/// total number of abandoned VMMs found
found: usize,

/// number of abandoned VMM records that were deleted
vmms_deleted: usize,

/// number of abandoned VMM records that were already deleted when
/// we tried to delete them.
vmms_already_deleted: usize,

/// sled resource reservations that were released
sled_reservations_deleted: usize,

/// number of errors that occurred during the activation
error_count: usize,

/// the last error that occurred during execution.
error: Option<String>,
}
match serde_json::from_value::<TaskSuccess>(details.clone()) {
match serde_json::from_value::<AbandonedVmmReaperStatus>(
details.clone(),
) {
Err(error) => eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
Ok(TaskSuccess {
found,
Ok(AbandonedVmmReaperStatus {
vmms_found,
vmms_deleted,
vmms_already_deleted,
sled_reservations_deleted,
error_count,
error,
errors,
}) => {
if let Some(error) = error {
println!(" task did not complete successfully!");
println!(" total errors: {error_count}");
println!(" most recent error: {error}");
if !errors.is_empty() {
println!(
" task did not complete successfully! ({} errors)",
errors.len()
);
for error in errors {
println!(" > {error}");
}
}

println!(" total abandoned VMMs found: {found}");
println!(" VMM records deleted: {vmms_deleted}");
const VMMS_FOUND: &'static str = "total abandoned VMMs found:";
const VMMS_DELETED: &'static str = " VMM records deleted:";
const VMMS_ALREADY_DELETED: &'static str =
" VMMs already deleted by another Nexus:";
const SLED_RESERVATIONS_DELETED: &'static str =
"sled resource reservations deleted:";
// To align the number column, figure out the length of the
// longest line of text and add one (so that there's a space).
//
// Yes, I *could* just count the number of characters in each
// line myself, but why do something by hand when you could make
// the computer do it for you? And, this way, if we change the
// text, we won't need to figure it out again.
const WIDTH: usize = const_max_len(&[
VMMS_FOUND,
VMMS_DELETED,
VMMS_ALREADY_DELETED,
SLED_RESERVATIONS_DELETED,
]) + 1;
const NUM_WIDTH: usize = 3;

println!(" {VMMS_FOUND:<WIDTH$}{vmms_found:>NUM_WIDTH$}");
println!(
" VMM records already deleted by another Nexus: {}",
vmms_already_deleted,
" {VMMS_DELETED:<WIDTH$}{vmms_deleted:>NUM_WIDTH$}"
);
println!(
" sled resource reservations deleted: {}",
" {VMMS_ALREADY_DELETED:<WIDTH$}{:>NUM_WIDTH$}",
vmms_already_deleted
);
println!(
" {SLED_RESERVATIONS_DELETED:<WIDTH$}{:>NUM_WIDTH$}",
sled_reservations_deleted,
);
}
Expand Down Expand Up @@ -2578,3 +2586,16 @@ async fn cmd_nexus_sled_expunge_disk(
eprintln!("expunged disk {}", args.physical_disk_id);
Ok(())
}

const fn const_max_len(strs: &[&str]) -> usize {
let mut max = 0;
let mut i = 0;
while i < strs.len() {
let len = strs[i].len();
if len > max {
max = len;
}
i += 1;
}
max
}
16 changes: 8 additions & 8 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -475,10 +475,10 @@ task: "abandoned_vmm_reaper"
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total abandoned VMMs found: 0
VMM records deleted: 0
VMM records already deleted by another Nexus: 0
sled resource reservations deleted: 0
total abandoned VMMs found: 0
VMM records deleted: 0
VMMs already deleted by another Nexus: 0
sled resource reservations deleted: 0

task: "bfd_manager"
configured period: every <REDACTED_DURATION>s
Expand Down Expand Up @@ -902,10 +902,10 @@ task: "abandoned_vmm_reaper"
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total abandoned VMMs found: 0
VMM records deleted: 0
VMM records already deleted by another Nexus: 0
sled resource reservations deleted: 0
total abandoned VMMs found: 0
VMM records deleted: 0
VMMs already deleted by another Nexus: 0
sled resource reservations deleted: 0

task: "bfd_manager"
configured period: every <REDACTED_DURATION>s
Expand Down
Loading

0 comments on commit b7c1585

Please sign in to comment.