Skip to content

Commit

Permalink
[nexus] nicer OMDB status for instance_updater
Browse files Browse the repository at this point in the history
Depends on #6541.

Similarly to #6541, this commit refactors the `instance_updater`
background task's OMDB status implementation to use a shared Rust struct
in `nexus-types` to represent the status JSON object. I also changed the
status message to include a full list of errors that occurred, instead
of just the most recent one and a count, and I made some tweaks to the
output alignment in OMDB.
  • Loading branch information
hawkw committed Sep 7, 2024
1 parent b7c1585 commit 7582607
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 215 deletions.
155 changes: 79 additions & 76 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ use nexus_db_queries::db::lookup::LookupPath;
use nexus_saga_recovery::LastPass;
use nexus_types::deployment::Blueprint;
use nexus_types::internal_api::background::AbandonedVmmReaperStatus;
use nexus_types::internal_api::background::InstanceUpdaterStatus;
use nexus_types::internal_api::background::LookupRegionPortStatus;
use nexus_types::internal_api::background::RegionReplacementDriverStatus;
use nexus_types::internal_api::background::RegionSnapshotReplacementFinishStatus;
Expand Down Expand Up @@ -1408,85 +1409,87 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
),
}
} else if name == "instance_updater" {
#[derive(Deserialize)]
struct UpdaterStatus {
/// number of instances found with destroyed active VMMs
destroyed_active_vmms: usize,

/// number of instances found with failed active VMMs
failed_active_vmms: usize,

/// number of instances found with terminated active migrations
terminated_active_migrations: usize,

/// number of update sagas started.
sagas_started: usize,

/// number of sagas completed successfully
sagas_completed: usize,

/// number of sagas which failed
sagas_failed: usize,

/// number of sagas which could not be started
saga_start_failures: usize,

/// the last error that occurred during execution.
error: Option<String>,
}
match serde_json::from_value::<UpdaterStatus>(details.clone()) {
Err(error) => eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details
),
Ok(UpdaterStatus {
destroyed_active_vmms,
failed_active_vmms,
terminated_active_migrations,
sagas_started,
sagas_completed,
sagas_failed,
saga_start_failures,
error,
}) => {
if let Some(error) = error {
println!(" task did not complete successfully!");
println!(" most recent error: {error}");
}

println!(
" total instances in need of updates: {}",
destroyed_active_vmms + terminated_active_migrations
);
println!(
" instances with Destroyed active VMMs: {}",
destroyed_active_vmms,
);
println!(
" instances with Failed active VMMs: {}",
failed_active_vmms,
);
println!(
" instances with terminated active migrations: {}",
terminated_active_migrations,
);
println!(" update sagas started: {sagas_started}");
println!(
" update sagas completed successfully: {}",
sagas_completed,
let status = match serde_json::from_value::<InstanceUpdaterStatus>(
details.clone(),
) {
Err(error) => {
eprintln!(
"warning: failed to interpret task details: {:?}: {:?}",
error, details,
);

let total_failed = sagas_failed + saga_start_failures;
if total_failed > 0 {
println!(" unsuccessful update sagas: {total_failed}");
println!(
" sagas which could not be started: {}",
saga_start_failures
);
println!(" sagas failed: {sagas_failed}");
}
return;
}
Ok(status) => status,
};
let errors = status.errors();
let instances_found = status.total_instances_found();
let InstanceUpdaterStatus {
disabled,
destroyed_active_vmms,
failed_active_vmms,
terminated_active_migrations,
sagas_started,
sagas_completed,
saga_errors,
query_errors,
} = status;

if disabled {
println!(" task explicitly disabled by config!")
}

const FOUND: &'static str = "instances in need of updates:";
const DESTROYED: &'static str =
" instances with Destroyed active VMMs:";
const FAILED: &'static str = " instances with Failed active VMMs:";
const MIGRATIONS: &'static str =
" instances with terminated migrations:";
const SAGAS_STARTED: &'static str = "update sagas started:";
const SAGAS_COMPLETED: &'static str =
" update sagas completed successfully:";
const SAGA_ERRORS: &'static str = " update sagas failed:";
const QUERY_ERRORS: &'static str =
" errors finding instances to update:";
const WIDTH: usize = const_max_len(&[
FOUND,
DESTROYED,
FAILED,
MIGRATIONS,
SAGAS_STARTED,
SAGA_ERRORS,
QUERY_ERRORS,
]) + 1;
const NUM_WIDTH: usize = 3;
if errors > 0 {
println!(
" task did not complete successfully! ({errors} errors)"
);
println!(
" {QUERY_ERRORS:<WIDTH$}{:>NUM_WIDTH$}",
query_errors.len()
);
for error in query_errors {
println!(" > {error}");
}
println!(
" {SAGA_ERRORS:<WIDTH$}{:>NUM_WIDTH$}",
saga_errors.len()
);
for (instance_id, error) in &saga_errors {
println!(" > {instance_id}: {error}");
}
}

println!(" {FOUND:<WIDTH$}{instances_found:>NUM_WIDTH$}");
println!(" {DESTROYED:<WIDTH$}{destroyed_active_vmms:>NUM_WIDTH$}",);
println!(" {FAILED:<WIDTH$}{failed_active_vmms:>NUM_WIDTH$}");
println!(
" {MIGRATIONS:<WIDTH$}{:>NUM_WIDTH$}",
terminated_active_migrations,
);
println!(" {SAGAS_STARTED:<WIDTH$}{sagas_started:>NUM_WIDTH$}");
println!(" {SAGAS_COMPLETED:<WIDTH$}{sagas_completed:>NUM_WIDTH$}",);
println!(" {SAGA_ERRORS:<WIDTH$}{:>NUM_WIDTH$}", saga_errors.len());
} else if name == "region_snapshot_replacement_start" {
match serde_json::from_value::<RegionSnapshotReplacementStartStatus>(
details.clone(),
Expand Down
28 changes: 16 additions & 12 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -523,12 +523,14 @@ task: "instance_updater"
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total instances in need of updates: 0
instances with Destroyed active VMMs: 0
instances with Failed active VMMs: 0
instances with terminated active migrations: 0
update sagas started: 0
update sagas completed successfully: 0
task explicitly disabled by config!
instances in need of updates: 0
instances with Destroyed active VMMs: 0
instances with Failed active VMMs: 0
instances with terminated migrations: 0
update sagas started: 0
update sagas completed successfully: 0
update sagas failed: 0

task: "instance_watcher"
configured period: every <REDACTED_DURATION>s
Expand Down Expand Up @@ -950,12 +952,14 @@ task: "instance_updater"
currently executing: no
last completed activation: <REDACTED ITERATIONS>, triggered by a periodic timer firing
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
total instances in need of updates: 0
instances with Destroyed active VMMs: 0
instances with Failed active VMMs: 0
instances with terminated active migrations: 0
update sagas started: 0
update sagas completed successfully: 0
task explicitly disabled by config!
instances in need of updates: 0
instances with Destroyed active VMMs: 0
instances with Failed active VMMs: 0
instances with terminated migrations: 0
update sagas started: 0
update sagas completed successfully: 0
update sagas failed: 0

task: "instance_watcher"
configured period: every <REDACTED_DURATION>s
Expand Down
Loading

0 comments on commit 7582607

Please sign in to comment.