From ba41dda6031e5a2a896299cac881f24a041c4702 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 5 Sep 2024 10:20:06 -0700 Subject: [PATCH] also include start reason in logs this just seemed nice to do... --- nexus/src/app/instance.rs | 66 +++++++++++++++++++++------ nexus/src/app/sagas/instance_start.rs | 37 +++++++++++---- 2 files changed, 79 insertions(+), 24 deletions(-) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 0661f70299c..79e293bd9db 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -59,6 +59,7 @@ use propolis_client::support::InstanceSerialConsoleHelper; use propolis_client::support::WSClientOffset; use propolis_client::support::WebSocketStream; use sagas::instance_common::ExternalIpAttach; +use sagas::instance_start; use sagas::instance_update; use sled_agent_client::types::InstanceMigrationTargetParams; use sled_agent_client::types::InstanceProperties; @@ -467,7 +468,7 @@ impl super::Nexus { .instance_start( opctx, &lookup, - sagas::instance_start::Reason::AutoStart, + instance_start::Reason::AutoStart, ) .await; if let Err(e) = start_result { @@ -644,7 +645,7 @@ impl super::Nexus { self: &Arc, opctx: &OpContext, instance_lookup: &lookup::Instance<'_>, - reason: sagas::instance_start::Reason, + reason: instance_start::Reason, ) -> Result { let (.., authz_instance) = instance_lookup.lookup_for(authz::Action::Modify).await?; @@ -654,7 +655,7 @@ impl super::Nexus { .instance_fetch_with_vmm(opctx, &authz_instance) .await?; - match instance_start_allowed(&self.log, &state)? { + match instance_start_allowed(&self.log, &state, reason)? { InstanceStartDisposition::AlreadyStarted => Ok(state), InstanceStartDisposition::Start => { let saga_params = sagas::instance_start::Params { @@ -2047,6 +2048,7 @@ pub(crate) async fn process_vmm_update( fn instance_start_allowed( log: &slog::Logger, state: &InstanceAndActiveVmm, + reason: instance_start::Reason, ) -> Result { let (instance, vmm) = (state.instance(), state.vmm()); @@ -2065,7 +2067,8 @@ fn instance_start_allowed( | s @ InstanceState::Migrating => { debug!(log, "asked to start an active instance"; "instance_id" => %instance.id(), - "state" => ?s); + "state" => ?s, + "start_reason" => ?reason); Ok(InstanceStartDisposition::AlreadyStarted) } @@ -2078,7 +2081,8 @@ fn instance_start_allowed( debug!( log, "instance's last VMM's start saga unwound, OK to start"; - "instance_id" => %instance.id() + "instance_id" => %instance.id(), + "start_reason" => ?reason, ); Ok(InstanceStartDisposition::Start) @@ -2091,7 +2095,8 @@ fn instance_start_allowed( "instance is {s:?} but still has an active VMM"; "instance_id" => %instance.id(), "propolis_id" => %vmm.id, - "propolis_state" => ?vmm.runtime.state); + "propolis_state" => ?vmm.runtime.state, + "start_reason" => ?reason); Err(Error::InternalError { internal_message: format!( @@ -2111,7 +2116,8 @@ fn instance_start_allowed( debug!(log, "instance's VMM is still in the process of stopping"; "instance_id" => %instance.id(), "propolis_id" => ?propolis_id, - "propolis_state" => ?propolis_state); + "propolis_state" => ?propolis_state, + "start_reason" => ?reason); Err(Error::conflict( "instance must finish stopping before it can be started", )) @@ -2283,7 +2289,9 @@ mod tests { let (mut instance, _vmm) = make_instance_and_vmm(); instance.runtime_state.nexus_state = DbInstanceState::NoVmm; let state = InstanceAndActiveVmm::from((instance, None)); - assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + assert!( + instance_start_allowed(&logctx.log, &state, Reason::User).is_ok() + ); logctx.cleanup_successful(); } @@ -2297,7 +2305,12 @@ mod tests { instance.runtime_state.propolis_id = Some(vmm.id); vmm.runtime.state = DbVmmState::SagaUnwound; let state = InstanceAndActiveVmm::from((instance, Some(vmm))); - assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + assert!(instance_start_allowed( + &logctx.log, + &state, + instance_start::Reason::User + ) + .is_ok()); logctx.cleanup_successful(); } @@ -2308,7 +2321,12 @@ mod tests { let (mut instance, _vmm) = make_instance_and_vmm(); instance.runtime_state.nexus_state = DbInstanceState::Creating; let state = InstanceAndActiveVmm::from((instance, None)); - assert!(instance_start_allowed(&logctx.log, &state).is_err()); + assert!(instance_start_allowed( + &logctx.log, + &state, + instance_start::Reason::User + ) + .is_err()); logctx.cleanup_successful(); } @@ -2321,21 +2339,41 @@ mod tests { vmm.runtime.state = DbVmmState::Starting; let state = InstanceAndActiveVmm::from((instance.clone(), Some(vmm.clone()))); - assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + assert!(instance_start_allowed( + &logctx.log, + &state, + instance_start::Reason::User + ) + .is_ok()); vmm.runtime.state = DbVmmState::Running; let state = InstanceAndActiveVmm::from((instance.clone(), Some(vmm.clone()))); - assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + assert!(instance_start_allowed( + &logctx.log, + &state, + instance_start::Reason::User + ) + .is_ok()); vmm.runtime.state = DbVmmState::Rebooting; let state = InstanceAndActiveVmm::from((instance.clone(), Some(vmm.clone()))); - assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + assert!(instance_start_allowed( + &logctx.log, + &state, + instance_start::Reason::User + ) + .is_ok()); vmm.runtime.state = DbVmmState::Migrating; let state = InstanceAndActiveVmm::from((instance, Some(vmm))); - assert!(instance_start_allowed(&logctx.log, &state).is_ok()); + assert!(instance_start_allowed( + &logctx.log, + &state, + instance_start::Reason::User + ) + .is_ok()); logctx.cleanup_successful(); } } diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs index dd35dbbec1d..5f6296dfd07 100644 --- a/nexus/src/app/sagas/instance_start.rs +++ b/nexus/src/app/sagas/instance_start.rs @@ -241,6 +241,7 @@ async fn sis_destroy_vmm_record( osagactx.log(), "destroying vmm record for start saga unwind"; "propolis_id" => %propolis_id, + "start_reason" => ?params.reason, ); osagactx.datastore().vmm_mark_saga_unwound(&opctx, &propolis_id).await?; @@ -257,7 +258,8 @@ async fn sis_move_to_starting( let propolis_id = sagactx.lookup::("propolis_id")?; info!(osagactx.log(), "moving instance to Starting state via saga"; "instance_id" => %instance_id, - "propolis_id" => %propolis_id); + "propolis_id" => %propolis_id, + "start_reason" => ?params.reason); let opctx = crate::context::op_context_for_saga_action( &sagactx, @@ -288,7 +290,8 @@ async fn sis_move_to_starting( // proceed. Some(vmm) if vmm.id == propolis_id.into_untyped_uuid() => { info!(osagactx.log(), "start saga: Propolis ID already set"; - "instance_id" => %instance_id); + "instance_id" => %instance_id, + "start_reason" => ?params.reason); return Ok(db_instance.clone()); } @@ -452,7 +455,8 @@ async fn sis_dpd_ensure( let instance_id = InstanceUuid::from_untyped_uuid(db_instance.id()); info!(osagactx.log(), "start saga: ensuring instance dpd configuration"; - "instance_id" => %instance_id); + "instance_id" => %instance_id, + "start_reason" => ?params.reason); let opctx = crate::context::op_context_for_saga_action( &sagactx, @@ -491,7 +495,8 @@ async fn sis_dpd_ensure_undo( ); info!(log, "start saga: undoing dpd configuration"; - "instance_id" => %instance_id); + "instance_id" => %instance_id, + "start_reason" => ?params.reason); let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) .instance_id(instance_id) @@ -543,7 +548,8 @@ async fn sis_ensure_registered( info!(osagactx.log(), "start saga: ensuring instance is registered on sled"; "instance_id" => %instance_id, - "sled_id" => %sled_id); + "sled_id" => %sled_id, + "start_reason" => ?params.reason); let (authz_silo, authz_project, authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) @@ -573,7 +579,8 @@ async fn sis_ensure_registered( "start saga: sled agent failed to register instance"; "instance_id" => %instance_id, "sled_id" => %sled_id, - "error" => ?inner); + "error" => ?inner, + "start_reason" => ?params.reason); // Don't set the instance to Failed in this case. Instead, allow // the saga to unwind and restore the instance to the Stopped @@ -585,7 +592,8 @@ async fn sis_ensure_registered( info!(osagactx.log(), "start saga: internal error registering instance"; "instance_id" => %instance_id, - "error" => ?inner); + "error" => ?inner, + "start_reason" => ?params.reason); ActionError::action_failed(inner) } }) @@ -609,7 +617,8 @@ async fn sis_ensure_registered_undo( info!(osagactx.log(), "start saga: unregistering instance from sled"; "instance_id" => %instance_id, "propolis_id" => %propolis_id, - "sled_id" => %sled_id); + "sled_id" => %sled_id, + "start_reason" => ?params.reason); // Fetch the latest record so that this callee can drive the instance into // a Failed state if the unregister call fails. @@ -632,6 +641,7 @@ async fn sis_ensure_registered_undo( error!(osagactx.log(), "start saga: failed to unregister instance from sled"; "instance_id" => %instance_id, + "start_reason" => ?params.reason, "error" => ?e); // If the failure came from talking to sled agent, and the error code @@ -658,6 +668,7 @@ async fn sis_ensure_registered_undo( error!(osagactx.log(), "start saga: failing instance after unregister failure"; "instance_id" => %instance_id, + "start_reason" => ?params.reason, "error" => ?inner); if let Err(set_failed_error) = osagactx @@ -668,6 +679,7 @@ async fn sis_ensure_registered_undo( error!(osagactx.log(), "start saga: failed to mark instance as failed"; "instance_id" => %instance_id, + "start_reason" => ?params.reason, "error" => ?set_failed_error); Err(set_failed_error.into()) @@ -678,7 +690,8 @@ async fn sis_ensure_registered_undo( InstanceStateChangeError::SledAgent(_) => { info!(osagactx.log(), "start saga: instance already unregistered from sled"; - "instance_id" => %instance_id); + "instance_id" => %instance_id, + "start_reason" => ?params.reason); Ok(()) } @@ -686,6 +699,7 @@ async fn sis_ensure_registered_undo( error!(osagactx.log(), "start saga: internal error unregistering instance"; "instance_id" => %instance_id, + "start_reason" => ?params.reason, "error" => ?inner); Err(inner.into()) @@ -713,7 +727,8 @@ async fn sis_ensure_running( let sled_id = sagactx.lookup::("sled_id")?; info!(osagactx.log(), "start saga: ensuring instance is running"; "instance_id" => %instance_id, - "sled_id" => %sled_id); + "sled_id" => %sled_id, + "start_reason" => ?params.reason); match osagactx .nexus() @@ -731,6 +746,7 @@ async fn sis_ensure_running( "start saga: sled agent failed to set instance to running"; "instance_id" => %instance_id, "sled_id" => %sled_id, + "start_reason" => ?params.reason, "error" => ?inner); // Don't set the instance to Failed in this case. Instead, allow @@ -743,6 +759,7 @@ async fn sis_ensure_running( info!(osagactx.log(), "start saga: internal error changing instance state"; "instance_id" => %instance_id, + "start_reason" => ?params.reason, "error" => ?inner); Err(ActionError::action_failed(inner))