oxidecomputer · gjcolombo · Oct 20, 2023 · Oct 12, 2023 · Oct 13, 2023 · Oct 19, 2023
diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs
@@ -1284,6 +1284,14 @@ impl super::Nexus {
               "propolis_id" => %propolis_id,
               "vmm_state" => ?new_runtime_state.vmm_state);
 
+        // Grab the current state of the instance in the DB to reason about
+        // whether this update is stale or not.
+        let (.., authz_instance, db_instance) =
+            LookupPath::new(&opctx, &self.db_datastore)
+                .instance_id(*instance_id)
+                .fetch()
+                .await?;
+
         // Update OPTE and Dendrite if the instance's active sled assignment
         // changed or a migration was retired. If these actions fail, sled agent
         // is expected to retry this update.
@@ -1297,12 +1305,6 @@ impl super::Nexus {
         //
         // In the future, this should be replaced by a call to trigger a
         // networking state update RPW.
-        let (.., authz_instance, db_instance) =
-            LookupPath::new(&opctx, &self.db_datastore)
-                .instance_id(*instance_id)
-                .fetch()
-                .await?;
-
         self.ensure_updated_instance_network_config(
             opctx,
             &authz_instance,
@@ -1311,6 +1313,31 @@ impl super::Nexus {
         )
         .await?;
 
+        // If the supplied instance state is at least as new as what's currently
+        // in the database, and it indicates the instance has no active VMM, the
+        // instance has been stopped and should have its virtual provisioning
+        // charges released.
+        //
+        // As with updating networking state, this must be done before
+        // committing the new runtime state to the database: once the DB is
+        // written, a new start saga can arrive and start the instance, which
+        // will try to create its own virtual provisioning charges, which will
+        // race with this operation.
+        if new_runtime_state.instance_state.propolis_id.is_none()
+            && new_runtime_state.instance_state.gen
+                >= db_instance.runtime().gen.0
+        {
+            self.db_datastore
+                .virtual_provisioning_collection_delete_instance(
+                    opctx,
+                    *instance_id,
+                    db_instance.project_id,
+                    i64::from(db_instance.ncpus.0 .0),
+                    db_instance.memory,
+                )
+                .await?;
+        }
+
         // Write the new instance and VMM states back to CRDB. This needs to be
         // done before trying to clean up the VMM, since the datastore will only
         // allow a VMM to be marked as deleted if it is already in a terminal
@@ -1331,7 +1358,20 @@ impl super::Nexus {
 
         // If the VMM is now in a terminal state, make sure its resources get
         // cleaned up.
-        if let Ok((_, true)) = result {
+        //
+        // For idempotency, only check to see if the update was successfully
+        // processed and ignore whether the VMM record was actually updated.
+        // This is required to handle the case where this routine is called
+        // once, writes the terminal VMM state, fails before all per-VMM
+        // resources are released, returns a retriable error, and is retried:
+        // the per-VMM resources still need to be cleaned up, but the DB update
+        // will return Ok(_, false) because the database was already updated.
+        //
+        // Unlike the pre-update cases, it is legal to do this cleanup *after*
+        // committing state to the database, because a terminated VMM cannot be
+        // reused (restarting or migrating its former instance will use new VMM
+        // IDs).
+        if result.is_ok() {
             let propolis_terminated = matches!(
                 new_runtime_state.vmm_state.state,
                 InstanceState::Destroyed | InstanceState::Failed

diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs
@@ -13,7 +13,6 @@ use crate::external_api::params;
 use nexus_db_model::NetworkInterfaceKind;
 use nexus_db_queries::db::identity::Resource;
 use nexus_db_queries::db::lookup::LookupPath;
-use nexus_db_queries::db::model::ByteCount as DbByteCount;
 use nexus_db_queries::db::queries::network_interface::InsertError as InsertNicError;
 use nexus_db_queries::{authn, authz, db};
 use nexus_defaults::DEFAULT_PRIMARY_NIC_NAME;
@@ -75,10 +74,6 @@ struct DiskAttachParams {
 
 declare_saga_actions! {
     instance_create;
-    VIRTUAL_RESOURCES_ACCOUNT -> "no_result" {
-        + sic_account_virtual_resources
-        - sic_account_virtual_resources_undo
-    }
     CREATE_INSTANCE_RECORD -> "instance_record" {
         + sic_create_instance_record
         - sic_delete_instance_record
@@ -131,7 +126,6 @@ impl NexusSaga for SagaInstanceCreate {
             })?,
         ));
 
-        builder.append(virtual_resources_account_action());
         builder.append(create_instance_record_action());
 
         // Helper function for appending subsagas to our parent saga.
@@ -728,56 +722,6 @@ async fn ensure_instance_disk_attach_state(
     Ok(())
 }
 
-async fn sic_account_virtual_resources(
-    sagactx: NexusActionContext,
-) -> Result<(), ActionError> {
-    let osagactx = sagactx.user_data();
-    let params = sagactx.saga_params::<Params>()?;
-    let instance_id = sagactx.lookup::<Uuid>("instance_id")?;
-
-    let opctx = crate::context::op_context_for_saga_action(
-        &sagactx,
-        &params.serialized_authn,
-    );
-    osagactx
-        .datastore()
-        .virtual_provisioning_collection_insert_instance(
-            &opctx,
-            instance_id,
-            params.project_id,
-            i64::from(params.create_params.ncpus.0),
-            DbByteCount(params.create_params.memory),
-        )
-        .await
-        .map_err(ActionError::action_failed)?;
-    Ok(())
-}
-
-async fn sic_account_virtual_resources_undo(
-    sagactx: NexusActionContext,
-) -> Result<(), anyhow::Error> {
-    let osagactx = sagactx.user_data();
-    let params = sagactx.saga_params::<Params>()?;
-    let instance_id = sagactx.lookup::<Uuid>("instance_id")?;
-
-    let opctx = crate::context::op_context_for_saga_action(
-        &sagactx,
-        &params.serialized_authn,
-    );
-    osagactx
-        .datastore()
-        .virtual_provisioning_collection_delete_instance(
-            &opctx,
-            instance_id,
-            params.project_id,
-            i64::from(params.create_params.ncpus.0),
-            DbByteCount(params.create_params.memory),
-        )
-        .await
-        .map_err(ActionError::action_failed)?;
-    Ok(())
-}
-
 async fn sic_create_instance_record(
     sagactx: NexusActionContext,
 ) -> Result<db::model::Instance, ActionError> {

diff --git a/nexus/src/app/sagas/instance_delete.rs b/nexus/src/app/sagas/instance_delete.rs
@@ -9,7 +9,6 @@ use super::NexusActionContext;
 use super::NexusSaga;
 use crate::app::sagas::declare_saga_actions;
 use nexus_db_queries::{authn, authz, db};
-use nexus_types::identity::Resource;
 use omicron_common::api::external::{Error, ResourceType};
 use omicron_common::api::internal::shared::SwitchLocation;
 use serde::Deserialize;
@@ -40,9 +39,6 @@ declare_saga_actions! {
     DEALLOCATE_EXTERNAL_IP -> "no_result3" {
         + sid_deallocate_external_ip
     }
-    VIRTUAL_RESOURCES_ACCOUNT -> "no_result4" {
-        + sid_account_virtual_resources
-    }
 }
 
 // instance delete saga: definition
@@ -64,7 +60,6 @@ impl NexusSaga for SagaInstanceDelete {
         builder.append(instance_delete_record_action());
         builder.append(delete_network_interfaces_action());
         builder.append(deallocate_external_ip_action());
-        builder.append(virtual_resources_account_action());
         Ok(builder.build()?)
     }
 }
@@ -135,30 +130,6 @@ async fn sid_deallocate_external_ip(
     Ok(())
 }
 
-async fn sid_account_virtual_resources(
-    sagactx: NexusActionContext,
-) -> Result<(), ActionError> {
-    let osagactx = sagactx.user_data();
-    let params = sagactx.saga_params::<Params>()?;
-    let opctx = crate::context::op_context_for_saga_action(
-        &sagactx,
-        &params.serialized_authn,
-    );
-
-    osagactx
-        .datastore()
-        .virtual_provisioning_collection_delete_instance(
-            &opctx,
-            params.instance.id(),
-            params.instance.project_id,
-            i64::from(params.instance.ncpus.0 .0),
-            params.instance.memory,
-        )
-        .await
-        .map_err(ActionError::action_failed)?;
-    Ok(())
-}
-
 #[cfg(test)]
 mod test {
     use crate::{

diff --git a/nexus/src/app/sagas/instance_start.rs b/nexus/src/app/sagas/instance_start.rs
@@ -42,6 +42,11 @@ declare_saga_actions! {
         + sis_alloc_propolis_ip
     }
 
+    ADD_VIRTUAL_RESOURCES -> "virtual_resources" {
+        + sis_account_virtual_resources
+        - sis_account_virtual_resources_undo
+    }
+
     CREATE_VMM_RECORD -> "vmm_record" {
         + sis_create_vmm_record
         - sis_destroy_vmm_record
@@ -96,6 +101,7 @@ impl NexusSaga for SagaInstanceStart {
 
         builder.append(alloc_server_action());
         builder.append(alloc_propolis_ip_action());
+        builder.append(add_virtual_resources_action());
         builder.append(create_vmm_record_action());
         builder.append(mark_as_starting_action());
         builder.append(dpd_ensure_action());
@@ -149,6 +155,56 @@ async fn sis_alloc_propolis_ip(
     allocate_sled_ipv6(&opctx, sagactx.user_data().datastore(), sled_uuid).await
 }
 
+async fn sis_account_virtual_resources(
+    sagactx: NexusActionContext,
+) -> Result<(), ActionError> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+    let instance_id = params.db_instance.id();
+
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+    osagactx
+        .datastore()
+        .virtual_provisioning_collection_insert_instance(
+            &opctx,
+            instance_id,
+            params.db_instance.project_id,
+            i64::from(params.db_instance.ncpus.0 .0),
+            nexus_db_model::ByteCount(*params.db_instance.memory),
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+    Ok(())
+}
+
+async fn sis_account_virtual_resources_undo(
+    sagactx: NexusActionContext,
+) -> Result<(), anyhow::Error> {
+    let osagactx = sagactx.user_data();
+    let params = sagactx.saga_params::<Params>()?;
+    let instance_id = params.db_instance.id();
+
+    let opctx = crate::context::op_context_for_saga_action(
+        &sagactx,
+        &params.serialized_authn,
+    );
+    osagactx
+        .datastore()
+        .virtual_provisioning_collection_delete_instance(
+            &opctx,
+            instance_id,
+            params.db_instance.project_id,
+            i64::from(params.db_instance.ncpus.0 .0),
+            nexus_db_model::ByteCount(*params.db_instance.memory),
+        )
+        .await
+        .map_err(ActionError::action_failed)?;
+    Ok(())
+}
+
 async fn sis_create_vmm_record(
     sagactx: NexusActionContext,
 ) -> Result<db::model::Vmm, ActionError> {