From 66100a533194175c198e8d3f80c2b29b39d1acc2 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 10 Nov 2023 17:49:15 -0800 Subject: [PATCH 01/25] WIP: First attempt at txn retry --- .../db-queries/src/db/datastore/silo_group.rs | 54 ++++++++++--------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/silo_group.rs b/nexus/db-queries/src/db/datastore/silo_group.rs index 46f4aae7c9..f4c945f93b 100644 --- a/nexus/db-queries/src/db/datastore/silo_group.rs +++ b/nexus/db-queries/src/db/datastore/silo_group.rs @@ -147,34 +147,40 @@ impl DataStore { self.pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - use db::schema::silo_group_membership::dsl; + .transaction_async_with_retry( + |conn| { + let silo_group_ids = silo_group_ids.clone(); + async move { + use db::schema::silo_group_membership::dsl; - // Delete existing memberships for user - let silo_user_id = authz_silo_user.id(); - diesel::delete(dsl::silo_group_membership) - .filter(dsl::silo_user_id.eq(silo_user_id)) - .execute_async(&conn) - .await?; + // Delete existing memberships for user + let silo_user_id = authz_silo_user.id(); + diesel::delete(dsl::silo_group_membership) + .filter(dsl::silo_user_id.eq(silo_user_id)) + .execute_async(&conn) + .await?; - // Create new memberships for user - let silo_group_memberships: Vec< - db::model::SiloGroupMembership, - > = silo_group_ids - .iter() - .map(|group_id| db::model::SiloGroupMembership { - silo_group_id: *group_id, - silo_user_id, - }) - .collect(); + // Create new memberships for user + let silo_group_memberships: Vec< + db::model::SiloGroupMembership, + > = silo_group_ids + .iter() + .map(|group_id| db::model::SiloGroupMembership { + silo_group_id: *group_id, + silo_user_id, + }) + .collect(); - diesel::insert_into(dsl::silo_group_membership) - .values(silo_group_memberships) - .execute_async(&conn) - .await?; + diesel::insert_into(dsl::silo_group_membership) + .values(silo_group_memberships) + .execute_async(&conn) + .await?; - Ok(()) - }) + Ok(()) + } + }, + || async { true }, + ) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } From 10a021d7354f26064f0685d87ad51a607f9927ed Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 14 Nov 2023 17:40:29 -0800 Subject: [PATCH 02/25] Add transaction retry helper --- Cargo.lock | 1 - Cargo.toml | 3 +- nexus/db-queries/src/db/datastore/mod.rs | 6 + .../db-queries/src/db/datastore/silo_group.rs | 7 +- nexus/db-queries/src/lib.rs | 1 + nexus/db-queries/src/transaction_retry.rs | 141 ++++++++++++++++++ 6 files changed, 156 insertions(+), 3 deletions(-) create mode 100644 nexus/db-queries/src/transaction_retry.rs diff --git a/Cargo.lock b/Cargo.lock index 5bcce6e8ad..fc6f8ea373 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -287,7 +287,6 @@ checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" [[package]] name = "async-bb8-diesel" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/async-bb8-diesel?rev=1446f7e0c1f05f33a0581abd51fa873c7652ab61#1446f7e0c1f05f33a0581abd51fa873c7652ab61" dependencies = [ "async-trait", "bb8", diff --git a/Cargo.toml b/Cargo.toml index 0e13946533..5f3963d92e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -140,7 +140,8 @@ api_identity = { path = "api_identity" } approx = "0.5.1" assert_matches = "1.5.0" assert_cmd = "2.0.12" -async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "1446f7e0c1f05f33a0581abd51fa873c7652ab61" } +# async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "1446f7e0c1f05f33a0581abd51fa873c7652ab61" } +async-bb8-diesel = { path = "../../async-bb8-diesel" } async-trait = "0.1.74" atomicwrites = "0.4.2" authz-macros = { path = "nexus/authz-macros" } diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 91373f6875..d86b310119 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -146,6 +146,7 @@ pub type DataStoreConnection<'a> = pub struct DataStore { pool: Arc, virtual_provisioning_collection_producer: crate::provisioning::Producer, + transaction_retry_producer: crate::transaction_retry::Producer, } // The majority of `DataStore`'s methods live in our submodules as a concession @@ -162,6 +163,8 @@ impl DataStore { pool, virtual_provisioning_collection_producer: crate::provisioning::Producer::new(), + transaction_retry_producer: crate::transaction_retry::Producer::new( + ), }; Ok(datastore) } @@ -208,6 +211,9 @@ impl DataStore { self.virtual_provisioning_collection_producer.clone(), ) .unwrap(); + registry + .register_producer(self.transaction_retry_producer.clone()) + .unwrap(); } /// Returns a connection to a connection from the database connection pool. diff --git a/nexus/db-queries/src/db/datastore/silo_group.rs b/nexus/db-queries/src/db/datastore/silo_group.rs index f4c945f93b..31f9c47027 100644 --- a/nexus/db-queries/src/db/datastore/silo_group.rs +++ b/nexus/db-queries/src/db/datastore/silo_group.rs @@ -145,6 +145,11 @@ impl DataStore { ) -> UpdateResult<()> { opctx.authorize(authz::Action::Modify, authz_silo_user).await?; + let retry_helper = crate::transaction_retry::RetryHelper::new( + &self.transaction_retry_producer, + "silo_group_membership_replace_for_user", + ); + self.pool_connection_authorized(opctx) .await? .transaction_async_with_retry( @@ -179,7 +184,7 @@ impl DataStore { Ok(()) } }, - || async { true }, + retry_helper.as_callback(), ) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) diff --git a/nexus/db-queries/src/lib.rs b/nexus/db-queries/src/lib.rs index a693f7ff42..5d1927ebc7 100644 --- a/nexus/db-queries/src/lib.rs +++ b/nexus/db-queries/src/lib.rs @@ -9,6 +9,7 @@ pub mod authz; pub mod context; pub mod db; pub mod provisioning; +pub mod transaction_retry; #[macro_use] extern crate slog; diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs new file mode 100644 index 0000000000..707972e6a3 --- /dev/null +++ b/nexus/db-queries/src/transaction_retry.rs @@ -0,0 +1,141 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Helper types for performing automatic transaction retries + +use chrono::Utc; +use oximeter::{types::Sample, Metric, MetricsError, Target}; +use std::sync::{Arc, Mutex}; +use std::time::Duration; + +// Identifies "which" transaction is retrying +#[derive(Debug, Clone, Target)] +struct Transaction { + name: String, +} + +// Identifies that a retry has occurred, and track how long +// the transaction took (either since starting, or since the last +// retry failure was recorded). +#[derive(Debug, Clone, Metric)] +struct RetryData { + #[datum] + latency: f64, +} + +// Collects all transaction retry samples +#[derive(Debug, Default, Clone)] +pub(crate) struct Producer { + samples: Arc>>, +} + +impl Producer { + pub(crate) fn new() -> Self { + Self { samples: Arc::new(Mutex::new(vec![])) } + } + + fn append( + &self, + transaction: &Transaction, + latency: Duration, + ) -> Result<(), MetricsError> { + let sample = Sample::new_with_timestamp( + Utc::now(), + transaction, + &RetryData { latency: latency.as_secs_f64() }, + )?; + self.samples.lock().unwrap().push(sample); + Ok(()) + } +} + +struct RetryHelperInner { + start: chrono::DateTime, + attempts: usize, +} + +impl RetryHelperInner { + fn new() -> Self { + Self { start: Utc::now(), attempts: 1 } + } + + fn tick(&mut self) -> Self { + let start = self.start; + let attempts = self.attempts; + + self.start = Utc::now(); + self.attempts += 1; + + Self { start, attempts } + } +} + +/// Helper utility for tracking retry attempts and latency. +/// Intended to be used from within "transaction_async_with_retry". +pub(crate) struct RetryHelper { + producer: Producer, + name: &'static str, + inner: Mutex, +} + +const MAX_RETRY_ATTEMPTS: usize = 10; + +impl RetryHelper { + /// Creates a new RetryHelper, and starts a timer tracking the transaction + /// duration. + pub(crate) fn new(producer: &Producer, name: &'static str) -> Self { + Self { + producer: producer.clone(), + name, + inner: Mutex::new(RetryHelperInner::new()), + } + } + + // Called upon retryable transaction failure. + // + // This function: + // - Appends a metric identifying the duration of the transaction operation + // - Performs a randomly generated backoff (limited to less than 50 ms) + // - Returns "true" if the transaction should be restarted + async fn retry_callback(&self) -> bool { + let inner = self.inner.lock().unwrap().tick(); + + let _ = self.producer.append( + &Transaction { name: self.name.into() }, + (Utc::now() - inner.start).to_std().unwrap_or(Duration::ZERO), + ); + + // This backoff is not exponential, but I'm not sure we actually want + // that much backoff here. If we're repeatedly failing, it would + // probably be better to fail the operation, at which point Oximeter + // will keep track of the failing transaction and identify that it's a + // high-priority target for CTE conversion. + tokio::time::sleep(Duration::from_millis(rand::random::() % 50)) + .await; + + return inner.attempts < MAX_RETRY_ATTEMPTS; + } + + /// Converts this function to a retryable callback that can be used from + /// "transaction_async_with_retry". + pub(crate) fn as_callback( + self, + ) -> impl Fn() -> futures::future::BoxFuture<'static, bool> { + let r = Arc::new(self); + move || { + let r = r.clone(); + Box::pin(async move { r.retry_callback().await }) + } + } +} + +impl oximeter::Producer for Producer { + fn produce( + &mut self, + ) -> Result + 'static>, MetricsError> { + let samples = + std::mem::replace(&mut *self.samples.lock().unwrap(), vec![]); + Ok(Box::new(samples.into_iter())) + } +} From d77b58c7d331d91af169e44113f09e9f40fc9410 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 11:04:00 -0800 Subject: [PATCH 03/25] More conversions (tests passing) --- nexus/db-model/src/sled.rs | 2 +- nexus/db-queries/src/db/datastore/project.rs | 94 +++++--- nexus/db-queries/src/db/datastore/region.rs | 172 +++++++------- .../db-queries/src/db/datastore/silo_group.rs | 3 +- nexus/db-queries/src/db/datastore/sled.rs | 222 ++++++++++-------- nexus/db-queries/src/db/error.rs | 23 -- 6 files changed, 266 insertions(+), 250 deletions(-) diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs index 5e059946ff..d92da7f414 100644 --- a/nexus/db-model/src/sled.rs +++ b/nexus/db-model/src/sled.rs @@ -154,7 +154,7 @@ impl DatastoreCollectionConfig for Sled { } /// A set of constraints that can be placed on operations that select a sled. -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct SledReservationConstraints { must_select_from: Vec, } diff --git a/nexus/db-queries/src/db/datastore/project.rs b/nexus/db-queries/src/db/datastore/project.rs index c447b5bf98..22c9cad1e1 100644 --- a/nexus/db-queries/src/db/datastore/project.rs +++ b/nexus/db-queries/src/db/datastore/project.rs @@ -24,6 +24,7 @@ use crate::db::model::ProjectUpdate; use crate::db::model::Silo; use crate::db::model::VirtualProvisioningCollection; use crate::db::pagination::paginated; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; use chrono::Utc; use diesel::prelude::*; @@ -37,6 +38,7 @@ use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use ref_cast::RefCast; +use std::sync::{Arc, OnceLock}; // Generates internal functions used for validation during project deletion. // Used simply to reduce boilerplate. @@ -151,49 +153,69 @@ impl DataStore { use db::schema::project::dsl; + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "project_create_in_silo", + ); let name = project.name().as_str().to_string(); let db_project = self .pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - let project: Project = Silo::insert_resource( - silo_id, - diesel::insert_into(dsl::project).values(project), - ) - .insert_and_get_result_async(&conn) - .await - .map_err(|e| match e { - AsyncInsertError::CollectionNotFound => { - authz_silo_inner.not_found() - } - AsyncInsertError::DatabaseError(e) => { - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Project, - &name, - ), - ) - } - })?; + .transaction_async_with_retry(|conn| { + let err = err.clone(); + + let authz_silo_inner = authz_silo_inner.clone(); + let name = name.clone(); + let project = project.clone(); + async move { + let project: Project = Silo::insert_resource( + silo_id, + diesel::insert_into(dsl::project).values(project), + ) + .insert_and_get_result_async(&conn) + .await + .map_err(|e| match e { + AsyncInsertError::CollectionNotFound => { + err.set(authz_silo_inner.not_found()).unwrap(); + return diesel::result::Error::RollbackTransaction; + } + AsyncInsertError::DatabaseError(e) => { + err.set(public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Project, + &name, + ), + )).unwrap(); + return diesel::result::Error::RollbackTransaction; + } + })?; - // Create resource provisioning for the project. - self.virtual_provisioning_collection_create_on_connection( - &conn, - VirtualProvisioningCollection::new( - project.id(), - CollectionTypeProvisioned::Project, - ), - ) - .await?; - Ok(project) - }) + // Create resource provisioning for the project. + self.virtual_provisioning_collection_create_on_connection( + &conn, + VirtualProvisioningCollection::new( + project.id(), + CollectionTypeProvisioned::Project, + ), + ) + .await + .map_err(|e| { + err.set(e).unwrap(); + return diesel::result::Error::RollbackTransaction; + })?; + Ok(project) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TransactionError::CustomError(e) => e, - TransactionError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) + .map_err(|e| { + if let Some(err) = err.get() { + return err.clone(); } + public_error_from_diesel(e, ErrorHandler::Server) })?; Ok(( diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index 9465fe2792..cfb6dbeb66 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -10,10 +10,10 @@ use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::lookup::LookupPath; use crate::db::model::Dataset; use crate::db::model::Region; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; @@ -21,9 +21,9 @@ use nexus_types::external_api::params; use omicron_common::api::external; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; -use omicron_common::backoff::{self, BackoffError}; use omicron_common::nexus_config::RegionAllocationStrategy; use slog::Logger; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -152,7 +152,7 @@ impl DataStore { /// Also updates the storage usage on their corresponding datasets. pub async fn regions_hard_delete( &self, - log: &Logger, + _log: &Logger, region_ids: Vec, ) -> DeleteResult { if region_ids.is_empty() { @@ -164,98 +164,88 @@ impl DataStore { #[error("Numeric error: {0}")] NumericError(String), } - type TxnError = TransactionError; - - // Retry this transaction until it succeeds. It's a little heavy in that - // there's a for loop inside that iterates over the datasets the - // argument regions belong to, and it often encounters the "retry - // transaction" error. - let transaction = { - |region_ids: Vec| async { - self.pool_connection_unauthorized() - .await? - .transaction_async(|conn| async move { - use db::schema::dataset::dsl as dataset_dsl; - use db::schema::region::dsl as region_dsl; - - // Remove the regions, collecting datasets they're from. - let datasets = diesel::delete(region_dsl::region) - .filter(region_dsl::id.eq_any(region_ids)) - .returning(region_dsl::dataset_id) - .get_results_async::(&conn).await?; - - // Update datasets to which the regions belonged. - for dataset in datasets { - let dataset_total_occupied_size: Option< - diesel::pg::data_types::PgNumeric, - > = region_dsl::region - .filter(region_dsl::dataset_id.eq(dataset)) - .select(diesel::dsl::sum( - region_dsl::block_size - * region_dsl::blocks_per_extent - * region_dsl::extent_count, - )) - .nullable() - .get_result_async(&conn).await?; - - let dataset_total_occupied_size: i64 = if let Some( - dataset_total_occupied_size, - ) = - dataset_total_occupied_size - { - let dataset_total_occupied_size: db::model::ByteCount = - dataset_total_occupied_size.try_into().map_err( - |e: anyhow::Error| { - TxnError::CustomError( - RegionDeleteError::NumericError( - e.to_string(), - ), - ) - }, - )?; - - dataset_total_occupied_size.into() - } else { - 0 - }; - - diesel::update(dataset_dsl::dataset) - .filter(dataset_dsl::id.eq(dataset)) - .set( - dataset_dsl::size_used - .eq(dataset_total_occupied_size), - ) - .execute_async(&conn).await?; - } + let err = Arc::new(OnceLock::new()); - Ok(()) - }) - .await - .map_err(|e: TxnError| { - if e.retry_transaction() { - BackoffError::transient(Error::internal_error( - &format!("Retryable transaction error {:?}", e) - )) - } else { - BackoffError::Permanent(Error::internal_error( - &format!("Transaction error: {}", e) - )) - } - }) - } - }; + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "regions_hard_delete", + ); - backoff::retry_notify( - backoff::retry_policy_internal_service_aggressive(), - || async { + self.pool_connection_unauthorized() + .await? + .transaction_async_with_retry(|conn| { + let err = err.clone(); let region_ids = region_ids.clone(); - transaction(region_ids).await - }, - |e: Error, delay| { - info!(log, "{:?}, trying again in {:?}", e, delay,); + async move { + use db::schema::dataset::dsl as dataset_dsl; + use db::schema::region::dsl as region_dsl; + + // Remove the regions, collecting datasets they're from. + let datasets = diesel::delete(region_dsl::region) + .filter(region_dsl::id.eq_any(region_ids)) + .returning(region_dsl::dataset_id) + .get_results_async::(&conn).await?; + + // Update datasets to which the regions belonged. + for dataset in datasets { + let dataset_total_occupied_size: Option< + diesel::pg::data_types::PgNumeric, + > = region_dsl::region + .filter(region_dsl::dataset_id.eq(dataset)) + .select(diesel::dsl::sum( + region_dsl::block_size + * region_dsl::blocks_per_extent + * region_dsl::extent_count, + )) + .nullable() + .get_result_async(&conn).await?; + + let dataset_total_occupied_size: i64 = if let Some( + dataset_total_occupied_size, + ) = + dataset_total_occupied_size + { + let dataset_total_occupied_size: db::model::ByteCount = + dataset_total_occupied_size.try_into().map_err( + |e: anyhow::Error| { + err.set(RegionDeleteError::NumericError( + e.to_string(), + )).unwrap(); + diesel::result::Error::RollbackTransaction + }, + )?; + + dataset_total_occupied_size.into() + } else { + 0 + }; + + diesel::update(dataset_dsl::dataset) + .filter(dataset_dsl::id.eq(dataset)) + .set( + dataset_dsl::size_used + .eq(dataset_total_occupied_size), + ) + .execute_async(&conn).await?; + } + Ok(()) + } }, - ) - .await + retry_helper.as_callback(), + ) + .await + .map_err(|e| { + if let Some(err) = err.get() { + match err { + RegionDeleteError::NumericError(err) => { + return Error::internal_error( + &format!("Transaction error: {}", err) + ); + } + } + } + public_error_from_diesel(e, ErrorHandler::Server) + }) } /// Return the total occupied size for a dataset diff --git a/nexus/db-queries/src/db/datastore/silo_group.rs b/nexus/db-queries/src/db/datastore/silo_group.rs index 31f9c47027..56a62d6b3c 100644 --- a/nexus/db-queries/src/db/datastore/silo_group.rs +++ b/nexus/db-queries/src/db/datastore/silo_group.rs @@ -15,6 +15,7 @@ use crate::db::error::TransactionError; use crate::db::model::SiloGroup; use crate::db::model::SiloGroupMembership; use crate::db::pagination::paginated; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -145,7 +146,7 @@ impl DataStore { ) -> UpdateResult<()> { opctx.authorize(authz::Action::Modify, authz_silo_user).await?; - let retry_helper = crate::transaction_retry::RetryHelper::new( + let retry_helper = RetryHelper::new( &self.transaction_retry_producer, "silo_group_membership_replace_for_user", ); diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index f4f5188057..d052b0ab29 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -10,11 +10,11 @@ use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::identity::Asset; use crate::db::model::Sled; use crate::db::model::SledResource; use crate::db::pagination::paginated; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -25,6 +25,7 @@ use omicron_common::api::external::DataPageParams; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::ResourceType; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -85,118 +86,143 @@ impl DataStore { enum SledReservationError { NotFound, } - type TxnError = TransactionError; + let err = Arc::new(OnceLock::new()); + + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "sled_reservation_create", + ); self.pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - use db::schema::sled_resource::dsl as resource_dsl; - // Check if resource ID already exists - if so, return it. - let old_resource = resource_dsl::sled_resource - .filter(resource_dsl::id.eq(resource_id)) - .select(SledResource::as_select()) - .limit(1) - .load_async(&conn) - .await?; + .transaction_async_with_retry( + |conn| { + // Clone variables into retryable function + let err = err.clone(); + let constraints = constraints.clone(); + let resources = resources.clone(); - if !old_resource.is_empty() { - return Ok(old_resource[0].clone()); - } + async move { + use db::schema::sled_resource::dsl as resource_dsl; + // Check if resource ID already exists - if so, return it. + let old_resource = resource_dsl::sled_resource + .filter(resource_dsl::id.eq(resource_id)) + .select(SledResource::as_select()) + .limit(1) + .load_async(&conn) + .await?; - // If it doesn't already exist, find a sled with enough space - // for the resources we're requesting. - use db::schema::sled::dsl as sled_dsl; - // This answers the boolean question: - // "Does the SUM of all hardware thread usage, plus the one we're trying - // to allocate, consume less threads than exists on the sled?" - let sled_has_space_for_threads = - (diesel::dsl::sql::(&format!( - "COALESCE(SUM(CAST({} as INT8)), 0)", - resource_dsl::hardware_threads::NAME - )) + resources.hardware_threads) - .le(sled_dsl::usable_hardware_threads); + if !old_resource.is_empty() { + return Ok(old_resource[0].clone()); + } - // This answers the boolean question: - // "Does the SUM of all RAM usage, plus the one we're trying - // to allocate, consume less RAM than exists on the sled?" - let sled_has_space_for_rss = - (diesel::dsl::sql::(&format!( - "COALESCE(SUM(CAST({} as INT8)), 0)", - resource_dsl::rss_ram::NAME - )) + resources.rss_ram) - .le(sled_dsl::usable_physical_ram); + // If it doesn't already exist, find a sled with enough space + // for the resources we're requesting. + use db::schema::sled::dsl as sled_dsl; + // This answers the boolean question: + // "Does the SUM of all hardware thread usage, plus the one we're trying + // to allocate, consume less threads than exists on the sled?" + let sled_has_space_for_threads = + (diesel::dsl::sql::( + &format!( + "COALESCE(SUM(CAST({} as INT8)), 0)", + resource_dsl::hardware_threads::NAME + ), + ) + resources.hardware_threads) + .le(sled_dsl::usable_hardware_threads); - // Determine whether adding this service's reservoir allocation - // to what's allocated on the sled would avoid going over quota. - let sled_has_space_in_reservoir = - (diesel::dsl::sql::(&format!( - "COALESCE(SUM(CAST({} as INT8)), 0)", - resource_dsl::reservoir_ram::NAME - )) + resources.reservoir_ram) - .le(sled_dsl::reservoir_size); + // This answers the boolean question: + // "Does the SUM of all RAM usage, plus the one we're trying + // to allocate, consume less RAM than exists on the sled?" + let sled_has_space_for_rss = + (diesel::dsl::sql::( + &format!( + "COALESCE(SUM(CAST({} as INT8)), 0)", + resource_dsl::rss_ram::NAME + ), + ) + resources.rss_ram) + .le(sled_dsl::usable_physical_ram); - // Generate a query describing all of the sleds that have space - // for this reservation. - let mut sled_targets = sled_dsl::sled - .left_join( - resource_dsl::sled_resource - .on(resource_dsl::sled_id.eq(sled_dsl::id)), - ) - .group_by(sled_dsl::id) - .having( - sled_has_space_for_threads - .and(sled_has_space_for_rss) - .and(sled_has_space_in_reservoir), - ) - .filter(sled_dsl::time_deleted.is_null()) - .select(sled_dsl::id) - .into_boxed(); + // Determine whether adding this service's reservoir allocation + // to what's allocated on the sled would avoid going over quota. + let sled_has_space_in_reservoir = + (diesel::dsl::sql::( + &format!( + "COALESCE(SUM(CAST({} as INT8)), 0)", + resource_dsl::reservoir_ram::NAME + ), + ) + resources.reservoir_ram) + .le(sled_dsl::reservoir_size); - // Further constrain the sled IDs according to any caller- - // supplied constraints. - if let Some(must_select_from) = constraints.must_select_from() { - sled_targets = sled_targets - .filter(sled_dsl::id.eq_any(must_select_from.to_vec())); - } + // Generate a query describing all of the sleds that have space + // for this reservation. + let mut sled_targets = sled_dsl::sled + .left_join( + resource_dsl::sled_resource + .on(resource_dsl::sled_id.eq(sled_dsl::id)), + ) + .group_by(sled_dsl::id) + .having( + sled_has_space_for_threads + .and(sled_has_space_for_rss) + .and(sled_has_space_in_reservoir), + ) + .filter(sled_dsl::time_deleted.is_null()) + .select(sled_dsl::id) + .into_boxed(); - sql_function!(fn random() -> diesel::sql_types::Float); - let sled_targets = sled_targets - .order(random()) - .limit(1) - .get_results_async::(&conn) - .await?; + // Further constrain the sled IDs according to any caller- + // supplied constraints. + if let Some(must_select_from) = + constraints.must_select_from() + { + sled_targets = sled_targets.filter( + sled_dsl::id.eq_any(must_select_from.to_vec()), + ); + } - if sled_targets.is_empty() { - return Err(TxnError::CustomError( - SledReservationError::NotFound, - )); - } + sql_function!(fn random() -> diesel::sql_types::Float); + let sled_targets = sled_targets + .order(random()) + .limit(1) + .get_results_async::(&conn) + .await?; - // Create a SledResource record, associate it with the target - // sled. - let resource = SledResource::new( - resource_id, - sled_targets[0], - resource_kind, - resources, - ); + if sled_targets.is_empty() { + err.set(SledReservationError::NotFound).unwrap(); + return Err(diesel::result::Error::NotFound); + } - Ok(diesel::insert_into(resource_dsl::sled_resource) - .values(resource) - .returning(SledResource::as_returning()) - .get_result_async(&conn) - .await?) - }) + // Create a SledResource record, associate it with the target + // sled. + let resource = SledResource::new( + resource_id, + sled_targets[0], + resource_kind, + resources, + ); + + Ok(diesel::insert_into(resource_dsl::sled_resource) + .values(resource) + .returning(SledResource::as_returning()) + .get_result_async(&conn) + .await?) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::CustomError(SledReservationError::NotFound) => { - external::Error::unavail( - "No sleds can fit the requested instance", - ) - } - TxnError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) + .map_err(|e| { + if let Some(err) = err.get() { + match err { + SledReservationError::NotFound => { + return external::Error::unavail( + "No sleds can fit the requested instance", + ); + } + } } + public_error_from_diesel(e, ErrorHandler::Server) }) } diff --git a/nexus/db-queries/src/db/error.rs b/nexus/db-queries/src/db/error.rs index cbe2b0a71f..d4770d4824 100644 --- a/nexus/db-queries/src/db/error.rs +++ b/nexus/db-queries/src/db/error.rs @@ -34,29 +34,6 @@ impl From for TransactionError { } } -impl TransactionError { - /// Based on [the CRDB][1] docs, return true if this transaction must be - /// retried. - /// - /// [1]: https://www.cockroachlabs.com/docs/v23.1/transaction-retry-error-reference#client-side-retry-handling - pub fn retry_transaction(&self) -> bool { - match &self { - Self::Database(DieselError::DatabaseError( - kind, - boxed_error_information, - )) => match kind { - DieselErrorKind::SerializationFailure => { - return boxed_error_information - .message() - .starts_with("restart transaction"); - } - _ => false, - }, - _ => false, - } - } -} - /// Summarizes details provided with a database error. fn format_database_error( kind: DieselErrorKind, From 64d46a441f69f3ea7287209ca6df12485cadaf29 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 13:42:40 -0800 Subject: [PATCH 04/25] More conversions (collections, network, switch_port) --- nexus/db-queries/src/db/collection_attach.rs | 27 +- .../src/db/collection_detach_many.rs | 26 +- .../src/db/datastore/network_interface.rs | 132 +- nexus/db-queries/src/db/datastore/sled.rs | 1 - .../src/db/datastore/switch_port.rs | 1162 +++++++++-------- 5 files changed, 691 insertions(+), 657 deletions(-) diff --git a/nexus/db-queries/src/db/collection_attach.rs b/nexus/db-queries/src/db/collection_attach.rs index ea4d9d5beb..fccc1aa324 100644 --- a/nexus/db-queries/src/db/collection_attach.rs +++ b/nexus/db-queries/src/db/collection_attach.rs @@ -563,12 +563,9 @@ where #[cfg(test)] mod test { use super::*; - use crate::db::{ - self, error::TransactionError, identity::Resource as IdentityResource, - }; + use crate::db::{self, identity::Resource as IdentityResource}; use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, - ConnectionManager, + AsyncRunQueryDsl, AsyncSimpleConnection, ConnectionManager, }; use chrono::Utc; use db_macros::Resource; @@ -999,22 +996,12 @@ mod test { .set(resource::dsl::collection_id.eq(collection_id)), ); - type TxnError = - TransactionError>; - let result = conn - .transaction_async(|conn| async move { - attach_query.attach_and_get_result_async(&conn).await.map_err( - |e| match e { - AttachError::DatabaseError(e) => TxnError::from(e), - e => TxnError::CustomError(e), - }, - ) - }) - .await; - // "attach_and_get_result" should return the "attached" resource. - let (returned_collection, returned_resource) = - result.expect("Attach should have worked"); + let (returned_collection, returned_resource) = attach_query + .attach_and_get_result_async(&conn) + .await + .expect("Attach should have worked"); + assert_eq!( returned_resource.collection_id.expect("Expected a collection ID"), collection_id diff --git a/nexus/db-queries/src/db/collection_detach_many.rs b/nexus/db-queries/src/db/collection_detach_many.rs index 8df6d4aed4..986cfb70b7 100644 --- a/nexus/db-queries/src/db/collection_detach_many.rs +++ b/nexus/db-queries/src/db/collection_detach_many.rs @@ -479,12 +479,9 @@ where mod test { use super::*; use crate::db::collection_attach::DatastoreAttachTarget; - use crate::db::{ - self, error::TransactionError, identity::Resource as IdentityResource, - }; + use crate::db::{self, identity::Resource as IdentityResource}; use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, - ConnectionManager, + AsyncRunQueryDsl, AsyncSimpleConnection, ConnectionManager, }; use chrono::Utc; use db_macros::Resource; @@ -919,21 +916,12 @@ mod test { .set(resource::dsl::collection_id.eq(Option::::None)), ); - type TxnError = - TransactionError>; - let result = conn - .transaction_async(|conn| async move { - detach_query.detach_and_get_result_async(&conn).await.map_err( - |e| match e { - DetachManyError::DatabaseError(e) => TxnError::from(e), - e => TxnError::CustomError(e), - }, - ) - }) - .await; - // "detach_and_get_result" should return the "detached" resource. - let returned_collection = result.expect("Detach should have worked"); + let returned_collection = detach_query + .detach_and_get_result_async(&conn) + .await + .expect("Detach should have worked"); + // The returned values should be the latest value in the DB. assert_eq!( returned_collection, diff --git a/nexus/db-queries/src/db/datastore/network_interface.rs b/nexus/db-queries/src/db/datastore/network_interface.rs index 06550e9439..0558149c0b 100644 --- a/nexus/db-queries/src/db/datastore/network_interface.rs +++ b/nexus/db-queries/src/db/datastore/network_interface.rs @@ -13,7 +13,6 @@ use crate::db::collection_insert::DatastoreCollection; use crate::db::cte_utils::BoxedQuery; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::IncompleteNetworkInterface; use crate::db::model::Instance; use crate::db::model::InstanceNetworkInterface; @@ -25,6 +24,7 @@ use crate::db::model::VpcSubnet; use crate::db::pagination::paginated; use crate::db::pool::DbConnection; use crate::db::queries::network_interface; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -40,6 +40,7 @@ use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use ref_cast::RefCast; use sled_agent_client::types as sled_client_types; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; /// OPTE requires information that's currently split across the network @@ -466,77 +467,98 @@ impl DataStore { InstanceNotStopped, FailedToUnsetPrimary(DieselError), } - type TxnError = TransactionError; + + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "instance_update_network_interface", + ); let conn = self.pool_connection_authorized(opctx).await?; if primary { - conn.transaction_async(|conn| async move { - let instance_runtime = - instance_query.get_result_async(&conn).await?.runtime_state; - if instance_runtime.propolis_id.is_some() - || instance_runtime.nexus_state != stopped - { - return Err(TxnError::CustomError( - NetworkInterfaceUpdateError::InstanceNotStopped, - )); - } - - // First, get the primary interface - let primary_interface = - find_primary_query.get_result_async(&conn).await?; - // If the target and primary are different, we need to toggle - // the primary into a secondary. - if primary_interface.identity.id != interface_id { - use crate::db::schema::network_interface::dsl; - if let Err(e) = diesel::update(dsl::network_interface) - .filter(dsl::id.eq(primary_interface.identity.id)) - .filter(dsl::kind.eq(NetworkInterfaceKind::Instance)) - .filter(dsl::time_deleted.is_null()) - .set(dsl::is_primary.eq(false)) - .execute_async(&conn) - .await + conn.transaction_async_with_retry(|conn| { + let err = err.clone(); + let stopped = stopped.clone(); + let update_target_query = update_target_query.clone(); + async move { + let instance_runtime = + instance_query.get_result_async(&conn).await?.runtime_state; + if instance_runtime.propolis_id.is_some() + || instance_runtime.nexus_state != stopped { - return Err(TxnError::CustomError( - NetworkInterfaceUpdateError::FailedToUnsetPrimary( - e, - ), - )); + err.set(NetworkInterfaceUpdateError::InstanceNotStopped).unwrap(); + return Err(diesel::result::Error::RollbackTransaction); } - } - // In any case, update the actual target - Ok(update_target_query.get_result_async(&conn).await?) - }) + // First, get the primary interface + let primary_interface = + find_primary_query.get_result_async(&conn).await?; + // If the target and primary are different, we need to toggle + // the primary into a secondary. + if primary_interface.identity.id != interface_id { + use crate::db::schema::network_interface::dsl; + if let Err(e) = diesel::update(dsl::network_interface) + .filter(dsl::id.eq(primary_interface.identity.id)) + .filter(dsl::kind.eq(NetworkInterfaceKind::Instance)) + .filter(dsl::time_deleted.is_null()) + .set(dsl::is_primary.eq(false)) + .execute_async(&conn) + .await + { + err.set(NetworkInterfaceUpdateError::FailedToUnsetPrimary(e)).unwrap(); + return Err(diesel::result::Error::RollbackTransaction); + } + } + + // In any case, update the actual target + Ok(update_target_query.get_result_async(&conn).await?) + } + }, + retry_helper.as_callback() + ) } else { // In this case, we can just directly apply the updates. By // construction, `updates.primary` is `None`, so nothing will // be done there. The other columns always need to be updated, and // we're only hitting a single row. Note that we still need to // verify the instance is stopped. - conn.transaction_async(|conn| async move { - let instance_state = - instance_query.get_result_async(&conn).await?.runtime_state; - if instance_state.propolis_id.is_some() - || instance_state.nexus_state != stopped - { - return Err(TxnError::CustomError( - NetworkInterfaceUpdateError::InstanceNotStopped, - )); - } - Ok(update_target_query.get_result_async(&conn).await?) - }) + conn.transaction_async_with_retry(|conn| { + let err = err.clone(); + let stopped = stopped.clone(); + let update_target_query = update_target_query.clone(); + async move { + let instance_state = + instance_query.get_result_async(&conn).await?.runtime_state; + if instance_state.propolis_id.is_some() + || instance_state.nexus_state != stopped + { + err.set(NetworkInterfaceUpdateError::InstanceNotStopped).unwrap(); + return Err(diesel::result::Error::RollbackTransaction); + } + Ok(update_target_query.get_result_async(&conn).await?) + } + }, + retry_helper.as_callback() + ) } .await // Convert to `InstanceNetworkInterface` before returning, we know // this is valid as we've filtered appropriately above. .map(NetworkInterface::as_instance) - .map_err(|e| match e { - TxnError::CustomError( - NetworkInterfaceUpdateError::InstanceNotStopped, - ) => Error::invalid_request( - "Instance must be stopped to update its network interfaces", - ), - _ => Error::internal_error(&format!("Transaction error: {:?}", e)), + .map_err(|e| { + if let Some(err) = Arc::try_unwrap(err).unwrap().take() { + match err { + NetworkInterfaceUpdateError::InstanceNotStopped => { + return Error::invalid_request( + "Instance must be stopped to update its network interfaces", + ); + }, + NetworkInterfaceUpdateError::FailedToUnsetPrimary(err) => { + return public_error_from_diesel(err, ErrorHandler::Server); + }, + } + } + public_error_from_diesel(e, ErrorHandler::Server) }) } } diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index d052b0ab29..85ed1cd4de 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -87,7 +87,6 @@ impl DataStore { NotFound, } let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( &self.transaction_retry_producer, "sled_reservation_create", diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index f301750ee9..0d21419acd 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -20,6 +20,7 @@ use crate::db::model::{ SwitchVlanInterfaceConfig, }; use crate::db::pagination::paginated; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; use diesel::result::Error as DieselError; use diesel::{ @@ -34,6 +35,7 @@ use omicron_common::api::external::{ }; use ref_cast::RefCast; use serde::{Deserialize, Serialize}; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; #[derive(Clone, Debug, Deserialize, Serialize)] @@ -163,280 +165,284 @@ impl DataStore { BgpConfigNotFound, ReserveBlock(ReserveBlockError), } - type TxnError = TransactionError; type SpsCreateError = SwitchPortSettingsCreateError; + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "switch_port_settings_create", + ); + let conn = self.pool_connection_authorized(opctx).await?; // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - // create the top level port settings object - let port_settings = match id { - Some(id) => SwitchPortSettings::with_id(id, ¶ms.identity), - None => SwitchPortSettings::new(¶ms.identity), - }; - //let port_settings = SwitchPortSettings::new(¶ms.identity); - let db_port_settings: SwitchPortSettings = - diesel::insert_into(port_settings_dsl::switch_port_settings) - .values(port_settings) - .returning(SwitchPortSettings::as_returning()) - .get_result_async(&conn) - .await?; - - let psid = db_port_settings.identity.id; + conn.transaction_async_with_retry(|conn| { + let err = err.clone(); + async move { + // create the top level port settings object + let port_settings = match id { + Some(id) => SwitchPortSettings::with_id(id, ¶ms.identity), + None => SwitchPortSettings::new(¶ms.identity), + }; + //let port_settings = SwitchPortSettings::new(¶ms.identity); + let db_port_settings: SwitchPortSettings = + diesel::insert_into(port_settings_dsl::switch_port_settings) + .values(port_settings) + .returning(SwitchPortSettings::as_returning()) + .get_result_async(&conn) + .await?; + + let psid = db_port_settings.identity.id; + + // add the port config + let port_config = SwitchPortConfig::new( + psid, + params.port_config.geometry.into(), + ); - // add the port config - let port_config = SwitchPortConfig::new( - psid, - params.port_config.geometry.into(), - ); + let db_port_config: SwitchPortConfig = + diesel::insert_into(port_config_dsl::switch_port_settings_port_config) + .values(port_config) + .returning(SwitchPortConfig::as_returning()) + .get_result_async(&conn) + .await?; + + let mut result = SwitchPortSettingsCombinedResult{ + settings: db_port_settings, + groups: Vec::new(), + port: db_port_config, + links: Vec::new(), + link_lldp: Vec::new(), + interfaces: Vec::new(), + vlan_interfaces: Vec::new(), + routes: Vec::new(), + bgp_peers: Vec::new(), + addresses: Vec::new(), + }; - let db_port_config: SwitchPortConfig = - diesel::insert_into(port_config_dsl::switch_port_settings_port_config) - .values(port_config) - .returning(SwitchPortConfig::as_returning()) - .get_result_async(&conn) - .await?; + //TODO validate link configs consistent with port geometry. + // - https://github.com/oxidecomputer/omicron/issues/2816 - let mut result = SwitchPortSettingsCombinedResult{ - settings: db_port_settings, - groups: Vec::new(), - port: db_port_config, - links: Vec::new(), - link_lldp: Vec::new(), - interfaces: Vec::new(), - vlan_interfaces: Vec::new(), - routes: Vec::new(), - bgp_peers: Vec::new(), - addresses: Vec::new(), - }; - - //TODO validate link configs consistent with port geometry. - // - https://github.com/oxidecomputer/omicron/issues/2816 - - let mut lldp_config = Vec::with_capacity(params.links.len()); - let mut link_config = Vec::with_capacity(params.links.len()); - - for (link_name, c) in ¶ms.links { - let lldp_config_id = match c.lldp.lldp_config { - Some(_) => todo!(), // TODO actual lldp support - None => None, - }; - let lldp_svc_config = - LldpServiceConfig::new(c.lldp.enabled, lldp_config_id); + let mut lldp_config = Vec::with_capacity(params.links.len()); + let mut link_config = Vec::with_capacity(params.links.len()); - lldp_config.push(lldp_svc_config.clone()); - link_config.push(SwitchPortLinkConfig::new( - psid, - lldp_svc_config.id, - link_name.clone(), - c.mtu, - c.fec.into(), - c.speed.into(), - )); - } - result.link_lldp = - diesel::insert_into(lldp_config_dsl::lldp_service_config) - .values(lldp_config.clone()) - .returning(LldpServiceConfig::as_returning()) - .get_results_async(&conn) - .await?; - result.links = - diesel::insert_into( - link_config_dsl::switch_port_settings_link_config) - .values(link_config) - .returning(SwitchPortLinkConfig::as_returning()) - .get_results_async(&conn) - .await?; + for (link_name, c) in ¶ms.links { + let lldp_config_id = match c.lldp.lldp_config { + Some(_) => todo!(), // TODO actual lldp support + None => None, + }; + let lldp_svc_config = + LldpServiceConfig::new(c.lldp.enabled, lldp_config_id); - let mut interface_config = Vec::with_capacity(params.interfaces.len()); - let mut vlan_interface_config = Vec::new(); - for (interface_name, i) in ¶ms.interfaces { - let ifx_config = SwitchInterfaceConfig::new( - psid, - interface_name.clone(), - i.v6_enabled, - i.kind.into(), - ); - interface_config.push(ifx_config.clone()); - if let params::SwitchInterfaceKind::Vlan(vlan_if) = i.kind { - vlan_interface_config.push(SwitchVlanInterfaceConfig::new( - ifx_config.id, - vlan_if.vid, + lldp_config.push(lldp_svc_config.clone()); + link_config.push(SwitchPortLinkConfig::new( + psid, + lldp_svc_config.id, + link_name.clone(), + c.mtu, + c.fec.into(), + c.speed.into(), )); } - } - result.interfaces = - diesel::insert_into( - interface_config_dsl::switch_port_settings_interface_config) - .values(interface_config) - .returning(SwitchInterfaceConfig::as_returning()) - .get_results_async(&conn) - .await?; - result.vlan_interfaces = - diesel::insert_into(vlan_config_dsl::switch_vlan_interface_config) - .values(vlan_interface_config) - .returning(SwitchVlanInterfaceConfig::as_returning()) - .get_results_async(&conn) - .await?; - - - let mut route_config = Vec::with_capacity(params.routes.len()); + result.link_lldp = + diesel::insert_into(lldp_config_dsl::lldp_service_config) + .values(lldp_config.clone()) + .returning(LldpServiceConfig::as_returning()) + .get_results_async(&conn) + .await?; + result.links = + diesel::insert_into( + link_config_dsl::switch_port_settings_link_config) + .values(link_config) + .returning(SwitchPortLinkConfig::as_returning()) + .get_results_async(&conn) + .await?; - for (interface_name, r) in ¶ms.routes { - for route in &r.routes { - route_config.push(SwitchPortRouteConfig::new( + let mut interface_config = Vec::with_capacity(params.interfaces.len()); + let mut vlan_interface_config = Vec::new(); + for (interface_name, i) in ¶ms.interfaces { + let ifx_config = SwitchInterfaceConfig::new( psid, interface_name.clone(), - route.dst.into(), - route.gw.into(), - route.vid.map(Into::into), - )); + i.v6_enabled, + i.kind.into(), + ); + interface_config.push(ifx_config.clone()); + if let params::SwitchInterfaceKind::Vlan(vlan_if) = i.kind { + vlan_interface_config.push(SwitchVlanInterfaceConfig::new( + ifx_config.id, + vlan_if.vid, + )); + } } - } - result.routes = - diesel::insert_into( - route_config_dsl::switch_port_settings_route_config) - .values(route_config) - .returning(SwitchPortRouteConfig::as_returning()) - .get_results_async(&conn) - .await?; + result.interfaces = + diesel::insert_into( + interface_config_dsl::switch_port_settings_interface_config) + .values(interface_config) + .returning(SwitchInterfaceConfig::as_returning()) + .get_results_async(&conn) + .await?; + result.vlan_interfaces = + diesel::insert_into(vlan_config_dsl::switch_vlan_interface_config) + .values(vlan_interface_config) + .returning(SwitchVlanInterfaceConfig::as_returning()) + .get_results_async(&conn) + .await?; - let mut bgp_peer_config = Vec::new(); - for (interface_name, p) in ¶ms.bgp_peers { - use db::schema::bgp_config; - let bgp_config_id = match &p.bgp_config { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name = name.to_string(); - bgp_config_dsl::bgp_config - .filter(bgp_config::time_deleted.is_null()) - .filter(bgp_config::name.eq(name)) - .select(bgp_config::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| - TxnError::CustomError( - SwitchPortSettingsCreateError::BgpConfigNotFound, - ) - )? - } - }; - bgp_peer_config.push(SwitchPortBgpPeerConfig::new( - psid, - bgp_config_id, - interface_name.clone(), - p.addr.into(), - p.hold_time.into(), - p.idle_hold_time.into(), - p.delay_open.into(), - p.connect_retry.into(), - p.keepalive.into(), - )); + let mut route_config = Vec::with_capacity(params.routes.len()); - } - result.bgp_peers = - diesel::insert_into( - bgp_peer_dsl::switch_port_settings_bgp_peer_config) - .values(bgp_peer_config) - .returning(SwitchPortBgpPeerConfig::as_returning()) - .get_results_async(&conn) - .await?; + for (interface_name, r) in ¶ms.routes { + for route in &r.routes { + route_config.push(SwitchPortRouteConfig::new( + psid, + interface_name.clone(), + route.dst.into(), + route.gw.into(), + route.vid.map(Into::into), + )); + } + } + result.routes = + diesel::insert_into( + route_config_dsl::switch_port_settings_route_config) + .values(route_config) + .returning(SwitchPortRouteConfig::as_returning()) + .get_results_async(&conn) + .await?; - let mut address_config = Vec::new(); - use db::schema::address_lot; - for (interface_name, a) in ¶ms.addresses { - for address in &a.addresses { - let address_lot_id = match &address.address_lot { + let mut bgp_peer_config = Vec::new(); + for (interface_name, p) in ¶ms.bgp_peers { + use db::schema::bgp_config; + let bgp_config_id = match &p.bgp_config { NameOrId::Id(id) => *id, NameOrId::Name(name) => { let name = name.to_string(); - address_lot_dsl::address_lot - .filter(address_lot::time_deleted.is_null()) - .filter(address_lot::name.eq(name)) - .select(address_lot::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| - TxnError::CustomError( - SwitchPortSettingsCreateError::AddressLotNotFound, - ) - )? + bgp_config_dsl::bgp_config + .filter(bgp_config::time_deleted.is_null()) + .filter(bgp_config::name.eq(name)) + .select(bgp_config::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|_| { + err.set(SwitchPortSettingsCreateError::BgpConfigNotFound).unwrap(); + DieselError::RollbackTransaction + })? } }; - // TODO: Reduce DB round trips needed for reserving ip blocks - // https://github.com/oxidecomputer/omicron/issues/3060 - let (block, rsvd_block) = - crate::db::datastore::address_lot::try_reserve_block( - address_lot_id, - address.address.ip().into(), - // TODO: Should we allow anycast addresses for switch_ports? - // anycast - false, - &conn - ) - .await - .map_err(|e| match e { - ReserveBlockTxnError::CustomError(err) => { - TxnError::CustomError( - SwitchPortSettingsCreateError::ReserveBlock(err) - ) - } - ReserveBlockTxnError::Database(err) => TxnError::Database(err), - })?; - address_config.push(SwitchPortAddressConfig::new( + bgp_peer_config.push(SwitchPortBgpPeerConfig::new( psid, - block.id, - rsvd_block.id, - address.address.into(), + bgp_config_id, interface_name.clone(), + p.addr.into(), + p.hold_time.into(), + p.idle_hold_time.into(), + p.delay_open.into(), + p.connect_retry.into(), + p.keepalive.into(), )); } - } - result.addresses = - diesel::insert_into( - address_config_dsl::switch_port_settings_address_config) - .values(address_config) - .returning(SwitchPortAddressConfig::as_returning()) - .get_results_async(&conn) - .await?; + result.bgp_peers = + diesel::insert_into( + bgp_peer_dsl::switch_port_settings_bgp_peer_config) + .values(bgp_peer_config) + .returning(SwitchPortBgpPeerConfig::as_returning()) + .get_results_async(&conn) + .await?; - Ok(result) - }) + let mut address_config = Vec::new(); + use db::schema::address_lot; + for (interface_name, a) in ¶ms.addresses { + for address in &a.addresses { + let address_lot_id = match &address.address_lot { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name = name.to_string(); + address_lot_dsl::address_lot + .filter(address_lot::time_deleted.is_null()) + .filter(address_lot::name.eq(name)) + .select(address_lot::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|_| { + err.set(SwitchPortSettingsCreateError::AddressLotNotFound).unwrap(); + DieselError::RollbackTransaction + })? + } + }; + // TODO: Reduce DB round trips needed for reserving ip blocks + // https://github.com/oxidecomputer/omicron/issues/3060 + let (block, rsvd_block) = + crate::db::datastore::address_lot::try_reserve_block( + address_lot_id, + address.address.ip().into(), + // TODO: Should we allow anycast addresses for switch_ports? + // anycast + false, + &conn + ) + .await + .map_err(|e| match e { + ReserveBlockTxnError::CustomError(e) => { + err.set(SwitchPortSettingsCreateError::ReserveBlock(e)).unwrap(); + DieselError::RollbackTransaction + } + ReserveBlockTxnError::Database(e) => e, + })?; + + address_config.push(SwitchPortAddressConfig::new( + psid, + block.id, + rsvd_block.id, + address.address.into(), + interface_name.clone(), + )); + + } + } + result.addresses = + diesel::insert_into( + address_config_dsl::switch_port_settings_address_config) + .values(address_config) + .returning(SwitchPortAddressConfig::as_returning()) + .get_results_async(&conn) + .await?; + + Ok(result) + }}, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::CustomError(SpsCreateError::AddressLotNotFound) => { - Error::invalid_request("AddressLot not found") - } - TxnError::CustomError(SpsCreateError::BgpConfigNotFound) => { - Error::invalid_request("BGP config not found") - } - TxnError::CustomError( - SwitchPortSettingsCreateError::ReserveBlock( - ReserveBlockError::AddressUnavailable - ) - ) => Error::invalid_request("address unavailable"), - TxnError::CustomError( - SwitchPortSettingsCreateError::ReserveBlock( - ReserveBlockError::AddressNotInLot - ) - ) => Error::invalid_request("address not in lot"), - TxnError::Database(e) => match e { - DieselError::DatabaseError(_, _) => public_error_from_diesel( + .map_err(|e| { + if let Some(err) = err.get() { + match err { + SpsCreateError::AddressLotNotFound => { + Error::invalid_request("AddressLot not found") + }, + SpsCreateError::BgpConfigNotFound => { + Error::invalid_request("BGP config not found") + }, + SpsCreateError::ReserveBlock(ReserveBlockError::AddressUnavailable) => { + Error::invalid_request("address unavailable") + }, + SpsCreateError::ReserveBlock(ReserveBlockError::AddressNotInLot) => { + Error::invalid_request("address not in lot") + }, + } + } else { + public_error_from_diesel( e, ErrorHandler::Conflict( ResourceType::SwitchPortSettings, params.identity.name.as_str(), ), - ), - _ => public_error_from_diesel(e, ErrorHandler::Server), - }, + ) + } }) } @@ -451,7 +457,6 @@ impl DataStore { enum SwitchPortSettingsDeleteError { SwitchPortSettingsNotFound, } - type TxnError = TransactionError; let conn = self.pool_connection_authorized(opctx).await?; @@ -460,173 +465,181 @@ impl DataStore { Some(name_or_id) => name_or_id, }; + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "switch_port_settings_delete", + ); + // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - - use db::schema::switch_port_settings; - let id = match selector { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name = name.to_string(); - port_settings_dsl::switch_port_settings - .filter(switch_port_settings::time_deleted.is_null()) - .filter(switch_port_settings::name.eq(name)) - .select(switch_port_settings::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| - TxnError::CustomError( - SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound, - ) - )? - } - }; + conn.transaction_async_with_retry(|conn| { + let err = err.clone(); + async move { + use db::schema::switch_port_settings; + let id = match selector { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name = name.to_string(); + port_settings_dsl::switch_port_settings + .filter(switch_port_settings::time_deleted.is_null()) + .filter(switch_port_settings::name.eq(name)) + .select(switch_port_settings::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|_| { + err.set(SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound).unwrap(); + DieselError::RollbackTransaction + })? + } + }; - // delete the top level port settings object - diesel::delete(port_settings_dsl::switch_port_settings) - .filter(switch_port_settings::id.eq(id)) - .execute_async(&conn) - .await?; + // delete the top level port settings object + diesel::delete(port_settings_dsl::switch_port_settings) + .filter(switch_port_settings::id.eq(id)) + .execute_async(&conn) + .await?; - // delete the port config object - use db::schema::switch_port_settings_port_config::{ - self as sps_port_config, dsl as port_config_dsl, - }; - diesel::delete(port_config_dsl::switch_port_settings_port_config) - .filter(sps_port_config::port_settings_id.eq(id)) - .execute_async(&conn) - .await?; + // delete the port config object + use db::schema::switch_port_settings_port_config::{ + self as sps_port_config, dsl as port_config_dsl, + }; + diesel::delete(port_config_dsl::switch_port_settings_port_config) + .filter(sps_port_config::port_settings_id.eq(id)) + .execute_async(&conn) + .await?; - // delete the link configs - use db::schema::switch_port_settings_link_config::{ - self as sps_link_config, dsl as link_config_dsl, - }; - let links: Vec = - diesel::delete( - link_config_dsl::switch_port_settings_link_config - ) - .filter( - sps_link_config::port_settings_id.eq(id) - ) - .returning(SwitchPortLinkConfig::as_returning()) - .get_results_async(&conn) - .await?; + // delete the link configs + use db::schema::switch_port_settings_link_config::{ + self as sps_link_config, dsl as link_config_dsl, + }; + let links: Vec = + diesel::delete( + link_config_dsl::switch_port_settings_link_config + ) + .filter( + sps_link_config::port_settings_id.eq(id) + ) + .returning(SwitchPortLinkConfig::as_returning()) + .get_results_async(&conn) + .await?; - // delete lldp configs - use db::schema::lldp_service_config::{self, dsl as lldp_config_dsl}; - let lldp_svc_ids: Vec = links - .iter() - .map(|link| link.lldp_service_config_id) - .collect(); - diesel::delete(lldp_config_dsl::lldp_service_config) - .filter(lldp_service_config::id.eq_any(lldp_svc_ids)) - .execute_async(&conn) - .await?; + // delete lldp configs + use db::schema::lldp_service_config::{self, dsl as lldp_config_dsl}; + let lldp_svc_ids: Vec = links + .iter() + .map(|link| link.lldp_service_config_id) + .collect(); + diesel::delete(lldp_config_dsl::lldp_service_config) + .filter(lldp_service_config::id.eq_any(lldp_svc_ids)) + .execute_async(&conn) + .await?; - // delete interface configs - use db::schema::switch_port_settings_interface_config::{ - self as sps_interface_config, dsl as interface_config_dsl, - }; + // delete interface configs + use db::schema::switch_port_settings_interface_config::{ + self as sps_interface_config, dsl as interface_config_dsl, + }; - let interfaces: Vec = - diesel::delete( - interface_config_dsl::switch_port_settings_interface_config - ) - .filter( - sps_interface_config::port_settings_id.eq(id) - ) - .returning(SwitchInterfaceConfig::as_returning()) - .get_results_async(&conn) - .await?; + let interfaces: Vec = + diesel::delete( + interface_config_dsl::switch_port_settings_interface_config + ) + .filter( + sps_interface_config::port_settings_id.eq(id) + ) + .returning(SwitchInterfaceConfig::as_returning()) + .get_results_async(&conn) + .await?; - // delete any vlan interfaces - use db::schema::switch_vlan_interface_config::{ - self, dsl as vlan_config_dsl, - }; - let interface_ids: Vec = interfaces - .iter() - .map(|interface| interface.id) - .collect(); - - diesel::delete(vlan_config_dsl::switch_vlan_interface_config) - .filter( - switch_vlan_interface_config::interface_config_id.eq_any( - interface_ids + // delete any vlan interfaces + use db::schema::switch_vlan_interface_config::{ + self, dsl as vlan_config_dsl, + }; + let interface_ids: Vec = interfaces + .iter() + .map(|interface| interface.id) + .collect(); + + diesel::delete(vlan_config_dsl::switch_vlan_interface_config) + .filter( + switch_vlan_interface_config::interface_config_id.eq_any( + interface_ids + ) ) + .execute_async(&conn) + .await?; + + // delete route configs + use db::schema::switch_port_settings_route_config; + use db::schema::switch_port_settings_route_config::dsl + as route_config_dsl; + + diesel::delete( + route_config_dsl::switch_port_settings_route_config ) + .filter(switch_port_settings_route_config::port_settings_id.eq(id)) .execute_async(&conn) .await?; - // delete route configs - use db::schema::switch_port_settings_route_config; - use db::schema::switch_port_settings_route_config::dsl - as route_config_dsl; + // delete bgp configurations + use db::schema::switch_port_settings_bgp_peer_config as bgp_peer; + use db::schema::switch_port_settings_bgp_peer_config::dsl + as bgp_peer_dsl; - diesel::delete( - route_config_dsl::switch_port_settings_route_config - ) - .filter(switch_port_settings_route_config::port_settings_id.eq(id)) - .execute_async(&conn) - .await?; + diesel::delete(bgp_peer_dsl::switch_port_settings_bgp_peer_config) + .filter(bgp_peer::port_settings_id.eq(id)) + .execute_async(&conn) + .await?; - // delete bgp configurations - use db::schema::switch_port_settings_bgp_peer_config as bgp_peer; - use db::schema::switch_port_settings_bgp_peer_config::dsl - as bgp_peer_dsl; + // delete address configs + use db::schema::switch_port_settings_address_config::{ + self as address_config, dsl as address_config_dsl, + }; - diesel::delete(bgp_peer_dsl::switch_port_settings_bgp_peer_config) - .filter(bgp_peer::port_settings_id.eq(id)) - .execute_async(&conn) + let port_settings_addrs = diesel::delete( + address_config_dsl::switch_port_settings_address_config, + ) + .filter(address_config::port_settings_id.eq(id)) + .returning(SwitchPortAddressConfig::as_returning()) + .get_results_async(&conn) .await?; - // delete address configs - use db::schema::switch_port_settings_address_config::{ - self as address_config, dsl as address_config_dsl, - }; - - let port_settings_addrs = diesel::delete( - address_config_dsl::switch_port_settings_address_config, - ) - .filter(address_config::port_settings_id.eq(id)) - .returning(SwitchPortAddressConfig::as_returning()) - .get_results_async(&conn) - .await?; + use db::schema::address_lot_rsvd_block::dsl as rsvd_block_dsl; - use db::schema::address_lot_rsvd_block::dsl as rsvd_block_dsl; + for ps in &port_settings_addrs { + diesel::delete(rsvd_block_dsl::address_lot_rsvd_block) + .filter(rsvd_block_dsl::id.eq(ps.rsvd_address_lot_block_id)) + .execute_async(&conn) + .await?; + } - for ps in &port_settings_addrs { - diesel::delete(rsvd_block_dsl::address_lot_rsvd_block) - .filter(rsvd_block_dsl::id.eq(ps.rsvd_address_lot_block_id)) - .execute_async(&conn) - .await?; + Ok(()) } - - Ok(()) - }) + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::CustomError( - SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound) => { - Error::invalid_request("port settings not found") + .map_err(|e| { + if let Some(err) = err.get() { + match err { + SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound => { + Error::invalid_request("port settings not found") + } + } + } else { + let name = match ¶ms.port_settings { + Some(name_or_id) => name_or_id.to_string(), + None => String::new(), + }; + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::SwitchPortSettings, + &name, + ), + ) } - TxnError::Database(e) => match e { - DieselError::DatabaseError(_, _) => { - let name = match ¶ms.port_settings { - Some(name_or_id) => name_or_id.to_string(), - None => String::new(), - }; - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::SwitchPortSettings, - &name, - ), - ) - }, - _ => public_error_from_diesel(e, ErrorHandler::Server), - }, }) } @@ -663,174 +676,184 @@ impl DataStore { enum SwitchPortSettingsGetError { NotFound(external::Name), } - type TxnError = TransactionError; + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "switch_port_settings_get", + ); let conn = self.pool_connection_authorized(opctx).await?; // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - // get the top level port settings object - use db::schema::switch_port_settings::{ - self, dsl as port_settings_dsl, - }; - - let id = match name_or_id { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name_str = name.to_string(); + conn.transaction_async_with_retry(|conn| { + let err = err.clone(); + async move { + // get the top level port settings object + use db::schema::switch_port_settings::{ + self, dsl as port_settings_dsl, + }; + + let id = match name_or_id { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name_str = name.to_string(); + port_settings_dsl::switch_port_settings + .filter(switch_port_settings::time_deleted.is_null()) + .filter(switch_port_settings::name.eq(name_str)) + .select(switch_port_settings::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|_| { + err.set( + SwitchPortSettingsGetError::NotFound( + name.clone(), + ), + ).unwrap(); + DieselError::RollbackTransaction + })? + } + }; + + let settings: SwitchPortSettings = port_settings_dsl::switch_port_settings .filter(switch_port_settings::time_deleted.is_null()) - .filter(switch_port_settings::name.eq(name_str)) - .select(switch_port_settings::id) + .filter(switch_port_settings::id.eq(id)) + .select(SwitchPortSettings::as_select()) .limit(1) - .first_async::(&conn) - .await - .map_err(|_| { - TxnError::CustomError( - SwitchPortSettingsGetError::NotFound( - name.clone(), - ), - ) - })? - } - }; - - let settings: SwitchPortSettings = - port_settings_dsl::switch_port_settings - .filter(switch_port_settings::time_deleted.is_null()) - .filter(switch_port_settings::id.eq(id)) - .select(SwitchPortSettings::as_select()) - .limit(1) - .first_async::(&conn) - .await?; + .first_async::(&conn) + .await?; - // get the port config - use db::schema::switch_port_settings_port_config::{ - self as port_config, dsl as port_config_dsl, - }; - let port: SwitchPortConfig = - port_config_dsl::switch_port_settings_port_config - .filter(port_config::port_settings_id.eq(id)) - .select(SwitchPortConfig::as_select()) - .limit(1) - .first_async::(&conn) - .await?; + // get the port config + use db::schema::switch_port_settings_port_config::{ + self as port_config, dsl as port_config_dsl, + }; + let port: SwitchPortConfig = + port_config_dsl::switch_port_settings_port_config + .filter(port_config::port_settings_id.eq(id)) + .select(SwitchPortConfig::as_select()) + .limit(1) + .first_async::(&conn) + .await?; - // initialize result - let mut result = - SwitchPortSettingsCombinedResult::new(settings, port); + // initialize result + let mut result = + SwitchPortSettingsCombinedResult::new(settings, port); - // get the link configs - use db::schema::switch_port_settings_link_config::{ - self as link_config, dsl as link_config_dsl, - }; + // get the link configs + use db::schema::switch_port_settings_link_config::{ + self as link_config, dsl as link_config_dsl, + }; - result.links = link_config_dsl::switch_port_settings_link_config - .filter(link_config::port_settings_id.eq(id)) - .select(SwitchPortLinkConfig::as_select()) - .load_async::(&conn) - .await?; + result.links = link_config_dsl::switch_port_settings_link_config + .filter(link_config::port_settings_id.eq(id)) + .select(SwitchPortLinkConfig::as_select()) + .load_async::(&conn) + .await?; - let lldp_svc_ids: Vec = result - .links - .iter() - .map(|link| link.lldp_service_config_id) - .collect(); - - use db::schema::lldp_service_config as lldp_config; - use db::schema::lldp_service_config::dsl as lldp_dsl; - result.link_lldp = lldp_dsl::lldp_service_config - .filter(lldp_config::id.eq_any(lldp_svc_ids)) - .select(LldpServiceConfig::as_select()) - .limit(1) - .load_async::(&conn) - .await?; + let lldp_svc_ids: Vec = result + .links + .iter() + .map(|link| link.lldp_service_config_id) + .collect(); + + use db::schema::lldp_service_config as lldp_config; + use db::schema::lldp_service_config::dsl as lldp_dsl; + result.link_lldp = lldp_dsl::lldp_service_config + .filter(lldp_config::id.eq_any(lldp_svc_ids)) + .select(LldpServiceConfig::as_select()) + .limit(1) + .load_async::(&conn) + .await?; - // get the interface configs - use db::schema::switch_port_settings_interface_config::{ - self as interface_config, dsl as interface_config_dsl, - }; + // get the interface configs + use db::schema::switch_port_settings_interface_config::{ + self as interface_config, dsl as interface_config_dsl, + }; - result.interfaces = - interface_config_dsl::switch_port_settings_interface_config - .filter(interface_config::port_settings_id.eq(id)) - .select(SwitchInterfaceConfig::as_select()) - .load_async::(&conn) + result.interfaces = + interface_config_dsl::switch_port_settings_interface_config + .filter(interface_config::port_settings_id.eq(id)) + .select(SwitchInterfaceConfig::as_select()) + .load_async::(&conn) + .await?; + + use db::schema::switch_vlan_interface_config as vlan_config; + use db::schema::switch_vlan_interface_config::dsl as vlan_dsl; + let interface_ids: Vec = result + .interfaces + .iter() + .map(|interface| interface.id) + .collect(); + + result.vlan_interfaces = vlan_dsl::switch_vlan_interface_config + .filter(vlan_config::interface_config_id.eq_any(interface_ids)) + .select(SwitchVlanInterfaceConfig::as_select()) + .load_async::(&conn) .await?; - use db::schema::switch_vlan_interface_config as vlan_config; - use db::schema::switch_vlan_interface_config::dsl as vlan_dsl; - let interface_ids: Vec = result - .interfaces - .iter() - .map(|interface| interface.id) - .collect(); - - result.vlan_interfaces = vlan_dsl::switch_vlan_interface_config - .filter(vlan_config::interface_config_id.eq_any(interface_ids)) - .select(SwitchVlanInterfaceConfig::as_select()) - .load_async::(&conn) - .await?; - - // get the route configs - use db::schema::switch_port_settings_route_config::{ - self as route_config, dsl as route_config_dsl, - }; + // get the route configs + use db::schema::switch_port_settings_route_config::{ + self as route_config, dsl as route_config_dsl, + }; - result.routes = route_config_dsl::switch_port_settings_route_config - .filter(route_config::port_settings_id.eq(id)) - .select(SwitchPortRouteConfig::as_select()) - .load_async::(&conn) - .await?; + result.routes = route_config_dsl::switch_port_settings_route_config + .filter(route_config::port_settings_id.eq(id)) + .select(SwitchPortRouteConfig::as_select()) + .load_async::(&conn) + .await?; - // get the bgp peer configs - use db::schema::switch_port_settings_bgp_peer_config::{ - self as bgp_peer, dsl as bgp_peer_dsl, - }; + // get the bgp peer configs + use db::schema::switch_port_settings_bgp_peer_config::{ + self as bgp_peer, dsl as bgp_peer_dsl, + }; - result.bgp_peers = - bgp_peer_dsl::switch_port_settings_bgp_peer_config - .filter(bgp_peer::port_settings_id.eq(id)) - .select(SwitchPortBgpPeerConfig::as_select()) - .load_async::(&conn) - .await?; + result.bgp_peers = + bgp_peer_dsl::switch_port_settings_bgp_peer_config + .filter(bgp_peer::port_settings_id.eq(id)) + .select(SwitchPortBgpPeerConfig::as_select()) + .load_async::(&conn) + .await?; - // get the address configs - use db::schema::switch_port_settings_address_config::{ - self as address_config, dsl as address_config_dsl, - }; + // get the address configs + use db::schema::switch_port_settings_address_config::{ + self as address_config, dsl as address_config_dsl, + }; - result.addresses = - address_config_dsl::switch_port_settings_address_config - .filter(address_config::port_settings_id.eq(id)) - .select(SwitchPortAddressConfig::as_select()) - .load_async::(&conn) - .await?; + result.addresses = + address_config_dsl::switch_port_settings_address_config + .filter(address_config::port_settings_id.eq(id)) + .select(SwitchPortAddressConfig::as_select()) + .load_async::(&conn) + .await?; - Ok(result) - }) + Ok(result) + } + }, + retry_helper.as_callback() + ) .await - .map_err(|e| match e { - TxnError::CustomError(SwitchPortSettingsGetError::NotFound( - name, - )) => Error::not_found_by_name( - ResourceType::SwitchPortSettings, - &name, - ), - TxnError::Database(e) => match e { - DieselError::DatabaseError(_, _) => { - let name = name_or_id.to_string(); - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::SwitchPortSettings, - &name, - ), - ) + .map_err(|e| { + if let Some(err) = err.get() { + match err { + SwitchPortSettingsGetError::NotFound( + name, + ) => Error::not_found_by_name( + ResourceType::SwitchPortSettings, + &name, + ), } - _ => public_error_from_diesel(e, ErrorHandler::Server), - }, + } else { + let name = name_or_id.to_string(); + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::SwitchPortSettings, + &name, + ), + ) + } }) } @@ -911,55 +934,70 @@ impl DataStore { NotFound, ActiveSettings, } - type TxnError = TransactionError; + + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "switch_port_delete", + ); let conn = self.pool_connection_authorized(opctx).await?; // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - use db::schema::switch_port; - use db::schema::switch_port::dsl as switch_port_dsl; - - let switch_location = params.switch_location.to_string(); - let port_name = portname.to_string(); - let port: SwitchPort = switch_port_dsl::switch_port - .filter(switch_port::rack_id.eq(params.rack_id)) - .filter( - switch_port::switch_location.eq(switch_location.clone()), - ) - .filter(switch_port::port_name.eq(port_name.clone())) - .select(SwitchPort::as_select()) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| { - TxnError::CustomError(SwitchPortDeleteError::NotFound) - })?; + conn.transaction_async_with_retry( + |conn| { + let err = err.clone(); + async move { + use db::schema::switch_port; + use db::schema::switch_port::dsl as switch_port_dsl; + + let switch_location = params.switch_location.to_string(); + let port_name = portname.to_string(); + let port: SwitchPort = switch_port_dsl::switch_port + .filter(switch_port::rack_id.eq(params.rack_id)) + .filter( + switch_port::switch_location + .eq(switch_location.clone()), + ) + .filter(switch_port::port_name.eq(port_name.clone())) + .select(SwitchPort::as_select()) + .limit(1) + .first_async::(&conn) + .await + .map_err(|_| { + err.set(SwitchPortDeleteError::NotFound).unwrap(); + DieselError::RollbackTransaction + })?; - if port.port_settings_id.is_some() { - return Err(TxnError::CustomError( - SwitchPortDeleteError::ActiveSettings, - )); - } + if port.port_settings_id.is_some() { + err.set(SwitchPortDeleteError::ActiveSettings).unwrap(); + return Err(DieselError::RollbackTransaction); + } - diesel::delete(switch_port_dsl::switch_port) - .filter(switch_port::id.eq(port.id)) - .execute_async(&conn) - .await?; + diesel::delete(switch_port_dsl::switch_port) + .filter(switch_port::id.eq(port.id)) + .execute_async(&conn) + .await?; - Ok(()) - }) + Ok(()) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::CustomError(SwitchPortDeleteError::NotFound) => { - let name = &portname.clone(); - Error::not_found_by_name(ResourceType::SwitchPort, name) - } - TxnError::CustomError(SwitchPortDeleteError::ActiveSettings) => { - Error::invalid_request("must clear port settings first") - } - TxnError::Database(e) => { + .map_err(|e| { + if let Some(err) = err.get() { + match err { + SwitchPortDeleteError::NotFound => { + let name = &portname.clone(); + Error::not_found_by_name(ResourceType::SwitchPort, name) + } + SwitchPortDeleteError::ActiveSettings => { + Error::invalid_request("must clear port settings first") + } + } + } else { public_error_from_diesel(e, ErrorHandler::Server) } }) From 10865b12cab194e577a22d408ac6fef8b4764a24 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 13:54:44 -0800 Subject: [PATCH 05/25] Clippy lints (tests still passing) --- .../src/db/datastore/network_interface.rs | 20 +++++++++---------- nexus/db-queries/src/db/datastore/sled.rs | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/network_interface.rs b/nexus/db-queries/src/db/datastore/network_interface.rs index 0558149c0b..8c529d68de 100644 --- a/nexus/db-queries/src/db/datastore/network_interface.rs +++ b/nexus/db-queries/src/db/datastore/network_interface.rs @@ -511,7 +511,7 @@ impl DataStore { } // In any case, update the actual target - Ok(update_target_query.get_result_async(&conn).await?) + update_target_query.get_result_async(&conn).await } }, retry_helper.as_callback() @@ -527,16 +527,16 @@ impl DataStore { let stopped = stopped.clone(); let update_target_query = update_target_query.clone(); async move { - let instance_state = - instance_query.get_result_async(&conn).await?.runtime_state; - if instance_state.propolis_id.is_some() - || instance_state.nexus_state != stopped - { - err.set(NetworkInterfaceUpdateError::InstanceNotStopped).unwrap(); - return Err(diesel::result::Error::RollbackTransaction); - } - Ok(update_target_query.get_result_async(&conn).await?) + let instance_state = + instance_query.get_result_async(&conn).await?.runtime_state; + if instance_state.propolis_id.is_some() + || instance_state.nexus_state != stopped + { + err.set(NetworkInterfaceUpdateError::InstanceNotStopped).unwrap(); + return Err(diesel::result::Error::RollbackTransaction); } + update_target_query.get_result_async(&conn).await + } }, retry_helper.as_callback() ) diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 85ed1cd4de..35eb334dd6 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -201,11 +201,11 @@ impl DataStore { resources, ); - Ok(diesel::insert_into(resource_dsl::sled_resource) + diesel::insert_into(resource_dsl::sled_resource) .values(resource) .returning(SledResource::as_returning()) .get_result_async(&conn) - .await?) + .await } }, retry_helper.as_callback(), From 49b4aa4c641a7e5de94759d1eb16956d2cd7e1d9 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 15:10:36 -0800 Subject: [PATCH 06/25] address_lot, saml_identity_provider, rack_set_initialized, snapshot, switch_port, vpc --- .../src/db/datastore/address_lot.rs | 158 +++--- .../src/db/datastore/identity_provider.rs | 79 +-- nexus/db-queries/src/db/datastore/rack.rs | 458 ++++++++++-------- nexus/db-queries/src/db/datastore/snapshot.rs | 192 ++++---- .../src/db/datastore/switch_port.rs | 83 ++-- nexus/db-queries/src/db/datastore/vpc.rs | 93 ++-- 6 files changed, 604 insertions(+), 459 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/address_lot.rs b/nexus/db-queries/src/db/datastore/address_lot.rs index 97dfb59eba..0c47ad141a 100644 --- a/nexus/db-queries/src/db/datastore/address_lot.rs +++ b/nexus/db-queries/src/db/datastore/address_lot.rs @@ -13,6 +13,7 @@ use crate::db::error::TransactionError; use crate::db::model::Name; use crate::db::model::{AddressLot, AddressLotBlock, AddressLotReservedBlock}; use crate::db::pagination::paginated; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl, Connection}; use chrono::Utc; use diesel::result::Error as DieselError; @@ -28,6 +29,7 @@ use omicron_common::api::external::{ }; use ref_cast::RefCast; use serde::{Deserialize, Serialize}; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; #[derive(Clone, Debug, Deserialize, Serialize)] @@ -45,51 +47,61 @@ impl DataStore { use db::schema::address_lot::dsl as lot_dsl; use db::schema::address_lot_block::dsl as block_dsl; + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "address_lot_create", + ); self.pool_connection_authorized(opctx) .await? // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - .transaction_async(|conn| async move { - let lot = AddressLot::new(¶ms.identity, params.kind.into()); - - let db_lot: AddressLot = - diesel::insert_into(lot_dsl::address_lot) - .values(lot) - .returning(AddressLot::as_returning()) - .get_result_async(&conn) - .await?; - - let blocks: Vec = params - .blocks - .iter() - .map(|b| { - AddressLotBlock::new( - db_lot.id(), - b.first_address.into(), - b.last_address.into(), - ) + .transaction_async_with_retry( + |conn| async move { + let lot = + AddressLot::new(¶ms.identity, params.kind.into()); + + let db_lot: AddressLot = + diesel::insert_into(lot_dsl::address_lot) + .values(lot) + .returning(AddressLot::as_returning()) + .get_result_async(&conn) + .await?; + + let blocks: Vec = params + .blocks + .iter() + .map(|b| { + AddressLotBlock::new( + db_lot.id(), + b.first_address.into(), + b.last_address.into(), + ) + }) + .collect(); + + let db_blocks = + diesel::insert_into(block_dsl::address_lot_block) + .values(blocks) + .returning(AddressLotBlock::as_returning()) + .get_results_async(&conn) + .await?; + + Ok(AddressLotCreateResult { + lot: db_lot, + blocks: db_blocks, }) - .collect(); - - let db_blocks = - diesel::insert_into(block_dsl::address_lot_block) - .values(blocks) - .returning(AddressLotBlock::as_returning()) - .get_results_async(&conn) - .await?; - - Ok(AddressLotCreateResult { lot: db_lot, blocks: db_blocks }) - }) + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - DieselError::DatabaseError(_, _) => public_error_from_diesel( + .map_err(|e| { + public_error_from_diesel( e, ErrorHandler::Conflict( ResourceType::AddressLot, ¶ms.identity.name.as_str(), ), - ), - _ => public_error_from_diesel(e, ErrorHandler::Server), + ) }) } @@ -113,46 +125,60 @@ impl DataStore { LotInUse, } - type TxnError = TransactionError; + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "address_lot_delete", + ); // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - let rsvd: Vec = - rsvd_block_dsl::address_lot_rsvd_block - .filter(rsvd_block_dsl::address_lot_id.eq(id)) - .select(AddressLotReservedBlock::as_select()) - .limit(1) - .load_async(&conn) - .await?; - - if !rsvd.is_empty() { - Err(TxnError::CustomError(AddressLotDeleteError::LotInUse))?; - } - - let now = Utc::now(); - diesel::update(lot_dsl::address_lot) - .filter(lot_dsl::time_deleted.is_null()) - .filter(lot_dsl::id.eq(id)) - .set(lot_dsl::time_deleted.eq(now)) - .execute_async(&conn) - .await?; + conn.transaction_async_with_retry( + |conn| { + let err = err.clone(); + async move { + let rsvd: Vec = + rsvd_block_dsl::address_lot_rsvd_block + .filter(rsvd_block_dsl::address_lot_id.eq(id)) + .select(AddressLotReservedBlock::as_select()) + .limit(1) + .load_async(&conn) + .await?; + + if !rsvd.is_empty() { + err.set(AddressLotDeleteError::LotInUse).unwrap(); + return Err(DieselError::RollbackTransaction); + } + + let now = Utc::now(); + diesel::update(lot_dsl::address_lot) + .filter(lot_dsl::time_deleted.is_null()) + .filter(lot_dsl::id.eq(id)) + .set(lot_dsl::time_deleted.eq(now)) + .execute_async(&conn) + .await?; - diesel::delete(block_dsl::address_lot_block) - .filter(block_dsl::address_lot_id.eq(id)) - .execute_async(&conn) - .await?; + diesel::delete(block_dsl::address_lot_block) + .filter(block_dsl::address_lot_id.eq(id)) + .execute_async(&conn) + .await?; - Ok(()) - }) + Ok(()) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::Database(e) => { + .map_err(|e| { + if let Some(err) = err.get() { + match err { + AddressLotDeleteError::LotInUse => { + Error::invalid_request("lot is in use") + } + } + } else { public_error_from_diesel(e, ErrorHandler::Server) } - TxnError::CustomError(AddressLotDeleteError::LotInUse) => { - Error::invalid_request("lot is in use") - } }) } diff --git a/nexus/db-queries/src/db/datastore/identity_provider.rs b/nexus/db-queries/src/db/datastore/identity_provider.rs index fdc9a020e7..cb15e20549 100644 --- a/nexus/db-queries/src/db/datastore/identity_provider.rs +++ b/nexus/db-queries/src/db/datastore/identity_provider.rs @@ -14,6 +14,7 @@ use crate::db::identity::Resource; use crate::db::model::IdentityProvider; use crate::db::model::Name; use crate::db::pagination::paginated; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; @@ -62,38 +63,62 @@ impl DataStore { opctx.authorize(authz::Action::CreateChild, authz_idp_list).await?; assert_eq!(provider.silo_id, authz_idp_list.silo().id()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "saml_identity_provider_create", + ); let name = provider.identity().name.to_string(); self.pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - // insert silo identity provider record with type Saml - use db::schema::identity_provider::dsl as idp_dsl; - diesel::insert_into(idp_dsl::identity_provider) - .values(db::model::IdentityProvider { - identity: db::model::IdentityProviderIdentity { - id: provider.identity.id, - name: provider.identity.name.clone(), - description: provider.identity.description.clone(), - time_created: provider.identity.time_created, - time_modified: provider.identity.time_modified, - time_deleted: provider.identity.time_deleted, - }, - silo_id: provider.silo_id, - provider_type: db::model::IdentityProviderType::Saml, - }) - .execute_async(&conn) - .await?; + .transaction_async_with_retry( + |conn| { + let provider = provider.clone(); + async move { + // insert silo identity provider record with type Saml + use db::schema::identity_provider::dsl as idp_dsl; + diesel::insert_into(idp_dsl::identity_provider) + .values(db::model::IdentityProvider { + identity: db::model::IdentityProviderIdentity { + id: provider.identity.id, + name: provider.identity.name.clone(), + description: provider + .identity + .description + .clone(), + time_created: provider + .identity + .time_created, + time_modified: provider + .identity + .time_modified, + time_deleted: provider + .identity + .time_deleted, + }, + silo_id: provider.silo_id, + provider_type: + db::model::IdentityProviderType::Saml, + }) + .execute_async(&conn) + .await?; - // insert silo saml identity provider record - use db::schema::saml_identity_provider::dsl; - let result = diesel::insert_into(dsl::saml_identity_provider) - .values(provider) - .returning(db::model::SamlIdentityProvider::as_returning()) - .get_result_async(&conn) - .await?; + // insert silo saml identity provider record + use db::schema::saml_identity_provider::dsl; + let result = diesel::insert_into( + dsl::saml_identity_provider, + ) + .values(provider) + .returning( + db::model::SamlIdentityProvider::as_returning(), + ) + .get_result_async(&conn) + .await?; - Ok(result) - }) + Ok(result) + } + }, + retry_helper.as_callback(), + ) .await .map_err(|e| { public_error_from_diesel( diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index ae982d86f8..cd8383e9e3 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -26,6 +26,7 @@ use crate::db::model::Rack; use crate::db::model::Zpool; use crate::db::pagination::paginated; use crate::db::pool::DbConnection; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -56,6 +57,7 @@ use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use std::net::IpAddr; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; /// Groups arguments related to rack initialization @@ -89,15 +91,12 @@ enum RackInitError { } type TxnError = TransactionError; -impl From for Error { - fn from(e: TxnError) -> Self { +impl From for Error { + fn from(e: RackInitError) -> Self { match e { - TxnError::CustomError(RackInitError::AddingIp(err)) => err, - TxnError::CustomError(RackInitError::AddingNic(err)) => err, - TxnError::CustomError(RackInitError::DatasetInsert { - err, - zpool_id, - }) => match err { + RackInitError::AddingIp(err) => err, + RackInitError::AddingNic(err) => err, + RackInitError::DatasetInsert { err, zpool_id } => match err { AsyncInsertError::CollectionNotFound => Error::ObjectNotFound { type_name: ResourceType::Zpool, lookup_type: LookupType::ById(zpool_id), @@ -106,43 +105,28 @@ impl From for Error { public_error_from_diesel(e, ErrorHandler::Server) } }, - TxnError::CustomError(RackInitError::ServiceInsert(err)) => { - Error::internal_error(&format!( - "failed to insert Service record: {:#}", - err - )) - } - TxnError::CustomError(RackInitError::RackUpdate { - err, - rack_id, - }) => public_error_from_diesel( - err, - ErrorHandler::NotFoundByLookup( - ResourceType::Rack, - LookupType::ById(rack_id), - ), + RackInitError::ServiceInsert(err) => Error::internal_error( + &format!("failed to insert Service record: {:#}", err), ), - TxnError::CustomError(RackInitError::DnsSerialization(err)) => { - Error::internal_error(&format!( - "failed to serialize initial DNS records: {:#}", - err - )) - } - TxnError::CustomError(RackInitError::Silo(err)) => { - Error::internal_error(&format!( - "failed to create recovery Silo: {:#}", - err - )) - } - TxnError::CustomError(RackInitError::RoleAssignment(err)) => { - Error::internal_error(&format!( - "failed to assign role to initial user: {:#}", - err - )) - } - TxnError::Database(e) => { - Error::internal_error(&format!("Transaction error: {}", e)) + RackInitError::RackUpdate { err, rack_id } => { + public_error_from_diesel( + err, + ErrorHandler::NotFoundByLookup( + ResourceType::Rack, + LookupType::ById(rack_id), + ), + ) } + RackInitError::DnsSerialization(err) => Error::internal_error( + &format!("failed to serialize initial DNS records: {:#}", err), + ), + RackInitError::Silo(err) => Error::internal_error(&format!( + "failed to create recovery Silo: {:#}", + err + )), + RackInitError::RoleAssignment(err) => Error::internal_error( + &format!("failed to assign role to initial user: {:#}", err), + ), } } } @@ -311,7 +295,7 @@ impl DataStore { log: &slog::Logger, service_pool: &db::model::IpPool, service: internal_params::ServicePutRequest, - ) -> Result<(), TxnError> { + ) -> Result<(), RackInitError> { use internal_params::ServiceKind; let service_db = db::model::Service::new( @@ -321,9 +305,9 @@ impl DataStore { service.address, service.kind.clone().into(), ); - self.service_upsert_conn(conn, service_db).await.map_err(|e| { - TxnError::CustomError(RackInitError::ServiceInsert(e)) - })?; + self.service_upsert_conn(conn, service_db) + .await + .map_err(|e| RackInitError::ServiceInsert(e))?; // For services with external connectivity, we record their // explicit IP allocation and create a service NIC as well. @@ -354,9 +338,7 @@ impl DataStore { Some(nic.ip), Some(nic.mac), ) - .map_err(|e| { - TxnError::CustomError(RackInitError::AddingNic(e)) - })?; + .map_err(|e| RackInitError::AddingNic(e))?; Some((db_ip, db_nic)) } ServiceKind::BoundaryNtp { snat, ref nic } => { @@ -378,9 +360,7 @@ impl DataStore { Some(nic.ip), Some(nic.mac), ) - .map_err(|e| { - TxnError::CustomError(RackInitError::AddingNic(e)) - })?; + .map_err(|e| RackInitError::AddingNic(e))?; Some((db_ip, db_nic)) } _ => None, @@ -395,7 +375,7 @@ impl DataStore { IP address for {}", service.kind, ); - TxnError::CustomError(RackInitError::AddingIp(err)) + RackInitError::AddingIp(err) })?; self.create_network_interface_raw_conn(conn, db_nic) @@ -408,9 +388,7 @@ impl DataStore { _, db::model::NetworkInterfaceKind::Service, ) => Ok(()), - _ => Err(TxnError::CustomError( - RackInitError::AddingNic(e.into_external()), - )), + _ => Err(RackInitError::AddingNic(e.into_external())), } })?; } @@ -429,146 +407,186 @@ impl DataStore { opctx.authorize(authz::Action::CreateChild, &authz::FLEET).await?; - let rack_id = rack_init.rack_id; - let services = rack_init.services; - let datasets = rack_init.datasets; - let service_ip_pool_ranges = rack_init.service_ip_pool_ranges; - let internal_dns = rack_init.internal_dns; - let external_dns = rack_init.external_dns; - let (authz_service_pool, service_pool) = self.ip_pools_service_lookup(&opctx).await?; // NOTE: This operation could likely be optimized with a CTE, but given // the low-frequency of calls, this optimization has been deferred. let log = opctx.log.clone(); + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "rack_set_initialized", + ); let rack = self .pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - // Early exit if the rack has already been initialized. - let rack = rack_dsl::rack - .filter(rack_dsl::id.eq(rack_id)) - .select(Rack::as_select()) - .get_result_async(&conn) - .await - .map_err(|e| { - warn!(log, "Initializing Rack: Rack UUID not found"); - TxnError::CustomError(RackInitError::RackUpdate { - err: e, - rack_id, - }) - })?; - if rack.initialized { - info!(log, "Early exit: Rack already initialized"); - return Ok(rack); - } + .transaction_async_with_retry(|conn| { + let err = err.clone(); + let log = log.clone(); + let authz_service_pool = authz_service_pool.clone(); + let rack_init = rack_init.clone(); + let service_pool = service_pool.clone(); + async move { + let rack_id = rack_init.rack_id; + let services = rack_init.services; + let datasets = rack_init.datasets; + let service_ip_pool_ranges = rack_init.service_ip_pool_ranges; + let internal_dns = rack_init.internal_dns; + let external_dns = rack_init.external_dns; + + // Early exit if the rack has already been initialized. + let rack = rack_dsl::rack + .filter(rack_dsl::id.eq(rack_id)) + .select(Rack::as_select()) + .get_result_async(&conn) + .await + .map_err(|e| { + warn!(log, "Initializing Rack: Rack UUID not found"); + err.set(RackInitError::RackUpdate { + err: e, + rack_id, + }).unwrap(); + DieselError::RollbackTransaction + })?; + if rack.initialized { + info!(log, "Early exit: Rack already initialized"); + return Ok(rack); + } - // Otherwise, insert services and datasets. + // Otherwise, insert services and datasets. - // Set up the IP pool for internal services. - for range in service_ip_pool_ranges { - Self::ip_pool_add_range_on_connection( - &conn, - opctx, - &authz_service_pool, - &range, - ) - .await - .map_err(|err| { - warn!( - log, - "Initializing Rack: Failed to add IP pool range" - ); - TxnError::CustomError(RackInitError::AddingIp(err)) - })?; - } + // Set up the IP pool for internal services. + for range in service_ip_pool_ranges { + Self::ip_pool_add_range_on_connection( + &conn, + opctx, + &authz_service_pool, + &range, + ) + .await + .map_err(|e| { + warn!( + log, + "Initializing Rack: Failed to add IP pool range" + ); + err.set(RackInitError::AddingIp(e)).unwrap(); + DieselError::RollbackTransaction + })?; + } + + // Allocate records for all services. + for service in services { + self.rack_populate_service_records( + &conn, + &log, + &service_pool, + service, + ) + .await + .map_err(|e| { + err.set(e).unwrap(); + DieselError::RollbackTransaction + })?; + } + info!(log, "Inserted services"); + + for dataset in datasets { + use db::schema::dataset::dsl; + let zpool_id = dataset.pool_id; + >::insert_resource( + zpool_id, + diesel::insert_into(dsl::dataset) + .values(dataset.clone()) + .on_conflict(dsl::id) + .do_update() + .set(( + dsl::time_modified.eq(Utc::now()), + dsl::pool_id.eq(excluded(dsl::pool_id)), + dsl::ip.eq(excluded(dsl::ip)), + dsl::port.eq(excluded(dsl::port)), + dsl::kind.eq(excluded(dsl::kind)), + )), + ) + .insert_and_get_result_async(&conn) + .await + .map_err(|e| { + err.set(RackInitError::DatasetInsert { + err: e, + zpool_id, + }).unwrap(); + DieselError::RollbackTransaction + })?; + } + info!(log, "Inserted datasets"); - // Allocate records for all services. - for service in services { - self.rack_populate_service_records( + // Insert the initial contents of the internal and external DNS + // zones. + Self::load_dns_data(&conn, internal_dns) + .await + .map_err(|e| { + err.set(RackInitError::DnsSerialization(e)).unwrap(); + DieselError::RollbackTransaction + })?; + info!(log, "Populated DNS tables for internal DNS"); + + Self::load_dns_data(&conn, external_dns) + .await + .map_err(|e| { + err.set(RackInitError::DnsSerialization(e)).unwrap(); + DieselError::RollbackTransaction + })?; + info!(log, "Populated DNS tables for external DNS"); + + // Create the initial Recovery Silo + self.rack_create_recovery_silo( + &opctx, &conn, &log, - &service_pool, - service, - ) - .await?; - } - info!(log, "Inserted services"); - - for dataset in datasets { - use db::schema::dataset::dsl; - let zpool_id = dataset.pool_id; - >::insert_resource( - zpool_id, - diesel::insert_into(dsl::dataset) - .values(dataset.clone()) - .on_conflict(dsl::id) - .do_update() - .set(( - dsl::time_modified.eq(Utc::now()), - dsl::pool_id.eq(excluded(dsl::pool_id)), - dsl::ip.eq(excluded(dsl::ip)), - dsl::port.eq(excluded(dsl::port)), - dsl::kind.eq(excluded(dsl::kind)), - )), + rack_init.recovery_silo, + rack_init.recovery_silo_fq_dns_name, + rack_init.recovery_user_id, + rack_init.recovery_user_password_hash, + rack_init.dns_update, ) - .insert_and_get_result_async(&conn) .await - .map_err(|err| { - TxnError::CustomError(RackInitError::DatasetInsert { - err, - zpool_id, - }) + .map_err(|e| match e { + TransactionError::CustomError(rack_init_err) => { + err.set(rack_init_err).unwrap(); + DieselError::RollbackTransaction + }, + TransactionError::Database(e) => e, })?; - } - info!(log, "Inserted datasets"); - - // Insert the initial contents of the internal and external DNS - // zones. - Self::load_dns_data(&conn, internal_dns) - .await - .map_err(RackInitError::DnsSerialization) - .map_err(TxnError::CustomError)?; - info!(log, "Populated DNS tables for internal DNS"); - Self::load_dns_data(&conn, external_dns) - .await - .map_err(RackInitError::DnsSerialization) - .map_err(TxnError::CustomError)?; - info!(log, "Populated DNS tables for external DNS"); - - // Create the initial Recovery Silo - self.rack_create_recovery_silo( - &opctx, - &conn, - &log, - rack_init.recovery_silo, - rack_init.recovery_silo_fq_dns_name, - rack_init.recovery_user_id, - rack_init.recovery_user_password_hash, - rack_init.dns_update, - ) - .await?; - - let rack = diesel::update(rack_dsl::rack) - .filter(rack_dsl::id.eq(rack_id)) - .set(( - rack_dsl::initialized.eq(true), - rack_dsl::time_modified.eq(Utc::now()), - )) - .returning(Rack::as_returning()) - .get_result_async::(&conn) - .await - .map_err(|err| { - TxnError::CustomError(RackInitError::RackUpdate { - err, - rack_id, - }) - })?; - Ok::<_, TxnError>(rack) - }) - .await?; + let rack = diesel::update(rack_dsl::rack) + .filter(rack_dsl::id.eq(rack_id)) + .set(( + rack_dsl::initialized.eq(true), + rack_dsl::time_modified.eq(Utc::now()), + )) + .returning(Rack::as_returning()) + .get_result_async::(&conn) + .await + .map_err(|e| { + err.set(RackInitError::RackUpdate { + err: e, + rack_id, + }).unwrap(); + DieselError::RollbackTransaction + })?; + Ok(rack) + } + }, + retry_helper.as_callback(), + ) + .await + .map_err(|e| { + if let Some(err) = Arc::try_unwrap(err).unwrap().take() { + err.into() + } else { + Error::internal_error(&format!("Transaction error: {}", e)) + } + })?; Ok(rack) } @@ -623,42 +641,62 @@ impl DataStore { use crate::db::schema::external_ip::dsl as extip_dsl; use crate::db::schema::service::dsl as service_dsl; - type TxnError = TransactionError; + + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "nexus_external_addresses", + ); self.pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - let ips = extip_dsl::external_ip - .inner_join( - service_dsl::service.on(service_dsl::id - .eq(extip_dsl::parent_id.assume_not_null())), - ) - .filter(extip_dsl::parent_id.is_not_null()) - .filter(extip_dsl::time_deleted.is_null()) - .filter(extip_dsl::is_service) - .filter(service_dsl::kind.eq(db::model::ServiceKind::Nexus)) - .select(ExternalIp::as_select()) - .get_results_async(&conn) - .await? - .into_iter() - .map(|external_ip| external_ip.ip.ip()) - .collect(); - - let dns_zones = self - .dns_zones_list_all_on_connection( - opctx, - &conn, - DnsGroup::External, - ) - .await?; + .transaction_async_with_retry( + |conn| { + let err = err.clone(); + async move { + let ips = + extip_dsl::external_ip + .inner_join(service_dsl::service.on( + service_dsl::id.eq( + extip_dsl::parent_id.assume_not_null(), + ), + )) + .filter(extip_dsl::parent_id.is_not_null()) + .filter(extip_dsl::time_deleted.is_null()) + .filter(extip_dsl::is_service) + .filter( + service_dsl::kind + .eq(db::model::ServiceKind::Nexus), + ) + .select(ExternalIp::as_select()) + .get_results_async(&conn) + .await? + .into_iter() + .map(|external_ip| external_ip.ip.ip()) + .collect(); + + let dns_zones = self + .dns_zones_list_all_on_connection( + opctx, + &conn, + DnsGroup::External, + ) + .await + .map_err(|e| { + err.set(e).unwrap(); + DieselError::RollbackTransaction + })?; - Ok((ips, dns_zones)) - }) + Ok((ips, dns_zones)) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|error: TxnError| match error { - TransactionError::CustomError(err) => err, - TransactionError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) + .map_err(|e| { + if let Some(err) = Arc::try_unwrap(err).unwrap().take() { + return err; } + public_error_from_diesel(e, ErrorHandler::Server) }) } } diff --git a/nexus/db-queries/src/db/datastore/snapshot.rs b/nexus/db-queries/src/db/datastore/snapshot.rs index 7c03e4bd40..fabbfaad0e 100644 --- a/nexus/db-queries/src/db/datastore/snapshot.rs +++ b/nexus/db-queries/src/db/datastore/snapshot.rs @@ -20,11 +20,12 @@ use crate::db::model::SnapshotState; use crate::db::pagination::paginated; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; -use crate::db::TransactionError; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; +use diesel::result::Error as DieselError; use diesel::OptionalExtension; use nexus_types::identity::Resource; use omicron_common::api::external::http_pagination::PaginatedBy; @@ -36,6 +37,7 @@ use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use omicron_common::bail_unless; use ref_cast::RefCast; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -57,105 +59,121 @@ impl DataStore { InsertError(AsyncInsertError), } - type TxnError = TransactionError; - let snapshot_name = snapshot.name().to_string(); let project_id = snapshot.project_id; + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "project_ensure_snapshot", + ); let snapshot: Snapshot = self .pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - use db::schema::snapshot::dsl; - - // If an undeleted snapshot exists in the database with the - // same name and project but a different id to the snapshot - // this function was passed as an argument, then return an - // error here. - // - // As written below, - // - // .on_conflict((dsl::project_id, dsl::name)) - // .filter_target(dsl::time_deleted.is_null()) - // .do_update() - // .set(dsl::time_modified.eq(dsl::time_modified)) - // - // will set any existing record's `time_modified` if the - // project id and name match, even if the snapshot ID does - // not match. diesel supports adding a filter below like so - // (marked with >>): - // - // .on_conflict((dsl::project_id, dsl::name)) - // .filter_target(dsl::time_deleted.is_null()) - // .do_update() - // .set(dsl::time_modified.eq(dsl::time_modified)) - // >> .filter(dsl::id.eq(snapshot.id())) - // - // which will restrict the `insert_into`'s set so that it - // only applies if the snapshot ID matches. But, - // AsyncInsertError does not have a ObjectAlreadyExists - // variant, so this will be returned as CollectionNotFound - // due to the `insert_into` failing. - // - // If this function is passed a snapshot with an ID that - // does not match, but a project and name that does, return - // ObjectAlreadyExists here. + .transaction_async_with_retry( + |conn| { + let err = err.clone(); + let snapshot = snapshot.clone(); + async move { + use db::schema::snapshot::dsl; - let existing_snapshot_id: Option = dsl::snapshot - .filter(dsl::time_deleted.is_null()) - .filter(dsl::name.eq(snapshot.name().to_string())) - .filter(dsl::project_id.eq(snapshot.project_id)) - .select(dsl::id) - .limit(1) - .first_async(&conn) - .await - .optional()?; + // If an undeleted snapshot exists in the database with the + // same name and project but a different id to the snapshot + // this function was passed as an argument, then return an + // error here. + // + // As written below, + // + // .on_conflict((dsl::project_id, dsl::name)) + // .filter_target(dsl::time_deleted.is_null()) + // .do_update() + // .set(dsl::time_modified.eq(dsl::time_modified)) + // + // will set any existing record's `time_modified` if the + // project id and name match, even if the snapshot ID does + // not match. diesel supports adding a filter below like so + // (marked with >>): + // + // .on_conflict((dsl::project_id, dsl::name)) + // .filter_target(dsl::time_deleted.is_null()) + // .do_update() + // .set(dsl::time_modified.eq(dsl::time_modified)) + // >> .filter(dsl::id.eq(snapshot.id())) + // + // which will restrict the `insert_into`'s set so that it + // only applies if the snapshot ID matches. But, + // AsyncInsertError does not have a ObjectAlreadyExists + // variant, so this will be returned as CollectionNotFound + // due to the `insert_into` failing. + // + // If this function is passed a snapshot with an ID that + // does not match, but a project and name that does, return + // ObjectAlreadyExists here. - if let Some(existing_snapshot_id) = existing_snapshot_id { - if existing_snapshot_id != snapshot.id() { - return Err(TransactionError::CustomError( - CustomError::ResourceAlreadyExists, - )); - } - } + let existing_snapshot_id: Option = dsl::snapshot + .filter(dsl::time_deleted.is_null()) + .filter(dsl::name.eq(snapshot.name().to_string())) + .filter(dsl::project_id.eq(snapshot.project_id)) + .select(dsl::id) + .limit(1) + .first_async(&conn) + .await + .optional()?; - Project::insert_resource( - project_id, - diesel::insert_into(dsl::snapshot) - .values(snapshot) - .on_conflict((dsl::project_id, dsl::name)) - .filter_target(dsl::time_deleted.is_null()) - .do_update() - .set(dsl::time_modified.eq(dsl::time_modified)), - ) - .insert_and_get_result_async(&conn) - .await - .map_err(|e| { - TransactionError::CustomError(CustomError::InsertError(e)) - }) - }) - .await - .map_err(|e: TxnError| match e { - TxnError::CustomError(e) => match e { - CustomError::ResourceAlreadyExists => { - Error::ObjectAlreadyExists { - type_name: ResourceType::Snapshot, - object_name: snapshot_name, + if let Some(existing_snapshot_id) = existing_snapshot_id + { + if existing_snapshot_id != snapshot.id() { + err.set(CustomError::ResourceAlreadyExists) + .unwrap(); + return Err(DieselError::RollbackTransaction); + } } + + Project::insert_resource( + project_id, + diesel::insert_into(dsl::snapshot) + .values(snapshot) + .on_conflict((dsl::project_id, dsl::name)) + .filter_target(dsl::time_deleted.is_null()) + .do_update() + .set(dsl::time_modified.eq(dsl::time_modified)), + ) + .insert_and_get_result_async(&conn) + .await + .map_err(|e| { + err.set(CustomError::InsertError(e)).unwrap(); + DieselError::RollbackTransaction + }) } - CustomError::InsertError(e) => match e { - AsyncInsertError::CollectionNotFound => { - Error::ObjectNotFound { - type_name: ResourceType::Project, - lookup_type: LookupType::ById(project_id), + }, + retry_helper.as_callback(), + ) + .await + .map_err(|e| { + if let Some(err) = Arc::try_unwrap(err).unwrap().take() { + match err { + CustomError::ResourceAlreadyExists => { + Error::ObjectAlreadyExists { + type_name: ResourceType::Snapshot, + object_name: snapshot_name, } } - AsyncInsertError::DatabaseError(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - }, - }, - TxnError::Database(e) => { + CustomError::InsertError(e) => match e { + AsyncInsertError::CollectionNotFound => { + Error::ObjectNotFound { + type_name: ResourceType::Project, + lookup_type: LookupType::ById(project_id), + } + } + AsyncInsertError::DatabaseError(e) => { + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + } + }, + } + } else { public_error_from_diesel(e, ErrorHandler::Server) } })?; diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index 0d21419acd..af584265a3 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -11,7 +11,6 @@ use crate::db::datastore::address_lot::{ use crate::db::datastore::UpdatePrecondition; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::{ LldpServiceConfig, Name, SwitchInterfaceConfig, SwitchPort, SwitchPortAddressConfig, SwitchPortBgpPeerConfig, SwitchPortConfig, @@ -870,7 +869,6 @@ impl DataStore { enum SwitchPortCreateError { RackNotFound, } - type TxnError = TransactionError; let conn = self.pool_connection_authorized(opctx).await?; let switch_port = SwitchPort::new( @@ -879,47 +877,64 @@ impl DataStore { port.to_string(), ); + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "switch_port_create", + ); + // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - use db::schema::rack; - use db::schema::rack::dsl as rack_dsl; - rack_dsl::rack - .filter(rack::id.eq(rack_id)) - .select(rack::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| { - TxnError::CustomError(SwitchPortCreateError::RackNotFound) - })?; - - // insert switch port - use db::schema::switch_port::dsl as switch_port_dsl; - let db_switch_port: SwitchPort = - diesel::insert_into(switch_port_dsl::switch_port) - .values(switch_port) - .returning(SwitchPort::as_returning()) - .get_result_async(&conn) - .await?; + conn.transaction_async_with_retry( + |conn| { + let err = err.clone(); + let switch_port = switch_port.clone(); + async move { + use db::schema::rack; + use db::schema::rack::dsl as rack_dsl; + rack_dsl::rack + .filter(rack::id.eq(rack_id)) + .select(rack::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|_| { + err.set(SwitchPortCreateError::RackNotFound) + .unwrap(); + DieselError::RollbackTransaction + })?; - Ok(db_switch_port) - }) + // insert switch port + use db::schema::switch_port::dsl as switch_port_dsl; + let db_switch_port: SwitchPort = + diesel::insert_into(switch_port_dsl::switch_port) + .values(switch_port) + .returning(SwitchPort::as_returning()) + .get_result_async(&conn) + .await?; + + Ok(db_switch_port) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::CustomError(SwitchPortCreateError::RackNotFound) => { - Error::invalid_request("rack not found") - } - TxnError::Database(e) => match e { - DieselError::DatabaseError(_, _) => public_error_from_diesel( + .map_err(|e| { + if let Some(err) = err.get() { + match err { + SwitchPortCreateError::RackNotFound => { + Error::invalid_request("rack not found") + } + } + } else { + public_error_from_diesel( e, ErrorHandler::Conflict( ResourceType::SwitchPort, &format!("{}/{}/{}", rack_id, &switch_location, &port,), ), - ), - _ => public_error_from_diesel(e, ErrorHandler::Server), - }, + ) + } }) } diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index 6db99465a3..6f57465846 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -12,7 +12,6 @@ use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::fixed_data::vpc::SERVICES_VPC_ID; use crate::db::identity::Resource; use crate::db::model::IncompleteVpc; @@ -37,6 +36,7 @@ use crate::db::queries::vpc::InsertVpcQuery; use crate::db::queries::vpc::VniSearchIter; use crate::db::queries::vpc_subnet::FilterConflictingVpcSubnetRangesQuery; use crate::db::queries::vpc_subnet::SubnetError; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -61,6 +61,7 @@ use omicron_common::api::external::UpdateResult; use omicron_common::api::external::Vni as ExternalVni; use ref_cast::RefCast; use std::collections::BTreeMap; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -580,16 +581,16 @@ impl DataStore { .set(dsl::time_deleted.eq(now)); let rules_is_empty = rules.is_empty(); - let insert_new_query = Vpc::insert_resource( - authz_vpc.id(), - diesel::insert_into(dsl::vpc_firewall_rule).values(rules), - ); - #[derive(Debug)] enum FirewallUpdateError { CollectionNotFound, } - type TxnError = TransactionError; + + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "vpc_update_firewall_rules", + ); // TODO-scalability: Ideally this would be a CTE so we don't need to // hold a transaction open across multiple roundtrips from the database, @@ -597,36 +598,58 @@ impl DataStore { // legibility of CTEs via diesel right now. self.pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - delete_old_query.execute_async(&conn).await?; - - // The generation count update on the vpc table row will take a - // write lock on the row, ensuring that the vpc was not deleted - // concurently. - if rules_is_empty { - return Ok(vec![]); - } - insert_new_query - .insert_and_get_results_async(&conn) - .await - .map_err(|e| match e { - AsyncInsertError::CollectionNotFound => { - TxnError::CustomError( - FirewallUpdateError::CollectionNotFound, - ) + .transaction_async_with_retry( + |conn| { + let err = err.clone(); + let delete_old_query = delete_old_query.clone(); + let rules = rules.clone(); + async move { + delete_old_query.execute_async(&conn).await?; + + // The generation count update on the vpc table row will take a + // write lock on the row, ensuring that the vpc was not deleted + // concurently. + if rules_is_empty { + return Ok(vec![]); } - AsyncInsertError::DatabaseError(e) => e.into(), - }) - }) + Vpc::insert_resource( + authz_vpc.id(), + diesel::insert_into(dsl::vpc_firewall_rule) + .values(rules), + ) + .insert_and_get_results_async(&conn) + .await + .map_err(|e| match e { + AsyncInsertError::CollectionNotFound => { + err.set( + FirewallUpdateError::CollectionNotFound, + ) + .unwrap(); + return DieselError::RollbackTransaction; + } + AsyncInsertError::DatabaseError(e) => e.into(), + }) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::CustomError( - FirewallUpdateError::CollectionNotFound, - ) => Error::not_found_by_id(ResourceType::Vpc, &authz_vpc.id()), - TxnError::Database(e) => public_error_from_diesel( - e, - ErrorHandler::NotFoundByResource(authz_vpc), - ), + .map_err(|e| { + if let Some(err) = err.get() { + match err { + FirewallUpdateError::CollectionNotFound => { + Error::not_found_by_id( + ResourceType::Vpc, + &authz_vpc.id(), + ) + } + } + } else { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByResource(authz_vpc), + ) + } }) } From 04aa33756c2f9bb53fe3b2a144149951420f951c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 16:25:10 -0800 Subject: [PATCH 07/25] Rack setup not ready for retry (nested transactions) --- nexus/db-queries/src/db/datastore/rack.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index cd8383e9e3..a798e0ca38 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -414,14 +414,17 @@ impl DataStore { // the low-frequency of calls, this optimization has been deferred. let log = opctx.log.clone(); let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "rack_set_initialized", - ); + + // NOTE: This transaction cannot yet be made retryable, as it uses + // nested transactions. +// let retry_helper = RetryHelper::new( +// &self.transaction_retry_producer, +// "rack_set_initialized", +// ); let rack = self .pool_connection_authorized(opctx) .await? - .transaction_async_with_retry(|conn| { + .transaction_async(|conn| { let err = err.clone(); let log = log.clone(); let authz_service_pool = authz_service_pool.clone(); @@ -451,7 +454,7 @@ impl DataStore { })?; if rack.initialized { info!(log, "Early exit: Rack already initialized"); - return Ok(rack); + return Ok::<_, DieselError>(rack); } // Otherwise, insert services and datasets. @@ -577,7 +580,6 @@ impl DataStore { Ok(rack) } }, - retry_helper.as_callback(), ) .await .map_err(|e| { From 5b19bc52c4a4c2524ee7d8252bd63751d5be4552 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 19:21:01 -0800 Subject: [PATCH 08/25] Fix error propagation --- nexus/db-queries/src/db/datastore/dns.rs | 9 +- .../src/db/datastore/network_interface.rs | 4 + nexus/db-queries/src/db/datastore/project.rs | 99 ++++++++++--------- nexus/db-queries/src/db/datastore/rack.rs | 19 ++-- nexus/db-queries/src/db/datastore/snapshot.rs | 34 +++---- .../src/db/datastore/switch_port.rs | 26 ++++- .../virtual_provisioning_collection.rs | 20 ++-- nexus/db-queries/src/db/datastore/vpc.rs | 2 +- nexus/db-queries/src/db/error.rs | 45 +++++++++ 9 files changed, 168 insertions(+), 90 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/dns.rs b/nexus/db-queries/src/db/datastore/dns.rs index f7ad97593e..5ecc2ae63e 100644 --- a/nexus/db-queries/src/db/datastore/dns.rs +++ b/nexus/db-queries/src/db/datastore/dns.rs @@ -67,7 +67,9 @@ impl DataStore { dns_group: DnsGroup, ) -> ListResultVec { let conn = self.pool_connection_authorized(opctx).await?; - self.dns_zones_list_all_on_connection(opctx, &conn, dns_group).await + Ok(self + .dns_zones_list_all_on_connection(opctx, &conn, dns_group) + .await?) } /// Variant of [`Self::dns_zones_list_all`] which may be called from a @@ -77,7 +79,7 @@ impl DataStore { opctx: &OpContext, conn: &async_bb8_diesel::Connection, dns_group: DnsGroup, - ) -> ListResultVec { + ) -> Result, TransactionError> { use db::schema::dns_zone::dsl; const LIMIT: usize = 5; @@ -88,8 +90,7 @@ impl DataStore { .limit(i64::try_from(LIMIT).unwrap()) .select(DnsZone::as_select()) .load_async(conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .await?; bail_unless!( list.len() < LIMIT, diff --git a/nexus/db-queries/src/db/datastore/network_interface.rs b/nexus/db-queries/src/db/datastore/network_interface.rs index 8c529d68de..b01dc16ced 100644 --- a/nexus/db-queries/src/db/datastore/network_interface.rs +++ b/nexus/db-queries/src/db/datastore/network_interface.rs @@ -12,6 +12,7 @@ use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::cte_utils::BoxedQuery; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; use crate::db::model::IncompleteNetworkInterface; use crate::db::model::Instance; @@ -505,6 +506,9 @@ impl DataStore { .execute_async(&conn) .await { + if retryable(&e) { + return Err(e); + } err.set(NetworkInterfaceUpdateError::FailedToUnsetPrimary(e)).unwrap(); return Err(diesel::result::Error::RollbackTransaction); } diff --git a/nexus/db-queries/src/db/datastore/project.rs b/nexus/db-queries/src/db/datastore/project.rs index 22c9cad1e1..972cd79466 100644 --- a/nexus/db-queries/src/db/datastore/project.rs +++ b/nexus/db-queries/src/db/datastore/project.rs @@ -12,8 +12,8 @@ use crate::db; use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::fixed_data::project::SERVICES_PROJECT; use crate::db::fixed_data::silo::INTERNAL_SILO_ID; use crate::db::identity::Resource; @@ -28,6 +28,7 @@ use crate::transaction_retry::RetryHelper; use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; use chrono::Utc; use diesel::prelude::*; +use diesel::result::Error as DieselError; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; @@ -178,9 +179,12 @@ impl DataStore { .map_err(|e| match e { AsyncInsertError::CollectionNotFound => { err.set(authz_silo_inner.not_found()).unwrap(); - return diesel::result::Error::RollbackTransaction; + return DieselError::RollbackTransaction; } AsyncInsertError::DatabaseError(e) => { + if retryable(&e) { + return e; + } err.set(public_error_from_diesel( e, ErrorHandler::Conflict( @@ -188,7 +192,7 @@ impl DataStore { &name, ), )).unwrap(); - return diesel::result::Error::RollbackTransaction; + return DieselError::RollbackTransaction; } })?; @@ -200,11 +204,7 @@ impl DataStore { CollectionTypeProvisioned::Project, ), ) - .await - .map_err(|e| { - err.set(e).unwrap(); - return diesel::result::Error::RollbackTransaction; - })?; + .await?; Ok(project) } }, @@ -252,47 +252,58 @@ impl DataStore { use db::schema::project::dsl; - type TxnError = TransactionError; + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "project_delete", + ); self.pool_connection_authorized(opctx) .await? - .transaction_async(|conn| async move { - let now = Utc::now(); - let updated_rows = diesel::update(dsl::project) - .filter(dsl::time_deleted.is_null()) - .filter(dsl::id.eq(authz_project.id())) - .filter(dsl::rcgen.eq(db_project.rcgen)) - .set(dsl::time_deleted.eq(now)) - .returning(Project::as_returning()) - .execute_async(&conn) - .await - .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::NotFoundByResource(authz_project), - ) - })?; - - if updated_rows == 0 { - return Err(TxnError::CustomError(Error::InvalidRequest { - message: - "deletion failed due to concurrent modification" - .to_string(), - })); + .transaction_async_with_retry(|conn| { + let err = err.clone(); + async move { + let now = Utc::now(); let updated_rows = diesel::update(dsl::project) + .filter(dsl::time_deleted.is_null()) + .filter(dsl::id.eq(authz_project.id())) + .filter(dsl::rcgen.eq(db_project.rcgen)) + .set(dsl::time_deleted.eq(now)) + .returning(Project::as_returning()) + .execute_async(&conn) + .await + .map_err(|e| { + if retryable(&e) { + return e; + } + err.set(public_error_from_diesel( + e, + ErrorHandler::NotFoundByResource(authz_project), + )).unwrap(); + DieselError::RollbackTransaction + })?; + + if updated_rows == 0 { + err.set(Error::InvalidRequest { + message: + "deletion failed due to concurrent modification" + .to_string(), + }).unwrap(); + return Err(DieselError::RollbackTransaction); + } + + self.virtual_provisioning_collection_delete_on_connection( + &conn, + db_project.id(), + ) + .await?; + Ok(()) } - - self.virtual_provisioning_collection_delete_on_connection( - &conn, - db_project.id(), - ) - .await?; - Ok(()) - }) + }, retry_helper.as_callback()) .await - .map_err(|e| match e { - TxnError::CustomError(e) => e, - TxnError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) + .map_err(|e| { + if let Some(err) = err.get() { + return err.clone(); } + public_error_from_diesel(e, ErrorHandler::Server) })?; Ok(()) } diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index a798e0ca38..c69c66e0b0 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -417,10 +417,10 @@ impl DataStore { // NOTE: This transaction cannot yet be made retryable, as it uses // nested transactions. -// let retry_helper = RetryHelper::new( -// &self.transaction_retry_producer, -// "rack_set_initialized", -// ); + // let retry_helper = RetryHelper::new( + // &self.transaction_retry_producer, + // "rack_set_initialized", + // ); let rack = self .pool_connection_authorized(opctx) .await? @@ -683,9 +683,12 @@ impl DataStore { DnsGroup::External, ) .await - .map_err(|e| { - err.set(e).unwrap(); - DieselError::RollbackTransaction + .map_err(|e| match e.take_retryable() { + Ok(e) => return e, + Err(e) => { + err.set(e).unwrap(); + DieselError::RollbackTransaction + } })?; Ok((ips, dns_zones)) @@ -696,7 +699,7 @@ impl DataStore { .await .map_err(|e| { if let Some(err) = Arc::try_unwrap(err).unwrap().take() { - return err; + return err.into(); } public_error_from_diesel(e, ErrorHandler::Server) }) diff --git a/nexus/db-queries/src/db/datastore/snapshot.rs b/nexus/db-queries/src/db/datastore/snapshot.rs index fabbfaad0e..5fd6326f56 100644 --- a/nexus/db-queries/src/db/datastore/snapshot.rs +++ b/nexus/db-queries/src/db/datastore/snapshot.rs @@ -56,7 +56,7 @@ impl DataStore { ResourceAlreadyExists, #[error("saw AsyncInsertError")] - InsertError(AsyncInsertError), + InsertError(Error), } let snapshot_name = snapshot.name().to_string(); @@ -140,9 +140,20 @@ impl DataStore { ) .insert_and_get_result_async(&conn) .await - .map_err(|e| { - err.set(CustomError::InsertError(e)).unwrap(); - DieselError::RollbackTransaction + .map_err(|e| match e { + AsyncInsertError::CollectionNotFound => { + err.set(CustomError::InsertError( + Error::ObjectNotFound { + type_name: ResourceType::Project, + lookup_type: LookupType::ById( + project_id, + ), + }, + )) + .unwrap(); + DieselError::RollbackTransaction + } + AsyncInsertError::DatabaseError(e) => e, }) } }, @@ -158,20 +169,7 @@ impl DataStore { object_name: snapshot_name, } } - CustomError::InsertError(e) => match e { - AsyncInsertError::CollectionNotFound => { - Error::ObjectNotFound { - type_name: ResourceType::Project, - lookup_type: LookupType::ById(project_id), - } - } - AsyncInsertError::DatabaseError(e) => { - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - } - }, + CustomError::InsertError(e) => e, } } else { public_error_from_diesel(e, ErrorHandler::Server) diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index af584265a3..59238dfb1c 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -10,6 +10,7 @@ use crate::db::datastore::address_lot::{ }; use crate::db::datastore::UpdatePrecondition; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; use crate::db::model::{ LldpServiceConfig, Name, SwitchInterfaceConfig, SwitchPort, @@ -368,7 +369,10 @@ impl DataStore { .limit(1) .first_async::(&conn) .await - .map_err(|_| { + .map_err(|e| { + if retryable(&e) { + return e; + } err.set(SwitchPortSettingsCreateError::AddressLotNotFound).unwrap(); DieselError::RollbackTransaction })? @@ -487,7 +491,10 @@ impl DataStore { .limit(1) .first_async::(&conn) .await - .map_err(|_| { + .map_err(|e| { + if retryable(&e) { + return e; + } err.set(SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound).unwrap(); DieselError::RollbackTransaction })? @@ -704,7 +711,10 @@ impl DataStore { .limit(1) .first_async::(&conn) .await - .map_err(|_| { + .map_err(|e| { + if retryable(&e) { + return e; + } err.set( SwitchPortSettingsGetError::NotFound( name.clone(), @@ -898,7 +908,10 @@ impl DataStore { .limit(1) .first_async::(&conn) .await - .map_err(|_| { + .map_err(|e| { + if retryable(&e) { + return e; + } err.set(SwitchPortCreateError::RackNotFound) .unwrap(); DieselError::RollbackTransaction @@ -980,7 +993,10 @@ impl DataStore { .limit(1) .first_async::(&conn) .await - .map_err(|_| { + .map_err(|e| { + if retryable(&e) { + return e; + } err.set(SwitchPortDeleteError::NotFound).unwrap(); DieselError::RollbackTransaction })?; diff --git a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs index 83856e10c7..05c2d012ac 100644 --- a/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs +++ b/nexus/db-queries/src/db/datastore/virtual_provisioning_collection.rs @@ -15,6 +15,7 @@ use crate::db::pool::DbConnection; use crate::db::queries::virtual_provisioning_collection_update::VirtualProvisioningCollectionUpdate; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; +use diesel::result::Error as DieselError; use omicron_common::api::external::{DeleteResult, Error}; use uuid::Uuid; @@ -52,13 +53,14 @@ impl DataStore { virtual_provisioning_collection, ) .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } pub(crate) async fn virtual_provisioning_collection_create_on_connection( &self, conn: &async_bb8_diesel::Connection, virtual_provisioning_collection: VirtualProvisioningCollection, - ) -> Result, Error> { + ) -> Result, DieselError> { use db::schema::virtual_provisioning_collection::dsl; let provisions: Vec = @@ -66,12 +68,10 @@ impl DataStore { .values(virtual_provisioning_collection) .on_conflict_do_nothing() .get_results_async(conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; - self.virtual_provisioning_collection_producer - .append_all_metrics(&provisions)?; + .await?; + let _ = self + .virtual_provisioning_collection_producer + .append_all_metrics(&provisions); Ok(provisions) } @@ -105,6 +105,7 @@ impl DataStore { let conn = self.pool_connection_authorized(opctx).await?; self.virtual_provisioning_collection_delete_on_connection(&conn, id) .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } /// Delete a [`VirtualProvisioningCollection`] object. @@ -112,7 +113,7 @@ impl DataStore { &self, conn: &async_bb8_diesel::Connection, id: Uuid, - ) -> DeleteResult { + ) -> Result<(), DieselError> { use db::schema::virtual_provisioning_collection::dsl; // NOTE: We don't really need to extract the value we're deleting from @@ -122,8 +123,7 @@ impl DataStore { .filter(dsl::id.eq(id)) .returning(VirtualProvisioningCollection::as_select()) .get_result_async(conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .await?; assert!( collection.is_empty(), "Collection deleted while non-empty: {collection:?}" diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index 6f57465846..67ee644942 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -627,7 +627,7 @@ impl DataStore { .unwrap(); return DieselError::RollbackTransaction; } - AsyncInsertError::DatabaseError(e) => e.into(), + AsyncInsertError::DatabaseError(e) => e, }) } }, diff --git a/nexus/db-queries/src/db/error.rs b/nexus/db-queries/src/db/error.rs index d4770d4824..ec0afcc8c0 100644 --- a/nexus/db-queries/src/db/error.rs +++ b/nexus/db-queries/src/db/error.rs @@ -28,12 +28,57 @@ pub enum TransactionError { Database(#[from] DieselError), } +pub fn retryable(error: &DieselError) -> bool { + match error { + DieselError::DatabaseError(kind, boxed_error_information) => match kind + { + DieselErrorKind::SerializationFailure => { + return boxed_error_information + .message() + .starts_with("restart transaction"); + } + _ => false, + }, + _ => false, + } +} + +impl TransactionError { + /// If this error is a retryable DieselError, unpack it as Ok(err). + /// + /// Otherwise, return Self as an error, so the error still can be used + /// in other cases. + pub fn take_retryable(self) -> Result { + match self { + TransactionError::CustomError(_) => Err(self), + TransactionError::Database(err) => { + if retryable(&err) { + Ok(err) + } else { + Err(TransactionError::Database(err)) + } + } + } + } +} + impl From for TransactionError { fn from(err: PublicError) -> Self { TransactionError::CustomError(err) } } +impl From> for PublicError { + fn from(err: TransactionError) -> Self { + match err { + TransactionError::CustomError(err) => err, + TransactionError::Database(err) => { + public_error_from_diesel(err, ErrorHandler::Server) + } + } + } +} + /// Summarizes details provided with a database error. fn format_database_error( kind: DieselErrorKind, From ad3bc08246ca2b5d4d44c2cf16e786023b540793 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 19:55:05 -0800 Subject: [PATCH 09/25] bgp, db_metadata, volume converted --- nexus/db-queries/src/db/datastore/bgp.rs | 375 +++++---- .../src/db/datastore/db_metadata.rs | 12 +- nexus/db-queries/src/db/datastore/volume.rs | 711 +++++++++--------- 3 files changed, 584 insertions(+), 514 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/bgp.rs b/nexus/db-queries/src/db/datastore/bgp.rs index ff314a2564..3d8ff1a68f 100644 --- a/nexus/db-queries/src/db/datastore/bgp.rs +++ b/nexus/db-queries/src/db/datastore/bgp.rs @@ -2,13 +2,15 @@ use super::DataStore; use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::Name; use crate::db::model::{BgpAnnounceSet, BgpAnnouncement, BgpConfig}; use crate::db::pagination::paginated; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; use chrono::Utc; +use diesel::result::Error as DieselError; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; use nexus_types::external_api::params; use nexus_types::identity::Resource; @@ -18,6 +20,7 @@ use omicron_common::api::external::{ ResourceType, }; use ref_cast::RefCast; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -32,29 +35,36 @@ impl DataStore { }; let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async(|conn| async move { - let id: Uuid = match &config.bgp_announce_set_id { - NameOrId::Name(name) => { - announce_set_dsl::bgp_announce_set - .filter(bgp_announce_set::time_deleted.is_null()) - .filter(bgp_announce_set::name.eq(name.to_string())) - .select(bgp_announce_set::id) - .limit(1) - .first_async::(&conn) - .await? - } - NameOrId::Id(id) => *id, - }; + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "bgp_config_set", + ); + pool.transaction_async_with_retry( + |conn| async move { + let id: Uuid = match &config.bgp_announce_set_id { + NameOrId::Name(name) => { + announce_set_dsl::bgp_announce_set + .filter(bgp_announce_set::time_deleted.is_null()) + .filter(bgp_announce_set::name.eq(name.to_string())) + .select(bgp_announce_set::id) + .limit(1) + .first_async::(&conn) + .await? + } + NameOrId::Id(id) => *id, + }; - let config = BgpConfig::from_config_create(config, id); + let config = BgpConfig::from_config_create(config, id); - let result = diesel::insert_into(dsl::bgp_config) - .values(config.clone()) - .returning(BgpConfig::as_returning()) - .get_result_async(&conn) - .await?; - Ok(result) - }) + let result = diesel::insert_into(dsl::bgp_config) + .values(config.clone()) + .returning(BgpConfig::as_returning()) + .get_result_async(&conn) + .await?; + Ok(result) + }, + retry_helper.as_callback(), + ) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -74,51 +84,60 @@ impl DataStore { enum BgpConfigDeleteError { ConfigInUse, } - type TxnError = TransactionError; + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "bgp_config_delete", + ); let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async(|conn| async move { - let name_or_id = sel.name_or_id.clone(); - - let id: Uuid = match name_or_id { - NameOrId::Id(id) => id, - NameOrId::Name(name) => { - bgp_config_dsl::bgp_config - .filter(bgp_config::name.eq(name.to_string())) - .select(bgp_config::id) - .limit(1) - .first_async::(&conn) - .await? + pool.transaction_async_with_retry(|conn| { + let err = err.clone(); + async move { + let name_or_id = sel.name_or_id.clone(); + + let id: Uuid = match name_or_id { + NameOrId::Id(id) => id, + NameOrId::Name(name) => { + bgp_config_dsl::bgp_config + .filter(bgp_config::name.eq(name.to_string())) + .select(bgp_config::id) + .limit(1) + .first_async::(&conn) + .await? + } + }; + + let count = + sps_bgp_peer_config_dsl::switch_port_settings_bgp_peer_config + .filter(sps_bgp_peer_config::bgp_config_id.eq(id)) + .count() + .execute_async(&conn) + .await?; + + if count > 0 { + err.set(BgpConfigDeleteError::ConfigInUse).unwrap(); + return Err(DieselError::RollbackTransaction); } - }; - let count = - sps_bgp_peer_config_dsl::switch_port_settings_bgp_peer_config - .filter(sps_bgp_peer_config::bgp_config_id.eq(id)) - .count() + diesel::update(bgp_config_dsl::bgp_config) + .filter(bgp_config_dsl::id.eq(id)) + .set(bgp_config_dsl::time_deleted.eq(Utc::now())) .execute_async(&conn) .await?; - if count > 0 { - return Err(TxnError::CustomError( - BgpConfigDeleteError::ConfigInUse, - )); + Ok(()) } - - diesel::update(bgp_config_dsl::bgp_config) - .filter(bgp_config_dsl::id.eq(id)) - .set(bgp_config_dsl::time_deleted.eq(Utc::now())) - .execute_async(&conn) - .await?; - - Ok(()) - }) + }, retry_helper.as_callback()) .await - .map_err(|e| match e { - TxnError::CustomError(BgpConfigDeleteError::ConfigInUse) => { - Error::invalid_request("BGP config in use") - } - TxnError::Database(e) => { + .map_err(|e| { + if let Some(err) = err.get() { + match err { + BgpConfigDeleteError::ConfigInUse => { + Error::invalid_request("BGP config in use") + } + } + } else { public_error_from_diesel(e, ErrorHandler::Server) } }) @@ -195,44 +214,62 @@ impl DataStore { enum BgpAnnounceListError { AnnounceSetNotFound(Name), } - type TxnError = TransactionError; + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "bgp_announce_list", + ); let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async(|conn| async move { - let name_or_id = sel.name_or_id.clone(); - - let announce_id: Uuid = match name_or_id { - NameOrId::Id(id) => id, - NameOrId::Name(name) => announce_set_dsl::bgp_announce_set - .filter(bgp_announce_set::time_deleted.is_null()) - .filter(bgp_announce_set::name.eq(name.to_string())) - .select(bgp_announce_set::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|_| { - TxnError::CustomError( - BgpAnnounceListError::AnnounceSetNotFound( + pool.transaction_async_with_retry( + |conn| { + let err = err.clone(); + async move { + let name_or_id = sel.name_or_id.clone(); + + let announce_id: Uuid = match name_or_id { + NameOrId::Id(id) => id, + NameOrId::Name(name) => announce_set_dsl::bgp_announce_set + .filter(bgp_announce_set::time_deleted.is_null()) + .filter(bgp_announce_set::name.eq(name.to_string())) + .select(bgp_announce_set::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + if retryable(&e) { + return e; + } + err.set(BgpAnnounceListError::AnnounceSetNotFound( Name::from(name.clone()), - ), - ) - })?, - }; + )).unwrap(); + DieselError::RollbackTransaction + })?, + }; - let result = announce_dsl::bgp_announcement - .filter(announce_dsl::announce_set_id.eq(announce_id)) - .select(BgpAnnouncement::as_select()) - .load_async(&conn) - .await?; + let result = announce_dsl::bgp_announcement + .filter(announce_dsl::announce_set_id.eq(announce_id)) + .select(BgpAnnouncement::as_select()) + .load_async(&conn) + .await?; - Ok(result) - }) + Ok(result) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::CustomError( - BgpAnnounceListError::AnnounceSetNotFound(name), - ) => Error::not_found_by_name(ResourceType::BgpAnnounceSet, &name), - TxnError::Database(e) => { + .map_err(|e| { + if let Some(err) = err.get() { + match err { + BgpAnnounceListError::AnnounceSetNotFound(name) => { + Error::not_found_by_name( + ResourceType::BgpAnnounceSet, + &name, + ) + } + } + } else { public_error_from_diesel(e, ErrorHandler::Server) } }) @@ -246,35 +283,43 @@ impl DataStore { use db::schema::bgp_announce_set::dsl as announce_set_dsl; use db::schema::bgp_announcement::dsl as bgp_announcement_dsl; + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "bgp_create_announce_set", + ); let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async(|conn| async move { - let bas: BgpAnnounceSet = announce.clone().into(); - - let db_as: BgpAnnounceSet = - diesel::insert_into(announce_set_dsl::bgp_announce_set) - .values(bas.clone()) - .returning(BgpAnnounceSet::as_returning()) - .get_result_async::(&conn) - .await?; - - let mut db_annoucements = Vec::new(); - for a in &announce.announcement { - let an = BgpAnnouncement { - announce_set_id: db_as.id(), - address_lot_block_id: bas.identity.id, - network: a.network.into(), - }; - let an = - diesel::insert_into(bgp_announcement_dsl::bgp_announcement) - .values(an.clone()) - .returning(BgpAnnouncement::as_returning()) - .get_result_async::(&conn) + pool.transaction_async_with_retry( + |conn| async move { + let bas: BgpAnnounceSet = announce.clone().into(); + + let db_as: BgpAnnounceSet = + diesel::insert_into(announce_set_dsl::bgp_announce_set) + .values(bas.clone()) + .returning(BgpAnnounceSet::as_returning()) + .get_result_async::(&conn) .await?; - db_annoucements.push(an); - } - Ok((db_as, db_annoucements)) - }) + let mut db_annoucements = Vec::new(); + for a in &announce.announcement { + let an = BgpAnnouncement { + announce_set_id: db_as.id(), + address_lot_block_id: bas.identity.id, + network: a.network.into(), + }; + let an = diesel::insert_into( + bgp_announcement_dsl::bgp_announcement, + ) + .values(an.clone()) + .returning(BgpAnnouncement::as_returning()) + .get_result_async::(&conn) + .await?; + db_annoucements.push(an); + } + + Ok((db_as, db_annoucements)) + }, + retry_helper.as_callback(), + ) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -295,55 +340,71 @@ impl DataStore { enum BgpAnnounceSetDeleteError { AnnounceSetInUse, } - type TxnError = TransactionError; let pool = self.pool_connection_authorized(opctx).await?; let name_or_id = sel.name_or_id.clone(); - pool.transaction_async(|conn| async move { - let id: Uuid = match name_or_id { - NameOrId::Name(name) => { - announce_set_dsl::bgp_announce_set - .filter(bgp_announce_set::name.eq(name.to_string())) - .select(bgp_announce_set::id) - .limit(1) - .first_async::(&conn) - .await? - } - NameOrId::Id(id) => id, - }; - - let count = bgp_config_dsl::bgp_config - .filter(bgp_config::bgp_announce_set_id.eq(id)) - .count() - .execute_async(&conn) - .await?; - - if count > 0 { - return Err(TxnError::CustomError( - BgpAnnounceSetDeleteError::AnnounceSetInUse, - )); - } + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "bgp_delete_announce_set", + ); + pool.transaction_async_with_retry( + |conn| { + let err = err.clone(); + let name_or_id = name_or_id.clone(); + async move { + let id: Uuid = match name_or_id { + NameOrId::Name(name) => { + announce_set_dsl::bgp_announce_set + .filter( + bgp_announce_set::name.eq(name.to_string()), + ) + .select(bgp_announce_set::id) + .limit(1) + .first_async::(&conn) + .await? + } + NameOrId::Id(id) => id, + }; + + let count = bgp_config_dsl::bgp_config + .filter(bgp_config::bgp_announce_set_id.eq(id)) + .count() + .execute_async(&conn) + .await?; - diesel::update(announce_set_dsl::bgp_announce_set) - .filter(announce_set_dsl::id.eq(id)) - .set(announce_set_dsl::time_deleted.eq(Utc::now())) - .execute_async(&conn) - .await?; + if count > 0 { + err.set(BgpAnnounceSetDeleteError::AnnounceSetInUse) + .unwrap(); + return Err(DieselError::RollbackTransaction); + } - diesel::delete(bgp_announcement_dsl::bgp_announcement) - .filter(bgp_announcement_dsl::announce_set_id.eq(id)) - .execute_async(&conn) - .await?; + diesel::update(announce_set_dsl::bgp_announce_set) + .filter(announce_set_dsl::id.eq(id)) + .set(announce_set_dsl::time_deleted.eq(Utc::now())) + .execute_async(&conn) + .await?; - Ok(()) - }) + diesel::delete(bgp_announcement_dsl::bgp_announcement) + .filter(bgp_announcement_dsl::announce_set_id.eq(id)) + .execute_async(&conn) + .await?; + + Ok(()) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::CustomError( - BgpAnnounceSetDeleteError::AnnounceSetInUse, - ) => Error::invalid_request("BGP announce set in use"), - TxnError::Database(e) => { + .map_err(|e| { + if let Some(err) = err.get() { + match err { + BgpAnnounceSetDeleteError::AnnounceSetInUse => { + Error::invalid_request("BGP announce set in use") + } + } + } else { public_error_from_diesel(e, ErrorHandler::Server) } }) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index 0ae61a7c38..5d50b69557 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -8,7 +8,6 @@ use super::DataStore; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::TransactionError; use async_bb8_diesel::{ AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, }; @@ -341,7 +340,7 @@ impl DataStore { target: &SemverVersion, sql: &String, ) -> Result<(), Error> { - let result = self.pool_connection_unauthorized().await?.transaction_async(|conn| async move { + let result = self.pool_connection_unauthorized().await?.transaction_async_with_retry(|conn| async move { if target.to_string() != EARLIEST_SUPPORTED_VERSION { let validate_version_query = format!("SELECT CAST(\ IF(\ @@ -356,15 +355,12 @@ impl DataStore { conn.batch_execute_async(&validate_version_query).await?; } conn.batch_execute_async(&sql).await?; - Ok::<_, TransactionError<()>>(()) - }).await; + Ok(()) + }, || async move { true }).await; match result { Ok(()) => Ok(()), - Err(TransactionError::CustomError(())) => panic!("No custom error"), - Err(TransactionError::Database(e)) => { - Err(public_error_from_diesel(e, ErrorHandler::Server)) - } + Err(e) => Err(public_error_from_diesel(e, ErrorHandler::Server)), } } diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index 5d753f0742..44fd43787f 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -7,18 +7,20 @@ use super::DataStore; use crate::db; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::identity::Asset; use crate::db::model::Dataset; use crate::db::model::Region; use crate::db::model::RegionSnapshot; use crate::db::model::Volume; use crate::db::queries::volume::DecreaseCrucibleResourceCountAndSoftDeleteVolume; +use crate::transaction_retry::RetryHelper; use anyhow::bail; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; +use diesel::result::Error as DieselError; use diesel::OptionalExtension; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; @@ -30,6 +32,7 @@ use serde::Deserialize; use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -44,7 +47,6 @@ impl DataStore { #[error("Serde error during Volume creation: {0}")] SerdeError(#[from] serde_json::Error), } - type TxnError = TransactionError; // Grab all the targets that the volume construction request references. // Do this outside the transaction, as the data inside volume doesn't @@ -66,86 +68,100 @@ impl DataStore { crucible_targets }; + let err = Arc::new(OnceLock::new()); + let retry_helper = + RetryHelper::new(&self.transaction_retry_producer, "volume_create"); self.pool_connection_unauthorized() .await? - .transaction_async(|conn| async move { - let maybe_volume: Option = dsl::volume - .filter(dsl::id.eq(volume.id())) - .select(Volume::as_select()) - .first_async(&conn) - .await - .optional() - .map_err(|e| { - TxnError::CustomError(VolumeCreationError::Public( - public_error_from_diesel(e, ErrorHandler::Server), - )) - })?; - - // If the volume existed already, return it and do not increase - // usage counts. - if let Some(volume) = maybe_volume { - return Ok(volume); - } - - // TODO do we need on_conflict do_nothing here? if the transaction - // model is read-committed, the SELECT above could return nothing, - // and the INSERT here could still result in a conflict. - // - // See also https://github.com/oxidecomputer/omicron/issues/1168 - let volume: Volume = diesel::insert_into(dsl::volume) - .values(volume.clone()) - .on_conflict(dsl::id) - .do_nothing() - .returning(Volume::as_returning()) - .get_result_async(&conn) - .await - .map_err(|e| { - TxnError::CustomError(VolumeCreationError::Public( - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Volume, - volume.id().to_string().as_str(), - ), - ), - )) - })?; + .transaction_async_with_retry( + |conn| { + let err = err.clone(); + let crucible_targets = crucible_targets.clone(); + let volume = volume.clone(); + async move { + let maybe_volume: Option = dsl::volume + .filter(dsl::id.eq(volume.id())) + .select(Volume::as_select()) + .first_async(&conn) + .await + .optional()?; + + // If the volume existed already, return it and do not increase + // usage counts. + if let Some(volume) = maybe_volume { + return Ok(volume); + } - // Increase the usage count for Crucible resources according to the - // contents of the volume. + // TODO do we need on_conflict do_nothing here? if the transaction + // model is read-committed, the SELECT above could return nothing, + // and the INSERT here could still result in a conflict. + // + // See also https://github.com/oxidecomputer/omicron/issues/1168 + let volume: Volume = diesel::insert_into(dsl::volume) + .values(volume.clone()) + .on_conflict(dsl::id) + .do_nothing() + .returning(Volume::as_returning()) + .get_result_async(&conn) + .await + .map_err(|e| { + if retryable(&e) { + return e; + } + err.set(VolumeCreationError::Public( + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Volume, + volume.id().to_string().as_str(), + ), + ), + )) + .unwrap(); + DieselError::RollbackTransaction + })?; - // Increase the number of uses for each referenced region snapshot. - use db::schema::region_snapshot::dsl as rs_dsl; - for read_only_target in &crucible_targets.read_only_targets { - diesel::update(rs_dsl::region_snapshot) - .filter( - rs_dsl::snapshot_addr.eq(read_only_target.clone()), - ) - .filter(rs_dsl::deleting.eq(false)) - .set( - rs_dsl::volume_references - .eq(rs_dsl::volume_references + 1), - ) - .execute_async(&conn) - .await - .map_err(|e| { - TxnError::CustomError(VolumeCreationError::Public( - public_error_from_diesel( - e, - ErrorHandler::Server, - ), - )) - })?; - } + // Increase the usage count for Crucible resources according to the + // contents of the volume. + + // Increase the number of uses for each referenced region snapshot. + use db::schema::region_snapshot::dsl as rs_dsl; + for read_only_target in + &crucible_targets.read_only_targets + { + diesel::update(rs_dsl::region_snapshot) + .filter( + rs_dsl::snapshot_addr + .eq(read_only_target.clone()), + ) + .filter(rs_dsl::deleting.eq(false)) + .set( + rs_dsl::volume_references + .eq(rs_dsl::volume_references + 1), + ) + .execute_async(&conn) + .await?; + } - Ok(volume) - }) + Ok(volume) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::CustomError(VolumeCreationError::Public(e)) => e, - - _ => { - Error::internal_error(&format!("Transaction error: {}", e)) + .map_err(|e| { + if let Some(err) = Arc::try_unwrap(err).unwrap().take() { + match err { + VolumeCreationError::Public(err) => err, + VolumeCreationError::SerdeError(err) => { + Error::internal_error(&format!( + "Transaction error: {}", + err + )) + } + } + } else { + public_error_from_diesel(e, ErrorHandler::Server) } }) } @@ -192,16 +208,12 @@ impl DataStore { #[derive(Debug, thiserror::Error)] enum VolumeGetError { - #[error("Error during volume_checkout: {0}")] - DieselError(#[from] diesel::result::Error), - #[error("Serde error during volume_checkout: {0}")] SerdeError(#[from] serde_json::Error), #[error("Updated {0} database rows, expected {1}")] UnexpectedDatabaseUpdate(usize, usize), } - type TxnError = TransactionError; // We perform a transaction here, to be sure that on completion // of this, the database contains an updated version of the @@ -209,141 +221,147 @@ impl DataStore { // types that require it). The generation number (along with the // rest of the volume data) that was in the database is what is // returned to the caller. + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "volume_checkout", + ); self.pool_connection_unauthorized() .await? - .transaction_async(|conn| async move { - // Grab the volume in question. - let volume = dsl::volume - .filter(dsl::id.eq(volume_id)) - .select(Volume::as_select()) - .get_result_async(&conn) - .await?; - - // Turn the volume.data into the VolumeConstructionRequest - let vcr: VolumeConstructionRequest = - serde_json::from_str(volume.data()).map_err(|e| { - TxnError::CustomError(VolumeGetError::SerdeError(e)) - })?; - - // Look to see if the VCR is a Volume type, and if so, look at - // its sub_volumes. If they are of type Region, then we need - // to update their generation numbers and record that update - // back to the database. We return to the caller whatever the - // original volume data was we pulled from the database. - match vcr { - VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent, - } => { - let mut update_needed = false; - let mut new_sv = Vec::new(); - for sv in sub_volumes { - match sv { - VolumeConstructionRequest::Region { - block_size, - blocks_per_extent, - extent_count, - opts, - gen, - } => { - update_needed = true; - new_sv.push( - VolumeConstructionRequest::Region { - block_size, - blocks_per_extent, - extent_count, - opts, - gen: gen + 1, - }, - ); - } - _ => { - new_sv.push(sv); + .transaction_async_with_retry(|conn| { + let err = err.clone(); + async move { + // Grab the volume in question. + let volume = dsl::volume + .filter(dsl::id.eq(volume_id)) + .select(Volume::as_select()) + .get_result_async(&conn) + .await?; + + // Turn the volume.data into the VolumeConstructionRequest + let vcr: VolumeConstructionRequest = + serde_json::from_str(volume.data()).map_err(|e| { + err.set(VolumeGetError::SerdeError(e)).unwrap(); + DieselError::RollbackTransaction + })?; + + // Look to see if the VCR is a Volume type, and if so, look at + // its sub_volumes. If they are of type Region, then we need + // to update their generation numbers and record that update + // back to the database. We return to the caller whatever the + // original volume data was we pulled from the database. + match vcr { + VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent, + } => { + let mut update_needed = false; + let mut new_sv = Vec::new(); + for sv in sub_volumes { + match sv { + VolumeConstructionRequest::Region { + block_size, + blocks_per_extent, + extent_count, + opts, + gen, + } => { + update_needed = true; + new_sv.push( + VolumeConstructionRequest::Region { + block_size, + blocks_per_extent, + extent_count, + opts, + gen: gen + 1, + }, + ); + } + _ => { + new_sv.push(sv); + } } } - } - // Only update the volume data if we found the type - // of volume that needed it. - if update_needed { - // Create a new VCR and fill in the contents - // from what the original volume had, but with our - // updated sub_volume records. - let new_vcr = VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes: new_sv, - read_only_parent, - }; - - let new_volume_data = serde_json::to_string( - &new_vcr, - ) - .map_err(|e| { - TxnError::CustomError( - VolumeGetError::SerdeError(e), - ) - })?; + // Only update the volume data if we found the type + // of volume that needed it. + if update_needed { + // Create a new VCR and fill in the contents + // from what the original volume had, but with our + // updated sub_volume records. + let new_vcr = VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes: new_sv, + read_only_parent, + }; - // Update the original volume_id with the new - // volume.data. - use db::schema::volume::dsl as volume_dsl; - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await?; + let new_volume_data = serde_json::to_string( + &new_vcr, + ) + .map_err(|e| { + err.set(VolumeGetError::SerdeError(e)).unwrap(); + DieselError::RollbackTransaction + })?; - // This should update just one row. If it does - // not, then something is terribly wrong in the - // database. - if num_updated != 1 { - return Err(TxnError::CustomError( - VolumeGetError::UnexpectedDatabaseUpdate( - num_updated, - 1, - ), - )); + // Update the original volume_id with the new + // volume.data. + use db::schema::volume::dsl as volume_dsl; + let num_updated = + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) + .execute_async(&conn) + .await?; + + // This should update just one row. If it does + // not, then something is terribly wrong in the + // database. + if num_updated != 1 { + err.set( + VolumeGetError::UnexpectedDatabaseUpdate( + num_updated, + 1, + ), + ).unwrap(); + return Err(DieselError::RollbackTransaction); + } } } + VolumeConstructionRequest::Region { + block_size: _, + blocks_per_extent: _, + extent_count: _, + opts: _, + gen: _, + } => { + // We don't support a pure Region VCR at the volume + // level in the database, so this choice should + // never be encountered, but I want to know if it is. + panic!("Region not supported as a top level volume"); + } + VolumeConstructionRequest::File { + id: _, + block_size: _, + path: _, + } + | VolumeConstructionRequest::Url { + id: _, + block_size: _, + url: _, + } => {} } - VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _, - } => { - // We don't support a pure Region VCR at the volume - // level in the database, so this choice should - // never be encountered, but I want to know if it is. - panic!("Region not supported as a top level volume"); - } - VolumeConstructionRequest::File { - id: _, - block_size: _, - path: _, - } - | VolumeConstructionRequest::Url { - id: _, - block_size: _, - url: _, - } => {} + Ok(volume) } - Ok(volume) - }) + }, retry_helper.as_callback()) .await - .map_err(|e| match e { - TxnError::CustomError(VolumeGetError::DieselError(e)) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - - _ => { - Error::internal_error(&format!("Transaction error: {}", e)) + .map_err(|e| { + if let Some(err) = err.get() { + return Error::internal_error(&format!("Transaction error: {}", err)); } + public_error_from_diesel(e, ErrorHandler::Server) }) } @@ -637,16 +655,12 @@ impl DataStore { ) -> Result { #[derive(Debug, thiserror::Error)] enum RemoveReadOnlyParentError { - #[error("Error removing read only parent: {0}")] - DieselError(#[from] diesel::result::Error), - #[error("Serde error removing read only parent: {0}")] SerdeError(#[from] serde_json::Error), #[error("Updated {0} database rows, expected {1}")] UnexpectedDatabaseUpdate(usize, usize), } - type TxnError = TransactionError; // In this single transaction: // - Get the given volume from the volume_id from the database @@ -662,170 +676,169 @@ impl DataStore { // data from original volume_id. // - Put the new temp VCR into the temp volume.data, update the // temp_volume in the database. + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "volume_remove_rop", + ); self.pool_connection_unauthorized() .await? - .transaction_async(|conn| async move { - // Grab the volume in question. If the volume record was already - // deleted then we can just return. - let volume = { - use db::schema::volume::dsl; - - let volume = dsl::volume - .filter(dsl::id.eq(volume_id)) - .select(Volume::as_select()) - .get_result_async(&conn) - .await - .optional()?; - - let volume = if let Some(v) = volume { - v - } else { - // the volume does not exist, nothing to do. - return Ok(false); + .transaction_async_with_retry(|conn| { + let err = err.clone(); + async move { + // Grab the volume in question. If the volume record was already + // deleted then we can just return. + let volume = { + use db::schema::volume::dsl; + + let volume = dsl::volume + .filter(dsl::id.eq(volume_id)) + .select(Volume::as_select()) + .get_result_async(&conn) + .await + .optional()?; + + let volume = if let Some(v) = volume { + v + } else { + // the volume does not exist, nothing to do. + return Ok(false); + }; + + if volume.time_deleted.is_some() { + // this volume is deleted, so let whatever is deleting + // it clean it up. + return Ok(false); + } else { + // A volume record exists, and was not deleted, we + // can attempt to remove its read_only_parent. + volume + } }; - if volume.time_deleted.is_some() { - // this volume is deleted, so let whatever is deleting - // it clean it up. - return Ok(false); - } else { - // A volume record exists, and was not deleted, we - // can attempt to remove its read_only_parent. - volume - } - }; - - // If a read_only_parent exists, remove it from volume_id, and - // attach it to temp_volume_id. - let vcr: VolumeConstructionRequest = - serde_json::from_str( - volume.data() - ) - .map_err(|e| { - TxnError::CustomError( - RemoveReadOnlyParentError::SerdeError( - e, - ), + // If a read_only_parent exists, remove it from volume_id, and + // attach it to temp_volume_id. + let vcr: VolumeConstructionRequest = + serde_json::from_str( + volume.data() ) - })?; - - match vcr { - VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent, - } => { - if read_only_parent.is_none() { - // This volume has no read_only_parent - Ok(false) - } else { - // Create a new VCR and fill in the contents - // from what the original volume had. - let new_vcr = VolumeConstructionRequest::Volume { - id, - block_size, - sub_volumes, - read_only_parent: None, - }; - - let new_volume_data = - serde_json::to_string( - &new_vcr + .map_err(|e| { + err.set( + RemoveReadOnlyParentError::SerdeError( + e, ) - .map_err(|e| { - TxnError::CustomError( - RemoveReadOnlyParentError::SerdeError( - e, - ), - ) - })?; - - // Update the original volume_id with the new - // volume.data. - use db::schema::volume::dsl as volume_dsl; - let num_updated = diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(volume_id)) - .set(volume_dsl::data.eq(new_volume_data)) - .execute_async(&conn) - .await?; + ).unwrap(); + DieselError::RollbackTransaction + })?; - // This should update just one row. If it does - // not, then something is terribly wrong in the - // database. - if num_updated != 1 { - return Err(TxnError::CustomError( - RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1), - )); - } + match vcr { + VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent, + } => { + if read_only_parent.is_none() { + // This volume has no read_only_parent + Ok(false) + } else { + // Create a new VCR and fill in the contents + // from what the original volume had. + let new_vcr = VolumeConstructionRequest::Volume { + id, + block_size, + sub_volumes, + read_only_parent: None, + }; - // Make a new VCR, with the information from - // our temp_volume_id, but the read_only_parent - // from the original volume. - let rop_vcr = VolumeConstructionRequest::Volume { - id: temp_volume_id, - block_size, - sub_volumes: vec![], - read_only_parent, - }; - let rop_volume_data = - serde_json::to_string( - &rop_vcr - ) - .map_err(|e| { - TxnError::CustomError( - RemoveReadOnlyParentError::SerdeError( - e, - ), + let new_volume_data = + serde_json::to_string( + &new_vcr ) - })?; - // Update the temp_volume_id with the volume - // data that contains the read_only_parent. - let num_updated = - diesel::update(volume_dsl::volume) - .filter(volume_dsl::id.eq(temp_volume_id)) - .filter(volume_dsl::time_deleted.is_null()) - .set(volume_dsl::data.eq(rop_volume_data)) + .map_err(|e| { + err.set(RemoveReadOnlyParentError::SerdeError( + e, + )).unwrap(); + DieselError::RollbackTransaction + })?; + + // Update the original volume_id with the new + // volume.data. + use db::schema::volume::dsl as volume_dsl; + let num_updated = diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(volume_id)) + .set(volume_dsl::data.eq(new_volume_data)) .execute_async(&conn) .await?; - if num_updated != 1 { - return Err(TxnError::CustomError( - RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1), - )); + + // This should update just one row. If it does + // not, then something is terribly wrong in the + // database. + if num_updated != 1 { + err.set(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1)).unwrap(); + return Err(DieselError::RollbackTransaction); + } + + // Make a new VCR, with the information from + // our temp_volume_id, but the read_only_parent + // from the original volume. + let rop_vcr = VolumeConstructionRequest::Volume { + id: temp_volume_id, + block_size, + sub_volumes: vec![], + read_only_parent, + }; + let rop_volume_data = + serde_json::to_string( + &rop_vcr + ) + .map_err(|e| { + err.set(RemoveReadOnlyParentError::SerdeError( + e, + )).unwrap(); + DieselError::RollbackTransaction + })?; + // Update the temp_volume_id with the volume + // data that contains the read_only_parent. + let num_updated = + diesel::update(volume_dsl::volume) + .filter(volume_dsl::id.eq(temp_volume_id)) + .filter(volume_dsl::time_deleted.is_null()) + .set(volume_dsl::data.eq(rop_volume_data)) + .execute_async(&conn) + .await?; + if num_updated != 1 { + err.set(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1)).unwrap(); + return Err(DieselError::RollbackTransaction); + } + Ok(true) } - Ok(true) } - } - VolumeConstructionRequest::File { id: _, block_size: _, path: _ } - | VolumeConstructionRequest::Region { - block_size: _, - blocks_per_extent: _, - extent_count: _, - opts: _, - gen: _ } - | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => { - // Volume has a format that does not contain ROPs - Ok(false) + VolumeConstructionRequest::File { id: _, block_size: _, path: _ } + | VolumeConstructionRequest::Region { + block_size: _, + blocks_per_extent: _, + extent_count: _, + opts: _, + gen: _ } + | VolumeConstructionRequest::Url { id: _, block_size: _, url: _ } => { + // Volume has a format that does not contain ROPs + Ok(false) + } } } - }) + }, retry_helper.as_callback()) .await - .map_err(|e| match e { - TxnError::CustomError( - RemoveReadOnlyParentError::DieselError(e), - ) => public_error_from_diesel( - e, - ErrorHandler::Server, - ), - - _ => { - Error::internal_error(&format!("Transaction error: {}", e)) + .map_err(|e| { + if let Some(err) = err.get() { + return Error::internal_error(&format!("Transaction error: {}", err)); } + public_error_from_diesel(e, ErrorHandler::Server) }) } } -#[derive(Default, Debug, Serialize, Deserialize)] +#[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct CrucibleTargets { pub read_only_targets: Vec, } From a209651f43da56ddd346af3e2c071a31a3fd16fc Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 20:06:19 -0800 Subject: [PATCH 10/25] switch_interface --- .../src/db/datastore/switch_interface.rs | 164 ++++++++++-------- 1 file changed, 90 insertions(+), 74 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/switch_interface.rs b/nexus/db-queries/src/db/datastore/switch_interface.rs index 88cff50471..a671de2358 100644 --- a/nexus/db-queries/src/db/datastore/switch_interface.rs +++ b/nexus/db-queries/src/db/datastore/switch_interface.rs @@ -11,9 +11,9 @@ use crate::db::datastore::address_lot::{ }; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::LoopbackAddress; use crate::db::pagination::paginated; +use crate::transaction_retry::RetryHelper; use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; use diesel::result::Error as DieselError; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; @@ -23,6 +23,7 @@ use omicron_common::api::external::{ CreateResult, DataPageParams, DeleteResult, Error, ListResultVec, LookupResult, ResourceType, }; +use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -40,79 +41,87 @@ impl DataStore { ReserveBlock(ReserveBlockError), } - type TxnError = TransactionError; - let conn = self.pool_connection_authorized(opctx).await?; let inet = IpNetwork::new(params.address, params.mask) .map_err(|_| Error::invalid_request("invalid address"))?; + let err = Arc::new(OnceLock::new()); + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "loopback_address_create", + ); + // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - let lot_id = authz_address_lot.id(); - let (block, rsvd_block) = - crate::db::datastore::address_lot::try_reserve_block( - lot_id, - inet.ip().into(), - params.anycast, - &conn, - ) - .await - .map_err(|e| match e { - ReserveBlockTxnError::CustomError(err) => { - TxnError::CustomError( - LoopbackAddressCreateError::ReserveBlock(err), + conn.transaction_async_with_retry( + |conn| { + let err = err.clone(); + async move { + let lot_id = authz_address_lot.id(); + let (block, rsvd_block) = + crate::db::datastore::address_lot::try_reserve_block( + lot_id, + inet.ip().into(), + params.anycast, + &conn, ) - } - ReserveBlockTxnError::Database(err) => { - TxnError::Database(err) - } - })?; - - // Address block reserved, now create the loopback address. - - let addr = LoopbackAddress::new( - id, - block.id, - rsvd_block.id, - params.rack_id, - params.switch_location.to_string(), - inet, - params.anycast, - ); - - let db_addr: LoopbackAddress = - diesel::insert_into(dsl::loopback_address) - .values(addr) - .returning(LoopbackAddress::as_returning()) - .get_result_async(&conn) - .await?; - - Ok(db_addr) - }) + .await + .map_err(|e| match e { + ReserveBlockTxnError::CustomError(e) => { + err.set( + LoopbackAddressCreateError::ReserveBlock(e), + ) + .unwrap(); + DieselError::RollbackTransaction + } + ReserveBlockTxnError::Database(e) => e, + })?; + + // Address block reserved, now create the loopback address. + + let addr = LoopbackAddress::new( + id, + block.id, + rsvd_block.id, + params.rack_id, + params.switch_location.to_string(), + inet, + params.anycast, + ); + + let db_addr: LoopbackAddress = + diesel::insert_into(dsl::loopback_address) + .values(addr) + .returning(LoopbackAddress::as_returning()) + .get_result_async(&conn) + .await?; + + Ok(db_addr) + } + }, + retry_helper.as_callback(), + ) .await - .map_err(|e| match e { - TxnError::CustomError( - LoopbackAddressCreateError::ReserveBlock( - ReserveBlockError::AddressUnavailable, - ), - ) => Error::invalid_request("address unavailable"), - TxnError::CustomError( - LoopbackAddressCreateError::ReserveBlock( - ReserveBlockError::AddressNotInLot, - ), - ) => Error::invalid_request("address not in lot"), - TxnError::Database(e) => match e { - DieselError::DatabaseError(_, _) => public_error_from_diesel( + .map_err(|e| { + if let Some(err) = err.get() { + match err { + LoopbackAddressCreateError::ReserveBlock( + ReserveBlockError::AddressUnavailable, + ) => Error::invalid_request("address unavailable"), + LoopbackAddressCreateError::ReserveBlock( + ReserveBlockError::AddressNotInLot, + ) => Error::invalid_request("address not in lot"), + } + } else { + public_error_from_diesel( e, ErrorHandler::Conflict( ResourceType::LoopbackAddress, &format!("lo {}", inet), ), - ), - _ => public_error_from_diesel(e, ErrorHandler::Server), - }, + ) + } }) } @@ -130,20 +139,27 @@ impl DataStore { // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async(|conn| async move { - let la = diesel::delete(dsl::loopback_address) - .filter(dsl::id.eq(id)) - .returning(LoopbackAddress::as_returning()) - .get_result_async(&conn) - .await?; - - diesel::delete(rsvd_block_dsl::address_lot_rsvd_block) - .filter(rsvd_block_dsl::id.eq(la.rsvd_address_lot_block_id)) - .execute_async(&conn) - .await?; - - Ok(()) - }) + let retry_helper = RetryHelper::new( + &self.transaction_retry_producer, + "loopback_address_delete", + ); + conn.transaction_async_with_retry( + |conn| async move { + let la = diesel::delete(dsl::loopback_address) + .filter(dsl::id.eq(id)) + .returning(LoopbackAddress::as_returning()) + .get_result_async(&conn) + .await?; + + diesel::delete(rsvd_block_dsl::address_lot_rsvd_block) + .filter(rsvd_block_dsl::id.eq(la.rsvd_address_lot_block_id)) + .execute_async(&conn) + .await?; + + Ok(()) + }, + retry_helper.as_callback(), + ) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } From dbec9aa7e2c342866a89f6abd974421977054d06 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 20:08:09 -0800 Subject: [PATCH 11/25] Point to branch --- Cargo.lock | 1 + Cargo.toml | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index fc6f8ea373..13844c61fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -287,6 +287,7 @@ checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" [[package]] name = "async-bb8-diesel" version = "0.1.0" +source = "git+https://github.com/oxidecomputer/async-bb8-diesel?branch=txn-retry#14bd98be8c76b36cc04464f06ffa5894b7240ab5" dependencies = [ "async-trait", "bb8", diff --git a/Cargo.toml b/Cargo.toml index 5f3963d92e..b4dbbd7757 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -140,8 +140,9 @@ api_identity = { path = "api_identity" } approx = "0.5.1" assert_matches = "1.5.0" assert_cmd = "2.0.12" +# TODO: Patch before merging # async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "1446f7e0c1f05f33a0581abd51fa873c7652ab61" } -async-bb8-diesel = { path = "../../async-bb8-diesel" } +async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", branch = "txn-retry" } async-trait = "0.1.74" atomicwrites = "0.4.2" authz-macros = { path = "nexus/authz-macros" } From b319f1c6495655abdc3a0a14de9d28cd97c6fea5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 20:53:17 -0800 Subject: [PATCH 12/25] Catch one more possibly retryable error --- nexus/db-queries/src/db/datastore/switch_port.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index 59238dfb1c..46b710c028 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -326,7 +326,10 @@ impl DataStore { .limit(1) .first_async::(&conn) .await - .map_err(|_| { + .map_err(|e| { + if retryable(&e) { + return e; + } err.set(SwitchPortSettingsCreateError::BgpConfigNotFound).unwrap(); DieselError::RollbackTransaction })? From aaeb8ab9a7afb54ccc46dc2165f24f26c13a81c6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Wed, 15 Nov 2023 21:03:21 -0800 Subject: [PATCH 13/25] Not sure how it worked without that, but added my rand dep --- Cargo.lock | 1 + nexus/db-queries/Cargo.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 6c1716b005..1b8eb44c3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4021,6 +4021,7 @@ dependencies = [ "pem 1.1.1", "petgraph", "pq-sys", + "rand 0.8.5", "rcgen", "ref-cast", "regex", diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index b1b8f3b28f..0ad33493e4 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -32,6 +32,7 @@ oso.workspace = true paste.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" +rand.workspace = true ref-cast.workspace = true samael.workspace = true serde.workspace = true From 4625d8a925351c27da2cc632b5968936b86aab0c Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 16 Nov 2023 17:16:54 -0800 Subject: [PATCH 14/25] review feedback --- nexus/db-queries/src/transaction_retry.rs | 53 +++++++++++++++-------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs index 707972e6a3..5203e32f26 100644 --- a/nexus/db-queries/src/transaction_retry.rs +++ b/nexus/db-queries/src/transaction_retry.rs @@ -6,12 +6,13 @@ use chrono::Utc; use oximeter::{types::Sample, Metric, MetricsError, Target}; +use rand::{thread_rng, Rng}; use std::sync::{Arc, Mutex}; use std::time::Duration; // Identifies "which" transaction is retrying #[derive(Debug, Clone, Target)] -struct Transaction { +struct DatabaseTransaction { name: String, } @@ -22,6 +23,7 @@ struct Transaction { struct RetryData { #[datum] latency: f64, + attempt: u32, } // Collects all transaction retry samples @@ -37,14 +39,10 @@ impl Producer { fn append( &self, - transaction: &Transaction, - latency: Duration, + transaction: &DatabaseTransaction, + data: &RetryData, ) -> Result<(), MetricsError> { - let sample = Sample::new_with_timestamp( - Utc::now(), - transaction, - &RetryData { latency: latency.as_secs_f64() }, - )?; + let sample = Sample::new_with_timestamp(Utc::now(), transaction, data)?; self.samples.lock().unwrap().push(sample); Ok(()) } @@ -52,7 +50,7 @@ impl Producer { struct RetryHelperInner { start: chrono::DateTime, - attempts: usize, + attempts: u32, } impl RetryHelperInner { @@ -79,7 +77,9 @@ pub(crate) struct RetryHelper { inner: Mutex, } -const MAX_RETRY_ATTEMPTS: usize = 10; +const MIN_RETRY_BACKOFF: Duration = Duration::from_millis(0); +const MAX_RETRY_BACKOFF: Duration = Duration::from_millis(50); +const MAX_RETRY_ATTEMPTS: u32 = 10; impl RetryHelper { /// Creates a new RetryHelper, and starts a timer tracking the transaction @@ -96,14 +96,24 @@ impl RetryHelper { // // This function: // - Appends a metric identifying the duration of the transaction operation - // - Performs a randomly generated backoff (limited to less than 50 ms) + // - Performs a random (uniform) backoff (limited to less than 50 ms) // - Returns "true" if the transaction should be restarted async fn retry_callback(&self) -> bool { - let inner = self.inner.lock().unwrap().tick(); + // Look at the current attempt and start time so we can log this + // information before we start sleeping. + let (start, attempt) = { + let inner = self.inner.lock().unwrap(); + (inner.start, inner.attempts) + }; + + let latency = (Utc::now() - start) + .to_std() + .unwrap_or(Duration::ZERO) + .as_secs_f64(); let _ = self.producer.append( - &Transaction { name: self.name.into() }, - (Utc::now() - inner.start).to_std().unwrap_or(Duration::ZERO), + &DatabaseTransaction { name: self.name.into() }, + &RetryData { latency, attempt }, ); // This backoff is not exponential, but I'm not sure we actually want @@ -111,9 +121,15 @@ impl RetryHelper { // probably be better to fail the operation, at which point Oximeter // will keep track of the failing transaction and identify that it's a // high-priority target for CTE conversion. - tokio::time::sleep(Duration::from_millis(rand::random::() % 50)) - .await; - + let duration = { + let mut rng = thread_rng(); + rng.gen_range(MIN_RETRY_BACKOFF..MAX_RETRY_BACKOFF) + }; + tokio::time::sleep(duration).await; + + // Now that we've finished sleeping, reset the timer and bump the number + // of attempts we've tried. + let inner = self.inner.lock().unwrap().tick(); return inner.attempts < MAX_RETRY_ATTEMPTS; } @@ -134,8 +150,7 @@ impl oximeter::Producer for Producer { fn produce( &mut self, ) -> Result + 'static>, MetricsError> { - let samples = - std::mem::replace(&mut *self.samples.lock().unwrap(), vec![]); + let samples = std::mem::take(&mut *self.samples.lock().unwrap()); Ok(Box::new(samples.into_iter())) } } From 761e046ce5a788813d052c835dd6b9fffa1a49fe Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 20 Nov 2023 14:36:32 -0800 Subject: [PATCH 15/25] Add tests for transaction_retry producing samples --- nexus/db-queries/src/db/datastore/mod.rs | 7 ++ nexus/db-queries/src/transaction_retry.rs | 112 ++++++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index d86b310119..228435b6c6 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -216,6 +216,13 @@ impl DataStore { .unwrap(); } + #[cfg(test)] + pub(crate) fn transaction_retry_producer( + &self, + ) -> &crate::transaction_retry::Producer { + &self.transaction_retry_producer + } + /// Returns a connection to a connection from the database connection pool. pub(super) async fn pool_connection_authorized( &self, diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs index 5203e32f26..bbdeb7fc91 100644 --- a/nexus/db-queries/src/transaction_retry.rs +++ b/nexus/db-queries/src/transaction_retry.rs @@ -154,3 +154,115 @@ impl oximeter::Producer for Producer { Ok(Box::new(samples.into_iter())) } } + +#[cfg(test)] +mod test { + use super::*; + + use crate::db::datastore::datastore_test; + use async_bb8_diesel::AsyncConnection; + use nexus_test_utils::db::test_setup_database; + use omicron_test_utils::dev; + use oximeter::types::FieldValue; + + // If a transaction is explicitly rolled back, we should not expect any + // samples to be taken. With no retries, this is just a normal operation + // failure. + #[tokio::test] + async fn test_transaction_rollback_produces_no_samples() { + let logctx = dev::test_setup_log( + "test_transaction_rollback_produces_no_samples", + ); + let mut db = test_setup_database(&logctx.log).await; + let (_opctx, datastore) = datastore_test(&logctx, &db).await; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + let retry_helper = RetryHelper::new( + datastore.transaction_retry_producer(), + "test_transaction_rollback_produces_no_samples", + ); + conn.transaction_async_with_retry( + |_conn| async move { + Err::<(), _>(diesel::result::Error::RollbackTransaction) + }, + retry_helper.as_callback(), + ) + .await + .expect_err("Should have failed"); + + let samples = datastore + .transaction_retry_producer() + .samples + .lock() + .unwrap() + .clone(); + assert_eq!(samples, vec![]); + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } + + // If a transaction fails with a retryable error, we record samples, + // providing oximeter-level information about the attempts. + #[tokio::test] + async fn test_transaction_retry_produces_samples() { + let logctx = + dev::test_setup_log("test_transaction_retry_produces_samples"); + let mut db = test_setup_database(&logctx.log).await; + let (_opctx, datastore) = datastore_test(&logctx, &db).await; + + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + let retry_helper = RetryHelper::new( + datastore.transaction_retry_producer(), + "test_transaction_retry_produces_samples", + ); + conn.transaction_async_with_retry( + |_conn| async move { + Err::<(), _>(diesel::result::Error::DatabaseError( + diesel::result::DatabaseErrorKind::SerializationFailure, + Box::new("restart transaction: Retry forever!".to_string()), + )) + }, + retry_helper.as_callback(), + ) + .await + .expect_err("Should have failed"); + + let samples = datastore + .transaction_retry_producer() + .samples + .lock() + .unwrap() + .clone(); + assert_eq!(samples.len(), MAX_RETRY_ATTEMPTS as usize); + + for i in 0..samples.len() { + let sample = &samples[i]; + + assert_eq!( + sample.timeseries_name, + "database_transaction:retry_data" + ); + + let target_fields = sample.sorted_target_fields(); + assert_eq!( + target_fields["name"].value, + FieldValue::String( + "test_transaction_retry_produces_samples".to_string() + ) + ); + + // Attempts are one-indexed + let metric_fields = sample.sorted_metric_fields(); + assert_eq!( + metric_fields["attempt"].value, + FieldValue::U32(u32::try_from(i).unwrap() + 1), + ); + } + + db.cleanup().await.unwrap(); + logctx.cleanup_successful(); + } +} From 180c872baf2c8855ff59c9b929eea4ea14262585 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 1 Dec 2023 11:09:34 -0800 Subject: [PATCH 16/25] Propagate more retry errors --- nexus/db-queries/src/db/datastore/dns.rs | 55 +++---- .../src/db/datastore/external_ip.rs | 24 ++- nexus/db-queries/src/db/datastore/rack.rs | 76 ++++++--- nexus/db-queries/src/db/datastore/service.rs | 40 +++-- nexus/db-queries/src/db/datastore/silo.rs | 149 +++++++++--------- nexus/db-queries/src/db/error.rs | 34 ++-- .../src/db/queries/network_interface.rs | 15 +- 7 files changed, 227 insertions(+), 166 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/dns.rs b/nexus/db-queries/src/db/datastore/dns.rs index 5ecc2ae63e..19b8b36102 100644 --- a/nexus/db-queries/src/db/datastore/dns.rs +++ b/nexus/db-queries/src/db/datastore/dns.rs @@ -107,12 +107,14 @@ impl DataStore { opctx: &OpContext, dns_group: DnsGroup, ) -> LookupResult { - self.dns_group_latest_version_conn( - opctx, - &*self.pool_connection_authorized(opctx).await?, - dns_group, - ) - .await + let version = self + .dns_group_latest_version_conn( + opctx, + &*self.pool_connection_authorized(opctx).await?, + dns_group, + ) + .await?; + Ok(version) } pub async fn dns_group_latest_version_conn( @@ -120,7 +122,7 @@ impl DataStore { opctx: &OpContext, conn: &async_bb8_diesel::Connection, dns_group: DnsGroup, - ) -> LookupResult { + ) -> Result> { opctx.authorize(authz::Action::Read, &authz::DNS_CONFIG).await?; use db::schema::dns_version::dsl; let versions = dsl::dns_version @@ -129,8 +131,7 @@ impl DataStore { .limit(1) .select(DnsVersion::as_select()) .load_async(conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .await?; bail_unless!( versions.len() == 1, @@ -378,28 +379,17 @@ impl DataStore { opctx: &OpContext, conn: &async_bb8_diesel::Connection, update: DnsVersionUpdateBuilder, - ) -> Result<(), Error> { + ) -> Result<(), TransactionError> { opctx.authorize(authz::Action::Modify, &authz::DNS_CONFIG).await?; let zones = self .dns_zones_list_all_on_connection(opctx, conn, update.dns_group) .await?; - let result = conn - .transaction_async(|c| async move { - self.dns_update_internal(opctx, &c, update, zones) - .await - .map_err(TransactionError::CustomError) - }) - .await; - - match result { - Ok(()) => Ok(()), - Err(TransactionError::CustomError(e)) => Err(e), - Err(TransactionError::Database(e)) => { - Err(public_error_from_diesel(e, ErrorHandler::Server)) - } - } + conn.transaction_async(|c| async move { + self.dns_update_internal(opctx, &c, update, zones).await + }) + .await } // This must only be used inside a transaction. Otherwise, it may make @@ -410,7 +400,7 @@ impl DataStore { conn: &async_bb8_diesel::Connection, update: DnsVersionUpdateBuilder, zones: Vec, - ) -> Result<(), Error> { + ) -> Result<(), TransactionError> { // TODO-scalability TODO-performance This would be much better as a CTE // for all the usual reasons described in RFD 192. Using an interactive // transaction here means that either we wind up holding database locks @@ -456,10 +446,7 @@ impl DataStore { diesel::insert_into(dsl::dns_version) .values(new_version) .execute_async(conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; + .await?; } { @@ -481,8 +468,7 @@ impl DataStore { ) .set(dsl::version_removed.eq(new_version_num)) .execute_async(conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .await?; bail_unless!( nremoved == ntoremove, @@ -496,10 +482,7 @@ impl DataStore { let nadded = diesel::insert_into(dsl::dns_name) .values(new_names) .execute_async(conn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; + .await?; bail_unless!( nadded == ntoadd, diff --git a/nexus/db-queries/src/db/datastore/external_ip.rs b/nexus/db-queries/src/db/datastore/external_ip.rs index e663130a84..4e34bfc15c 100644 --- a/nexus/db-queries/src/db/datastore/external_ip.rs +++ b/nexus/db-queries/src/db/datastore/external_ip.rs @@ -10,7 +10,9 @@ use crate::authz::ApiResource; use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; +use crate::db::error::TransactionError; use crate::db::lookup::LookupPath; use crate::db::model::ExternalIp; use crate::db::model::IncompleteExternalIp; @@ -132,7 +134,8 @@ impl DataStore { data: IncompleteExternalIp, ) -> CreateResult { let conn = self.pool_connection_authorized(opctx).await?; - Self::allocate_external_ip_on_connection(&conn, data).await + let ip = Self::allocate_external_ip_on_connection(&conn, data).await?; + Ok(ip) } /// Variant of [Self::allocate_external_ip] which may be called from a @@ -140,23 +143,30 @@ impl DataStore { pub(crate) async fn allocate_external_ip_on_connection( conn: &async_bb8_diesel::Connection, data: IncompleteExternalIp, - ) -> CreateResult { + ) -> Result> { let explicit_ip = data.explicit_ip().is_some(); NextExternalIp::new(data).get_result_async(conn).await.map_err(|e| { use diesel::result::Error::NotFound; match e { NotFound => { if explicit_ip { - Error::invalid_request( + TransactionError::CustomError(Error::invalid_request( "Requested external IP address not available", - ) + )) } else { - Error::invalid_request( + TransactionError::CustomError(Error::invalid_request( "No external IP addresses available", - ) + )) + } + } + _ => { + if retryable(&e) { + return TransactionError::Database(e); } + TransactionError::CustomError( + crate::db::queries::external_ip::from_diesel(e), + ) } - _ => crate::db::queries::external_ip::from_diesel(e), } }) } diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 944d469b7d..a7406db568 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -13,8 +13,9 @@ use crate::db; use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; +use crate::db::error::MaybeRetryable::*; use crate::db::fixed_data::silo::INTERNAL_SILO_ID; use crate::db::fixed_data::vpc_subnet::DNS_VPC_SUBNET; use crate::db::fixed_data::vpc_subnet::NEXUS_VPC_SUBNET; @@ -90,8 +91,23 @@ enum RackInitError { DnsSerialization(Error), Silo(Error), RoleAssignment(Error), + // Retryable database error + Retryable(DieselError), + // Other non-retryable database error + Database(DieselError), +} + +// Catch-all for Diesel error conversion into RackInitError, which +// can also label errors as retryable. +impl From for RackInitError { + fn from(e: DieselError) -> Self { + if retryable(&e) { + Self::Retryable(e) + } else { + Self::Database(e) + } + } } -type TxnError = TransactionError; impl From for Error { fn from(e: RackInitError) -> Self { @@ -129,6 +145,14 @@ impl From for Error { RackInitError::RoleAssignment(err) => Error::internal_error( &format!("failed to assign role to initial user: {:#}", err), ), + RackInitError::Retryable(err) => Error::internal_error(&format!( + "failed operation due to database contention: {:#}", + err + )), + RackInitError::Database(err) => Error::internal_error(&format!( + "failed operation due to database error: {:#}", + err + )), } } } @@ -320,9 +344,6 @@ impl DataStore { Ok(()) } - // The following methods which return a `TxnError` take a `conn` parameter - // which comes from the transaction created in `rack_set_initialized`. - #[allow(clippy::too_many_arguments)] async fn rack_create_recovery_silo( &self, @@ -334,7 +355,7 @@ impl DataStore { recovery_user_id: external_params::UserId, recovery_user_password_hash: omicron_passwords::PasswordHashString, dns_update: DnsVersionUpdateBuilder, - ) -> Result<(), TxnError> { + ) -> Result<(), RackInitError> { let db_silo = self .silo_create_conn( conn, @@ -345,8 +366,10 @@ impl DataStore { dns_update, ) .await - .map_err(RackInitError::Silo) - .map_err(TxnError::CustomError)?; + .map_err(|err| match err.retryable() { + NotRetryable(err) => RackInitError::Silo(err.into()), + Retryable(err) => RackInitError::Retryable(err), + })?; info!(log, "Created recovery silo"); // Create the first user in the initial Recovery Silo @@ -400,8 +423,7 @@ impl DataStore { }], ) .await - .map_err(RackInitError::RoleAssignment) - .map_err(TxnError::CustomError)?; + .map_err(RackInitError::RoleAssignment)?; debug!(log, "Generated role assignment queries"); q1.execute_async(conn).await?; @@ -427,9 +449,12 @@ impl DataStore { service.address, service.kind.clone().into(), ); - self.service_upsert_conn(conn, service_db) - .await - .map_err(|e| RackInitError::ServiceInsert(e))?; + self.service_upsert_conn(conn, service_db).await.map_err( + |e| match e.retryable() { + Retryable(e) => RackInitError::Retryable(e), + NotRetryable(e) => RackInitError::ServiceInsert(e.into()), + }, + )?; // For services with external connectivity, we record their // explicit IP allocation and create a service NIC as well. @@ -497,7 +522,10 @@ impl DataStore { IP address for {}", service.kind, ); - RackInitError::AddingIp(err) + match err.retryable() { + Retryable(e) => RackInitError::Retryable(e), + NotRetryable(e) => RackInitError::AddingIp(e.into()), + } })?; self.create_network_interface_raw_conn(conn, db_nic) @@ -510,6 +538,9 @@ impl DataStore { _, db::model::NetworkInterfaceKind::Service, ) => Ok(()), + InsertError::Retryable(err) => { + Err(RackInitError::Retryable(err)) + } _ => Err(RackInitError::AddingNic(e.into_external())), } })?; @@ -676,11 +707,11 @@ impl DataStore { ) .await .map_err(|e| match e { - TransactionError::CustomError(rack_init_err) => { - err.set(rack_init_err).unwrap(); + RackInitError::Retryable(e) => e, + _ => { + err.set(e).unwrap(); DieselError::RollbackTransaction }, - TransactionError::Database(e) => e, })?; let rack = diesel::update(rack_dsl::rack) @@ -693,6 +724,9 @@ impl DataStore { .get_result_async::(&conn) .await .map_err(|e| { + if retryable(&e) { + return e; + } err.set(RackInitError::RackUpdate { err: e, rack_id, @@ -805,12 +839,12 @@ impl DataStore { DnsGroup::External, ) .await - .map_err(|e| match e.take_retryable() { - Ok(e) => return e, - Err(e) => { - err.set(e).unwrap(); + .map_err(|e| match e.retryable() { + NotRetryable(not_retryable_err) => { + err.set(not_retryable_err).unwrap(); DieselError::RollbackTransaction } + Retryable(retryable_err) => retryable_err, })?; Ok((ips, dns_zones)) diff --git a/nexus/db-queries/src/db/datastore/service.rs b/nexus/db-queries/src/db/datastore/service.rs index 40bf250abe..df7ed27a6d 100644 --- a/nexus/db-queries/src/db/datastore/service.rs +++ b/nexus/db-queries/src/db/datastore/service.rs @@ -11,7 +11,9 @@ use crate::db; use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; +use crate::db::error::TransactionError; use crate::db::identity::Asset; use crate::db::model::Service; use crate::db::model::Sled; @@ -38,7 +40,12 @@ impl DataStore { service: Service, ) -> CreateResult { let conn = self.pool_connection_authorized(opctx).await?; - self.service_upsert_conn(&conn, service).await + self.service_upsert_conn(&conn, service).await.map_err(|e| match e { + TransactionError::CustomError(err) => err, + TransactionError::Database(err) => { + public_error_from_diesel(err, ErrorHandler::Server) + } + }) } /// Stores a new service in the database (using an existing db connection). @@ -46,7 +53,7 @@ impl DataStore { &self, conn: &async_bb8_diesel::Connection, service: Service, - ) -> CreateResult { + ) -> Result> { use db::schema::service::dsl; let service_id = service.id(); @@ -68,17 +75,24 @@ impl DataStore { .insert_and_get_result_async(conn) .await .map_err(|e| match e { - AsyncInsertError::CollectionNotFound => Error::ObjectNotFound { - type_name: ResourceType::Sled, - lookup_type: LookupType::ById(sled_id), - }, - AsyncInsertError::DatabaseError(e) => public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Service, - &service_id.to_string(), - ), - ), + AsyncInsertError::CollectionNotFound => { + TransactionError::CustomError(Error::ObjectNotFound { + type_name: ResourceType::Sled, + lookup_type: LookupType::ById(sled_id), + }) + } + AsyncInsertError::DatabaseError(e) => { + if retryable(&e) { + return TransactionError::Database(e); + } + TransactionError::CustomError(public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Service, + &service_id.to_string(), + ), + )) + } }) } diff --git a/nexus/db-queries/src/db/datastore/silo.rs b/nexus/db-queries/src/db/datastore/silo.rs index 4e75a9fbc7..ab48ec458f 100644 --- a/nexus/db-queries/src/db/datastore/silo.rs +++ b/nexus/db-queries/src/db/datastore/silo.rs @@ -11,6 +11,7 @@ use crate::context::OpContext; use crate::db; use crate::db::datastore::RunnableQuery; use crate::db::error::public_error_from_diesel; +use crate::db::error::retryable; use crate::db::error::ErrorHandler; use crate::db::error::TransactionError; use crate::db::fixed_data::silo::{DEFAULT_SILO, INTERNAL_SILO}; @@ -123,15 +124,17 @@ impl DataStore { dns_update: DnsVersionUpdateBuilder, ) -> CreateResult { let conn = self.pool_connection_authorized(opctx).await?; - self.silo_create_conn( - &conn, - opctx, - nexus_opctx, - new_silo_params, - new_silo_dns_names, - dns_update, - ) - .await + let silo = self + .silo_create_conn( + &conn, + opctx, + nexus_opctx, + new_silo_params, + new_silo_dns_names, + dns_update, + ) + .await?; + Ok(silo) } pub async fn silo_create_conn( @@ -142,7 +145,7 @@ impl DataStore { new_silo_params: params::SiloCreate, new_silo_dns_names: &[String], dns_update: DnsVersionUpdateBuilder, - ) -> CreateResult { + ) -> Result> { let silo_id = Uuid::new_v4(); let silo_group_id = Uuid::new_v4(); @@ -199,71 +202,71 @@ impl DataStore { None }; - conn.transaction_async(|conn| async move { - let silo = silo_create_query - .get_result_async(&conn) - .await - .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Silo, - new_silo_params.identity.name.as_str(), - ), - ) - })?; - self.virtual_provisioning_collection_create_on_connection( - &conn, - VirtualProvisioningCollection::new( - silo.id(), - CollectionTypeProvisioned::Silo, - ), - ) - .await?; - - if let Some(query) = silo_admin_group_ensure_query { - query.get_result_async(&conn).await?; - } - - if let Some(queries) = silo_admin_group_role_assignment_queries { - let (delete_old_query, insert_new_query) = queries; - delete_old_query.execute_async(&conn).await?; - insert_new_query.execute_async(&conn).await?; - } - - let certificates = new_silo_params - .tls_certificates - .into_iter() - .map(|c| { - Certificate::new( + let silo = conn + .transaction_async(|conn| async move { + let silo = silo_create_query + .get_result_async(&conn) + .await + .map_err(|e| { + if retryable(&e) { + return TransactionError::Database(e); + } + TransactionError::CustomError(public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::Silo, + new_silo_params.identity.name.as_str(), + ), + )) + })?; + self.virtual_provisioning_collection_create_on_connection( + &conn, + VirtualProvisioningCollection::new( silo.id(), - Uuid::new_v4(), - ServiceKind::Nexus, - c, - new_silo_dns_names, - ) - }) - .collect::, _>>() - .map_err(Error::from)?; - { - use db::schema::certificate::dsl; - diesel::insert_into(dsl::certificate) - .values(certificates) - .execute_async(&conn) - .await?; - } - - self.dns_update(nexus_opctx, &conn, dns_update).await?; + CollectionTypeProvisioned::Silo, + ), + ) + .await?; - Ok(silo) - }) - .await - .map_err(|e| match e { - TransactionError::CustomError(e) => e, - TransactionError::Database(e) => { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + if let Some(query) = silo_admin_group_ensure_query { + query.get_result_async(&conn).await?; + } + + if let Some(queries) = silo_admin_group_role_assignment_queries + { + let (delete_old_query, insert_new_query) = queries; + delete_old_query.execute_async(&conn).await?; + insert_new_query.execute_async(&conn).await?; + } + + let certificates = new_silo_params + .tls_certificates + .into_iter() + .map(|c| { + Certificate::new( + silo.id(), + Uuid::new_v4(), + ServiceKind::Nexus, + c, + new_silo_dns_names, + ) + }) + .collect::, _>>() + .map_err(Error::from)?; + { + use db::schema::certificate::dsl; + diesel::insert_into(dsl::certificate) + .values(certificates) + .execute_async(&conn) + .await?; + } + + self.dns_update(nexus_opctx, &conn, dns_update).await?; + + Ok::>(silo) + }) + .await?; + Ok(silo) } pub async fn silos_list_by_id( diff --git a/nexus/db-queries/src/db/error.rs b/nexus/db-queries/src/db/error.rs index ec0afcc8c0..fc7f30da93 100644 --- a/nexus/db-queries/src/db/error.rs +++ b/nexus/db-queries/src/db/error.rs @@ -17,7 +17,7 @@ pub enum TransactionError { /// The customizable error type. /// /// This error should be used for all non-Diesel transaction failures. - #[error("Custom transaction error; {0}")] + #[error("Custom transaction error: {0}")] CustomError(T), /// The Diesel error type. @@ -43,21 +43,29 @@ pub fn retryable(error: &DieselError) -> bool { } } -impl TransactionError { - /// If this error is a retryable DieselError, unpack it as Ok(err). +/// Identifies if the error is retryable or not. +pub enum MaybeRetryable { + /// The error isn't retryable. + NotRetryable(T), + /// The error is retryable. + Retryable(DieselError), +} + +impl TransactionError { + /// Identifies that the error could be returned from a Diesel transaction. /// - /// Otherwise, return Self as an error, so the error still can be used - /// in other cases. - pub fn take_retryable(self) -> Result { + /// Allows callers to propagate arbitrary errors out of transaction contexts + /// without losing information that might be valuable to the calling context, + /// such as "does this particular error indicate that the entire transaction + /// should retry?". + pub fn retryable(self) -> MaybeRetryable { + use MaybeRetryable::*; + match self { - TransactionError::CustomError(_) => Err(self), - TransactionError::Database(err) => { - if retryable(&err) { - Ok(err) - } else { - Err(TransactionError::Database(err)) - } + TransactionError::Database(err) if retryable(&err) => { + Retryable(err) } + _ => NotRetryable(self), } } } diff --git a/nexus/db-queries/src/db/queries/network_interface.rs b/nexus/db-queries/src/db/queries/network_interface.rs index 84a81a7b7a..1dbe57da6f 100644 --- a/nexus/db-queries/src/db/queries/network_interface.rs +++ b/nexus/db-queries/src/db/queries/network_interface.rs @@ -5,6 +5,7 @@ //! Queries for inserting and deleting network interfaces. use crate::db; +use crate::db::error::{public_error_from_diesel, retryable, ErrorHandler}; use crate::db::model::IncompleteNetworkInterface; use crate::db::pool::DbConnection; use crate::db::queries::next_item::DefaultShiftGenerator; @@ -120,6 +121,8 @@ pub enum InsertError { InstanceMustBeStopped(Uuid), /// The instance does not exist at all, or is in the destroyed state. InstanceNotFound(Uuid), + /// The operation occurred within a transaction, and is retryable + Retryable(DieselError), /// Any other error External(external::Error), } @@ -135,7 +138,6 @@ impl InsertError { e: DieselError, interface: &IncompleteNetworkInterface, ) -> Self { - use crate::db::error; match e { // Catch the specific errors designed to communicate the failures we // want to distinguish @@ -143,9 +145,9 @@ impl InsertError { decode_database_error(e, interface) } // Any other error at all is a bug - _ => InsertError::External(error::public_error_from_diesel( + _ => InsertError::External(public_error_from_diesel( e, - error::ErrorHandler::Server, + ErrorHandler::Server, )), } } @@ -209,6 +211,9 @@ impl InsertError { InsertError::InstanceNotFound(id) => { external::Error::not_found_by_id(external::ResourceType::Instance, &id) } + InsertError::Retryable(err) => { + public_error_from_diesel(err, ErrorHandler::Server) + } InsertError::External(e) => e, } } @@ -290,6 +295,10 @@ fn decode_database_error( r#"uuid: incorrect UUID length: non-unique-subnets"#, ); + if retryable(&err) { + return InsertError::Retryable(err); + } + match err { // If the address allocation subquery fails, we'll attempt to insert // NULL for the `ip` column. This checks that the non-NULL constraint on From da330175cf9a28642edb12fe7c4761f9227631ab Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 1 Dec 2023 14:35:55 -0800 Subject: [PATCH 17/25] Start refactoring RetryHelper to help callsites become simpler --- .../src/db/datastore/address_lot.rs | 76 +- nexus/db-queries/src/db/datastore/mod.rs | 14 + .../src/db/datastore/switch_port.rs | 1096 ++++++++--------- nexus/db-queries/src/db/mod.rs | 4 +- nexus/db-queries/src/transaction_retry.rs | 81 ++ 5 files changed, 673 insertions(+), 598 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/address_lot.rs b/nexus/db-queries/src/db/datastore/address_lot.rs index 0c47ad141a..acef27e363 100644 --- a/nexus/db-queries/src/db/datastore/address_lot.rs +++ b/nexus/db-queries/src/db/datastore/address_lot.rs @@ -47,52 +47,42 @@ impl DataStore { use db::schema::address_lot::dsl as lot_dsl; use db::schema::address_lot_block::dsl as block_dsl; - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "address_lot_create", - ); - self.pool_connection_authorized(opctx) - .await? - // TODO https://github.com/oxidecomputer/omicron/issues/2811 - // Audit external networking database transaction usage - .transaction_async_with_retry( - |conn| async move { - let lot = - AddressLot::new(¶ms.identity, params.kind.into()); - - let db_lot: AddressLot = - diesel::insert_into(lot_dsl::address_lot) - .values(lot) - .returning(AddressLot::as_returning()) - .get_result_async(&conn) - .await?; + let conn = self.pool_connection_authorized(opctx).await?; - let blocks: Vec = params - .blocks - .iter() - .map(|b| { - AddressLotBlock::new( - db_lot.id(), - b.first_address.into(), - b.last_address.into(), - ) - }) - .collect(); - - let db_blocks = - diesel::insert_into(block_dsl::address_lot_block) - .values(blocks) - .returning(AddressLotBlock::as_returning()) - .get_results_async(&conn) - .await?; + // TODO https://github.com/oxidecomputer/omicron/issues/2811 + // Audit external networking database transaction usage + self.transaction_retry_wrapper("address_lot_create") + .transaction(&conn, |conn| async move { + let lot = AddressLot::new(¶ms.identity, params.kind.into()); + + let db_lot: AddressLot = + diesel::insert_into(lot_dsl::address_lot) + .values(lot) + .returning(AddressLot::as_returning()) + .get_result_async(&conn) + .await?; - Ok(AddressLotCreateResult { - lot: db_lot, - blocks: db_blocks, + let blocks: Vec = params + .blocks + .iter() + .map(|b| { + AddressLotBlock::new( + db_lot.id(), + b.first_address.into(), + b.last_address.into(), + ) }) - }, - retry_helper.as_callback(), - ) + .collect(); + + let db_blocks = + diesel::insert_into(block_dsl::address_lot_block) + .values(blocks) + .returning(AddressLotBlock::as_returning()) + .get_results_async(&conn) + .await?; + + Ok(AddressLotCreateResult { lot: db_lot, blocks: db_blocks }) + }) .await .map_err(|e| { public_error_from_diesel( diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 0646166c72..c63cfeec34 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -218,6 +218,20 @@ impl DataStore { .unwrap(); } + /// Constructs a transaction retry helper + /// + /// Automatically wraps the underlying producer + pub(crate) fn transaction_retry_wrapper( + &self, + name: &'static str, + ) -> crate::transaction_retry::RetryHelper { + crate::transaction_retry::RetryHelper::new( + &self.transaction_retry_producer, + name, + ) + } + + // TODO: Do we need this anymore... #[cfg(test)] pub(crate) fn transaction_retry_producer( &self, diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index 7c899fc2d0..e4747327ea 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -20,8 +20,8 @@ use crate::db::model::{ SwitchVlanInterfaceConfig, }; use crate::db::pagination::paginated; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; use diesel::result::Error as DieselError; use diesel::{ CombineDsl, ExpressionMethods, JoinOnDsl, NullableExpressionMethods, @@ -167,267 +167,260 @@ impl DataStore { } type SpsCreateError = SwitchPortSettingsCreateError; - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "switch_port_settings_create", - ); - + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async_with_retry(|conn| { - let err = err.clone(); - async move { - // create the top level port settings object - let port_settings = match id { - Some(id) => SwitchPortSettings::with_id(id, ¶ms.identity), - None => SwitchPortSettings::new(¶ms.identity), - }; - //let port_settings = SwitchPortSettings::new(¶ms.identity); - let db_port_settings: SwitchPortSettings = - diesel::insert_into(port_settings_dsl::switch_port_settings) - .values(port_settings) - .returning(SwitchPortSettings::as_returning()) - .get_result_async(&conn) - .await?; - - let psid = db_port_settings.identity.id; + self.transaction_retry_wrapper("switch_port_settings_create") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + // create the top level port settings object + let port_settings = match id { + Some(id) => SwitchPortSettings::with_id(id, ¶ms.identity), + None => SwitchPortSettings::new(¶ms.identity), + }; + //let port_settings = SwitchPortSettings::new(¶ms.identity); + let db_port_settings: SwitchPortSettings = + diesel::insert_into(port_settings_dsl::switch_port_settings) + .values(port_settings) + .returning(SwitchPortSettings::as_returning()) + .get_result_async(&conn) + .await?; - // add the port config - let port_config = SwitchPortConfig::new( - psid, - params.port_config.geometry.into(), - ); + let psid = db_port_settings.identity.id; - let db_port_config: SwitchPortConfig = - diesel::insert_into(port_config_dsl::switch_port_settings_port_config) - .values(port_config) - .returning(SwitchPortConfig::as_returning()) - .get_result_async(&conn) - .await?; + // add the port config + let port_config = SwitchPortConfig::new( + psid, + params.port_config.geometry.into(), + ); - let mut result = SwitchPortSettingsCombinedResult{ - settings: db_port_settings, - groups: Vec::new(), - port: db_port_config, - links: Vec::new(), - link_lldp: Vec::new(), - interfaces: Vec::new(), - vlan_interfaces: Vec::new(), - routes: Vec::new(), - bgp_peers: Vec::new(), - addresses: Vec::new(), - }; + let db_port_config: SwitchPortConfig = + diesel::insert_into(port_config_dsl::switch_port_settings_port_config) + .values(port_config) + .returning(SwitchPortConfig::as_returning()) + .get_result_async(&conn) + .await?; - //TODO validate link configs consistent with port geometry. - // - https://github.com/oxidecomputer/omicron/issues/2816 + let mut result = SwitchPortSettingsCombinedResult{ + settings: db_port_settings, + groups: Vec::new(), + port: db_port_config, + links: Vec::new(), + link_lldp: Vec::new(), + interfaces: Vec::new(), + vlan_interfaces: Vec::new(), + routes: Vec::new(), + bgp_peers: Vec::new(), + addresses: Vec::new(), + }; - let mut lldp_config = Vec::with_capacity(params.links.len()); - let mut link_config = Vec::with_capacity(params.links.len()); + //TODO validate link configs consistent with port geometry. + // - https://github.com/oxidecomputer/omicron/issues/2816 - for (link_name, c) in ¶ms.links { - let lldp_config_id = match c.lldp.lldp_config { - Some(_) => todo!(), // TODO actual lldp support - None => None, - }; - let lldp_svc_config = - LldpServiceConfig::new(c.lldp.enabled, lldp_config_id); + let mut lldp_config = Vec::with_capacity(params.links.len()); + let mut link_config = Vec::with_capacity(params.links.len()); - lldp_config.push(lldp_svc_config.clone()); - link_config.push(SwitchPortLinkConfig::new( - psid, - lldp_svc_config.id, - link_name.clone(), - c.mtu, - c.fec.into(), - c.speed.into(), - c.autoneg, - )); - } - result.link_lldp = - diesel::insert_into(lldp_config_dsl::lldp_service_config) - .values(lldp_config.clone()) - .returning(LldpServiceConfig::as_returning()) - .get_results_async(&conn) - .await?; - result.links = - diesel::insert_into( - link_config_dsl::switch_port_settings_link_config) - .values(link_config) - .returning(SwitchPortLinkConfig::as_returning()) - .get_results_async(&conn) - .await?; + for (link_name, c) in ¶ms.links { + let lldp_config_id = match c.lldp.lldp_config { + Some(_) => todo!(), // TODO actual lldp support + None => None, + }; + let lldp_svc_config = + LldpServiceConfig::new(c.lldp.enabled, lldp_config_id); - let mut interface_config = Vec::with_capacity(params.interfaces.len()); - let mut vlan_interface_config = Vec::new(); - for (interface_name, i) in ¶ms.interfaces { - let ifx_config = SwitchInterfaceConfig::new( - psid, - interface_name.clone(), - i.v6_enabled, - i.kind.into(), - ); - interface_config.push(ifx_config.clone()); - if let params::SwitchInterfaceKind::Vlan(vlan_if) = i.kind { - vlan_interface_config.push(SwitchVlanInterfaceConfig::new( - ifx_config.id, - vlan_if.vid, + lldp_config.push(lldp_svc_config.clone()); + link_config.push(SwitchPortLinkConfig::new( + psid, + lldp_svc_config.id, + link_name.clone(), + c.mtu, + c.fec.into(), + c.speed.into(), + c.autoneg, )); } - } - result.interfaces = - diesel::insert_into( - interface_config_dsl::switch_port_settings_interface_config) - .values(interface_config) - .returning(SwitchInterfaceConfig::as_returning()) - .get_results_async(&conn) - .await?; - result.vlan_interfaces = - diesel::insert_into(vlan_config_dsl::switch_vlan_interface_config) - .values(vlan_interface_config) - .returning(SwitchVlanInterfaceConfig::as_returning()) - .get_results_async(&conn) - .await?; - - let mut route_config = Vec::with_capacity(params.routes.len()); + result.link_lldp = + diesel::insert_into(lldp_config_dsl::lldp_service_config) + .values(lldp_config.clone()) + .returning(LldpServiceConfig::as_returning()) + .get_results_async(&conn) + .await?; + result.links = + diesel::insert_into( + link_config_dsl::switch_port_settings_link_config) + .values(link_config) + .returning(SwitchPortLinkConfig::as_returning()) + .get_results_async(&conn) + .await?; - for (interface_name, r) in ¶ms.routes { - for route in &r.routes { - route_config.push(SwitchPortRouteConfig::new( + let mut interface_config = Vec::with_capacity(params.interfaces.len()); + let mut vlan_interface_config = Vec::new(); + for (interface_name, i) in ¶ms.interfaces { + let ifx_config = SwitchInterfaceConfig::new( psid, interface_name.clone(), - route.dst.into(), - route.gw.into(), - route.vid.map(Into::into), - )); + i.v6_enabled, + i.kind.into(), + ); + interface_config.push(ifx_config.clone()); + if let params::SwitchInterfaceKind::Vlan(vlan_if) = i.kind { + vlan_interface_config.push(SwitchVlanInterfaceConfig::new( + ifx_config.id, + vlan_if.vid, + )); + } } - } - result.routes = - diesel::insert_into( - route_config_dsl::switch_port_settings_route_config) - .values(route_config) - .returning(SwitchPortRouteConfig::as_returning()) - .get_results_async(&conn) - .await?; + result.interfaces = + diesel::insert_into( + interface_config_dsl::switch_port_settings_interface_config) + .values(interface_config) + .returning(SwitchInterfaceConfig::as_returning()) + .get_results_async(&conn) + .await?; + result.vlan_interfaces = + diesel::insert_into(vlan_config_dsl::switch_vlan_interface_config) + .values(vlan_interface_config) + .returning(SwitchVlanInterfaceConfig::as_returning()) + .get_results_async(&conn) + .await?; + + let mut route_config = Vec::with_capacity(params.routes.len()); + + for (interface_name, r) in ¶ms.routes { + for route in &r.routes { + route_config.push(SwitchPortRouteConfig::new( + psid, + interface_name.clone(), + route.dst.into(), + route.gw.into(), + route.vid.map(Into::into), + )); + } + } + result.routes = + diesel::insert_into( + route_config_dsl::switch_port_settings_route_config) + .values(route_config) + .returning(SwitchPortRouteConfig::as_returning()) + .get_results_async(&conn) + .await?; + + let mut bgp_peer_config = Vec::new(); + for (interface_name, peer_config) in ¶ms.bgp_peers { + for p in &peer_config.peers { + use db::schema::bgp_config; + let bgp_config_id = match &p.bgp_config { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name = name.to_string(); + bgp_config_dsl::bgp_config + .filter(bgp_config::time_deleted.is_null()) + .filter(bgp_config::name.eq(name)) + .select(bgp_config::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortSettingsCreateError::BgpConfigNotFound + ) + })? + } + }; + + bgp_peer_config.push(SwitchPortBgpPeerConfig::new( + psid, + bgp_config_id, + interface_name.clone(), + p.addr.into(), + p.hold_time.into(), + p.idle_hold_time.into(), + p.delay_open.into(), + p.connect_retry.into(), + p.keepalive.into(), + )); + + } + } + result.bgp_peers = + diesel::insert_into( + bgp_peer_dsl::switch_port_settings_bgp_peer_config) + .values(bgp_peer_config) + .returning(SwitchPortBgpPeerConfig::as_returning()) + .get_results_async(&conn) + .await?; - let mut bgp_peer_config = Vec::new(); - for (interface_name, peer_config) in ¶ms.bgp_peers { - for p in &peer_config.peers { - use db::schema::bgp_config; - let bgp_config_id = match &p.bgp_config { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name = name.to_string(); - bgp_config_dsl::bgp_config - .filter(bgp_config::time_deleted.is_null()) - .filter(bgp_config::name.eq(name)) - .select(bgp_config::id) + let mut address_config = Vec::new(); + use db::schema::address_lot; + for (interface_name, a) in ¶ms.addresses { + for address in &a.addresses { + let address_lot_id = match &address.address_lot { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name = name.to_string(); + address_lot_dsl::address_lot + .filter(address_lot::time_deleted.is_null()) + .filter(address_lot::name.eq(name)) + .select(address_lot::id) .limit(1) .first_async::(&conn) .await - .map_err(|e| { - if retryable(&e) { - return e; - } - err.set(SwitchPortSettingsCreateError::BgpConfigNotFound).unwrap(); - DieselError::RollbackTransaction + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortSettingsCreateError::AddressLotNotFound + ) })? - } - }; - - bgp_peer_config.push(SwitchPortBgpPeerConfig::new( - psid, - bgp_config_id, - interface_name.clone(), - p.addr.into(), - p.hold_time.into(), - p.idle_hold_time.into(), - p.delay_open.into(), - p.connect_retry.into(), - p.keepalive.into(), - )); - - } - } - result.bgp_peers = - diesel::insert_into( - bgp_peer_dsl::switch_port_settings_bgp_peer_config) - .values(bgp_peer_config) - .returning(SwitchPortBgpPeerConfig::as_returning()) - .get_results_async(&conn) - .await?; - - let mut address_config = Vec::new(); - use db::schema::address_lot; - for (interface_name, a) in ¶ms.addresses { - for address in &a.addresses { - let address_lot_id = match &address.address_lot { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name = name.to_string(); - address_lot_dsl::address_lot - .filter(address_lot::time_deleted.is_null()) - .filter(address_lot::name.eq(name)) - .select(address_lot::id) - .limit(1) - .first_async::(&conn) + } + }; + // TODO: Reduce DB round trips needed for reserving ip blocks + // https://github.com/oxidecomputer/omicron/issues/3060 + let (block, rsvd_block) = + crate::db::datastore::address_lot::try_reserve_block( + address_lot_id, + address.address.ip().into(), + // TODO: Should we allow anycast addresses for switch_ports? + // anycast + false, + &conn + ) .await - .map_err(|e| { - if retryable(&e) { - return e; + .map_err(|e| match e { + ReserveBlockTxnError::CustomError(e) => { + err.bail(SwitchPortSettingsCreateError::ReserveBlock(e)) } - err.set(SwitchPortSettingsCreateError::AddressLotNotFound).unwrap(); - DieselError::RollbackTransaction - })? - } - }; - // TODO: Reduce DB round trips needed for reserving ip blocks - // https://github.com/oxidecomputer/omicron/issues/3060 - let (block, rsvd_block) = - crate::db::datastore::address_lot::try_reserve_block( - address_lot_id, - address.address.ip().into(), - // TODO: Should we allow anycast addresses for switch_ports? - // anycast - false, - &conn - ) - .await - .map_err(|e| match e { - ReserveBlockTxnError::CustomError(e) => { - err.set(SwitchPortSettingsCreateError::ReserveBlock(e)).unwrap(); - DieselError::RollbackTransaction - } - ReserveBlockTxnError::Database(e) => e, - })?; - - address_config.push(SwitchPortAddressConfig::new( - psid, - block.id, - rsvd_block.id, - address.address.into(), - interface_name.clone(), - )); - + ReserveBlockTxnError::Database(e) => e, + })?; + + address_config.push(SwitchPortAddressConfig::new( + psid, + block.id, + rsvd_block.id, + address.address.into(), + interface_name.clone(), + )); + + } } - } - result.addresses = - diesel::insert_into( - address_config_dsl::switch_port_settings_address_config) - .values(address_config) - .returning(SwitchPortAddressConfig::as_returning()) - .get_results_async(&conn) - .await?; + result.addresses = + diesel::insert_into( + address_config_dsl::switch_port_settings_address_config) + .values(address_config) + .returning(SwitchPortAddressConfig::as_returning()) + .get_results_async(&conn) + .await?; - Ok(result) - }}, - retry_helper.as_callback(), + Ok(result) + } + } ) .await .map_err(|e| { - if let Some(err) = err.get() { + if let Some(err) = err.take() { match err { SpsCreateError::AddressLotNotFound => { Error::invalid_request("AddressLot not found") @@ -473,163 +466,160 @@ impl DataStore { Some(name_or_id) => name_or_id, }; - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "switch_port_settings_delete", - ); + let err = OptionalError::new(); // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async_with_retry(|conn| { - let err = err.clone(); - async move { - use db::schema::switch_port_settings; - let id = match selector { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name = name.to_string(); - port_settings_dsl::switch_port_settings - .filter(switch_port_settings::time_deleted.is_null()) - .filter(switch_port_settings::name.eq(name)) - .select(switch_port_settings::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|e| { - if retryable(&e) { - return e; - } - err.set(SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound).unwrap(); - DieselError::RollbackTransaction - })? - } - }; + self.transaction_retry_wrapper("switch_port_settings_delete") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + use db::schema::switch_port_settings; + let id = match selector { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name = name.to_string(); + port_settings_dsl::switch_port_settings + .filter(switch_port_settings::time_deleted.is_null()) + .filter(switch_port_settings::name.eq(name)) + .select(switch_port_settings::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound + ) + })? + } + }; - // delete the top level port settings object - diesel::delete(port_settings_dsl::switch_port_settings) - .filter(switch_port_settings::id.eq(id)) - .execute_async(&conn) - .await?; + // delete the top level port settings object + diesel::delete(port_settings_dsl::switch_port_settings) + .filter(switch_port_settings::id.eq(id)) + .execute_async(&conn) + .await?; - // delete the port config object - use db::schema::switch_port_settings_port_config::{ - self as sps_port_config, dsl as port_config_dsl, - }; - diesel::delete(port_config_dsl::switch_port_settings_port_config) - .filter(sps_port_config::port_settings_id.eq(id)) - .execute_async(&conn) - .await?; + // delete the port config object + use db::schema::switch_port_settings_port_config::{ + self as sps_port_config, dsl as port_config_dsl, + }; + diesel::delete(port_config_dsl::switch_port_settings_port_config) + .filter(sps_port_config::port_settings_id.eq(id)) + .execute_async(&conn) + .await?; - // delete the link configs - use db::schema::switch_port_settings_link_config::{ - self as sps_link_config, dsl as link_config_dsl, - }; - let links: Vec = - diesel::delete( - link_config_dsl::switch_port_settings_link_config - ) - .filter( - sps_link_config::port_settings_id.eq(id) - ) - .returning(SwitchPortLinkConfig::as_returning()) - .get_results_async(&conn) - .await?; + // delete the link configs + use db::schema::switch_port_settings_link_config::{ + self as sps_link_config, dsl as link_config_dsl, + }; + let links: Vec = + diesel::delete( + link_config_dsl::switch_port_settings_link_config + ) + .filter( + sps_link_config::port_settings_id.eq(id) + ) + .returning(SwitchPortLinkConfig::as_returning()) + .get_results_async(&conn) + .await?; - // delete lldp configs - use db::schema::lldp_service_config::{self, dsl as lldp_config_dsl}; - let lldp_svc_ids: Vec = links - .iter() - .map(|link| link.lldp_service_config_id) - .collect(); - diesel::delete(lldp_config_dsl::lldp_service_config) - .filter(lldp_service_config::id.eq_any(lldp_svc_ids)) - .execute_async(&conn) - .await?; + // delete lldp configs + use db::schema::lldp_service_config::{self, dsl as lldp_config_dsl}; + let lldp_svc_ids: Vec = links + .iter() + .map(|link| link.lldp_service_config_id) + .collect(); + diesel::delete(lldp_config_dsl::lldp_service_config) + .filter(lldp_service_config::id.eq_any(lldp_svc_ids)) + .execute_async(&conn) + .await?; - // delete interface configs - use db::schema::switch_port_settings_interface_config::{ - self as sps_interface_config, dsl as interface_config_dsl, - }; + // delete interface configs + use db::schema::switch_port_settings_interface_config::{ + self as sps_interface_config, dsl as interface_config_dsl, + }; - let interfaces: Vec = - diesel::delete( - interface_config_dsl::switch_port_settings_interface_config - ) - .filter( - sps_interface_config::port_settings_id.eq(id) - ) - .returning(SwitchInterfaceConfig::as_returning()) - .get_results_async(&conn) - .await?; + let interfaces: Vec = + diesel::delete( + interface_config_dsl::switch_port_settings_interface_config + ) + .filter( + sps_interface_config::port_settings_id.eq(id) + ) + .returning(SwitchInterfaceConfig::as_returning()) + .get_results_async(&conn) + .await?; - // delete any vlan interfaces - use db::schema::switch_vlan_interface_config::{ - self, dsl as vlan_config_dsl, - }; - let interface_ids: Vec = interfaces - .iter() - .map(|interface| interface.id) - .collect(); - - diesel::delete(vlan_config_dsl::switch_vlan_interface_config) - .filter( - switch_vlan_interface_config::interface_config_id.eq_any( - interface_ids + // delete any vlan interfaces + use db::schema::switch_vlan_interface_config::{ + self, dsl as vlan_config_dsl, + }; + let interface_ids: Vec = interfaces + .iter() + .map(|interface| interface.id) + .collect(); + + diesel::delete(vlan_config_dsl::switch_vlan_interface_config) + .filter( + switch_vlan_interface_config::interface_config_id.eq_any( + interface_ids + ) ) + .execute_async(&conn) + .await?; + + // delete route configs + use db::schema::switch_port_settings_route_config; + use db::schema::switch_port_settings_route_config::dsl + as route_config_dsl; + + diesel::delete( + route_config_dsl::switch_port_settings_route_config ) + .filter(switch_port_settings_route_config::port_settings_id.eq(id)) .execute_async(&conn) .await?; - // delete route configs - use db::schema::switch_port_settings_route_config; - use db::schema::switch_port_settings_route_config::dsl - as route_config_dsl; + // delete bgp configurations + use db::schema::switch_port_settings_bgp_peer_config as bgp_peer; + use db::schema::switch_port_settings_bgp_peer_config::dsl + as bgp_peer_dsl; - diesel::delete( - route_config_dsl::switch_port_settings_route_config - ) - .filter(switch_port_settings_route_config::port_settings_id.eq(id)) - .execute_async(&conn) - .await?; + diesel::delete(bgp_peer_dsl::switch_port_settings_bgp_peer_config) + .filter(bgp_peer::port_settings_id.eq(id)) + .execute_async(&conn) + .await?; - // delete bgp configurations - use db::schema::switch_port_settings_bgp_peer_config as bgp_peer; - use db::schema::switch_port_settings_bgp_peer_config::dsl - as bgp_peer_dsl; + // delete address configs + use db::schema::switch_port_settings_address_config::{ + self as address_config, dsl as address_config_dsl, + }; - diesel::delete(bgp_peer_dsl::switch_port_settings_bgp_peer_config) - .filter(bgp_peer::port_settings_id.eq(id)) - .execute_async(&conn) + let port_settings_addrs = diesel::delete( + address_config_dsl::switch_port_settings_address_config, + ) + .filter(address_config::port_settings_id.eq(id)) + .returning(SwitchPortAddressConfig::as_returning()) + .get_results_async(&conn) .await?; - // delete address configs - use db::schema::switch_port_settings_address_config::{ - self as address_config, dsl as address_config_dsl, - }; + use db::schema::address_lot_rsvd_block::dsl as rsvd_block_dsl; - let port_settings_addrs = diesel::delete( - address_config_dsl::switch_port_settings_address_config, - ) - .filter(address_config::port_settings_id.eq(id)) - .returning(SwitchPortAddressConfig::as_returning()) - .get_results_async(&conn) - .await?; - - use db::schema::address_lot_rsvd_block::dsl as rsvd_block_dsl; + for ps in &port_settings_addrs { + diesel::delete(rsvd_block_dsl::address_lot_rsvd_block) + .filter(rsvd_block_dsl::id.eq(ps.rsvd_address_lot_block_id)) + .execute_async(&conn) + .await?; + } - for ps in &port_settings_addrs { - diesel::delete(rsvd_block_dsl::address_lot_rsvd_block) - .filter(rsvd_block_dsl::id.eq(ps.rsvd_address_lot_block_id)) - .execute_async(&conn) - .await?; + Ok(()) } - - Ok(()) - }}, retry_helper.as_callback()) + }) .await .map_err(|e| { - if let Some(err) = err.get() { + if let Some(err) = err.take() { match err { SwitchPortSettingsDeleteError::SwitchPortSettingsNotFound => { Error::invalid_request("port settings not found") @@ -685,162 +675,159 @@ impl DataStore { NotFound(external::Name), } - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "switch_port_settings_get", - ); - + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async_with_retry(|conn| { - let err = err.clone(); - async move { - // get the top level port settings object - use db::schema::switch_port_settings::{ - self, dsl as port_settings_dsl, - }; - - let id = match name_or_id { - NameOrId::Id(id) => *id, - NameOrId::Name(name) => { - let name_str = name.to_string(); + self.transaction_retry_wrapper("switch_port_settings_get") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + // get the top level port settings object + use db::schema::switch_port_settings::{ + self, dsl as port_settings_dsl, + }; + + let id = match name_or_id { + NameOrId::Id(id) => *id, + NameOrId::Name(name) => { + let name_str = name.to_string(); + port_settings_dsl::switch_port_settings + .filter(switch_port_settings::time_deleted.is_null()) + .filter(switch_port_settings::name.eq(name_str)) + .select(switch_port_settings::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|diesel_error| { + err.bail_retryable_or_else(diesel_error, |_| { + SwitchPortSettingsGetError::NotFound( + name.clone(), + ) + }) + })? + } + }; + + let settings: SwitchPortSettings = port_settings_dsl::switch_port_settings .filter(switch_port_settings::time_deleted.is_null()) - .filter(switch_port_settings::name.eq(name_str)) - .select(switch_port_settings::id) + .filter(switch_port_settings::id.eq(id)) + .select(SwitchPortSettings::as_select()) .limit(1) - .first_async::(&conn) - .await - .map_err(|e| { - if retryable(&e) { return e; } - err.set(SwitchPortSettingsGetError::NotFound( - name.clone(), - )).unwrap(); - DieselError::RollbackTransaction - })? - } - }; + .first_async::(&conn) + .await?; - let settings: SwitchPortSettings = - port_settings_dsl::switch_port_settings - .filter(switch_port_settings::time_deleted.is_null()) - .filter(switch_port_settings::id.eq(id)) - .select(SwitchPortSettings::as_select()) - .limit(1) - .first_async::(&conn) - .await?; + // get the port config + use db::schema::switch_port_settings_port_config::{ + self as port_config, dsl as port_config_dsl, + }; + let port: SwitchPortConfig = + port_config_dsl::switch_port_settings_port_config + .filter(port_config::port_settings_id.eq(id)) + .select(SwitchPortConfig::as_select()) + .limit(1) + .first_async::(&conn) + .await?; - // get the port config - use db::schema::switch_port_settings_port_config::{ - self as port_config, dsl as port_config_dsl, - }; - let port: SwitchPortConfig = - port_config_dsl::switch_port_settings_port_config - .filter(port_config::port_settings_id.eq(id)) - .select(SwitchPortConfig::as_select()) - .limit(1) - .first_async::(&conn) - .await?; + // initialize result + let mut result = + SwitchPortSettingsCombinedResult::new(settings, port); - // initialize result - let mut result = - SwitchPortSettingsCombinedResult::new(settings, port); + // get the link configs + use db::schema::switch_port_settings_link_config::{ + self as link_config, dsl as link_config_dsl, + }; - // get the link configs - use db::schema::switch_port_settings_link_config::{ - self as link_config, dsl as link_config_dsl, - }; + result.links = link_config_dsl::switch_port_settings_link_config + .filter(link_config::port_settings_id.eq(id)) + .select(SwitchPortLinkConfig::as_select()) + .load_async::(&conn) + .await?; - result.links = link_config_dsl::switch_port_settings_link_config - .filter(link_config::port_settings_id.eq(id)) - .select(SwitchPortLinkConfig::as_select()) - .load_async::(&conn) - .await?; + let lldp_svc_ids: Vec = result + .links + .iter() + .map(|link| link.lldp_service_config_id) + .collect(); + + use db::schema::lldp_service_config as lldp_config; + use db::schema::lldp_service_config::dsl as lldp_dsl; + result.link_lldp = lldp_dsl::lldp_service_config + .filter(lldp_config::id.eq_any(lldp_svc_ids)) + .select(LldpServiceConfig::as_select()) + .limit(1) + .load_async::(&conn) + .await?; - let lldp_svc_ids: Vec = result - .links - .iter() - .map(|link| link.lldp_service_config_id) - .collect(); - - use db::schema::lldp_service_config as lldp_config; - use db::schema::lldp_service_config::dsl as lldp_dsl; - result.link_lldp = lldp_dsl::lldp_service_config - .filter(lldp_config::id.eq_any(lldp_svc_ids)) - .select(LldpServiceConfig::as_select()) - .limit(1) - .load_async::(&conn) - .await?; + // get the interface configs + use db::schema::switch_port_settings_interface_config::{ + self as interface_config, dsl as interface_config_dsl, + }; - // get the interface configs - use db::schema::switch_port_settings_interface_config::{ - self as interface_config, dsl as interface_config_dsl, - }; + result.interfaces = + interface_config_dsl::switch_port_settings_interface_config + .filter(interface_config::port_settings_id.eq(id)) + .select(SwitchInterfaceConfig::as_select()) + .load_async::(&conn) + .await?; - result.interfaces = - interface_config_dsl::switch_port_settings_interface_config - .filter(interface_config::port_settings_id.eq(id)) - .select(SwitchInterfaceConfig::as_select()) - .load_async::(&conn) + use db::schema::switch_vlan_interface_config as vlan_config; + use db::schema::switch_vlan_interface_config::dsl as vlan_dsl; + let interface_ids: Vec = result + .interfaces + .iter() + .map(|interface| interface.id) + .collect(); + + result.vlan_interfaces = vlan_dsl::switch_vlan_interface_config + .filter(vlan_config::interface_config_id.eq_any(interface_ids)) + .select(SwitchVlanInterfaceConfig::as_select()) + .load_async::(&conn) .await?; - use db::schema::switch_vlan_interface_config as vlan_config; - use db::schema::switch_vlan_interface_config::dsl as vlan_dsl; - let interface_ids: Vec = result - .interfaces - .iter() - .map(|interface| interface.id) - .collect(); - - result.vlan_interfaces = vlan_dsl::switch_vlan_interface_config - .filter(vlan_config::interface_config_id.eq_any(interface_ids)) - .select(SwitchVlanInterfaceConfig::as_select()) - .load_async::(&conn) - .await?; - - // get the route configs - use db::schema::switch_port_settings_route_config::{ - self as route_config, dsl as route_config_dsl, - }; + // get the route configs + use db::schema::switch_port_settings_route_config::{ + self as route_config, dsl as route_config_dsl, + }; - result.routes = route_config_dsl::switch_port_settings_route_config - .filter(route_config::port_settings_id.eq(id)) - .select(SwitchPortRouteConfig::as_select()) - .load_async::(&conn) - .await?; + result.routes = route_config_dsl::switch_port_settings_route_config + .filter(route_config::port_settings_id.eq(id)) + .select(SwitchPortRouteConfig::as_select()) + .load_async::(&conn) + .await?; - // get the bgp peer configs - use db::schema::switch_port_settings_bgp_peer_config::{ - self as bgp_peer, dsl as bgp_peer_dsl, - }; + // get the bgp peer configs + use db::schema::switch_port_settings_bgp_peer_config::{ + self as bgp_peer, dsl as bgp_peer_dsl, + }; - result.bgp_peers = - bgp_peer_dsl::switch_port_settings_bgp_peer_config - .filter(bgp_peer::port_settings_id.eq(id)) - .select(SwitchPortBgpPeerConfig::as_select()) - .load_async::(&conn) - .await?; + result.bgp_peers = + bgp_peer_dsl::switch_port_settings_bgp_peer_config + .filter(bgp_peer::port_settings_id.eq(id)) + .select(SwitchPortBgpPeerConfig::as_select()) + .load_async::(&conn) + .await?; - // get the address configs - use db::schema::switch_port_settings_address_config::{ - self as address_config, dsl as address_config_dsl, - }; + // get the address configs + use db::schema::switch_port_settings_address_config::{ + self as address_config, dsl as address_config_dsl, + }; - result.addresses = - address_config_dsl::switch_port_settings_address_config - .filter(address_config::port_settings_id.eq(id)) - .select(SwitchPortAddressConfig::as_select()) - .load_async::(&conn) - .await?; + result.addresses = + address_config_dsl::switch_port_settings_address_config + .filter(address_config::port_settings_id.eq(id)) + .select(SwitchPortAddressConfig::as_select()) + .load_async::(&conn) + .await?; - Ok(result) - }}, retry_helper.as_callback()) + Ok(result) + } + }) .await .map_err(|e| { - if let Some(err) = err.get() { + if let Some(err) = err.take() { match err { SwitchPortSettingsGetError::NotFound(name) => { Error::not_found_by_name( @@ -877,10 +864,6 @@ impl DataStore { } let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "switch_port_create", - ); let conn = self.pool_connection_authorized(opctx).await?; let switch_port = SwitchPort::new( @@ -891,8 +874,8 @@ impl DataStore { // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async_with_retry( - |conn| { + self.transaction_retry_wrapper("switch_port_create") + .transaction(&conn, |conn| { let err = err.clone(); let switch_port = switch_port.clone(); async move { @@ -905,6 +888,13 @@ impl DataStore { .first_async::(&conn) .await .map_err(|e| { + // TODO: + // IF retryable + // e + // OTHERWISE + // set a custom error + // e: RollbackTransaction + if retryable(&e) { return e; } @@ -924,27 +914,28 @@ impl DataStore { Ok(db_switch_port) } - }, - retry_helper.as_callback(), - ) - .await - .map_err(|e| { - if let Some(err) = err.get() { - match err { - SwitchPortCreateError::RackNotFound => { - Error::invalid_request("rack not found") + }) + .await + .map_err(|e| { + if let Some(err) = err.get() { + match err { + SwitchPortCreateError::RackNotFound => { + Error::invalid_request("rack not found") + } } + } else { + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::SwitchPort, + &format!( + "{}/{}/{}", + rack_id, &switch_location, &port, + ), + ), + ) } - } else { - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::SwitchPort, - &format!("{}/{}/{}", rack_id, &switch_location, &port,), - ), - ) - } - }) + }) } pub async fn switch_port_delete( @@ -959,18 +950,14 @@ impl DataStore { ActiveSettings, } - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "switch_port_delete", - ); + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async_with_retry( - |conn| { + self.transaction_retry_wrapper("switch_port_delete") + .transaction(&conn, |conn| { let err = err.clone(); async move { use db::schema::switch_port; @@ -989,17 +976,17 @@ impl DataStore { .limit(1) .first_async::(&conn) .await - .map_err(|e| { - if retryable(&e) { - return e; - } - err.set(SwitchPortDeleteError::NotFound).unwrap(); - DieselError::RollbackTransaction + .map_err(|diesel_error| { + err.bail_retryable_or( + diesel_error, + SwitchPortDeleteError::NotFound, + ) })?; if port.port_settings_id.is_some() { - err.set(SwitchPortDeleteError::ActiveSettings).unwrap(); - return Err(DieselError::RollbackTransaction); + return Err( + err.bail(SwitchPortDeleteError::ActiveSettings) + ); } diesel::delete(switch_port_dsl::switch_port) @@ -1009,25 +996,28 @@ impl DataStore { Ok(()) } - }, - retry_helper.as_callback(), - ) - .await - .map_err(|e| { - if let Some(err) = err.get() { - match err { - SwitchPortDeleteError::NotFound => { - let name = &portname.clone(); - Error::not_found_by_name(ResourceType::SwitchPort, name) - } - SwitchPortDeleteError::ActiveSettings => { - Error::invalid_request("must clear port settings first") + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + SwitchPortDeleteError::NotFound => { + let name = &portname.clone(); + Error::not_found_by_name( + ResourceType::SwitchPort, + name, + ) + } + SwitchPortDeleteError::ActiveSettings => { + Error::invalid_request( + "must clear port settings first", + ) + } } + } else { + public_error_from_diesel(e, ErrorHandler::Server) } - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + }) } pub async fn switch_port_list( diff --git a/nexus/db-queries/src/db/mod.rs b/nexus/db-queries/src/db/mod.rs index b7c7079b54..e6b8743e94 100644 --- a/nexus/db-queries/src/db/mod.rs +++ b/nexus/db-queries/src/db/mod.rs @@ -17,7 +17,7 @@ mod config; mod cte_utils; // This is marked public for use by the integration tests pub mod datastore; -mod error; +pub(crate) mod error; mod explain; pub mod fixed_data; pub mod lookup; @@ -42,7 +42,7 @@ pub use nexus_db_model::schema; pub use crate::db::error::TransactionError; pub use config::Config; pub use datastore::DataStore; -pub use pool::Pool; +pub use pool::{DbConnection, Pool}; pub use saga_recovery::{recover, CompletionTask, RecoveryTask}; pub use saga_types::SecId; pub use sec_store::CockroachDbSecStore; diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs index bbdeb7fc91..1cbf72c121 100644 --- a/nexus/db-queries/src/transaction_retry.rs +++ b/nexus/db-queries/src/transaction_retry.rs @@ -4,7 +4,9 @@ //! Helper types for performing automatic transaction retries +use async_bb8_diesel::AsyncConnection; use chrono::Utc; +use diesel::result::Error as DieselError; use oximeter::{types::Sample, Metric, MetricsError, Target}; use rand::{thread_rng, Rng}; use std::sync::{Arc, Mutex}; @@ -92,6 +94,22 @@ impl RetryHelper { } } + /// Calls the function "f" in an asynchronous, retryable transaction. + pub(crate) async fn transaction( + self, + conn: &async_bb8_diesel::Connection, + f: Func, + ) -> Result + where + R: Send + 'static, + Fut: std::future::Future> + Send, + Func: Fn(async_bb8_diesel::Connection) -> Fut + + Send + + Sync, + { + conn.transaction_async_with_retry(f, self.as_callback()).await + } + // Called upon retryable transaction failure. // // This function: @@ -155,6 +173,69 @@ impl oximeter::Producer for Producer { } } +/// Helper utility for passing non-retryable errors out-of-band from +/// transactions. +/// +/// Transactions prefer to act on the `diesel::result::Error` type, +/// but transaction users may want more meaningful error types. +/// This utility helps callers safely propagate back Diesel errors while +/// retaining auxiliary error info. +pub struct OptionalError(Arc>>); + +impl Clone for OptionalError { + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +impl OptionalError { + pub fn new() -> Self { + Self(Arc::new(Mutex::new(None))) + } + + /// Sets "Self" to the value of `error` and returns `DieselError::RollbackTransaction`. + pub fn bail(&self, err: E) -> DieselError { + (*self.0.lock().unwrap()).replace(err); + DieselError::RollbackTransaction + } + + /// If `diesel_error` is retryable, returns it without setting Self. + /// + /// Otherwise, sets "Self" to the value of `err`, and returns + /// `DieselError::RollbackTransaction`. + pub fn bail_retryable_or( + &self, + diesel_error: DieselError, + err: E, + ) -> DieselError { + self.bail_retryable_or_else(diesel_error, |_diesel_error| err) + } + + /// If `diesel_error` is retryable, returns it without setting Self. + /// + /// Otherwise, sets "Self" to the value of `f` applied to `diesel_err`, and + /// returns `DieselError::RollbackTransaction`. + pub fn bail_retryable_or_else( + &self, + diesel_error: DieselError, + f: F, + ) -> DieselError + where + F: FnOnce(DieselError) -> E, + { + if crate::db::error::retryable(&diesel_error) { + return diesel_error; + } else { + self.bail(f(diesel_error)) + } + } + + /// If "Self" was previously set to a non-retryable error, return it. + pub fn take(self) -> Option { + (*self.0.lock().unwrap()).take() + } +} + #[cfg(test)] mod test { use super::*; From eb3097791351abcd5907d0888e168e25271f6105 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 1 Dec 2023 15:26:55 -0800 Subject: [PATCH 18/25] RetryHelper refactor --- .../src/db/datastore/address_lot.rs | 43 ++- nexus/db-queries/src/db/datastore/bgp.rs | 271 ++++++++---------- .../src/db/datastore/identity_provider.rs | 93 +++--- nexus/db-queries/src/db/datastore/mod.rs | 1 - .../src/db/datastore/network_interface.rs | 121 ++++---- nexus/db-queries/src/db/datastore/project.rs | 100 +++---- nexus/db-queries/src/db/datastore/rack.rs | 101 +++---- nexus/db-queries/src/db/datastore/region.rs | 29 +- .../db-queries/src/db/datastore/silo_group.rs | 66 ++--- nexus/db-queries/src/db/datastore/sled.rs | 211 +++++++------- nexus/db-queries/src/db/datastore/snapshot.rs | 176 ++++++------ .../src/db/datastore/switch_interface.rs | 86 +++--- nexus/db-queries/src/db/datastore/volume.rs | 200 ++++++------- nexus/db-queries/src/db/datastore/vpc.rs | 78 +++-- nexus/db-queries/src/transaction_retry.rs | 40 ++- 15 files changed, 708 insertions(+), 908 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/address_lot.rs b/nexus/db-queries/src/db/datastore/address_lot.rs index acef27e363..5c2ffbf1d0 100644 --- a/nexus/db-queries/src/db/datastore/address_lot.rs +++ b/nexus/db-queries/src/db/datastore/address_lot.rs @@ -13,10 +13,9 @@ use crate::db::error::TransactionError; use crate::db::model::Name; use crate::db::model::{AddressLot, AddressLotBlock, AddressLotReservedBlock}; use crate::db::pagination::paginated; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl, Connection}; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::{AsyncRunQueryDsl, Connection}; use chrono::Utc; -use diesel::result::Error as DieselError; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; use diesel_dtrace::DTraceConnection; use ipnetwork::IpNetwork; @@ -29,7 +28,6 @@ use omicron_common::api::external::{ }; use ref_cast::RefCast; use serde::{Deserialize, Serialize}; -use std::sync::{Arc, OnceLock}; use uuid::Uuid; #[derive(Clone, Debug, Deserialize, Serialize)] @@ -115,16 +113,12 @@ impl DataStore { LotInUse, } - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "address_lot_delete", - ); + let err = OptionalError::new(); // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async_with_retry( - |conn| { + self.transaction_retry_wrapper("address_lot_delete") + .transaction(&conn, |conn| { let err = err.clone(); async move { let rsvd: Vec = @@ -136,8 +130,7 @@ impl DataStore { .await?; if !rsvd.is_empty() { - err.set(AddressLotDeleteError::LotInUse).unwrap(); - return Err(DieselError::RollbackTransaction); + return Err(err.bail(AddressLotDeleteError::LotInUse)); } let now = Utc::now(); @@ -155,21 +148,19 @@ impl DataStore { Ok(()) } - }, - retry_helper.as_callback(), - ) - .await - .map_err(|e| { - if let Some(err) = err.get() { - match err { - AddressLotDeleteError::LotInUse => { - Error::invalid_request("lot is in use") + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + AddressLotDeleteError::LotInUse => { + Error::invalid_request("lot is in use") + } } + } else { + public_error_from_diesel(e, ErrorHandler::Server) } - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + }) } pub async fn address_lot_list( diff --git a/nexus/db-queries/src/db/datastore/bgp.rs b/nexus/db-queries/src/db/datastore/bgp.rs index 3d8ff1a68f..28075b0ded 100644 --- a/nexus/db-queries/src/db/datastore/bgp.rs +++ b/nexus/db-queries/src/db/datastore/bgp.rs @@ -2,15 +2,13 @@ use super::DataStore; use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; -use crate::db::error::retryable; use crate::db::error::ErrorHandler; use crate::db::model::Name; use crate::db::model::{BgpAnnounceSet, BgpAnnouncement, BgpConfig}; use crate::db::pagination::paginated; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; -use diesel::result::Error as DieselError; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; use nexus_types::external_api::params; use nexus_types::identity::Resource; @@ -20,7 +18,6 @@ use omicron_common::api::external::{ ResourceType, }; use ref_cast::RefCast; -use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -33,14 +30,9 @@ impl DataStore { use db::schema::{ bgp_announce_set, bgp_announce_set::dsl as announce_set_dsl, }; - let pool = self.pool_connection_authorized(opctx).await?; - - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "bgp_config_set", - ); - pool.transaction_async_with_retry( - |conn| async move { + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("bgp_config_set") + .transaction(&conn, |conn| async move { let id: Uuid = match &config.bgp_announce_set_id { NameOrId::Name(name) => { announce_set_dsl::bgp_announce_set @@ -62,11 +54,9 @@ impl DataStore { .get_result_async(&conn) .await?; Ok(result) - }, - retry_helper.as_callback(), - ) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } pub async fn bgp_config_delete( @@ -85,62 +75,58 @@ impl DataStore { ConfigInUse, } - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "bgp_config_delete", - ); - let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async_with_retry(|conn| { - let err = err.clone(); - async move { - let name_or_id = sel.name_or_id.clone(); - - let id: Uuid = match name_or_id { - NameOrId::Id(id) => id, - NameOrId::Name(name) => { - bgp_config_dsl::bgp_config - .filter(bgp_config::name.eq(name.to_string())) - .select(bgp_config::id) - .limit(1) - .first_async::(&conn) - .await? + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("bgp_config_delete") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let name_or_id = sel.name_or_id.clone(); + + let id: Uuid = match name_or_id { + NameOrId::Id(id) => id, + NameOrId::Name(name) => { + bgp_config_dsl::bgp_config + .filter(bgp_config::name.eq(name.to_string())) + .select(bgp_config::id) + .limit(1) + .first_async::(&conn) + .await? + } + }; + + let count = + sps_bgp_peer_config_dsl::switch_port_settings_bgp_peer_config + .filter(sps_bgp_peer_config::bgp_config_id.eq(id)) + .count() + .execute_async(&conn) + .await?; + + if count > 0 { + return Err(err.bail(BgpConfigDeleteError::ConfigInUse)); } - }; - let count = - sps_bgp_peer_config_dsl::switch_port_settings_bgp_peer_config - .filter(sps_bgp_peer_config::bgp_config_id.eq(id)) - .count() + diesel::update(bgp_config_dsl::bgp_config) + .filter(bgp_config_dsl::id.eq(id)) + .set(bgp_config_dsl::time_deleted.eq(Utc::now())) .execute_async(&conn) .await?; - if count > 0 { - err.set(BgpConfigDeleteError::ConfigInUse).unwrap(); - return Err(DieselError::RollbackTransaction); + Ok(()) } - - diesel::update(bgp_config_dsl::bgp_config) - .filter(bgp_config_dsl::id.eq(id)) - .set(bgp_config_dsl::time_deleted.eq(Utc::now())) - .execute_async(&conn) - .await?; - - Ok(()) - } - }, retry_helper.as_callback()) - .await - .map_err(|e| { - if let Some(err) = err.get() { - match err { - BgpConfigDeleteError::ConfigInUse => { - Error::invalid_request("BGP config in use") + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + BgpConfigDeleteError::ConfigInUse => { + Error::invalid_request("BGP config in use") + } } + } else { + public_error_from_diesel(e, ErrorHandler::Server) } - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + }) } pub async fn bgp_config_get( @@ -150,7 +136,7 @@ impl DataStore { ) -> LookupResult { use db::schema::bgp_config; use db::schema::bgp_config::dsl; - let pool = self.pool_connection_authorized(opctx).await?; + let conn = self.pool_connection_authorized(opctx).await?; let name_or_id = name_or_id.clone(); @@ -159,14 +145,14 @@ impl DataStore { .filter(bgp_config::name.eq(name.to_string())) .select(BgpConfig::as_select()) .limit(1) - .first_async::(&*pool) + .first_async::(&*conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)), NameOrId::Id(id) => dsl::bgp_config .filter(bgp_config::id.eq(id)) .select(BgpConfig::as_select()) .limit(1) - .first_async::(&*pool) + .first_async::(&*conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)), }?; @@ -181,7 +167,7 @@ impl DataStore { ) -> ListResultVec { use db::schema::bgp_config::dsl; - let pool = self.pool_connection_authorized(opctx).await?; + let conn = self.pool_connection_authorized(opctx).await?; match pagparams { PaginatedBy::Id(pagparams) => { @@ -195,7 +181,7 @@ impl DataStore { } .filter(dsl::time_deleted.is_null()) .select(BgpConfig::as_select()) - .load_async(&*pool) + .load_async(&*conn) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } @@ -215,37 +201,38 @@ impl DataStore { AnnounceSetNotFound(Name), } - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "bgp_announce_list", - ); - let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async_with_retry( - |conn| { + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("bgp_announce_list") + .transaction(&conn, |conn| { let err = err.clone(); async move { let name_or_id = sel.name_or_id.clone(); let announce_id: Uuid = match name_or_id { - NameOrId::Id(id) => id, - NameOrId::Name(name) => announce_set_dsl::bgp_announce_set - .filter(bgp_announce_set::time_deleted.is_null()) - .filter(bgp_announce_set::name.eq(name.to_string())) - .select(bgp_announce_set::id) - .limit(1) - .first_async::(&conn) - .await - .map_err(|e| { - if retryable(&e) { - return e; - } - err.set(BgpAnnounceListError::AnnounceSetNotFound( - Name::from(name.clone()), - )).unwrap(); - DieselError::RollbackTransaction - })?, - }; + NameOrId::Id(id) => id, + NameOrId::Name(name) => { + announce_set_dsl::bgp_announce_set + .filter( + bgp_announce_set::time_deleted.is_null(), + ) + .filter( + bgp_announce_set::name.eq(name.to_string()), + ) + .select(bgp_announce_set::id) + .limit(1) + .first_async::(&conn) + .await + .map_err(|e| { + err.bail_retryable_or( + e, + BgpAnnounceListError::AnnounceSetNotFound( + Name::from(name.clone()), + ) + ) + })? + } + }; let result = announce_dsl::bgp_announcement .filter(announce_dsl::announce_set_id.eq(announce_id)) @@ -255,24 +242,22 @@ impl DataStore { Ok(result) } - }, - retry_helper.as_callback(), - ) - .await - .map_err(|e| { - if let Some(err) = err.get() { - match err { - BgpAnnounceListError::AnnounceSetNotFound(name) => { - Error::not_found_by_name( - ResourceType::BgpAnnounceSet, - &name, - ) + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + BgpAnnounceListError::AnnounceSetNotFound(name) => { + Error::not_found_by_name( + ResourceType::BgpAnnounceSet, + &name, + ) + } } + } else { + public_error_from_diesel(e, ErrorHandler::Server) } - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + }) } pub async fn bgp_create_announce_set( @@ -283,13 +268,9 @@ impl DataStore { use db::schema::bgp_announce_set::dsl as announce_set_dsl; use db::schema::bgp_announcement::dsl as bgp_announcement_dsl; - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "bgp_create_announce_set", - ); - let pool = self.pool_connection_authorized(opctx).await?; - pool.transaction_async_with_retry( - |conn| async move { + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("bgp_create_announce_set") + .transaction(&conn, |conn| async move { let bas: BgpAnnounceSet = announce.clone().into(); let db_as: BgpAnnounceSet = @@ -317,11 +298,9 @@ impl DataStore { } Ok((db_as, db_annoucements)) - }, - retry_helper.as_callback(), - ) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } pub async fn bgp_delete_announce_set( @@ -341,16 +320,12 @@ impl DataStore { AnnounceSetInUse, } - let pool = self.pool_connection_authorized(opctx).await?; + let conn = self.pool_connection_authorized(opctx).await?; let name_or_id = sel.name_or_id.clone(); - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "bgp_delete_announce_set", - ); - pool.transaction_async_with_retry( - |conn| { + let err = OptionalError::new(); + self.transaction_retry_wrapper("bgp_delete_announce_set") + .transaction(&conn, |conn| { let err = err.clone(); let name_or_id = name_or_id.clone(); async move { @@ -375,9 +350,9 @@ impl DataStore { .await?; if count > 0 { - err.set(BgpAnnounceSetDeleteError::AnnounceSetInUse) - .unwrap(); - return Err(DieselError::RollbackTransaction); + return Err(err.bail( + BgpAnnounceSetDeleteError::AnnounceSetInUse, + )); } diesel::update(announce_set_dsl::bgp_announce_set) @@ -393,20 +368,18 @@ impl DataStore { Ok(()) } - }, - retry_helper.as_callback(), - ) - .await - .map_err(|e| { - if let Some(err) = err.get() { - match err { - BgpAnnounceSetDeleteError::AnnounceSetInUse => { - Error::invalid_request("BGP announce set in use") + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + BgpAnnounceSetDeleteError::AnnounceSetInUse => { + Error::invalid_request("BGP announce set in use") + } } + } else { + public_error_from_diesel(e, ErrorHandler::Server) } - } else { - public_error_from_diesel(e, ErrorHandler::Server) - } - }) + }) } } diff --git a/nexus/db-queries/src/db/datastore/identity_provider.rs b/nexus/db-queries/src/db/datastore/identity_provider.rs index cb15e20549..cee577acd6 100644 --- a/nexus/db-queries/src/db/datastore/identity_provider.rs +++ b/nexus/db-queries/src/db/datastore/identity_provider.rs @@ -14,8 +14,6 @@ use crate::db::identity::Resource; use crate::db::model::IdentityProvider; use crate::db::model::Name; use crate::db::pagination::paginated; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use omicron_common::api::external::http_pagination::PaginatedBy; @@ -63,62 +61,49 @@ impl DataStore { opctx.authorize(authz::Action::CreateChild, authz_idp_list).await?; assert_eq!(provider.silo_id, authz_idp_list.silo().id()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "saml_identity_provider_create", - ); let name = provider.identity().name.to_string(); - self.pool_connection_authorized(opctx) - .await? - .transaction_async_with_retry( - |conn| { - let provider = provider.clone(); - async move { - // insert silo identity provider record with type Saml - use db::schema::identity_provider::dsl as idp_dsl; - diesel::insert_into(idp_dsl::identity_provider) - .values(db::model::IdentityProvider { - identity: db::model::IdentityProviderIdentity { - id: provider.identity.id, - name: provider.identity.name.clone(), - description: provider - .identity - .description - .clone(), - time_created: provider - .identity - .time_created, - time_modified: provider - .identity - .time_modified, - time_deleted: provider - .identity - .time_deleted, - }, - silo_id: provider.silo_id, - provider_type: - db::model::IdentityProviderType::Saml, - }) - .execute_async(&conn) - .await?; + let conn = self.pool_connection_authorized(opctx).await?; - // insert silo saml identity provider record - use db::schema::saml_identity_provider::dsl; - let result = diesel::insert_into( - dsl::saml_identity_provider, - ) - .values(provider) - .returning( - db::model::SamlIdentityProvider::as_returning(), - ) - .get_result_async(&conn) + self.transaction_retry_wrapper("saml_identity_provider_create") + .transaction(&conn, |conn| { + let provider = provider.clone(); + async move { + // insert silo identity provider record with type Saml + use db::schema::identity_provider::dsl as idp_dsl; + diesel::insert_into(idp_dsl::identity_provider) + .values(db::model::IdentityProvider { + identity: db::model::IdentityProviderIdentity { + id: provider.identity.id, + name: provider.identity.name.clone(), + description: provider + .identity + .description + .clone(), + time_created: provider.identity.time_created, + time_modified: provider.identity.time_modified, + time_deleted: provider.identity.time_deleted, + }, + silo_id: provider.silo_id, + provider_type: + db::model::IdentityProviderType::Saml, + }) + .execute_async(&conn) .await?; - Ok(result) - } - }, - retry_helper.as_callback(), - ) + // insert silo saml identity provider record + use db::schema::saml_identity_provider::dsl; + let result = + diesel::insert_into(dsl::saml_identity_provider) + .values(provider) + .returning( + db::model::SamlIdentityProvider::as_returning(), + ) + .get_result_async(&conn) + .await?; + + Ok(result) + } + }) .await .map_err(|e| { public_error_from_diesel( diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index c63cfeec34..e5baed0a17 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -231,7 +231,6 @@ impl DataStore { ) } - // TODO: Do we need this anymore... #[cfg(test)] pub(crate) fn transaction_retry_producer( &self, diff --git a/nexus/db-queries/src/db/datastore/network_interface.rs b/nexus/db-queries/src/db/datastore/network_interface.rs index b01dc16ced..4d4e43c9a7 100644 --- a/nexus/db-queries/src/db/datastore/network_interface.rs +++ b/nexus/db-queries/src/db/datastore/network_interface.rs @@ -12,7 +12,6 @@ use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::cte_utils::BoxedQuery; use crate::db::error::public_error_from_diesel; -use crate::db::error::retryable; use crate::db::error::ErrorHandler; use crate::db::model::IncompleteNetworkInterface; use crate::db::model::Instance; @@ -25,8 +24,7 @@ use crate::db::model::VpcSubnet; use crate::db::pagination::paginated; use crate::db::pool::DbConnection; use crate::db::queries::network_interface; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -41,7 +39,6 @@ use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use ref_cast::RefCast; use sled_agent_client::types as sled_client_types; -use std::sync::{Arc, OnceLock}; use uuid::Uuid; /// OPTE requires information that's currently split across the network @@ -469,88 +466,78 @@ impl DataStore { FailedToUnsetPrimary(DieselError), } - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "instance_update_network_interface", - ); + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; if primary { - conn.transaction_async_with_retry(|conn| { - let err = err.clone(); - let stopped = stopped.clone(); - let update_target_query = update_target_query.clone(); - async move { - let instance_runtime = - instance_query.get_result_async(&conn).await?.runtime_state; - if instance_runtime.propolis_id.is_some() - || instance_runtime.nexus_state != stopped - { - err.set(NetworkInterfaceUpdateError::InstanceNotStopped).unwrap(); - return Err(diesel::result::Error::RollbackTransaction); - } - - // First, get the primary interface - let primary_interface = - find_primary_query.get_result_async(&conn).await?; - // If the target and primary are different, we need to toggle - // the primary into a secondary. - if primary_interface.identity.id != interface_id { - use crate::db::schema::network_interface::dsl; - if let Err(e) = diesel::update(dsl::network_interface) - .filter(dsl::id.eq(primary_interface.identity.id)) - .filter(dsl::kind.eq(NetworkInterfaceKind::Instance)) - .filter(dsl::time_deleted.is_null()) - .set(dsl::is_primary.eq(false)) - .execute_async(&conn) - .await + self.transaction_retry_wrapper("instance_update_network_interface") + .transaction(&conn, |conn| { + let err = err.clone(); + let stopped = stopped.clone(); + let update_target_query = update_target_query.clone(); + async move { + let instance_runtime = + instance_query.get_result_async(&conn).await?.runtime_state; + if instance_runtime.propolis_id.is_some() + || instance_runtime.nexus_state != stopped { - if retryable(&e) { - return Err(e); + return Err(err.bail(NetworkInterfaceUpdateError::InstanceNotStopped)); + } + + // First, get the primary interface + let primary_interface = + find_primary_query.get_result_async(&conn).await?; + // If the target and primary are different, we need to toggle + // the primary into a secondary. + if primary_interface.identity.id != interface_id { + use crate::db::schema::network_interface::dsl; + if let Err(e) = diesel::update(dsl::network_interface) + .filter(dsl::id.eq(primary_interface.identity.id)) + .filter(dsl::kind.eq(NetworkInterfaceKind::Instance)) + .filter(dsl::time_deleted.is_null()) + .set(dsl::is_primary.eq(false)) + .execute_async(&conn) + .await + { + return Err(err.bail_retryable_or_else( + e, + |e| NetworkInterfaceUpdateError::FailedToUnsetPrimary(e) + )); } - err.set(NetworkInterfaceUpdateError::FailedToUnsetPrimary(e)).unwrap(); - return Err(diesel::result::Error::RollbackTransaction); } - } - // In any case, update the actual target - update_target_query.get_result_async(&conn).await - } - }, - retry_helper.as_callback() - ) + // In any case, update the actual target + update_target_query.get_result_async(&conn).await + } + }).await } else { // In this case, we can just directly apply the updates. By // construction, `updates.primary` is `None`, so nothing will // be done there. The other columns always need to be updated, and // we're only hitting a single row. Note that we still need to // verify the instance is stopped. - conn.transaction_async_with_retry(|conn| { - let err = err.clone(); - let stopped = stopped.clone(); - let update_target_query = update_target_query.clone(); - async move { - let instance_state = - instance_query.get_result_async(&conn).await?.runtime_state; - if instance_state.propolis_id.is_some() - || instance_state.nexus_state != stopped - { - err.set(NetworkInterfaceUpdateError::InstanceNotStopped).unwrap(); - return Err(diesel::result::Error::RollbackTransaction); + self.transaction_retry_wrapper("instance_update_network_interface") + .transaction(&conn, |conn| { + let err = err.clone(); + let stopped = stopped.clone(); + let update_target_query = update_target_query.clone(); + async move { + let instance_state = + instance_query.get_result_async(&conn).await?.runtime_state; + if instance_state.propolis_id.is_some() + || instance_state.nexus_state != stopped + { + return Err(err.bail(NetworkInterfaceUpdateError::InstanceNotStopped)); + } + update_target_query.get_result_async(&conn).await } - update_target_query.get_result_async(&conn).await - } - }, - retry_helper.as_callback() - ) + }).await } - .await // Convert to `InstanceNetworkInterface` before returning, we know // this is valid as we've filtered appropriately above. .map(NetworkInterface::as_instance) .map_err(|e| { - if let Some(err) = Arc::try_unwrap(err).unwrap().take() { + if let Some(err) = err.take() { match err { NetworkInterfaceUpdateError::InstanceNotStopped => { return Error::invalid_request( diff --git a/nexus/db-queries/src/db/datastore/project.rs b/nexus/db-queries/src/db/datastore/project.rs index 8c63f6e71b..ba0c64abfd 100644 --- a/nexus/db-queries/src/db/datastore/project.rs +++ b/nexus/db-queries/src/db/datastore/project.rs @@ -12,7 +12,6 @@ use crate::db; use crate::db::collection_insert::AsyncInsertError; use crate::db::collection_insert::DatastoreCollection; use crate::db::error::public_error_from_diesel; -use crate::db::error::retryable; use crate::db::error::ErrorHandler; use crate::db::fixed_data::project::SERVICES_PROJECT; use crate::db::fixed_data::silo::INTERNAL_SILO_ID; @@ -24,11 +23,10 @@ use crate::db::model::ProjectUpdate; use crate::db::model::Silo; use crate::db::model::VirtualProvisioningCollection; use crate::db::pagination::paginated; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; -use diesel::result::Error as DieselError; use omicron_common::api::external::http_pagination::PaginatedBy; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; @@ -39,7 +37,6 @@ use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use ref_cast::RefCast; -use std::sync::{Arc, OnceLock}; // Generates internal functions used for validation during project deletion. // Used simply to reduce boilerplate. @@ -154,16 +151,13 @@ impl DataStore { use db::schema::project::dsl; - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "project_create_in_silo", - ); + let err = OptionalError::new(); let name = project.name().as_str().to_string(); + let conn = self.pool_connection_authorized(opctx).await?; + let db_project = self - .pool_connection_authorized(opctx) - .await? - .transaction_async_with_retry(|conn| { + .transaction_retry_wrapper("project_create_in_silo") + .transaction(&conn, |conn| { let err = err.clone(); let authz_silo_inner = authz_silo_inner.clone(); @@ -178,22 +172,21 @@ impl DataStore { .await .map_err(|e| match e { AsyncInsertError::CollectionNotFound => { - err.set(authz_silo_inner.not_found()).unwrap(); - return DieselError::RollbackTransaction; - } - AsyncInsertError::DatabaseError(e) => { - if retryable(&e) { - return e; - } - err.set(public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::Project, - &name, - ), - )).unwrap(); - return DieselError::RollbackTransaction; + err.bail(authz_silo_inner.not_found()) } + AsyncInsertError::DatabaseError(diesel_error) => err + .bail_retryable_or_else( + diesel_error, + |diesel_error| { + public_error_from_diesel( + diesel_error, + ErrorHandler::Conflict( + ResourceType::Project, + &name, + ), + ) + }, + ), })?; // Create resource provisioning for the project. @@ -207,13 +200,11 @@ impl DataStore { .await?; Ok(project) } - }, - retry_helper.as_callback(), - ) + }) .await .map_err(|e| { - if let Some(err) = err.get() { - return err.clone(); + if let Some(err) = err.take() { + return err; } public_error_from_diesel(e, ErrorHandler::Server) })?; @@ -252,17 +243,15 @@ impl DataStore { use db::schema::project::dsl; - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "project_delete", - ); - self.pool_connection_authorized(opctx) - .await? - .transaction_async_with_retry(|conn| { + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("project_delete") + .transaction(&conn, |conn| { let err = err.clone(); async move { - let now = Utc::now(); let updated_rows = diesel::update(dsl::project) + let now = Utc::now(); + let updated_rows = diesel::update(dsl::project) .filter(dsl::time_deleted.is_null()) .filter(dsl::id.eq(authz_project.id())) .filter(dsl::rcgen.eq(db_project.rcgen)) @@ -271,23 +260,22 @@ impl DataStore { .execute_async(&conn) .await .map_err(|e| { - if retryable(&e) { - return e; - } - err.set(public_error_from_diesel( - e, - ErrorHandler::NotFoundByResource(authz_project), - )).unwrap(); - DieselError::RollbackTransaction + err.bail_retryable_or_else(e, |e| { + public_error_from_diesel( + e, + ErrorHandler::NotFoundByResource( + authz_project, + ), + ) + }) })?; if updated_rows == 0 { - err.set(Error::InvalidRequest { + return Err(err.bail(Error::InvalidRequest { message: "deletion failed due to concurrent modification" .to_string(), - }).unwrap(); - return Err(DieselError::RollbackTransaction); + })); } self.virtual_provisioning_collection_delete_on_connection( @@ -298,11 +286,11 @@ impl DataStore { .await?; Ok(()) } - }, retry_helper.as_callback()) + }) .await .map_err(|e| { - if let Some(err) = err.get() { - return err.clone(); + if let Some(err) = err.take() { + return err; } public_error_from_diesel(e, ErrorHandler::Server) })?; diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index a7406db568..b678a32e5a 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -27,7 +27,7 @@ use crate::db::model::Rack; use crate::db::model::Zpool; use crate::db::pagination::paginated; use crate::db::pool::DbConnection; -use crate::transaction_retry::RetryHelper; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -570,10 +570,6 @@ impl DataStore { // NOTE: This transaction cannot yet be made retryable, as it uses // nested transactions. - // let retry_helper = RetryHelper::new( - // &self.transaction_retry_producer, - // "rack_set_initialized", - // ); let rack = self .pool_connection_authorized(opctx) .await? @@ -800,61 +796,50 @@ impl DataStore { use crate::db::schema::external_ip::dsl as extip_dsl; use crate::db::schema::service::dsl as service_dsl; - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "nexus_external_addresses", - ); - self.pool_connection_authorized(opctx) - .await? - .transaction_async_with_retry( - |conn| { - let err = err.clone(); - async move { - let ips = - extip_dsl::external_ip - .inner_join(service_dsl::service.on( - service_dsl::id.eq( - extip_dsl::parent_id.assume_not_null(), - ), - )) - .filter(extip_dsl::parent_id.is_not_null()) - .filter(extip_dsl::time_deleted.is_null()) - .filter(extip_dsl::is_service) - .filter( - service_dsl::kind - .eq(db::model::ServiceKind::Nexus), - ) - .select(ExternalIp::as_select()) - .get_results_async(&conn) - .await? - .into_iter() - .map(|external_ip| external_ip.ip.ip()) - .collect(); - - let dns_zones = self - .dns_zones_list_all_on_connection( - opctx, - &conn, - DnsGroup::External, - ) - .await - .map_err(|e| match e.retryable() { - NotRetryable(not_retryable_err) => { - err.set(not_retryable_err).unwrap(); - DieselError::RollbackTransaction - } - Retryable(retryable_err) => retryable_err, - })?; - - Ok((ips, dns_zones)) - } - }, - retry_helper.as_callback(), - ) + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + self.transaction_retry_wrapper("nexus_external_addresses") + .transaction(&conn, |conn| { + let err = err.clone(); + async move { + let ips = extip_dsl::external_ip + .inner_join( + service_dsl::service.on(service_dsl::id + .eq(extip_dsl::parent_id.assume_not_null())), + ) + .filter(extip_dsl::parent_id.is_not_null()) + .filter(extip_dsl::time_deleted.is_null()) + .filter(extip_dsl::is_service) + .filter( + service_dsl::kind.eq(db::model::ServiceKind::Nexus), + ) + .select(ExternalIp::as_select()) + .get_results_async(&conn) + .await? + .into_iter() + .map(|external_ip| external_ip.ip.ip()) + .collect(); + + let dns_zones = self + .dns_zones_list_all_on_connection( + opctx, + &conn, + DnsGroup::External, + ) + .await + .map_err(|e| match e.retryable() { + NotRetryable(not_retryable_err) => { + err.bail(not_retryable_err) + } + Retryable(retryable_err) => retryable_err, + })?; + + Ok((ips, dns_zones)) + } + }) .await .map_err(|e| { - if let Some(err) = Arc::try_unwrap(err).unwrap().take() { + if let Some(err) = err.take() { return err.into(); } public_error_from_diesel(e, ErrorHandler::Server) diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index cfb6dbeb66..b055a3e85c 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -13,8 +13,7 @@ use crate::db::error::ErrorHandler; use crate::db::lookup::LookupPath; use crate::db::model::Dataset; use crate::db::model::Region; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use nexus_types::external_api::params; @@ -23,7 +22,6 @@ use omicron_common::api::external::DeleteResult; use omicron_common::api::external::Error; use omicron_common::nexus_config::RegionAllocationStrategy; use slog::Logger; -use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -164,16 +162,10 @@ impl DataStore { #[error("Numeric error: {0}")] NumericError(String), } - let err = Arc::new(OnceLock::new()); - - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "regions_hard_delete", - ); - - self.pool_connection_unauthorized() - .await? - .transaction_async_with_retry(|conn| { + let err = OptionalError::new(); + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("regions_hard_delete") + .transaction(&conn, |conn| { let err = err.clone(); let region_ids = region_ids.clone(); async move { @@ -208,10 +200,9 @@ impl DataStore { let dataset_total_occupied_size: db::model::ByteCount = dataset_total_occupied_size.try_into().map_err( |e: anyhow::Error| { - err.set(RegionDeleteError::NumericError( + err.bail(RegionDeleteError::NumericError( e.to_string(), - )).unwrap(); - diesel::result::Error::RollbackTransaction + )) }, )?; @@ -230,12 +221,10 @@ impl DataStore { } Ok(()) } - }, - retry_helper.as_callback(), - ) + }) .await .map_err(|e| { - if let Some(err) = err.get() { + if let Some(err) = err.take() { match err { RegionDeleteError::NumericError(err) => { return Error::internal_error( diff --git a/nexus/db-queries/src/db/datastore/silo_group.rs b/nexus/db-queries/src/db/datastore/silo_group.rs index 56a62d6b3c..29fcb7490b 100644 --- a/nexus/db-queries/src/db/datastore/silo_group.rs +++ b/nexus/db-queries/src/db/datastore/silo_group.rs @@ -15,7 +15,6 @@ use crate::db::error::TransactionError; use crate::db::model::SiloGroup; use crate::db::model::SiloGroupMembership; use crate::db::pagination::paginated; -use crate::transaction_retry::RetryHelper; use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; @@ -146,47 +145,40 @@ impl DataStore { ) -> UpdateResult<()> { opctx.authorize(authz::Action::Modify, authz_silo_user).await?; - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "silo_group_membership_replace_for_user", - ); + let conn = self.pool_connection_authorized(opctx).await?; - self.pool_connection_authorized(opctx) - .await? - .transaction_async_with_retry( - |conn| { - let silo_group_ids = silo_group_ids.clone(); - async move { - use db::schema::silo_group_membership::dsl; + self.transaction_retry_wrapper("silo_group_membership_replace_for_user") + .transaction(&conn, |conn| { + let silo_group_ids = silo_group_ids.clone(); + async move { + use db::schema::silo_group_membership::dsl; - // Delete existing memberships for user - let silo_user_id = authz_silo_user.id(); - diesel::delete(dsl::silo_group_membership) - .filter(dsl::silo_user_id.eq(silo_user_id)) - .execute_async(&conn) - .await?; + // Delete existing memberships for user + let silo_user_id = authz_silo_user.id(); + diesel::delete(dsl::silo_group_membership) + .filter(dsl::silo_user_id.eq(silo_user_id)) + .execute_async(&conn) + .await?; - // Create new memberships for user - let silo_group_memberships: Vec< - db::model::SiloGroupMembership, - > = silo_group_ids - .iter() - .map(|group_id| db::model::SiloGroupMembership { - silo_group_id: *group_id, - silo_user_id, - }) - .collect(); + // Create new memberships for user + let silo_group_memberships: Vec< + db::model::SiloGroupMembership, + > = silo_group_ids + .iter() + .map(|group_id| db::model::SiloGroupMembership { + silo_group_id: *group_id, + silo_user_id, + }) + .collect(); - diesel::insert_into(dsl::silo_group_membership) - .values(silo_group_memberships) - .execute_async(&conn) - .await?; + diesel::insert_into(dsl::silo_group_membership) + .values(silo_group_memberships) + .execute_async(&conn) + .await?; - Ok(()) - } - }, - retry_helper.as_callback(), - ) + Ok(()) + } + }) .await .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 57c56ffcda..023384a9bf 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -15,8 +15,7 @@ use crate::db::model::SledResource; use crate::db::model::SledUpdate; use crate::db::pagination::paginated; use crate::db::update_and_check::UpdateAndCheck; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -26,7 +25,6 @@ use omicron_common::api::external::DataPageParams; use omicron_common::api::external::DeleteResult; use omicron_common::api::external::ListResultVec; use omicron_common::api::external::ResourceType; -use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -91,76 +89,74 @@ impl DataStore { enum SledReservationError { NotFound, } - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "sled_reservation_create", - ); - self.pool_connection_authorized(opctx) - .await? - .transaction_async_with_retry( - |conn| { - // Clone variables into retryable function - let err = err.clone(); - let constraints = constraints.clone(); - let resources = resources.clone(); - - async move { - use db::schema::sled_resource::dsl as resource_dsl; - // Check if resource ID already exists - if so, return it. - let old_resource = resource_dsl::sled_resource - .filter(resource_dsl::id.eq(resource_id)) - .select(SledResource::as_select()) - .limit(1) - .load_async(&conn) - .await?; - - if !old_resource.is_empty() { - return Ok(old_resource[0].clone()); - } + let err = OptionalError::new(); + + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("sled_reservation_create") + .transaction(&conn, |conn| { + // Clone variables into retryable function + let err = err.clone(); + let constraints = constraints.clone(); + let resources = resources.clone(); + + async move { + use db::schema::sled_resource::dsl as resource_dsl; + // Check if resource ID already exists - if so, return it. + let old_resource = resource_dsl::sled_resource + .filter(resource_dsl::id.eq(resource_id)) + .select(SledResource::as_select()) + .limit(1) + .load_async(&conn) + .await?; + + if !old_resource.is_empty() { + return Ok(old_resource[0].clone()); + } - // If it doesn't already exist, find a sled with enough space - // for the resources we're requesting. - use db::schema::sled::dsl as sled_dsl; - // This answers the boolean question: - // "Does the SUM of all hardware thread usage, plus the one we're trying - // to allocate, consume less threads than exists on the sled?" - let sled_has_space_for_threads = - (diesel::dsl::sql::( - &format!( - "COALESCE(SUM(CAST({} as INT8)), 0)", - resource_dsl::hardware_threads::NAME - ), - ) + resources.hardware_threads) - .le(sled_dsl::usable_hardware_threads); - - // This answers the boolean question: - // "Does the SUM of all RAM usage, plus the one we're trying - // to allocate, consume less RAM than exists on the sled?" - let sled_has_space_for_rss = - (diesel::dsl::sql::( - &format!( - "COALESCE(SUM(CAST({} as INT8)), 0)", - resource_dsl::rss_ram::NAME - ), - ) + resources.rss_ram) - .le(sled_dsl::usable_physical_ram); - - // Determine whether adding this service's reservoir allocation - // to what's allocated on the sled would avoid going over quota. - let sled_has_space_in_reservoir = - (diesel::dsl::sql::( - &format!( - "COALESCE(SUM(CAST({} as INT8)), 0)", - resource_dsl::reservoir_ram::NAME - ), - ) + resources.reservoir_ram) - .le(sled_dsl::reservoir_size); - - // Generate a query describing all of the sleds that have space - // for this reservation. - let mut sled_targets = sled_dsl::sled + // If it doesn't already exist, find a sled with enough space + // for the resources we're requesting. + use db::schema::sled::dsl as sled_dsl; + // This answers the boolean question: + // "Does the SUM of all hardware thread usage, plus the one we're trying + // to allocate, consume less threads than exists on the sled?" + let sled_has_space_for_threads = + (diesel::dsl::sql::( + &format!( + "COALESCE(SUM(CAST({} as INT8)), 0)", + resource_dsl::hardware_threads::NAME + ), + ) + resources.hardware_threads) + .le(sled_dsl::usable_hardware_threads); + + // This answers the boolean question: + // "Does the SUM of all RAM usage, plus the one we're trying + // to allocate, consume less RAM than exists on the sled?" + let sled_has_space_for_rss = + (diesel::dsl::sql::( + &format!( + "COALESCE(SUM(CAST({} as INT8)), 0)", + resource_dsl::rss_ram::NAME + ), + ) + resources.rss_ram) + .le(sled_dsl::usable_physical_ram); + + // Determine whether adding this service's reservoir allocation + // to what's allocated on the sled would avoid going over quota. + let sled_has_space_in_reservoir = + (diesel::dsl::sql::( + &format!( + "COALESCE(SUM(CAST({} as INT8)), 0)", + resource_dsl::reservoir_ram::NAME + ), + ) + resources.reservoir_ram) + .le(sled_dsl::reservoir_size); + + // Generate a query describing all of the sleds that have space + // for this reservation. + let mut sled_targets = + sled_dsl::sled .left_join( resource_dsl::sled_resource .on(resource_dsl::sled_id.eq(sled_dsl::id)), @@ -179,49 +175,46 @@ impl DataStore { .select(sled_dsl::id) .into_boxed(); - // Further constrain the sled IDs according to any caller- - // supplied constraints. - if let Some(must_select_from) = - constraints.must_select_from() - { - sled_targets = sled_targets.filter( - sled_dsl::id.eq_any(must_select_from.to_vec()), - ); - } - - sql_function!(fn random() -> diesel::sql_types::Float); - let sled_targets = sled_targets - .order(random()) - .limit(1) - .get_results_async::(&conn) - .await?; - - if sled_targets.is_empty() { - err.set(SledReservationError::NotFound).unwrap(); - return Err(diesel::result::Error::NotFound); - } - - // Create a SledResource record, associate it with the target - // sled. - let resource = SledResource::new( - resource_id, - sled_targets[0], - resource_kind, - resources, + // Further constrain the sled IDs according to any caller- + // supplied constraints. + if let Some(must_select_from) = + constraints.must_select_from() + { + sled_targets = sled_targets.filter( + sled_dsl::id.eq_any(must_select_from.to_vec()), ); + } - diesel::insert_into(resource_dsl::sled_resource) - .values(resource) - .returning(SledResource::as_returning()) - .get_result_async(&conn) - .await + sql_function!(fn random() -> diesel::sql_types::Float); + let sled_targets = sled_targets + .order(random()) + .limit(1) + .get_results_async::(&conn) + .await?; + + if sled_targets.is_empty() { + return Err(err.bail(SledReservationError::NotFound)); } - }, - retry_helper.as_callback(), - ) + + // Create a SledResource record, associate it with the target + // sled. + let resource = SledResource::new( + resource_id, + sled_targets[0], + resource_kind, + resources, + ); + + diesel::insert_into(resource_dsl::sled_resource) + .values(resource) + .returning(SledResource::as_returning()) + .get_result_async(&conn) + .await + } + }) .await .map_err(|e| { - if let Some(err) = err.get() { + if let Some(err) = err.take() { match err { SledReservationError::NotFound => { return external::Error::unavail( diff --git a/nexus/db-queries/src/db/datastore/snapshot.rs b/nexus/db-queries/src/db/datastore/snapshot.rs index 5fd6326f56..03058eda61 100644 --- a/nexus/db-queries/src/db/datastore/snapshot.rs +++ b/nexus/db-queries/src/db/datastore/snapshot.rs @@ -20,12 +20,10 @@ use crate::db::model::SnapshotState; use crate::db::pagination::paginated; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; -use diesel::result::Error as DieselError; use diesel::OptionalExtension; use nexus_types::identity::Resource; use omicron_common::api::external::http_pagination::PaginatedBy; @@ -37,7 +35,6 @@ use omicron_common::api::external::ResourceType; use omicron_common::api::external::UpdateResult; use omicron_common::bail_unless; use ref_cast::RefCast; -use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -62,106 +59,93 @@ impl DataStore { let snapshot_name = snapshot.name().to_string(); let project_id = snapshot.project_id; - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "project_ensure_snapshot", - ); + let err = OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + let snapshot: Snapshot = self - .pool_connection_authorized(opctx) - .await? - .transaction_async_with_retry( - |conn| { - let err = err.clone(); - let snapshot = snapshot.clone(); - async move { - use db::schema::snapshot::dsl; + .transaction_retry_wrapper("project_ensure_snapshot") + .transaction(&conn, |conn| { + let err = err.clone(); + let snapshot = snapshot.clone(); + async move { + use db::schema::snapshot::dsl; - // If an undeleted snapshot exists in the database with the - // same name and project but a different id to the snapshot - // this function was passed as an argument, then return an - // error here. - // - // As written below, - // - // .on_conflict((dsl::project_id, dsl::name)) - // .filter_target(dsl::time_deleted.is_null()) - // .do_update() - // .set(dsl::time_modified.eq(dsl::time_modified)) - // - // will set any existing record's `time_modified` if the - // project id and name match, even if the snapshot ID does - // not match. diesel supports adding a filter below like so - // (marked with >>): - // - // .on_conflict((dsl::project_id, dsl::name)) - // .filter_target(dsl::time_deleted.is_null()) - // .do_update() - // .set(dsl::time_modified.eq(dsl::time_modified)) - // >> .filter(dsl::id.eq(snapshot.id())) - // - // which will restrict the `insert_into`'s set so that it - // only applies if the snapshot ID matches. But, - // AsyncInsertError does not have a ObjectAlreadyExists - // variant, so this will be returned as CollectionNotFound - // due to the `insert_into` failing. - // - // If this function is passed a snapshot with an ID that - // does not match, but a project and name that does, return - // ObjectAlreadyExists here. + // If an undeleted snapshot exists in the database with the + // same name and project but a different id to the snapshot + // this function was passed as an argument, then return an + // error here. + // + // As written below, + // + // .on_conflict((dsl::project_id, dsl::name)) + // .filter_target(dsl::time_deleted.is_null()) + // .do_update() + // .set(dsl::time_modified.eq(dsl::time_modified)) + // + // will set any existing record's `time_modified` if the + // project id and name match, even if the snapshot ID does + // not match. diesel supports adding a filter below like so + // (marked with >>): + // + // .on_conflict((dsl::project_id, dsl::name)) + // .filter_target(dsl::time_deleted.is_null()) + // .do_update() + // .set(dsl::time_modified.eq(dsl::time_modified)) + // >> .filter(dsl::id.eq(snapshot.id())) + // + // which will restrict the `insert_into`'s set so that it + // only applies if the snapshot ID matches. But, + // AsyncInsertError does not have a ObjectAlreadyExists + // variant, so this will be returned as CollectionNotFound + // due to the `insert_into` failing. + // + // If this function is passed a snapshot with an ID that + // does not match, but a project and name that does, return + // ObjectAlreadyExists here. - let existing_snapshot_id: Option = dsl::snapshot - .filter(dsl::time_deleted.is_null()) - .filter(dsl::name.eq(snapshot.name().to_string())) - .filter(dsl::project_id.eq(snapshot.project_id)) - .select(dsl::id) - .limit(1) - .first_async(&conn) - .await - .optional()?; + let existing_snapshot_id: Option = dsl::snapshot + .filter(dsl::time_deleted.is_null()) + .filter(dsl::name.eq(snapshot.name().to_string())) + .filter(dsl::project_id.eq(snapshot.project_id)) + .select(dsl::id) + .limit(1) + .first_async(&conn) + .await + .optional()?; - if let Some(existing_snapshot_id) = existing_snapshot_id - { - if existing_snapshot_id != snapshot.id() { - err.set(CustomError::ResourceAlreadyExists) - .unwrap(); - return Err(DieselError::RollbackTransaction); - } + if let Some(existing_snapshot_id) = existing_snapshot_id { + if existing_snapshot_id != snapshot.id() { + return Err( + err.bail(CustomError::ResourceAlreadyExists) + ); } - - Project::insert_resource( - project_id, - diesel::insert_into(dsl::snapshot) - .values(snapshot) - .on_conflict((dsl::project_id, dsl::name)) - .filter_target(dsl::time_deleted.is_null()) - .do_update() - .set(dsl::time_modified.eq(dsl::time_modified)), - ) - .insert_and_get_result_async(&conn) - .await - .map_err(|e| match e { - AsyncInsertError::CollectionNotFound => { - err.set(CustomError::InsertError( - Error::ObjectNotFound { - type_name: ResourceType::Project, - lookup_type: LookupType::ById( - project_id, - ), - }, - )) - .unwrap(); - DieselError::RollbackTransaction - } - AsyncInsertError::DatabaseError(e) => e, - }) } - }, - retry_helper.as_callback(), - ) + + Project::insert_resource( + project_id, + diesel::insert_into(dsl::snapshot) + .values(snapshot) + .on_conflict((dsl::project_id, dsl::name)) + .filter_target(dsl::time_deleted.is_null()) + .do_update() + .set(dsl::time_modified.eq(dsl::time_modified)), + ) + .insert_and_get_result_async(&conn) + .await + .map_err(|e| match e { + AsyncInsertError::CollectionNotFound => err.bail( + CustomError::InsertError(Error::ObjectNotFound { + type_name: ResourceType::Project, + lookup_type: LookupType::ById(project_id), + }), + ), + AsyncInsertError::DatabaseError(e) => e, + }) + } + }) .await .map_err(|e| { - if let Some(err) = Arc::try_unwrap(err).unwrap().take() { + if let Some(err) = err.take() { match err { CustomError::ResourceAlreadyExists => { Error::ObjectAlreadyExists { diff --git a/nexus/db-queries/src/db/datastore/switch_interface.rs b/nexus/db-queries/src/db/datastore/switch_interface.rs index a671de2358..67f16fa08f 100644 --- a/nexus/db-queries/src/db/datastore/switch_interface.rs +++ b/nexus/db-queries/src/db/datastore/switch_interface.rs @@ -13,9 +13,8 @@ use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; use crate::db::model::LoopbackAddress; use crate::db::pagination::paginated; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; -use diesel::result::Error as DieselError; +use crate::transaction_retry::OptionalError; +use async_bb8_diesel::AsyncRunQueryDsl; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; use ipnetwork::IpNetwork; use nexus_types::external_api::params::LoopbackAddressCreate; @@ -23,7 +22,6 @@ use omicron_common::api::external::{ CreateResult, DataPageParams, DeleteResult, Error, ListResultVec, LookupResult, ResourceType, }; -use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -46,16 +44,12 @@ impl DataStore { let inet = IpNetwork::new(params.address, params.mask) .map_err(|_| Error::invalid_request("invalid address"))?; - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "loopback_address_create", - ); + let err = OptionalError::new(); // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - conn.transaction_async_with_retry( - |conn| { + self.transaction_retry_wrapper("loopback_address_create") + .transaction(&conn, |conn| { let err = err.clone(); async move { let lot_id = authz_address_lot.id(); @@ -68,13 +62,9 @@ impl DataStore { ) .await .map_err(|e| match e { - ReserveBlockTxnError::CustomError(e) => { - err.set( - LoopbackAddressCreateError::ReserveBlock(e), - ) - .unwrap(); - DieselError::RollbackTransaction - } + ReserveBlockTxnError::CustomError(e) => err.bail( + LoopbackAddressCreateError::ReserveBlock(e), + ), ReserveBlockTxnError::Database(e) => e, })?; @@ -99,30 +89,28 @@ impl DataStore { Ok(db_addr) } - }, - retry_helper.as_callback(), - ) - .await - .map_err(|e| { - if let Some(err) = err.get() { - match err { - LoopbackAddressCreateError::ReserveBlock( - ReserveBlockError::AddressUnavailable, - ) => Error::invalid_request("address unavailable"), - LoopbackAddressCreateError::ReserveBlock( - ReserveBlockError::AddressNotInLot, - ) => Error::invalid_request("address not in lot"), + }) + .await + .map_err(|e| { + if let Some(err) = err.take() { + match err { + LoopbackAddressCreateError::ReserveBlock( + ReserveBlockError::AddressUnavailable, + ) => Error::invalid_request("address unavailable"), + LoopbackAddressCreateError::ReserveBlock( + ReserveBlockError::AddressNotInLot, + ) => Error::invalid_request("address not in lot"), + } + } else { + public_error_from_diesel( + e, + ErrorHandler::Conflict( + ResourceType::LoopbackAddress, + &format!("lo {}", inet), + ), + ) } - } else { - public_error_from_diesel( - e, - ErrorHandler::Conflict( - ResourceType::LoopbackAddress, - &format!("lo {}", inet), - ), - ) - } - }) + }) } pub async fn loopback_address_delete( @@ -139,12 +127,8 @@ impl DataStore { // TODO https://github.com/oxidecomputer/omicron/issues/2811 // Audit external networking database transaction usage - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "loopback_address_delete", - ); - conn.transaction_async_with_retry( - |conn| async move { + self.transaction_retry_wrapper("loopback_address_delete") + .transaction(&conn, |conn| async move { let la = diesel::delete(dsl::loopback_address) .filter(dsl::id.eq(id)) .returning(LoopbackAddress::as_returning()) @@ -157,11 +141,9 @@ impl DataStore { .await?; Ok(()) - }, - retry_helper.as_callback(), - ) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + }) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } pub async fn loopback_address_get( diff --git a/nexus/db-queries/src/db/datastore/volume.rs b/nexus/db-queries/src/db/datastore/volume.rs index b754df7a9d..4f31efd610 100644 --- a/nexus/db-queries/src/db/datastore/volume.rs +++ b/nexus/db-queries/src/db/datastore/volume.rs @@ -7,7 +7,6 @@ use super::DataStore; use crate::db; use crate::db::error::public_error_from_diesel; -use crate::db::error::retryable; use crate::db::error::ErrorHandler; use crate::db::identity::Asset; use crate::db::model::Dataset; @@ -15,12 +14,10 @@ use crate::db::model::Region; use crate::db::model::RegionSnapshot; use crate::db::model::Volume; use crate::db::queries::volume::DecreaseCrucibleResourceCountAndSoftDeleteVolume; -use crate::transaction_retry::RetryHelper; +use crate::transaction_retry::OptionalError; use anyhow::bail; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; -use diesel::result::Error as DieselError; use diesel::OptionalExtension; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DeleteResult; @@ -32,7 +29,6 @@ use serde::Deserialize; use serde::Deserializer; use serde::Serialize; use sled_agent_client::types::VolumeConstructionRequest; -use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -68,47 +64,42 @@ impl DataStore { crucible_targets }; - let err = Arc::new(OnceLock::new()); - let retry_helper = - RetryHelper::new(&self.transaction_retry_producer, "volume_create"); - self.pool_connection_unauthorized() - .await? - .transaction_async_with_retry( - |conn| { - let err = err.clone(); - let crucible_targets = crucible_targets.clone(); - let volume = volume.clone(); - async move { - let maybe_volume: Option = dsl::volume - .filter(dsl::id.eq(volume.id())) - .select(Volume::as_select()) - .first_async(&conn) - .await - .optional()?; + let err = OptionalError::new(); + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("volume_create") + .transaction(&conn, |conn| { + let err = err.clone(); + let crucible_targets = crucible_targets.clone(); + let volume = volume.clone(); + async move { + let maybe_volume: Option = dsl::volume + .filter(dsl::id.eq(volume.id())) + .select(Volume::as_select()) + .first_async(&conn) + .await + .optional()?; - // If the volume existed already, return it and do not increase - // usage counts. - if let Some(volume) = maybe_volume { - return Ok(volume); - } + // If the volume existed already, return it and do not increase + // usage counts. + if let Some(volume) = maybe_volume { + return Ok(volume); + } - // TODO do we need on_conflict do_nothing here? if the transaction - // model is read-committed, the SELECT above could return nothing, - // and the INSERT here could still result in a conflict. - // - // See also https://github.com/oxidecomputer/omicron/issues/1168 - let volume: Volume = diesel::insert_into(dsl::volume) - .values(volume.clone()) - .on_conflict(dsl::id) - .do_nothing() - .returning(Volume::as_returning()) - .get_result_async(&conn) - .await - .map_err(|e| { - if retryable(&e) { - return e; - } - err.set(VolumeCreationError::Public( + // TODO do we need on_conflict do_nothing here? if the transaction + // model is read-committed, the SELECT above could return nothing, + // and the INSERT here could still result in a conflict. + // + // See also https://github.com/oxidecomputer/omicron/issues/1168 + let volume: Volume = diesel::insert_into(dsl::volume) + .values(volume.clone()) + .on_conflict(dsl::id) + .do_nothing() + .returning(Volume::as_returning()) + .get_result_async(&conn) + .await + .map_err(|e| { + err.bail_retryable_or_else(e, |e| { + VolumeCreationError::Public( public_error_from_diesel( e, ErrorHandler::Conflict( @@ -116,41 +107,37 @@ impl DataStore { volume.id().to_string().as_str(), ), ), - )) - .unwrap(); - DieselError::RollbackTransaction - })?; - - // Increase the usage count for Crucible resources according to the - // contents of the volume. - - // Increase the number of uses for each referenced region snapshot. - use db::schema::region_snapshot::dsl as rs_dsl; - for read_only_target in - &crucible_targets.read_only_targets - { - diesel::update(rs_dsl::region_snapshot) - .filter( - rs_dsl::snapshot_addr - .eq(read_only_target.clone()), ) - .filter(rs_dsl::deleting.eq(false)) - .set( - rs_dsl::volume_references - .eq(rs_dsl::volume_references + 1), - ) - .execute_async(&conn) - .await?; - } + }) + })?; - Ok(volume) + // Increase the usage count for Crucible resources according to the + // contents of the volume. + + // Increase the number of uses for each referenced region snapshot. + use db::schema::region_snapshot::dsl as rs_dsl; + for read_only_target in &crucible_targets.read_only_targets + { + diesel::update(rs_dsl::region_snapshot) + .filter( + rs_dsl::snapshot_addr + .eq(read_only_target.clone()), + ) + .filter(rs_dsl::deleting.eq(false)) + .set( + rs_dsl::volume_references + .eq(rs_dsl::volume_references + 1), + ) + .execute_async(&conn) + .await?; } - }, - retry_helper.as_callback(), - ) + + Ok(volume) + } + }) .await .map_err(|e| { - if let Some(err) = Arc::try_unwrap(err).unwrap().take() { + if let Some(err) = err.take() { match err { VolumeCreationError::Public(err) => err, VolumeCreationError::SerdeError(err) => { @@ -221,14 +208,11 @@ impl DataStore { // types that require it). The generation number (along with the // rest of the volume data) that was in the database is what is // returned to the caller. - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "volume_checkout", - ); - self.pool_connection_unauthorized() - .await? - .transaction_async_with_retry(|conn| { + let err = OptionalError::new(); + let conn = self.pool_connection_unauthorized().await?; + + self.transaction_retry_wrapper("volume_checkout") + .transaction(&conn, |conn| { let err = err.clone(); async move { // Grab the volume in question. @@ -241,8 +225,7 @@ impl DataStore { // Turn the volume.data into the VolumeConstructionRequest let vcr: VolumeConstructionRequest = serde_json::from_str(volume.data()).map_err(|e| { - err.set(VolumeGetError::SerdeError(e)).unwrap(); - DieselError::RollbackTransaction + err.bail(VolumeGetError::SerdeError(e)) })?; // Look to see if the VCR is a Volume type, and if so, look at @@ -302,8 +285,7 @@ impl DataStore { &new_vcr, ) .map_err(|e| { - err.set(VolumeGetError::SerdeError(e)).unwrap(); - DieselError::RollbackTransaction + err.bail(VolumeGetError::SerdeError(e)) })?; // Update the original volume_id with the new @@ -320,13 +302,12 @@ impl DataStore { // not, then something is terribly wrong in the // database. if num_updated != 1 { - err.set( + return Err(err.bail( VolumeGetError::UnexpectedDatabaseUpdate( num_updated, 1, ), - ).unwrap(); - return Err(DieselError::RollbackTransaction); + )); } } } @@ -355,10 +336,10 @@ impl DataStore { } Ok(volume) } - }, retry_helper.as_callback()) + }) .await .map_err(|e| { - if let Some(err) = err.get() { + if let Some(err) = err.take() { return Error::internal_error(&format!("Transaction error: {}", err)); } public_error_from_diesel(e, ErrorHandler::Server) @@ -677,14 +658,10 @@ impl DataStore { // data from original volume_id. // - Put the new temp VCR into the temp volume.data, update the // temp_volume in the database. - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "volume_remove_rop", - ); - self.pool_connection_unauthorized() - .await? - .transaction_async_with_retry(|conn| { + let err = OptionalError::new(); + let conn = self.pool_connection_unauthorized().await?; + self.transaction_retry_wrapper("volume_remove_rop") + .transaction(&conn, |conn| { let err = err.clone(); async move { // Grab the volume in question. If the volume record was already @@ -724,12 +701,11 @@ impl DataStore { volume.data() ) .map_err(|e| { - err.set( + err.bail( RemoveReadOnlyParentError::SerdeError( e, ) - ).unwrap(); - DieselError::RollbackTransaction + ) })?; match vcr { @@ -757,10 +733,9 @@ impl DataStore { &new_vcr ) .map_err(|e| { - err.set(RemoveReadOnlyParentError::SerdeError( + err.bail(RemoveReadOnlyParentError::SerdeError( e, - )).unwrap(); - DieselError::RollbackTransaction + )) })?; // Update the original volume_id with the new @@ -776,8 +751,7 @@ impl DataStore { // not, then something is terribly wrong in the // database. if num_updated != 1 { - err.set(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1)).unwrap(); - return Err(DieselError::RollbackTransaction); + return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); } // Make a new VCR, with the information from @@ -794,10 +768,9 @@ impl DataStore { &rop_vcr ) .map_err(|e| { - err.set(RemoveReadOnlyParentError::SerdeError( + err.bail(RemoveReadOnlyParentError::SerdeError( e, - )).unwrap(); - DieselError::RollbackTransaction + )) })?; // Update the temp_volume_id with the volume // data that contains the read_only_parent. @@ -809,8 +782,7 @@ impl DataStore { .execute_async(&conn) .await?; if num_updated != 1 { - err.set(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1)).unwrap(); - return Err(DieselError::RollbackTransaction); + return Err(err.bail(RemoveReadOnlyParentError::UnexpectedDatabaseUpdate(num_updated, 1))); } Ok(true) } @@ -828,10 +800,10 @@ impl DataStore { } } } - }, retry_helper.as_callback()) + }) .await .map_err(|e| { - if let Some(err) = err.get() { + if let Some(err) = err.take() { return Error::internal_error(&format!("Transaction error: {}", err)); } public_error_from_diesel(e, ErrorHandler::Server) diff --git a/nexus/db-queries/src/db/datastore/vpc.rs b/nexus/db-queries/src/db/datastore/vpc.rs index 67ee644942..069ce63028 100644 --- a/nexus/db-queries/src/db/datastore/vpc.rs +++ b/nexus/db-queries/src/db/datastore/vpc.rs @@ -36,8 +36,7 @@ use crate::db::queries::vpc::InsertVpcQuery; use crate::db::queries::vpc::VniSearchIter; use crate::db::queries::vpc_subnet::FilterConflictingVpcSubnetRangesQuery; use crate::db::queries::vpc_subnet::SubnetError; -use crate::transaction_retry::RetryHelper; -use async_bb8_diesel::AsyncConnection; +use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; @@ -61,7 +60,6 @@ use omicron_common::api::external::UpdateResult; use omicron_common::api::external::Vni as ExternalVni; use ref_cast::RefCast; use std::collections::BTreeMap; -use std::sync::{Arc, OnceLock}; use uuid::Uuid; impl DataStore { @@ -586,56 +584,46 @@ impl DataStore { CollectionNotFound, } - let err = Arc::new(OnceLock::new()); - let retry_helper = RetryHelper::new( - &self.transaction_retry_producer, - "vpc_update_firewall_rules", - ); + let err = OptionalError::new(); // TODO-scalability: Ideally this would be a CTE so we don't need to // hold a transaction open across multiple roundtrips from the database, // but for now we're using a transaction due to the severely decreased // legibility of CTEs via diesel right now. - self.pool_connection_authorized(opctx) - .await? - .transaction_async_with_retry( - |conn| { - let err = err.clone(); - let delete_old_query = delete_old_query.clone(); - let rules = rules.clone(); - async move { - delete_old_query.execute_async(&conn).await?; - - // The generation count update on the vpc table row will take a - // write lock on the row, ensuring that the vpc was not deleted - // concurently. - if rules_is_empty { - return Ok(vec![]); - } - Vpc::insert_resource( - authz_vpc.id(), - diesel::insert_into(dsl::vpc_firewall_rule) - .values(rules), - ) - .insert_and_get_results_async(&conn) - .await - .map_err(|e| match e { - AsyncInsertError::CollectionNotFound => { - err.set( - FirewallUpdateError::CollectionNotFound, - ) - .unwrap(); - return DieselError::RollbackTransaction; - } - AsyncInsertError::DatabaseError(e) => e, - }) + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("vpc_update_firewall_rules") + .transaction(&conn, |conn| { + let err = err.clone(); + let delete_old_query = delete_old_query.clone(); + let rules = rules.clone(); + async move { + delete_old_query.execute_async(&conn).await?; + + // The generation count update on the vpc table row will take a + // write lock on the row, ensuring that the vpc was not deleted + // concurently. + if rules_is_empty { + return Ok(vec![]); } - }, - retry_helper.as_callback(), - ) + Vpc::insert_resource( + authz_vpc.id(), + diesel::insert_into(dsl::vpc_firewall_rule) + .values(rules), + ) + .insert_and_get_results_async(&conn) + .await + .map_err(|e| match e { + AsyncInsertError::CollectionNotFound => { + err.bail(FirewallUpdateError::CollectionNotFound) + } + AsyncInsertError::DatabaseError(e) => e, + }) + } + }) .await .map_err(|e| { - if let Some(err) = err.get() { + if let Some(err) = err.take() { match err { FirewallUpdateError::CollectionNotFound => { Error::not_found_by_id( diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs index 1cbf72c121..6675eb7c71 100644 --- a/nexus/db-queries/src/transaction_retry.rs +++ b/nexus/db-queries/src/transaction_retry.rs @@ -241,7 +241,6 @@ mod test { use super::*; use crate::db::datastore::datastore_test; - use async_bb8_diesel::AsyncConnection; use nexus_test_utils::db::test_setup_database; use omicron_test_utils::dev; use oximeter::types::FieldValue; @@ -259,18 +258,15 @@ mod test { let conn = datastore.pool_connection_for_tests().await.unwrap(); - let retry_helper = RetryHelper::new( - datastore.transaction_retry_producer(), - "test_transaction_rollback_produces_no_samples", - ); - conn.transaction_async_with_retry( - |_conn| async move { + datastore + .transaction_retry_wrapper( + "test_transaction_rollback_produces_no_samples", + ) + .transaction(&conn, |_conn| async move { Err::<(), _>(diesel::result::Error::RollbackTransaction) - }, - retry_helper.as_callback(), - ) - .await - .expect_err("Should have failed"); + }) + .await + .expect_err("Should have failed"); let samples = datastore .transaction_retry_producer() @@ -294,22 +290,18 @@ mod test { let (_opctx, datastore) = datastore_test(&logctx, &db).await; let conn = datastore.pool_connection_for_tests().await.unwrap(); - - let retry_helper = RetryHelper::new( - datastore.transaction_retry_producer(), - "test_transaction_retry_produces_samples", - ); - conn.transaction_async_with_retry( - |_conn| async move { + datastore + .transaction_retry_wrapper( + "test_transaction_retry_produces_samples", + ) + .transaction(&conn, |_conn| async move { Err::<(), _>(diesel::result::Error::DatabaseError( diesel::result::DatabaseErrorKind::SerializationFailure, Box::new("restart transaction: Retry forever!".to_string()), )) - }, - retry_helper.as_callback(), - ) - .await - .expect_err("Should have failed"); + }) + .await + .expect_err("Should have failed"); let samples = datastore .transaction_retry_producer() From 246ac8a924c66ba451647ef3535e50fcdc6d07be Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 1 Dec 2023 15:35:21 -0800 Subject: [PATCH 19/25] use retry wrapper everywhere, even for schema changes --- .../src/db/datastore/db_metadata.rs | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs index b7c99d7c92..e579bb8476 100644 --- a/nexus/db-queries/src/db/datastore/db_metadata.rs +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -8,9 +8,7 @@ use super::DataStore; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, -}; +use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use camino::{Utf8Path, Utf8PathBuf}; use chrono::Utc; use diesel::prelude::*; @@ -414,23 +412,26 @@ impl DataStore { target: &SemverVersion, sql: &String, ) -> Result<(), Error> { - let result = self.pool_connection_unauthorized().await?.transaction_async_with_retry(|conn| async move { - if target.to_string() != EARLIEST_SUPPORTED_VERSION { - let validate_version_query = format!("SELECT CAST(\ - IF(\ - (\ - SELECT version = '{current}' and target_version = '{target}'\ - FROM omicron.public.db_metadata WHERE singleton = true\ - ),\ - 'true',\ - 'Invalid starting version for schema change'\ - ) AS BOOL\ - );"); - conn.batch_execute_async(&validate_version_query).await?; - } - conn.batch_execute_async(&sql).await?; - Ok(()) - }, || async move { true }).await; + let conn = self.pool_connection_unauthorized().await?; + + let result = self.transaction_retry_wrapper("apply_schema_update") + .transaction(&conn, |conn| async move { + if target.to_string() != EARLIEST_SUPPORTED_VERSION { + let validate_version_query = format!("SELECT CAST(\ + IF(\ + (\ + SELECT version = '{current}' and target_version = '{target}'\ + FROM omicron.public.db_metadata WHERE singleton = true\ + ),\ + 'true',\ + 'Invalid starting version for schema change'\ + ) AS BOOL\ + );"); + conn.batch_execute_async(&validate_version_query).await?; + } + conn.batch_execute_async(&sql).await?; + Ok(()) + }).await; match result { Ok(()) => Ok(()), From d6f76922bc77d2cc560250813265bb278cd61a97 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 1 Dec 2023 15:39:55 -0800 Subject: [PATCH 20/25] one more error cleanup --- .../src/db/datastore/switch_port.rs | 24 +++-------- nexus/preprocessed_configs/config.xml | 41 +++++++++++++++++++ 2 files changed, 47 insertions(+), 18 deletions(-) create mode 100644 nexus/preprocessed_configs/config.xml diff --git a/nexus/db-queries/src/db/datastore/switch_port.rs b/nexus/db-queries/src/db/datastore/switch_port.rs index e4747327ea..221feee23c 100644 --- a/nexus/db-queries/src/db/datastore/switch_port.rs +++ b/nexus/db-queries/src/db/datastore/switch_port.rs @@ -10,7 +10,6 @@ use crate::db::datastore::address_lot::{ }; use crate::db::datastore::UpdatePrecondition; use crate::db::error::public_error_from_diesel; -use crate::db::error::retryable; use crate::db::error::ErrorHandler; use crate::db::model::{ LldpServiceConfig, Name, SwitchInterfaceConfig, SwitchPort, @@ -22,7 +21,6 @@ use crate::db::model::{ use crate::db::pagination::paginated; use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; -use diesel::result::Error as DieselError; use diesel::{ CombineDsl, ExpressionMethods, JoinOnDsl, NullableExpressionMethods, QueryDsl, SelectableHelper, @@ -35,7 +33,6 @@ use omicron_common::api::external::{ }; use ref_cast::RefCast; use serde::{Deserialize, Serialize}; -use std::sync::{Arc, OnceLock}; use uuid::Uuid; #[derive(Clone, Debug, Deserialize, Serialize)] @@ -863,7 +860,7 @@ impl DataStore { RackNotFound, } - let err = Arc::new(OnceLock::new()); + let err = OptionalError::new(); let conn = self.pool_connection_authorized(opctx).await?; let switch_port = SwitchPort::new( @@ -888,19 +885,10 @@ impl DataStore { .first_async::(&conn) .await .map_err(|e| { - // TODO: - // IF retryable - // e - // OTHERWISE - // set a custom error - // e: RollbackTransaction - - if retryable(&e) { - return e; - } - err.set(SwitchPortCreateError::RackNotFound) - .unwrap(); - DieselError::RollbackTransaction + err.bail_retryable_or( + e, + SwitchPortCreateError::RackNotFound, + ) })?; // insert switch port @@ -917,7 +905,7 @@ impl DataStore { }) .await .map_err(|e| { - if let Some(err) = err.get() { + if let Some(err) = err.take() { match err { SwitchPortCreateError::RackNotFound => { Error::invalid_request("rack not found") diff --git a/nexus/preprocessed_configs/config.xml b/nexus/preprocessed_configs/config.xml new file mode 100644 index 0000000000..9b13f12aea --- /dev/null +++ b/nexus/preprocessed_configs/config.xml @@ -0,0 +1,41 @@ + + + + + trace + true + + + 8123 + 9000 + 9004 + + ./ + + true + + + + + + + ::/0 + + + default + default + 1 + + + + + + + + + + + \ No newline at end of file From 7fff381255e2a70e6a7878423fe2b0547638da16 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 1 Dec 2023 15:42:42 -0800 Subject: [PATCH 21/25] Remove config.xml from tests --- nexus/preprocessed_configs/config.xml | 41 --------------------------- 1 file changed, 41 deletions(-) delete mode 100644 nexus/preprocessed_configs/config.xml diff --git a/nexus/preprocessed_configs/config.xml b/nexus/preprocessed_configs/config.xml deleted file mode 100644 index 9b13f12aea..0000000000 --- a/nexus/preprocessed_configs/config.xml +++ /dev/null @@ -1,41 +0,0 @@ - - - - - trace - true - - - 8123 - 9000 - 9004 - - ./ - - true - - - - - - - ::/0 - - - default - default - 1 - - - - - - - - - - - \ No newline at end of file From a54925eb7c66a2ffdb8778dbc1d8607b6c8e44f5 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 1 Dec 2023 17:49:10 -0800 Subject: [PATCH 22/25] Make tests happier --- nexus/db-queries/src/db/datastore/dns.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/dns.rs b/nexus/db-queries/src/db/datastore/dns.rs index 19b8b36102..cfd25d6a4f 100644 --- a/nexus/db-queries/src/db/datastore/dns.rs +++ b/nexus/db-queries/src/db/datastore/dns.rs @@ -1668,6 +1668,10 @@ mod test { let conn = datastore.pool_connection_for_tests().await.unwrap(); let error = datastore.dns_update(&opctx, &conn, update).await.unwrap_err(); + let error = match error { + TransactionError::CustomError(err) => err, + _ => panic!("Unexpected error: {:?}", error), + }; assert_eq!( error.to_string(), "Internal Error: updated wrong number of dns_name \ @@ -1691,11 +1695,15 @@ mod test { update.add_name(String::from("n2"), records1.clone()).unwrap(); let conn = datastore.pool_connection_for_tests().await.unwrap(); - let error = - datastore.dns_update(&opctx, &conn, update).await.unwrap_err(); + let error = Error::from( + datastore.dns_update(&opctx, &conn, update).await.unwrap_err(), + ); let msg = error.to_string(); - assert!(msg.starts_with("Internal Error: ")); - assert!(msg.contains("violates unique constraint")); + assert!(msg.starts_with("Internal Error: "), "Message: {msg:}"); + assert!( + msg.contains("violates unique constraint"), + "Message: {msg:}" + ); } let dns_config = datastore From ec9f4500b1b85e9db0564308db7c9a7b0982ca6e Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 4 Dec 2023 12:47:15 -0800 Subject: [PATCH 23/25] Add more automated retries --- .../src/db/datastore/device_auth.rs | 53 ++++++------ nexus/db-queries/src/db/datastore/mod.rs | 2 +- nexus/db-queries/src/db/datastore/rack.rs | 10 ++- nexus/db-queries/src/db/datastore/update.rs | 56 +++++++------ nexus/db-queries/src/transaction_retry.rs | 4 +- nexus/src/app/background/dns_config.rs | 11 ++- nexus/src/app/background/init.rs | 8 +- nexus/src/app/sagas/disk_create.rs | 32 ++++--- nexus/src/app/sagas/instance_create.rs | 83 +++++++++---------- nexus/src/app/sagas/project_create.rs | 15 ++-- nexus/src/app/sagas/test_helpers.rs | 25 +++--- 11 files changed, 144 insertions(+), 155 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/device_auth.rs b/nexus/db-queries/src/db/datastore/device_auth.rs index e1facb43f6..8d8e09744c 100644 --- a/nexus/db-queries/src/db/datastore/device_auth.rs +++ b/nexus/db-queries/src/db/datastore/device_auth.rs @@ -10,10 +10,8 @@ use crate::context::OpContext; use crate::db; use crate::db::error::public_error_from_diesel; use crate::db::error::ErrorHandler; -use crate::db::error::TransactionError; use crate::db::model::DeviceAccessToken; use crate::db::model::DeviceAuthRequest; -use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use omicron_common::api::external::CreateResult; @@ -75,35 +73,40 @@ impl DataStore { RequestNotFound, TooManyRequests, } - type TxnError = TransactionError; - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - match delete_request.execute_async(&conn).await? { - 0 => { - Err(TxnError::CustomError(TokenGrantError::RequestNotFound)) + let err = crate::transaction_retry::OptionalError::new(); + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("device_access_token_create") + .transaction(&conn, |conn| { + let err = err.clone(); + let insert_token = insert_token.clone(); + let delete_request = delete_request.clone(); + async move { + match delete_request.execute_async(&conn).await? { + 0 => Err(err.bail(TokenGrantError::RequestNotFound)), + 1 => Ok(insert_token.get_result_async(&conn).await?), + _ => Err(err.bail(TokenGrantError::TooManyRequests)), } - 1 => Ok(insert_token.get_result_async(&conn).await?), - _ => Err(TxnError::CustomError( - TokenGrantError::TooManyRequests, - )), } }) .await - .map_err(|e| match e { - TxnError::CustomError(TokenGrantError::RequestNotFound) => { - Error::ObjectNotFound { - type_name: ResourceType::DeviceAuthRequest, - lookup_type: LookupType::ByCompositeId( - authz_request.id(), - ), + .map_err(|e| { + if let Some(err) = err.take() { + match err { + TokenGrantError::RequestNotFound => { + Error::ObjectNotFound { + type_name: ResourceType::DeviceAuthRequest, + lookup_type: LookupType::ByCompositeId( + authz_request.id(), + ), + } + } + TokenGrantError::TooManyRequests => { + Error::internal_error("unexpectedly found multiple device auth requests for the same user code") + } } - } - TxnError::CustomError(TokenGrantError::TooManyRequests) => { - Error::internal_error("unexpectedly found multiple device auth requests for the same user code") - } - TxnError::Database(e) => { + } else { public_error_from_diesel(e, ErrorHandler::Server) } }) diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index e5baed0a17..2e7f9da5b7 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -221,7 +221,7 @@ impl DataStore { /// Constructs a transaction retry helper /// /// Automatically wraps the underlying producer - pub(crate) fn transaction_retry_wrapper( + pub fn transaction_retry_wrapper( &self, name: &'static str, ) -> crate::transaction_retry::RetryHelper { diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index b678a32e5a..a69386cfd0 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -1076,14 +1076,16 @@ mod test { async fn [](db: &DataStore) -> Vec<$model> { use crate::db::schema::$table::dsl; use nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL; - db.pool_connection_for_tests() + let conn = db.pool_connection_for_tests() .await - .unwrap() - .transaction_async(|conn| async move { + .unwrap(); + + db.transaction_retry_wrapper(concat!("fn_to_get_all_", stringify!($table))) + .transaction(&conn, |conn| async move { conn.batch_execute_async(ALLOW_FULL_TABLE_SCAN_SQL) .await .unwrap(); - Ok::<_, crate::db::TransactionError<()>>( + Ok( dsl::$table .select($model::as_select()) .get_results_async(&conn) diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index 8b1eecb781..0790bd458e 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -8,15 +8,13 @@ use super::DataStore; use crate::authz; use crate::context::OpContext; use crate::db; -use crate::db::error::{ - public_error_from_diesel, ErrorHandler, TransactionError, -}; +use crate::db::error::{public_error_from_diesel, ErrorHandler}; use crate::db::model::{ ComponentUpdate, SemverVersion, SystemUpdate, UpdateArtifact, UpdateDeployment, UpdateStatus, UpdateableComponent, }; use crate::db::pagination::paginated; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; +use async_bb8_diesel::AsyncRunQueryDsl; use chrono::Utc; use diesel::prelude::*; use nexus_db_model::SystemUpdateComponentUpdate; @@ -141,36 +139,40 @@ impl DataStore { let version_string = update.version.to_string(); - self.pool_connection_authorized(opctx) - .await? - .transaction_async(|conn| async move { - let db_update = diesel::insert_into(component_update::table) - .values(update.clone()) - .returning(ComponentUpdate::as_returning()) - .get_result_async(&conn) - .await?; - - diesel::insert_into(join_table::table) - .values(SystemUpdateComponentUpdate { - system_update_id, - component_update_id: update.id(), - }) - .returning(SystemUpdateComponentUpdate::as_returning()) - .get_result_async(&conn) - .await?; - - Ok(db_update) + let conn = self.pool_connection_authorized(opctx).await?; + + self.transaction_retry_wrapper("create_component_update") + .transaction(&conn, |conn| { + let update = update.clone(); + async move { + let db_update = + diesel::insert_into(component_update::table) + .values(update.clone()) + .returning(ComponentUpdate::as_returning()) + .get_result_async(&conn) + .await?; + + diesel::insert_into(join_table::table) + .values(SystemUpdateComponentUpdate { + system_update_id, + component_update_id: update.id(), + }) + .returning(SystemUpdateComponentUpdate::as_returning()) + .get_result_async(&conn) + .await?; + + Ok(db_update) + } }) .await - .map_err(|e| match e { - TransactionError::CustomError(e) => e, - TransactionError::Database(e) => public_error_from_diesel( + .map_err(|e| { + public_error_from_diesel( e, ErrorHandler::Conflict( ResourceType::ComponentUpdate, &version_string, ), - ), + ) }) } diff --git a/nexus/db-queries/src/transaction_retry.rs b/nexus/db-queries/src/transaction_retry.rs index 6675eb7c71..c474b729f8 100644 --- a/nexus/db-queries/src/transaction_retry.rs +++ b/nexus/db-queries/src/transaction_retry.rs @@ -73,7 +73,7 @@ impl RetryHelperInner { /// Helper utility for tracking retry attempts and latency. /// Intended to be used from within "transaction_async_with_retry". -pub(crate) struct RetryHelper { +pub struct RetryHelper { producer: Producer, name: &'static str, inner: Mutex, @@ -95,7 +95,7 @@ impl RetryHelper { } /// Calls the function "f" in an asynchronous, retryable transaction. - pub(crate) async fn transaction( + pub async fn transaction( self, conn: &async_bb8_diesel::Connection, f: Func, diff --git a/nexus/src/app/background/dns_config.rs b/nexus/src/app/background/dns_config.rs index 654e9c0bf1..805ae813fe 100644 --- a/nexus/src/app/background/dns_config.rs +++ b/nexus/src/app/background/dns_config.rs @@ -166,7 +166,6 @@ mod test { use crate::app::background::init::test::read_internal_dns_zone_id; use crate::app::background::init::test::write_test_dns_generation; use assert_matches::assert_matches; - use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use async_bb8_diesel::AsyncSimpleConnection; use diesel::ExpressionMethods; @@ -237,11 +236,11 @@ mod test { ); // Similarly, wipe all of the state and verify that we handle that okay. + let conn = datastore.pool_connection_for_tests().await.unwrap(); + datastore - .pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + .transaction_retry_wrapper("dns_config_test_basic") + .transaction(&conn, |conn| async move { conn.batch_execute_async( nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, ) @@ -265,7 +264,7 @@ mod test { .execute_async(&conn) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>(()) + Ok(()) }) .await .unwrap(); diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index d27248ffdc..efbbd65428 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -226,14 +226,12 @@ fn init_dns( #[cfg(test)] pub mod test { - use async_bb8_diesel::AsyncConnection; use async_bb8_diesel::AsyncRunQueryDsl; use dropshot::HandlerTaskMode; use nexus_db_model::DnsGroup; use nexus_db_model::Generation; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; - use nexus_db_queries::db::TransactionError; use nexus_test_utils_macros::nexus_test; use nexus_types::internal_api::params as nexus_params; use nexus_types::internal_api::params::ServiceKind; @@ -425,11 +423,11 @@ pub mod test { datastore: &DataStore, internal_dns_zone_id: Uuid, ) { - type TxnError = TransactionError<()>; { let conn = datastore.pool_connection_for_tests().await.unwrap(); - let _: Result<(), TxnError> = conn - .transaction_async(|conn| async move { + let _: Result<(), _> = datastore + .transaction_retry_wrapper("write_test_dns_generation") + .transaction(&conn, |conn| async move { { use nexus_db_queries::db::model::DnsVersion; use nexus_db_queries::db::schema::dns_version::dsl; diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index fe403a7d41..4883afaddc 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -830,9 +830,7 @@ pub(crate) mod test { app::saga::create_saga_dag, app::sagas::disk_create::Params, app::sagas::disk_create::SagaDiskCreate, external_api::params, }; - use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, - }; + use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; @@ -972,27 +970,25 @@ pub(crate) mod test { use nexus_db_queries::db::model::VirtualProvisioningCollection; use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl; + let conn = datastore.pool_connection_for_tests().await.unwrap(); + datastore - .pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + .transaction_retry_wrapper( + "no_virtual_provisioning_collection_records_using_storage", + ) + .transaction(&conn, |conn| async move { conn.batch_execute_async( nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, ) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( - dsl::virtual_provisioning_collection - .filter(dsl::virtual_disk_bytes_provisioned.ne(0)) - .select(VirtualProvisioningCollection::as_select()) - .get_results_async::( - &conn, - ) - .await - .unwrap() - .is_empty(), - ) + Ok(dsl::virtual_provisioning_collection + .filter(dsl::virtual_disk_bytes_provisioned.ne(0)) + .select(VirtualProvisioningCollection::as_select()) + .get_results_async::(&conn) + .await + .unwrap() + .is_empty()) }) .await .unwrap() diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 153e0323e7..8c2f96c36c 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -866,9 +866,7 @@ pub mod test { app::sagas::instance_create::SagaInstanceCreate, app::sagas::test_helpers, external_api::params, }; - use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, - }; + use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use diesel::{ BoolExpressionMethods, ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, @@ -1013,30 +1011,28 @@ pub mod test { use nexus_db_queries::db::model::SledResource; use nexus_db_queries::db::schema::sled_resource::dsl; + let conn = datastore.pool_connection_for_tests().await.unwrap(); + datastore - .pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + .transaction_retry_wrapper( + "no_sled_resource_instance_records_exist", + ) + .transaction(&conn, |conn| async move { conn.batch_execute_async( nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, ) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( - dsl::sled_resource - .filter( - dsl::kind.eq( - nexus_db_queries::db::model::SledResourceKind::Instance, - ), - ) - .select(SledResource::as_select()) - .get_results_async::(&conn) - .await - .unwrap() - .is_empty(), - ) + Ok(dsl::sled_resource + .filter(dsl::kind.eq( + nexus_db_queries::db::model::SledResourceKind::Instance, + )) + .select(SledResource::as_select()) + .get_results_async::(&conn) + .await + .unwrap() + .is_empty()) }) .await .unwrap() @@ -1048,16 +1044,17 @@ pub mod test { use nexus_db_queries::db::model::VirtualProvisioningResource; use nexus_db_queries::db::schema::virtual_provisioning_resource::dsl; - datastore.pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + datastore + .transaction_retry_wrapper("no_virtual_provisioning_resource_records_exist") + .transaction(&conn, |conn| async move { conn .batch_execute_async(nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( + Ok( dsl::virtual_provisioning_resource .filter(dsl::resource_type.eq(nexus_db_queries::db::model::ResourceTypeProvisioned::Instance.to_string())) .select(VirtualProvisioningResource::as_select()) @@ -1075,31 +1072,29 @@ pub mod test { use nexus_db_queries::db::model::VirtualProvisioningCollection; use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl; + let conn = datastore.pool_connection_for_tests().await.unwrap(); + datastore - .pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + .transaction_retry_wrapper( + "no_virtual_provisioning_collection_records_using_instances", + ) + .transaction(&conn, |conn| async move { conn.batch_execute_async( nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL, ) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( - dsl::virtual_provisioning_collection - .filter( - dsl::cpus_provisioned - .ne(0) - .or(dsl::ram_provisioned.ne(0)), - ) - .select(VirtualProvisioningCollection::as_select()) - .get_results_async::( - &conn, - ) - .await - .unwrap() - .is_empty(), - ) + Ok(dsl::virtual_provisioning_collection + .filter( + dsl::cpus_provisioned + .ne(0) + .or(dsl::ram_provisioned.ne(0)), + ) + .select(VirtualProvisioningCollection::as_select()) + .get_results_async::(&conn) + .await + .unwrap() + .is_empty()) }) .await .unwrap() diff --git a/nexus/src/app/sagas/project_create.rs b/nexus/src/app/sagas/project_create.rs index 135e20ff06..40acc822c0 100644 --- a/nexus/src/app/sagas/project_create.rs +++ b/nexus/src/app/sagas/project_create.rs @@ -157,9 +157,7 @@ mod test { app::saga::create_saga_dag, app::sagas::project_create::Params, app::sagas::project_create::SagaProjectCreate, external_api::params, }; - use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, - }; + use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use diesel::{ ExpressionMethods, OptionalExtension, QueryDsl, SelectableHelper, }; @@ -233,15 +231,16 @@ mod test { use nexus_db_queries::db::model::VirtualProvisioningCollection; use nexus_db_queries::db::schema::virtual_provisioning_collection::dsl; - datastore.pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + datastore + .transaction_retry_wrapper("no_virtual_provisioning_collection_records_for_projects") + .transaction(&conn, |conn| async move { conn .batch_execute_async(nexus_test_utils::db::ALLOW_FULL_TABLE_SCAN_SQL) .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( + Ok( dsl::virtual_provisioning_collection .filter(dsl::collection_type.eq(nexus_db_queries::db::model::CollectionTypeProvisioned::Project.to_string())) // ignore built-in services project diff --git a/nexus/src/app/sagas/test_helpers.rs b/nexus/src/app/sagas/test_helpers.rs index eccb013b66..3110bd318a 100644 --- a/nexus/src/app/sagas/test_helpers.rs +++ b/nexus/src/app/sagas/test_helpers.rs @@ -10,9 +10,7 @@ use crate::{ app::{saga::create_saga_dag, test_interfaces::TestInterfaces as _}, Nexus, }; -use async_bb8_diesel::{ - AsyncConnection, AsyncRunQueryDsl, AsyncSimpleConnection, -}; +use async_bb8_diesel::{AsyncRunQueryDsl, AsyncSimpleConnection}; use diesel::{ExpressionMethods, QueryDsl, SelectableHelper}; use futures::future::BoxFuture; use nexus_db_queries::{ @@ -434,11 +432,10 @@ pub(crate) async fn assert_no_failed_undo_steps( ) { use nexus_db_queries::db::model::saga_types::SagaNodeEvent; + let conn = datastore.pool_connection_for_tests().await.unwrap(); let saga_node_events: Vec = datastore - .pool_connection_for_tests() - .await - .unwrap() - .transaction_async(|conn| async move { + .transaction_retry_wrapper("assert_no_failed_undo_steps") + .transaction(&conn, |conn| async move { use nexus_db_queries::db::schema::saga_node_event::dsl; conn.batch_execute_async( @@ -447,14 +444,12 @@ pub(crate) async fn assert_no_failed_undo_steps( .await .unwrap(); - Ok::<_, nexus_db_queries::db::TransactionError<()>>( - dsl::saga_node_event - .filter(dsl::event_type.eq(String::from("undo_failed"))) - .select(SagaNodeEvent::as_select()) - .load_async::(&conn) - .await - .unwrap(), - ) + Ok(dsl::saga_node_event + .filter(dsl::event_type.eq(String::from("undo_failed"))) + .select(SagaNodeEvent::as_select()) + .load_async::(&conn) + .await + .unwrap()) }) .await .unwrap(); From 538d48c0f133c198e0d28aaed26ece133e6da723 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Mon, 4 Dec 2023 13:59:37 -0800 Subject: [PATCH 24/25] review feedback --- nexus/db-queries/src/db/datastore/snapshot.rs | 36 ++++++------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/snapshot.rs b/nexus/db-queries/src/db/datastore/snapshot.rs index 03058eda61..7a9eb8d2bc 100644 --- a/nexus/db-queries/src/db/datastore/snapshot.rs +++ b/nexus/db-queries/src/db/datastore/snapshot.rs @@ -47,16 +47,6 @@ impl DataStore { let gen = snapshot.gen; opctx.authorize(authz::Action::CreateChild, authz_project).await?; - #[derive(Debug, thiserror::Error)] - pub enum CustomError { - #[error("Resource already exists")] - ResourceAlreadyExists, - - #[error("saw AsyncInsertError")] - InsertError(Error), - } - - let snapshot_name = snapshot.name().to_string(); let project_id = snapshot.project_id; let err = OptionalError::new(); @@ -67,6 +57,7 @@ impl DataStore { .transaction(&conn, |conn| { let err = err.clone(); let snapshot = snapshot.clone(); + let snapshot_name = snapshot.name().to_string(); async move { use db::schema::snapshot::dsl; @@ -115,9 +106,10 @@ impl DataStore { if let Some(existing_snapshot_id) = existing_snapshot_id { if existing_snapshot_id != snapshot.id() { - return Err( - err.bail(CustomError::ResourceAlreadyExists) - ); + return Err(err.bail(Error::ObjectAlreadyExists { + type_name: ResourceType::Snapshot, + object_name: snapshot_name, + })); } } @@ -133,12 +125,12 @@ impl DataStore { .insert_and_get_result_async(&conn) .await .map_err(|e| match e { - AsyncInsertError::CollectionNotFound => err.bail( - CustomError::InsertError(Error::ObjectNotFound { + AsyncInsertError::CollectionNotFound => { + err.bail(Error::ObjectNotFound { type_name: ResourceType::Project, lookup_type: LookupType::ById(project_id), - }), - ), + }) + } AsyncInsertError::DatabaseError(e) => e, }) } @@ -146,15 +138,7 @@ impl DataStore { .await .map_err(|e| { if let Some(err) = err.take() { - match err { - CustomError::ResourceAlreadyExists => { - Error::ObjectAlreadyExists { - type_name: ResourceType::Snapshot, - object_name: snapshot_name, - } - } - CustomError::InsertError(e) => e, - } + err } else { public_error_from_diesel(e, ErrorHandler::Server) } From 2058492886ba90dd0fcb5215786ccc0e6318093a Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 5 Dec 2023 16:49:43 -0800 Subject: [PATCH 25/25] update async-bb8-diesel version --- Cargo.lock | 2 +- Cargo.toml | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1dc209aba6..2a1c1523d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -250,7 +250,7 @@ checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" [[package]] name = "async-bb8-diesel" version = "0.1.0" -source = "git+https://github.com/oxidecomputer/async-bb8-diesel?branch=txn-retry#14bd98be8c76b36cc04464f06ffa5894b7240ab5" +source = "git+https://github.com/oxidecomputer/async-bb8-diesel?rev=ed7ab5ef0513ba303d33efd41d3e9e381169d59b#ed7ab5ef0513ba303d33efd41d3e9e381169d59b" dependencies = [ "async-trait", "bb8", diff --git a/Cargo.toml b/Cargo.toml index 063863b5e1..5e51e85f1a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -144,9 +144,7 @@ api_identity = { path = "api_identity" } approx = "0.5.1" assert_matches = "1.5.0" assert_cmd = "2.0.12" -# TODO: Patch before merging -# async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "1446f7e0c1f05f33a0581abd51fa873c7652ab61" } -async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", branch = "txn-retry" } +async-bb8-diesel = { git = "https://github.com/oxidecomputer/async-bb8-diesel", rev = "ed7ab5ef0513ba303d33efd41d3e9e381169d59b" } async-trait = "0.1.74" atomicwrites = "0.4.2" authz-macros = { path = "nexus/authz-macros" }