diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 8f529c80a7..eed8c81421 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(75, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(76, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(76, "lookup-region-snapshot-by-snapshot-id"), KnownVersion::new(75, "add-cockroach-zone-id-to-node-id"), KnownVersion::new(74, "add-migration-table"), KnownVersion::new(73, "add-vlan-to-uplink"), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 11427cd0fd..0a48f33ebd 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -115,6 +115,8 @@ use nexus_db_model::AllSchemaVersions; pub use probe::ProbeInfo; pub use rack::RackInit; pub use rack::SledUnderlayAllocationResult; +pub use region::RegionAllocationFor; +pub use region::RegionAllocationParameters; pub use silo::Discoverability; pub use sled::SledTransition; pub use sled::TransitionError; diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index d7da24cce3..ac86c6a7d3 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -27,6 +27,29 @@ use omicron_common::api::external::LookupResult; use slog::Logger; use uuid::Uuid; +pub enum RegionAllocationFor { + /// Allocate region(s) for a disk volume + DiskVolume { volume_id: Uuid }, + + /// Allocate region(s) for a snapshot volume, which may have read-only + /// targets. + SnapshotVolume { volume_id: Uuid, snapshot_id: Uuid }, +} + +/// Describe the region(s) to be allocated +pub enum RegionAllocationParameters<'a> { + FromDiskSource { + disk_source: &'a params::DiskSource, + size: external::ByteCount, + }, + + FromRaw { + block_size: u64, + blocks_per_extent: u64, + extent_count: u64, + }, +} + impl DataStore { pub(super) fn get_allocated_regions_query( volume_id: Uuid, @@ -156,9 +179,8 @@ impl DataStore { ) -> Result, Error> { self.arbitrary_region_allocate( opctx, - volume_id, - disk_source, - size, + RegionAllocationFor::DiskVolume { volume_id }, + RegionAllocationParameters::FromDiskSource { disk_source, size }, allocation_strategy, REGION_REDUNDANCY_THRESHOLD, ) @@ -175,47 +197,59 @@ impl DataStore { /// level for a volume. If a single region is allocated in isolation this /// could land on the same dataset as one of the existing volume's regions. /// + /// For allocating for snapshot volumes, it's important to take into account + /// `region_snapshot`s that may be used as some of the targets in the region + /// set, representing read-only downstairs served out of a ZFS snapshot + /// instead of a dataset. + /// /// Returns the allocated regions, as well as the datasets to which they /// belong. pub async fn arbitrary_region_allocate( &self, opctx: &OpContext, - volume_id: Uuid, - disk_source: ¶ms::DiskSource, - size: external::ByteCount, + region_for: RegionAllocationFor, + region_parameters: RegionAllocationParameters<'_>, allocation_strategy: &RegionAllocationStrategy, num_regions_required: usize, ) -> Result, Error> { - let block_size = - self.get_block_size_from_disk_source(opctx, &disk_source).await?; - let (blocks_per_extent, extent_count) = - Self::get_crucible_allocation(&block_size, size); + let (volume_id, maybe_snapshot_id) = match region_for { + RegionAllocationFor::DiskVolume { volume_id } => (volume_id, None), - self.arbitrary_region_allocate_direct( - opctx, - volume_id, - u64::from(block_size.to_bytes()), - blocks_per_extent, - extent_count, - allocation_strategy, - num_regions_required, - ) - .await - } + RegionAllocationFor::SnapshotVolume { volume_id, snapshot_id } => { + (volume_id, Some(snapshot_id)) + } + }; + + let (block_size, blocks_per_extent, extent_count) = + match region_parameters { + RegionAllocationParameters::FromDiskSource { + disk_source, + size, + } => { + let block_size = self + .get_block_size_from_disk_source(opctx, &disk_source) + .await?; + + let (blocks_per_extent, extent_count) = + Self::get_crucible_allocation(&block_size, size); + + ( + u64::from(block_size.to_bytes()), + blocks_per_extent, + extent_count, + ) + } + + RegionAllocationParameters::FromRaw { + block_size, + blocks_per_extent, + extent_count, + } => (block_size, blocks_per_extent, extent_count), + }; - #[allow(clippy::too_many_arguments)] - pub async fn arbitrary_region_allocate_direct( - &self, - opctx: &OpContext, - volume_id: Uuid, - block_size: u64, - blocks_per_extent: u64, - extent_count: u64, - allocation_strategy: &RegionAllocationStrategy, - num_regions_required: usize, - ) -> Result, Error> { let query = crate::db::queries::region_allocation::allocation_query( volume_id, + maybe_snapshot_id, block_size, blocks_per_extent, extent_count, @@ -234,6 +268,7 @@ impl DataStore { self.log, "Allocated regions for volume"; "volume_id" => %volume_id, + "maybe_snapshot_id" => ?maybe_snapshot_id, "datasets_and_regions" => ?dataset_and_regions, ); diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index 83cc7483c9..44a89376d4 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -67,8 +67,14 @@ type SelectableSql = < >::SelectExpression as diesel::Expression >::SqlType; +/// For a given volume, idempotently allocate enough regions (according to some +/// allocation strategy) to meet some redundancy level. This should only be used +/// for the region set that is in the top level of the Volume (not the deeper +/// layers of the hierarchy). If that volume has region snapshots in the region +/// set, a `snapshot_id` should be supplied matching those entries. pub fn allocation_query( volume_id: uuid::Uuid, + snapshot_id: Option, block_size: u64, blocks_per_extent: u64, extent_count: u64, @@ -116,24 +122,42 @@ pub fn allocation_query( SELECT dataset.pool_id, sum(dataset.size_used) AS size_used - FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),") - - // Any zpool already have this volume's existing regions? - .sql(" - existing_zpools AS ( - SELECT - dataset.pool_id - FROM - dataset INNER JOIN old_regions ON (old_regions.dataset_id = dataset.id) - ),") + FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),"); + + let builder = if let Some(snapshot_id) = snapshot_id { + // Any zpool already have this volume's existing regions, or host the + // snapshot volume's regions? + builder.sql(" + existing_zpools AS (( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN old_regions ON (old_regions.dataset_id = dataset.id) + ) UNION ( + select dataset.pool_id from + dataset inner join region_snapshot on (region_snapshot.dataset_id = dataset.id) + where region_snapshot.snapshot_id = ").param().sql(")),") + .bind::(snapshot_id) + } else { + // Any zpool already have this volume's existing regions? + builder.sql(" + existing_zpools AS ( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN old_regions ON (old_regions.dataset_id = dataset.id) + ),") + }; // Identifies zpools with enough space for region allocation, that are not // currently used by this Volume's existing regions. // // NOTE: 'distinct_sleds' changes the format of the underlying SQL query, as it uses // distinct bind parameters depending on the conditional branch. - .sql(" - candidate_zpools AS ("); + let builder = builder.sql( + " + candidate_zpools AS (", + ); let builder = if distinct_sleds { builder.sql("SELECT DISTINCT ON (zpool.sled_id) ") } else { @@ -384,10 +408,15 @@ mod test { let blocks_per_extent = 4; let extent_count = 8; + // Start with snapshot_id = None + + let snapshot_id = None; + // First structure: "RandomWithDistinctSleds" let region_allocate = allocation_query( volume_id, + snapshot_id, block_size, blocks_per_extent, extent_count, @@ -406,6 +435,7 @@ mod test { let region_allocate = allocation_query( volume_id, + snapshot_id, block_size, blocks_per_extent, extent_count, @@ -417,6 +447,46 @@ mod test { "tests/output/region_allocate_random_sleds.sql", ) .await; + + // Next, put a value in for snapshot_id + + let snapshot_id = Some(Uuid::new_v4()); + + // First structure: "RandomWithDistinctSleds" + + let region_allocate = allocation_query( + volume_id, + snapshot_id, + block_size, + blocks_per_extent, + extent_count, + &RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(1), + }, + REGION_REDUNDANCY_THRESHOLD, + ); + expectorate_query_contents( + ®ion_allocate, + "tests/output/region_allocate_with_snapshot_distinct_sleds.sql", + ) + .await; + + // Second structure: "Random" + + let region_allocate = allocation_query( + volume_id, + snapshot_id, + block_size, + blocks_per_extent, + extent_count, + &RegionAllocationStrategy::Random { seed: Some(1) }, + REGION_REDUNDANCY_THRESHOLD, + ); + expectorate_query_contents( + ®ion_allocate, + "tests/output/region_allocate_with_snapshot_random_sleds.sql", + ) + .await; } // Explain the possible forms of the SQL query to ensure that it @@ -439,6 +509,7 @@ mod test { let region_allocate = allocation_query( volume_id, + None, block_size, blocks_per_extent, extent_count, @@ -454,6 +525,7 @@ mod test { let region_allocate = allocation_query( volume_id, + None, block_size, blocks_per_extent, extent_count, diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql new file mode 100644 index 0000000000..b90b2f2adf --- /dev/null +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql @@ -0,0 +1,323 @@ +WITH + old_regions + AS ( + SELECT + region.id, + region.time_created, + region.time_modified, + region.dataset_id, + region.volume_id, + region.block_size, + region.blocks_per_extent, + region.extent_count + FROM + region + WHERE + region.volume_id = $1 + ), + old_zpool_usage + AS ( + SELECT + dataset.pool_id, sum(dataset.size_used) AS size_used + FROM + dataset + WHERE + (dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL) + GROUP BY + dataset.pool_id + ), + existing_zpools + AS ( + ( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN old_regions ON old_regions.dataset_id = dataset.id + ) + UNION + ( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN region_snapshot ON region_snapshot.dataset_id = dataset.id + WHERE + region_snapshot.snapshot_id = $2 + ) + ), + candidate_zpools + AS ( + SELECT + DISTINCT ON (zpool.sled_id) old_zpool_usage.pool_id + FROM + old_zpool_usage + INNER JOIN (zpool INNER JOIN sled ON zpool.sled_id = sled.id) ON + zpool.id = old_zpool_usage.pool_id + INNER JOIN physical_disk ON zpool.physical_disk_id = physical_disk.id + WHERE + (old_zpool_usage.size_used + $3) + <= ( + SELECT + total_size + FROM + omicron.public.inv_zpool + WHERE + inv_zpool.id = old_zpool_usage.pool_id + ORDER BY + inv_zpool.time_collected DESC + LIMIT + 1 + ) + AND sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + AND physical_disk.disk_policy = 'in_service' + AND physical_disk.disk_state = 'active' + AND NOT (zpool.id = ANY (SELECT existing_zpools.pool_id FROM existing_zpools)) + ORDER BY + zpool.sled_id, md5(CAST(zpool.id AS BYTES) || $4) + ), + candidate_datasets + AS ( + SELECT + DISTINCT ON (dataset.pool_id) dataset.id, dataset.pool_id + FROM + dataset INNER JOIN candidate_zpools ON dataset.pool_id = candidate_zpools.pool_id + WHERE + ((dataset.time_deleted IS NULL) AND (dataset.size_used IS NOT NULL)) + AND dataset.kind = 'crucible' + ORDER BY + dataset.pool_id, md5(CAST(dataset.id AS BYTES) || $5) + ), + shuffled_candidate_datasets + AS ( + SELECT + candidate_datasets.id, candidate_datasets.pool_id + FROM + candidate_datasets + ORDER BY + md5(CAST(candidate_datasets.id AS BYTES) || $6) + LIMIT + $7 + ), + candidate_regions + AS ( + SELECT + gen_random_uuid() AS id, + now() AS time_created, + now() AS time_modified, + shuffled_candidate_datasets.id AS dataset_id, + $8 AS volume_id, + $9 AS block_size, + $10 AS blocks_per_extent, + $11 AS extent_count + FROM + shuffled_candidate_datasets + LIMIT + $12 - (SELECT count(*) FROM old_regions) + ), + proposed_dataset_changes + AS ( + SELECT + candidate_regions.dataset_id AS id, + dataset.pool_id AS pool_id, + candidate_regions.block_size + * candidate_regions.blocks_per_extent + * candidate_regions.extent_count + AS size_used_delta + FROM + candidate_regions INNER JOIN dataset ON dataset.id = candidate_regions.dataset_id + ), + do_insert + AS ( + SELECT + ( + ( + (SELECT count(*) FROM old_regions LIMIT 1) < $13 + AND CAST( + IF( + ( + ( + ( + (SELECT count(*) FROM candidate_zpools LIMIT 1) + + (SELECT count(*) FROM existing_zpools LIMIT 1) + ) + ) + >= $14 + ), + 'TRUE', + 'Not enough space' + ) + AS BOOL + ) + ) + AND CAST( + IF( + ( + ( + ( + (SELECT count(*) FROM candidate_regions LIMIT 1) + + (SELECT count(*) FROM old_regions LIMIT 1) + ) + ) + >= $15 + ), + 'TRUE', + 'Not enough datasets' + ) + AS BOOL + ) + ) + AND CAST( + IF( + ( + ( + ( + SELECT + count(DISTINCT pool_id) + FROM + ( + ( + SELECT + dataset.pool_id + FROM + candidate_regions + INNER JOIN dataset ON candidate_regions.dataset_id = dataset.id + ) + UNION + ( + SELECT + dataset.pool_id + FROM + old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id + ) + ) + LIMIT + 1 + ) + ) + >= $16 + ), + 'TRUE', + 'Not enough unique zpools selected' + ) + AS BOOL + ) + AS insert + ), + inserted_regions + AS ( + INSERT + INTO + region + ( + id, + time_created, + time_modified, + dataset_id, + volume_id, + block_size, + blocks_per_extent, + extent_count + ) + SELECT + candidate_regions.id, + candidate_regions.time_created, + candidate_regions.time_modified, + candidate_regions.dataset_id, + candidate_regions.volume_id, + candidate_regions.block_size, + candidate_regions.blocks_per_extent, + candidate_regions.extent_count + FROM + candidate_regions + WHERE + (SELECT do_insert.insert FROM do_insert LIMIT 1) + RETURNING + region.id, + region.time_created, + region.time_modified, + region.dataset_id, + region.volume_id, + region.block_size, + region.blocks_per_extent, + region.extent_count + ), + updated_datasets + AS ( + UPDATE + dataset + SET + size_used + = dataset.size_used + + ( + SELECT + proposed_dataset_changes.size_used_delta + FROM + proposed_dataset_changes + WHERE + proposed_dataset_changes.id = dataset.id + LIMIT + 1 + ) + WHERE + dataset.id = ANY (SELECT proposed_dataset_changes.id FROM proposed_dataset_changes) + AND (SELECT do_insert.insert FROM do_insert LIMIT 1) + RETURNING + dataset.id, + dataset.time_created, + dataset.time_modified, + dataset.time_deleted, + dataset.rcgen, + dataset.pool_id, + dataset.ip, + dataset.port, + dataset.kind, + dataset.size_used + ) +( + SELECT + dataset.id, + dataset.time_created, + dataset.time_modified, + dataset.time_deleted, + dataset.rcgen, + dataset.pool_id, + dataset.ip, + dataset.port, + dataset.kind, + dataset.size_used, + old_regions.id, + old_regions.time_created, + old_regions.time_modified, + old_regions.dataset_id, + old_regions.volume_id, + old_regions.block_size, + old_regions.blocks_per_extent, + old_regions.extent_count + FROM + old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id +) +UNION + ( + SELECT + updated_datasets.id, + updated_datasets.time_created, + updated_datasets.time_modified, + updated_datasets.time_deleted, + updated_datasets.rcgen, + updated_datasets.pool_id, + updated_datasets.ip, + updated_datasets.port, + updated_datasets.kind, + updated_datasets.size_used, + inserted_regions.id, + inserted_regions.time_created, + inserted_regions.time_modified, + inserted_regions.dataset_id, + inserted_regions.volume_id, + inserted_regions.block_size, + inserted_regions.blocks_per_extent, + inserted_regions.extent_count + FROM + inserted_regions + INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id + ) diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql new file mode 100644 index 0000000000..daf13f18ce --- /dev/null +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql @@ -0,0 +1,321 @@ +WITH + old_regions + AS ( + SELECT + region.id, + region.time_created, + region.time_modified, + region.dataset_id, + region.volume_id, + region.block_size, + region.blocks_per_extent, + region.extent_count + FROM + region + WHERE + region.volume_id = $1 + ), + old_zpool_usage + AS ( + SELECT + dataset.pool_id, sum(dataset.size_used) AS size_used + FROM + dataset + WHERE + (dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL) + GROUP BY + dataset.pool_id + ), + existing_zpools + AS ( + ( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN old_regions ON old_regions.dataset_id = dataset.id + ) + UNION + ( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN region_snapshot ON region_snapshot.dataset_id = dataset.id + WHERE + region_snapshot.snapshot_id = $2 + ) + ), + candidate_zpools + AS ( + SELECT + old_zpool_usage.pool_id + FROM + old_zpool_usage + INNER JOIN (zpool INNER JOIN sled ON zpool.sled_id = sled.id) ON + zpool.id = old_zpool_usage.pool_id + INNER JOIN physical_disk ON zpool.physical_disk_id = physical_disk.id + WHERE + (old_zpool_usage.size_used + $3) + <= ( + SELECT + total_size + FROM + omicron.public.inv_zpool + WHERE + inv_zpool.id = old_zpool_usage.pool_id + ORDER BY + inv_zpool.time_collected DESC + LIMIT + 1 + ) + AND sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + AND physical_disk.disk_policy = 'in_service' + AND physical_disk.disk_state = 'active' + AND NOT (zpool.id = ANY (SELECT existing_zpools.pool_id FROM existing_zpools)) + ), + candidate_datasets + AS ( + SELECT + DISTINCT ON (dataset.pool_id) dataset.id, dataset.pool_id + FROM + dataset INNER JOIN candidate_zpools ON dataset.pool_id = candidate_zpools.pool_id + WHERE + ((dataset.time_deleted IS NULL) AND (dataset.size_used IS NOT NULL)) + AND dataset.kind = 'crucible' + ORDER BY + dataset.pool_id, md5(CAST(dataset.id AS BYTES) || $4) + ), + shuffled_candidate_datasets + AS ( + SELECT + candidate_datasets.id, candidate_datasets.pool_id + FROM + candidate_datasets + ORDER BY + md5(CAST(candidate_datasets.id AS BYTES) || $5) + LIMIT + $6 + ), + candidate_regions + AS ( + SELECT + gen_random_uuid() AS id, + now() AS time_created, + now() AS time_modified, + shuffled_candidate_datasets.id AS dataset_id, + $7 AS volume_id, + $8 AS block_size, + $9 AS blocks_per_extent, + $10 AS extent_count + FROM + shuffled_candidate_datasets + LIMIT + $11 - (SELECT count(*) FROM old_regions) + ), + proposed_dataset_changes + AS ( + SELECT + candidate_regions.dataset_id AS id, + dataset.pool_id AS pool_id, + candidate_regions.block_size + * candidate_regions.blocks_per_extent + * candidate_regions.extent_count + AS size_used_delta + FROM + candidate_regions INNER JOIN dataset ON dataset.id = candidate_regions.dataset_id + ), + do_insert + AS ( + SELECT + ( + ( + (SELECT count(*) FROM old_regions LIMIT 1) < $12 + AND CAST( + IF( + ( + ( + ( + (SELECT count(*) FROM candidate_zpools LIMIT 1) + + (SELECT count(*) FROM existing_zpools LIMIT 1) + ) + ) + >= $13 + ), + 'TRUE', + 'Not enough space' + ) + AS BOOL + ) + ) + AND CAST( + IF( + ( + ( + ( + (SELECT count(*) FROM candidate_regions LIMIT 1) + + (SELECT count(*) FROM old_regions LIMIT 1) + ) + ) + >= $14 + ), + 'TRUE', + 'Not enough datasets' + ) + AS BOOL + ) + ) + AND CAST( + IF( + ( + ( + ( + SELECT + count(DISTINCT pool_id) + FROM + ( + ( + SELECT + dataset.pool_id + FROM + candidate_regions + INNER JOIN dataset ON candidate_regions.dataset_id = dataset.id + ) + UNION + ( + SELECT + dataset.pool_id + FROM + old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id + ) + ) + LIMIT + 1 + ) + ) + >= $15 + ), + 'TRUE', + 'Not enough unique zpools selected' + ) + AS BOOL + ) + AS insert + ), + inserted_regions + AS ( + INSERT + INTO + region + ( + id, + time_created, + time_modified, + dataset_id, + volume_id, + block_size, + blocks_per_extent, + extent_count + ) + SELECT + candidate_regions.id, + candidate_regions.time_created, + candidate_regions.time_modified, + candidate_regions.dataset_id, + candidate_regions.volume_id, + candidate_regions.block_size, + candidate_regions.blocks_per_extent, + candidate_regions.extent_count + FROM + candidate_regions + WHERE + (SELECT do_insert.insert FROM do_insert LIMIT 1) + RETURNING + region.id, + region.time_created, + region.time_modified, + region.dataset_id, + region.volume_id, + region.block_size, + region.blocks_per_extent, + region.extent_count + ), + updated_datasets + AS ( + UPDATE + dataset + SET + size_used + = dataset.size_used + + ( + SELECT + proposed_dataset_changes.size_used_delta + FROM + proposed_dataset_changes + WHERE + proposed_dataset_changes.id = dataset.id + LIMIT + 1 + ) + WHERE + dataset.id = ANY (SELECT proposed_dataset_changes.id FROM proposed_dataset_changes) + AND (SELECT do_insert.insert FROM do_insert LIMIT 1) + RETURNING + dataset.id, + dataset.time_created, + dataset.time_modified, + dataset.time_deleted, + dataset.rcgen, + dataset.pool_id, + dataset.ip, + dataset.port, + dataset.kind, + dataset.size_used + ) +( + SELECT + dataset.id, + dataset.time_created, + dataset.time_modified, + dataset.time_deleted, + dataset.rcgen, + dataset.pool_id, + dataset.ip, + dataset.port, + dataset.kind, + dataset.size_used, + old_regions.id, + old_regions.time_created, + old_regions.time_modified, + old_regions.dataset_id, + old_regions.volume_id, + old_regions.block_size, + old_regions.blocks_per_extent, + old_regions.extent_count + FROM + old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id +) +UNION + ( + SELECT + updated_datasets.id, + updated_datasets.time_created, + updated_datasets.time_modified, + updated_datasets.time_deleted, + updated_datasets.rcgen, + updated_datasets.pool_id, + updated_datasets.ip, + updated_datasets.port, + updated_datasets.kind, + updated_datasets.size_used, + inserted_regions.id, + inserted_regions.time_created, + inserted_regions.time_modified, + inserted_regions.dataset_id, + inserted_regions.volume_id, + inserted_regions.block_size, + inserted_regions.blocks_per_extent, + inserted_regions.extent_count + FROM + inserted_regions + INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id + ) diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index f546ae645b..c983716b4f 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -257,12 +257,16 @@ async fn srrs_alloc_new_region( // agent could reuse the allocated port and cause trouble. let datasets_and_regions = osagactx .datastore() - .arbitrary_region_allocate_direct( + .arbitrary_region_allocate( &opctx, - db_region.volume_id(), - db_region.block_size().to_bytes(), - db_region.blocks_per_extent(), - db_region.extent_count(), + db::datastore::RegionAllocationFor::DiskVolume { + volume_id: db_region.volume_id(), + }, + db::datastore::RegionAllocationParameters::FromRaw { + block_size: db_region.block_size().to_bytes(), + blocks_per_extent: db_region.blocks_per_extent(), + extent_count: db_region.extent_count(), + }, ¶ms.allocation_strategy, // Note: this assumes that previous redundancy is // REGION_REDUNDANCY_THRESHOLD, and that region replacement will diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index beba6ba7c0..a707f8240d 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -13,6 +13,8 @@ use http::StatusCode; use nexus_config::RegionAllocationStrategy; use nexus_db_model::PhysicalDiskPolicy; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::datastore::RegionAllocationFor; +use nexus_db_queries::db::datastore::RegionAllocationParameters; use nexus_db_queries::db::datastore::REGION_REDUNDANCY_THRESHOLD; use nexus_db_queries::db::fixed_data::{silo::DEFAULT_SILO_ID, FLEET_ID}; use nexus_db_queries::db::lookup::LookupPath; @@ -2033,11 +2035,13 @@ async fn test_single_region_allocate(cptestctx: &ControlPlaneTestContext) { let datasets_and_regions = datastore .arbitrary_region_allocate( &opctx, - volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from(512).unwrap(), + RegionAllocationFor::DiskVolume { volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), }, - ByteCount::from_gibibytes_u32(1), &RegionAllocationStrategy::Random { seed: None }, 1, ) @@ -2173,11 +2177,13 @@ async fn test_region_allocation_strategy_random_is_idempotent_arbitrary( let datasets_and_regions = datastore .arbitrary_region_allocate( &opctx, - volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from(512).unwrap(), + RegionAllocationFor::DiskVolume { volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), }, - ByteCount::from_gibibytes_u32(1), &RegionAllocationStrategy::Random { seed: None }, REGION_REDUNDANCY_THRESHOLD, ) @@ -2191,11 +2197,13 @@ async fn test_region_allocation_strategy_random_is_idempotent_arbitrary( let datasets_and_regions = datastore .arbitrary_region_allocate( &opctx, - volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from(512).unwrap(), + RegionAllocationFor::DiskVolume { volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), }, - ByteCount::from_gibibytes_u32(1), &RegionAllocationStrategy::Random { seed: None }, REGION_REDUNDANCY_THRESHOLD + 1, ) @@ -2262,14 +2270,16 @@ async fn test_single_region_allocate_for_replace( let datasets_and_regions = datastore .arbitrary_region_allocate( &opctx, - db_disk.volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from( - region_to_replace.block_size().to_bytes() as u32, - ) - .unwrap(), + RegionAllocationFor::DiskVolume { volume_id: db_disk.volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + region_to_replace.block_size().to_bytes() as u32, + ) + .unwrap(), + }, + size: region_total_size, }, - region_total_size, &RegionAllocationStrategy::Random { seed: None }, one_more, ) @@ -2348,14 +2358,16 @@ async fn test_single_region_allocate_for_replace_not_enough_zpools( let result = datastore .arbitrary_region_allocate( &opctx, - db_disk.volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from( - region_to_replace.block_size().to_bytes() as u32, - ) - .unwrap(), + RegionAllocationFor::DiskVolume { volume_id: db_disk.volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + region_to_replace.block_size().to_bytes() as u32, + ) + .unwrap(), + }, + size: region_total_size, }, - region_total_size, &RegionAllocationStrategy::Random { seed: None }, one_more, ) @@ -2367,14 +2379,16 @@ async fn test_single_region_allocate_for_replace_not_enough_zpools( let datasets_and_regions = datastore .arbitrary_region_allocate( &opctx, - db_disk.volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from( - region_to_replace.block_size().to_bytes() as u32, - ) - .unwrap(), + RegionAllocationFor::DiskVolume { volume_id: db_disk.volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + region_to_replace.block_size().to_bytes() as u32, + ) + .unwrap(), + }, + size: region_total_size, }, - region_total_size, &RegionAllocationStrategy::Random { seed: None }, allocated_regions.len(), ) diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index 3fb6f8f6ec..f0ce2cca72 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -9,15 +9,20 @@ use chrono::Utc; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; +use nexus_config::RegionAllocationStrategy; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::RegionAllocationFor; +use nexus_db_queries::db::datastore::RegionAllocationParameters; +use nexus_db_queries::db::datastore::REGION_REDUNDANCY_THRESHOLD; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::http_testing::RequestBuilder; use nexus_test_utils::resource_helpers::create_default_ip_pool; +use nexus_test_utils::resource_helpers::create_disk; use nexus_test_utils::resource_helpers::create_project; use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils::resource_helpers::DiskTest; @@ -1441,3 +1446,194 @@ async fn test_multiple_deletes_not_sent(cptestctx: &ControlPlaneTestContext) { assert!(!resources_2_datasets_and_snapshots.contains(tuple)); } } + +/// Ensure that allocating one replacement for a snapshot works +#[nexus_test] +async fn test_region_allocation_for_snapshot( + cptestctx: &ControlPlaneTestContext, +) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Create three 10 GiB zpools, each with one dataset. + let mut disk_test = DiskTest::new(&cptestctx).await; + + // An additional disk is required, otherwise the allocation will fail with + // "not enough storage" + disk_test.add_zpool_with_dataset(&cptestctx).await; + + // Assert default is still 10 GiB + assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); + + // Create a disk + let client = &cptestctx.external_client; + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + // Assert disk has three allocated regions + let disk_id = disk.identity.id; + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_id) + .fetch() + .await + .unwrap_or_else(|_| panic!("test disk {:?} should exist", disk_id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD); + + // Create a snapshot of the disk + + let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME); + + let snapshot: views::Snapshot = object_create( + client, + &snapshots_url, + ¶ms::SnapshotCreate { + identity: IdentityMetadataCreateParams { + name: "snapshot".parse().unwrap(), + description: String::from("a snapshot"), + }, + disk: disk_id.into(), + }, + ) + .await; + + assert_eq!(snapshot.disk_id, disk.identity.id); + assert_eq!(snapshot.size, disk.size); + + // There shouldn't be any regions for the snapshot volume + + let snapshot_id = snapshot.identity.id; + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot_id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("test snapshot {:?} should exist", snapshot_id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 0); + + // Run region allocation for the snapshot volume, setting the redundancy to + // 1 (aka one more than existing number of regions), and expect only _one_ + // region to be allocated. + + datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: db_snapshot.volume_id, + snapshot_id: snapshot.identity.id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + disk.block_size.to_bytes() as u32, + ) + .unwrap(), + }, + size: disk.size, + }, + &RegionAllocationStrategy::Random { seed: None }, + 1, + ) + .await + .unwrap(); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + + // Assert that all regions are on separate datasets from the region + // snapshots + + for (_, region) in allocated_regions { + { + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + use async_bb8_diesel::AsyncRunQueryDsl; + use db::schema::region_snapshot::dsl; + use diesel::ExpressionMethods; + use diesel::QueryDsl; + use diesel::SelectableHelper; + + let region_snapshots: Vec = + dsl::region_snapshot + .filter(dsl::dataset_id.eq(region.dataset_id())) + .filter(dsl::snapshot_id.eq(snapshot.identity.id)) + .select(db::model::RegionSnapshot::as_select()) + .load_async::(&*conn) + .await + .unwrap(); + + assert!(region_snapshots.is_empty()); + } + } + + // Ensure the function is idempotent + + datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: db_snapshot.volume_id, + snapshot_id: snapshot.identity.id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + disk.block_size.to_bytes() as u32, + ) + .unwrap(), + }, + size: disk.size, + }, + &RegionAllocationStrategy::Random { seed: None }, + 1, + ) + .await + .unwrap(); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + + // If an additional region is required, make sure that works too. + disk_test.add_zpool_with_dataset(&cptestctx).await; + + datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: db_snapshot.volume_id, + snapshot_id: snapshot.identity.id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + disk.block_size.to_bytes() as u32, + ) + .unwrap(), + }, + size: disk.size, + }, + &RegionAllocationStrategy::Random { seed: None }, + 2, + ) + .await + .unwrap(); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 2); +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 4fd1930d33..338f52f854 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4129,6 +4129,11 @@ CREATE TABLE IF NOT EXISTS omicron.public.migration ( time_target_updated TIMESTAMPTZ ); +/* Lookup region snapshot by snapshot id */ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_snapshot_id on omicron.public.region_snapshot ( + snapshot_id +); + /* * Keep this at the end of file so that the database does not contain a version * until it is fully populated. @@ -4140,7 +4145,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '75.0.0', NULL) + (TRUE, NOW(), NOW(), '76.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/lookup-region-snapshot-by-snapshot-id/up.sql b/schema/crdb/lookup-region-snapshot-by-snapshot-id/up.sql new file mode 100644 index 0000000000..40789781c2 --- /dev/null +++ b/schema/crdb/lookup-region-snapshot-by-snapshot-id/up.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_snapshot_id on omicron.public.region_snapshot ( + snapshot_id +);