From a8b3ce22b16cea009977331ea31cbaac7e79aa73 Mon Sep 17 00:00:00 2001 From: James MacMahon Date: Fri, 14 Jun 2024 17:49:49 -0400 Subject: [PATCH] Account for region snapshots during allocation (#5901) When replacing a snapshot (aka a read-only downstairs somewhere), region allocation must occur for a snapshot volume. A snapshot volume's region set is currently only composed of read-only downstairs which all contain the same data, and any newly allocated region should take those into account when looking to meet the redundancy criteria for a region set: in production, allocate the new region to a distinct sled. This is done by slightly changing the region allocation query to accept an optional snapshot id, and, if that is supplied, will add the region snapshot's pools to the `existing_zpools` temporary table in order to prevent region allocation there. --- nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-queries/src/db/datastore/mod.rs | 2 + nexus/db-queries/src/db/datastore/region.rs | 99 ++++-- .../src/db/queries/region_allocation.rs | 96 +++++- ..._allocate_with_snapshot_distinct_sleds.sql | 323 ++++++++++++++++++ ...on_allocate_with_snapshot_random_sleds.sql | 321 +++++++++++++++++ .../src/app/sagas/region_replacement_start.rs | 14 +- nexus/tests/integration_tests/disks.rs | 80 +++-- nexus/tests/integration_tests/snapshots.rs | 196 +++++++++++ schema/crdb/dbinit.sql | 7 +- .../up.sql | 3 + 11 files changed, 1060 insertions(+), 84 deletions(-) create mode 100644 nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql create mode 100644 nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql create mode 100644 schema/crdb/lookup-region-snapshot-by-snapshot-id/up.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 8f529c80a7..eed8c81421 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -17,7 +17,7 @@ use std::collections::BTreeMap; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(75, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(76, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy> = Lazy::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(76, "lookup-region-snapshot-by-snapshot-id"), KnownVersion::new(75, "add-cockroach-zone-id-to-node-id"), KnownVersion::new(74, "add-migration-table"), KnownVersion::new(73, "add-vlan-to-uplink"), diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 11427cd0fd..0a48f33ebd 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -115,6 +115,8 @@ use nexus_db_model::AllSchemaVersions; pub use probe::ProbeInfo; pub use rack::RackInit; pub use rack::SledUnderlayAllocationResult; +pub use region::RegionAllocationFor; +pub use region::RegionAllocationParameters; pub use silo::Discoverability; pub use sled::SledTransition; pub use sled::TransitionError; diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index d7da24cce3..ac86c6a7d3 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -27,6 +27,29 @@ use omicron_common::api::external::LookupResult; use slog::Logger; use uuid::Uuid; +pub enum RegionAllocationFor { + /// Allocate region(s) for a disk volume + DiskVolume { volume_id: Uuid }, + + /// Allocate region(s) for a snapshot volume, which may have read-only + /// targets. + SnapshotVolume { volume_id: Uuid, snapshot_id: Uuid }, +} + +/// Describe the region(s) to be allocated +pub enum RegionAllocationParameters<'a> { + FromDiskSource { + disk_source: &'a params::DiskSource, + size: external::ByteCount, + }, + + FromRaw { + block_size: u64, + blocks_per_extent: u64, + extent_count: u64, + }, +} + impl DataStore { pub(super) fn get_allocated_regions_query( volume_id: Uuid, @@ -156,9 +179,8 @@ impl DataStore { ) -> Result, Error> { self.arbitrary_region_allocate( opctx, - volume_id, - disk_source, - size, + RegionAllocationFor::DiskVolume { volume_id }, + RegionAllocationParameters::FromDiskSource { disk_source, size }, allocation_strategy, REGION_REDUNDANCY_THRESHOLD, ) @@ -175,47 +197,59 @@ impl DataStore { /// level for a volume. If a single region is allocated in isolation this /// could land on the same dataset as one of the existing volume's regions. /// + /// For allocating for snapshot volumes, it's important to take into account + /// `region_snapshot`s that may be used as some of the targets in the region + /// set, representing read-only downstairs served out of a ZFS snapshot + /// instead of a dataset. + /// /// Returns the allocated regions, as well as the datasets to which they /// belong. pub async fn arbitrary_region_allocate( &self, opctx: &OpContext, - volume_id: Uuid, - disk_source: ¶ms::DiskSource, - size: external::ByteCount, + region_for: RegionAllocationFor, + region_parameters: RegionAllocationParameters<'_>, allocation_strategy: &RegionAllocationStrategy, num_regions_required: usize, ) -> Result, Error> { - let block_size = - self.get_block_size_from_disk_source(opctx, &disk_source).await?; - let (blocks_per_extent, extent_count) = - Self::get_crucible_allocation(&block_size, size); + let (volume_id, maybe_snapshot_id) = match region_for { + RegionAllocationFor::DiskVolume { volume_id } => (volume_id, None), - self.arbitrary_region_allocate_direct( - opctx, - volume_id, - u64::from(block_size.to_bytes()), - blocks_per_extent, - extent_count, - allocation_strategy, - num_regions_required, - ) - .await - } + RegionAllocationFor::SnapshotVolume { volume_id, snapshot_id } => { + (volume_id, Some(snapshot_id)) + } + }; + + let (block_size, blocks_per_extent, extent_count) = + match region_parameters { + RegionAllocationParameters::FromDiskSource { + disk_source, + size, + } => { + let block_size = self + .get_block_size_from_disk_source(opctx, &disk_source) + .await?; + + let (blocks_per_extent, extent_count) = + Self::get_crucible_allocation(&block_size, size); + + ( + u64::from(block_size.to_bytes()), + blocks_per_extent, + extent_count, + ) + } + + RegionAllocationParameters::FromRaw { + block_size, + blocks_per_extent, + extent_count, + } => (block_size, blocks_per_extent, extent_count), + }; - #[allow(clippy::too_many_arguments)] - pub async fn arbitrary_region_allocate_direct( - &self, - opctx: &OpContext, - volume_id: Uuid, - block_size: u64, - blocks_per_extent: u64, - extent_count: u64, - allocation_strategy: &RegionAllocationStrategy, - num_regions_required: usize, - ) -> Result, Error> { let query = crate::db::queries::region_allocation::allocation_query( volume_id, + maybe_snapshot_id, block_size, blocks_per_extent, extent_count, @@ -234,6 +268,7 @@ impl DataStore { self.log, "Allocated regions for volume"; "volume_id" => %volume_id, + "maybe_snapshot_id" => ?maybe_snapshot_id, "datasets_and_regions" => ?dataset_and_regions, ); diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index 83cc7483c9..44a89376d4 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -67,8 +67,14 @@ type SelectableSql = < >::SelectExpression as diesel::Expression >::SqlType; +/// For a given volume, idempotently allocate enough regions (according to some +/// allocation strategy) to meet some redundancy level. This should only be used +/// for the region set that is in the top level of the Volume (not the deeper +/// layers of the hierarchy). If that volume has region snapshots in the region +/// set, a `snapshot_id` should be supplied matching those entries. pub fn allocation_query( volume_id: uuid::Uuid, + snapshot_id: Option, block_size: u64, blocks_per_extent: u64, extent_count: u64, @@ -116,24 +122,42 @@ pub fn allocation_query( SELECT dataset.pool_id, sum(dataset.size_used) AS size_used - FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),") - - // Any zpool already have this volume's existing regions? - .sql(" - existing_zpools AS ( - SELECT - dataset.pool_id - FROM - dataset INNER JOIN old_regions ON (old_regions.dataset_id = dataset.id) - ),") + FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),"); + + let builder = if let Some(snapshot_id) = snapshot_id { + // Any zpool already have this volume's existing regions, or host the + // snapshot volume's regions? + builder.sql(" + existing_zpools AS (( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN old_regions ON (old_regions.dataset_id = dataset.id) + ) UNION ( + select dataset.pool_id from + dataset inner join region_snapshot on (region_snapshot.dataset_id = dataset.id) + where region_snapshot.snapshot_id = ").param().sql(")),") + .bind::(snapshot_id) + } else { + // Any zpool already have this volume's existing regions? + builder.sql(" + existing_zpools AS ( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN old_regions ON (old_regions.dataset_id = dataset.id) + ),") + }; // Identifies zpools with enough space for region allocation, that are not // currently used by this Volume's existing regions. // // NOTE: 'distinct_sleds' changes the format of the underlying SQL query, as it uses // distinct bind parameters depending on the conditional branch. - .sql(" - candidate_zpools AS ("); + let builder = builder.sql( + " + candidate_zpools AS (", + ); let builder = if distinct_sleds { builder.sql("SELECT DISTINCT ON (zpool.sled_id) ") } else { @@ -384,10 +408,15 @@ mod test { let blocks_per_extent = 4; let extent_count = 8; + // Start with snapshot_id = None + + let snapshot_id = None; + // First structure: "RandomWithDistinctSleds" let region_allocate = allocation_query( volume_id, + snapshot_id, block_size, blocks_per_extent, extent_count, @@ -406,6 +435,7 @@ mod test { let region_allocate = allocation_query( volume_id, + snapshot_id, block_size, blocks_per_extent, extent_count, @@ -417,6 +447,46 @@ mod test { "tests/output/region_allocate_random_sleds.sql", ) .await; + + // Next, put a value in for snapshot_id + + let snapshot_id = Some(Uuid::new_v4()); + + // First structure: "RandomWithDistinctSleds" + + let region_allocate = allocation_query( + volume_id, + snapshot_id, + block_size, + blocks_per_extent, + extent_count, + &RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(1), + }, + REGION_REDUNDANCY_THRESHOLD, + ); + expectorate_query_contents( + ®ion_allocate, + "tests/output/region_allocate_with_snapshot_distinct_sleds.sql", + ) + .await; + + // Second structure: "Random" + + let region_allocate = allocation_query( + volume_id, + snapshot_id, + block_size, + blocks_per_extent, + extent_count, + &RegionAllocationStrategy::Random { seed: Some(1) }, + REGION_REDUNDANCY_THRESHOLD, + ); + expectorate_query_contents( + ®ion_allocate, + "tests/output/region_allocate_with_snapshot_random_sleds.sql", + ) + .await; } // Explain the possible forms of the SQL query to ensure that it @@ -439,6 +509,7 @@ mod test { let region_allocate = allocation_query( volume_id, + None, block_size, blocks_per_extent, extent_count, @@ -454,6 +525,7 @@ mod test { let region_allocate = allocation_query( volume_id, + None, block_size, blocks_per_extent, extent_count, diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql new file mode 100644 index 0000000000..b90b2f2adf --- /dev/null +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql @@ -0,0 +1,323 @@ +WITH + old_regions + AS ( + SELECT + region.id, + region.time_created, + region.time_modified, + region.dataset_id, + region.volume_id, + region.block_size, + region.blocks_per_extent, + region.extent_count + FROM + region + WHERE + region.volume_id = $1 + ), + old_zpool_usage + AS ( + SELECT + dataset.pool_id, sum(dataset.size_used) AS size_used + FROM + dataset + WHERE + (dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL) + GROUP BY + dataset.pool_id + ), + existing_zpools + AS ( + ( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN old_regions ON old_regions.dataset_id = dataset.id + ) + UNION + ( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN region_snapshot ON region_snapshot.dataset_id = dataset.id + WHERE + region_snapshot.snapshot_id = $2 + ) + ), + candidate_zpools + AS ( + SELECT + DISTINCT ON (zpool.sled_id) old_zpool_usage.pool_id + FROM + old_zpool_usage + INNER JOIN (zpool INNER JOIN sled ON zpool.sled_id = sled.id) ON + zpool.id = old_zpool_usage.pool_id + INNER JOIN physical_disk ON zpool.physical_disk_id = physical_disk.id + WHERE + (old_zpool_usage.size_used + $3) + <= ( + SELECT + total_size + FROM + omicron.public.inv_zpool + WHERE + inv_zpool.id = old_zpool_usage.pool_id + ORDER BY + inv_zpool.time_collected DESC + LIMIT + 1 + ) + AND sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + AND physical_disk.disk_policy = 'in_service' + AND physical_disk.disk_state = 'active' + AND NOT (zpool.id = ANY (SELECT existing_zpools.pool_id FROM existing_zpools)) + ORDER BY + zpool.sled_id, md5(CAST(zpool.id AS BYTES) || $4) + ), + candidate_datasets + AS ( + SELECT + DISTINCT ON (dataset.pool_id) dataset.id, dataset.pool_id + FROM + dataset INNER JOIN candidate_zpools ON dataset.pool_id = candidate_zpools.pool_id + WHERE + ((dataset.time_deleted IS NULL) AND (dataset.size_used IS NOT NULL)) + AND dataset.kind = 'crucible' + ORDER BY + dataset.pool_id, md5(CAST(dataset.id AS BYTES) || $5) + ), + shuffled_candidate_datasets + AS ( + SELECT + candidate_datasets.id, candidate_datasets.pool_id + FROM + candidate_datasets + ORDER BY + md5(CAST(candidate_datasets.id AS BYTES) || $6) + LIMIT + $7 + ), + candidate_regions + AS ( + SELECT + gen_random_uuid() AS id, + now() AS time_created, + now() AS time_modified, + shuffled_candidate_datasets.id AS dataset_id, + $8 AS volume_id, + $9 AS block_size, + $10 AS blocks_per_extent, + $11 AS extent_count + FROM + shuffled_candidate_datasets + LIMIT + $12 - (SELECT count(*) FROM old_regions) + ), + proposed_dataset_changes + AS ( + SELECT + candidate_regions.dataset_id AS id, + dataset.pool_id AS pool_id, + candidate_regions.block_size + * candidate_regions.blocks_per_extent + * candidate_regions.extent_count + AS size_used_delta + FROM + candidate_regions INNER JOIN dataset ON dataset.id = candidate_regions.dataset_id + ), + do_insert + AS ( + SELECT + ( + ( + (SELECT count(*) FROM old_regions LIMIT 1) < $13 + AND CAST( + IF( + ( + ( + ( + (SELECT count(*) FROM candidate_zpools LIMIT 1) + + (SELECT count(*) FROM existing_zpools LIMIT 1) + ) + ) + >= $14 + ), + 'TRUE', + 'Not enough space' + ) + AS BOOL + ) + ) + AND CAST( + IF( + ( + ( + ( + (SELECT count(*) FROM candidate_regions LIMIT 1) + + (SELECT count(*) FROM old_regions LIMIT 1) + ) + ) + >= $15 + ), + 'TRUE', + 'Not enough datasets' + ) + AS BOOL + ) + ) + AND CAST( + IF( + ( + ( + ( + SELECT + count(DISTINCT pool_id) + FROM + ( + ( + SELECT + dataset.pool_id + FROM + candidate_regions + INNER JOIN dataset ON candidate_regions.dataset_id = dataset.id + ) + UNION + ( + SELECT + dataset.pool_id + FROM + old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id + ) + ) + LIMIT + 1 + ) + ) + >= $16 + ), + 'TRUE', + 'Not enough unique zpools selected' + ) + AS BOOL + ) + AS insert + ), + inserted_regions + AS ( + INSERT + INTO + region + ( + id, + time_created, + time_modified, + dataset_id, + volume_id, + block_size, + blocks_per_extent, + extent_count + ) + SELECT + candidate_regions.id, + candidate_regions.time_created, + candidate_regions.time_modified, + candidate_regions.dataset_id, + candidate_regions.volume_id, + candidate_regions.block_size, + candidate_regions.blocks_per_extent, + candidate_regions.extent_count + FROM + candidate_regions + WHERE + (SELECT do_insert.insert FROM do_insert LIMIT 1) + RETURNING + region.id, + region.time_created, + region.time_modified, + region.dataset_id, + region.volume_id, + region.block_size, + region.blocks_per_extent, + region.extent_count + ), + updated_datasets + AS ( + UPDATE + dataset + SET + size_used + = dataset.size_used + + ( + SELECT + proposed_dataset_changes.size_used_delta + FROM + proposed_dataset_changes + WHERE + proposed_dataset_changes.id = dataset.id + LIMIT + 1 + ) + WHERE + dataset.id = ANY (SELECT proposed_dataset_changes.id FROM proposed_dataset_changes) + AND (SELECT do_insert.insert FROM do_insert LIMIT 1) + RETURNING + dataset.id, + dataset.time_created, + dataset.time_modified, + dataset.time_deleted, + dataset.rcgen, + dataset.pool_id, + dataset.ip, + dataset.port, + dataset.kind, + dataset.size_used + ) +( + SELECT + dataset.id, + dataset.time_created, + dataset.time_modified, + dataset.time_deleted, + dataset.rcgen, + dataset.pool_id, + dataset.ip, + dataset.port, + dataset.kind, + dataset.size_used, + old_regions.id, + old_regions.time_created, + old_regions.time_modified, + old_regions.dataset_id, + old_regions.volume_id, + old_regions.block_size, + old_regions.blocks_per_extent, + old_regions.extent_count + FROM + old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id +) +UNION + ( + SELECT + updated_datasets.id, + updated_datasets.time_created, + updated_datasets.time_modified, + updated_datasets.time_deleted, + updated_datasets.rcgen, + updated_datasets.pool_id, + updated_datasets.ip, + updated_datasets.port, + updated_datasets.kind, + updated_datasets.size_used, + inserted_regions.id, + inserted_regions.time_created, + inserted_regions.time_modified, + inserted_regions.dataset_id, + inserted_regions.volume_id, + inserted_regions.block_size, + inserted_regions.blocks_per_extent, + inserted_regions.extent_count + FROM + inserted_regions + INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id + ) diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql new file mode 100644 index 0000000000..daf13f18ce --- /dev/null +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql @@ -0,0 +1,321 @@ +WITH + old_regions + AS ( + SELECT + region.id, + region.time_created, + region.time_modified, + region.dataset_id, + region.volume_id, + region.block_size, + region.blocks_per_extent, + region.extent_count + FROM + region + WHERE + region.volume_id = $1 + ), + old_zpool_usage + AS ( + SELECT + dataset.pool_id, sum(dataset.size_used) AS size_used + FROM + dataset + WHERE + (dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL) + GROUP BY + dataset.pool_id + ), + existing_zpools + AS ( + ( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN old_regions ON old_regions.dataset_id = dataset.id + ) + UNION + ( + SELECT + dataset.pool_id + FROM + dataset INNER JOIN region_snapshot ON region_snapshot.dataset_id = dataset.id + WHERE + region_snapshot.snapshot_id = $2 + ) + ), + candidate_zpools + AS ( + SELECT + old_zpool_usage.pool_id + FROM + old_zpool_usage + INNER JOIN (zpool INNER JOIN sled ON zpool.sled_id = sled.id) ON + zpool.id = old_zpool_usage.pool_id + INNER JOIN physical_disk ON zpool.physical_disk_id = physical_disk.id + WHERE + (old_zpool_usage.size_used + $3) + <= ( + SELECT + total_size + FROM + omicron.public.inv_zpool + WHERE + inv_zpool.id = old_zpool_usage.pool_id + ORDER BY + inv_zpool.time_collected DESC + LIMIT + 1 + ) + AND sled.sled_policy = 'in_service' + AND sled.sled_state = 'active' + AND physical_disk.disk_policy = 'in_service' + AND physical_disk.disk_state = 'active' + AND NOT (zpool.id = ANY (SELECT existing_zpools.pool_id FROM existing_zpools)) + ), + candidate_datasets + AS ( + SELECT + DISTINCT ON (dataset.pool_id) dataset.id, dataset.pool_id + FROM + dataset INNER JOIN candidate_zpools ON dataset.pool_id = candidate_zpools.pool_id + WHERE + ((dataset.time_deleted IS NULL) AND (dataset.size_used IS NOT NULL)) + AND dataset.kind = 'crucible' + ORDER BY + dataset.pool_id, md5(CAST(dataset.id AS BYTES) || $4) + ), + shuffled_candidate_datasets + AS ( + SELECT + candidate_datasets.id, candidate_datasets.pool_id + FROM + candidate_datasets + ORDER BY + md5(CAST(candidate_datasets.id AS BYTES) || $5) + LIMIT + $6 + ), + candidate_regions + AS ( + SELECT + gen_random_uuid() AS id, + now() AS time_created, + now() AS time_modified, + shuffled_candidate_datasets.id AS dataset_id, + $7 AS volume_id, + $8 AS block_size, + $9 AS blocks_per_extent, + $10 AS extent_count + FROM + shuffled_candidate_datasets + LIMIT + $11 - (SELECT count(*) FROM old_regions) + ), + proposed_dataset_changes + AS ( + SELECT + candidate_regions.dataset_id AS id, + dataset.pool_id AS pool_id, + candidate_regions.block_size + * candidate_regions.blocks_per_extent + * candidate_regions.extent_count + AS size_used_delta + FROM + candidate_regions INNER JOIN dataset ON dataset.id = candidate_regions.dataset_id + ), + do_insert + AS ( + SELECT + ( + ( + (SELECT count(*) FROM old_regions LIMIT 1) < $12 + AND CAST( + IF( + ( + ( + ( + (SELECT count(*) FROM candidate_zpools LIMIT 1) + + (SELECT count(*) FROM existing_zpools LIMIT 1) + ) + ) + >= $13 + ), + 'TRUE', + 'Not enough space' + ) + AS BOOL + ) + ) + AND CAST( + IF( + ( + ( + ( + (SELECT count(*) FROM candidate_regions LIMIT 1) + + (SELECT count(*) FROM old_regions LIMIT 1) + ) + ) + >= $14 + ), + 'TRUE', + 'Not enough datasets' + ) + AS BOOL + ) + ) + AND CAST( + IF( + ( + ( + ( + SELECT + count(DISTINCT pool_id) + FROM + ( + ( + SELECT + dataset.pool_id + FROM + candidate_regions + INNER JOIN dataset ON candidate_regions.dataset_id = dataset.id + ) + UNION + ( + SELECT + dataset.pool_id + FROM + old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id + ) + ) + LIMIT + 1 + ) + ) + >= $15 + ), + 'TRUE', + 'Not enough unique zpools selected' + ) + AS BOOL + ) + AS insert + ), + inserted_regions + AS ( + INSERT + INTO + region + ( + id, + time_created, + time_modified, + dataset_id, + volume_id, + block_size, + blocks_per_extent, + extent_count + ) + SELECT + candidate_regions.id, + candidate_regions.time_created, + candidate_regions.time_modified, + candidate_regions.dataset_id, + candidate_regions.volume_id, + candidate_regions.block_size, + candidate_regions.blocks_per_extent, + candidate_regions.extent_count + FROM + candidate_regions + WHERE + (SELECT do_insert.insert FROM do_insert LIMIT 1) + RETURNING + region.id, + region.time_created, + region.time_modified, + region.dataset_id, + region.volume_id, + region.block_size, + region.blocks_per_extent, + region.extent_count + ), + updated_datasets + AS ( + UPDATE + dataset + SET + size_used + = dataset.size_used + + ( + SELECT + proposed_dataset_changes.size_used_delta + FROM + proposed_dataset_changes + WHERE + proposed_dataset_changes.id = dataset.id + LIMIT + 1 + ) + WHERE + dataset.id = ANY (SELECT proposed_dataset_changes.id FROM proposed_dataset_changes) + AND (SELECT do_insert.insert FROM do_insert LIMIT 1) + RETURNING + dataset.id, + dataset.time_created, + dataset.time_modified, + dataset.time_deleted, + dataset.rcgen, + dataset.pool_id, + dataset.ip, + dataset.port, + dataset.kind, + dataset.size_used + ) +( + SELECT + dataset.id, + dataset.time_created, + dataset.time_modified, + dataset.time_deleted, + dataset.rcgen, + dataset.pool_id, + dataset.ip, + dataset.port, + dataset.kind, + dataset.size_used, + old_regions.id, + old_regions.time_created, + old_regions.time_modified, + old_regions.dataset_id, + old_regions.volume_id, + old_regions.block_size, + old_regions.blocks_per_extent, + old_regions.extent_count + FROM + old_regions INNER JOIN dataset ON old_regions.dataset_id = dataset.id +) +UNION + ( + SELECT + updated_datasets.id, + updated_datasets.time_created, + updated_datasets.time_modified, + updated_datasets.time_deleted, + updated_datasets.rcgen, + updated_datasets.pool_id, + updated_datasets.ip, + updated_datasets.port, + updated_datasets.kind, + updated_datasets.size_used, + inserted_regions.id, + inserted_regions.time_created, + inserted_regions.time_modified, + inserted_regions.dataset_id, + inserted_regions.volume_id, + inserted_regions.block_size, + inserted_regions.blocks_per_extent, + inserted_regions.extent_count + FROM + inserted_regions + INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id + ) diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index f546ae645b..c983716b4f 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -257,12 +257,16 @@ async fn srrs_alloc_new_region( // agent could reuse the allocated port and cause trouble. let datasets_and_regions = osagactx .datastore() - .arbitrary_region_allocate_direct( + .arbitrary_region_allocate( &opctx, - db_region.volume_id(), - db_region.block_size().to_bytes(), - db_region.blocks_per_extent(), - db_region.extent_count(), + db::datastore::RegionAllocationFor::DiskVolume { + volume_id: db_region.volume_id(), + }, + db::datastore::RegionAllocationParameters::FromRaw { + block_size: db_region.block_size().to_bytes(), + blocks_per_extent: db_region.blocks_per_extent(), + extent_count: db_region.extent_count(), + }, ¶ms.allocation_strategy, // Note: this assumes that previous redundancy is // REGION_REDUNDANCY_THRESHOLD, and that region replacement will diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index beba6ba7c0..a707f8240d 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -13,6 +13,8 @@ use http::StatusCode; use nexus_config::RegionAllocationStrategy; use nexus_db_model::PhysicalDiskPolicy; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::datastore::RegionAllocationFor; +use nexus_db_queries::db::datastore::RegionAllocationParameters; use nexus_db_queries::db::datastore::REGION_REDUNDANCY_THRESHOLD; use nexus_db_queries::db::fixed_data::{silo::DEFAULT_SILO_ID, FLEET_ID}; use nexus_db_queries::db::lookup::LookupPath; @@ -2033,11 +2035,13 @@ async fn test_single_region_allocate(cptestctx: &ControlPlaneTestContext) { let datasets_and_regions = datastore .arbitrary_region_allocate( &opctx, - volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from(512).unwrap(), + RegionAllocationFor::DiskVolume { volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), }, - ByteCount::from_gibibytes_u32(1), &RegionAllocationStrategy::Random { seed: None }, 1, ) @@ -2173,11 +2177,13 @@ async fn test_region_allocation_strategy_random_is_idempotent_arbitrary( let datasets_and_regions = datastore .arbitrary_region_allocate( &opctx, - volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from(512).unwrap(), + RegionAllocationFor::DiskVolume { volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), }, - ByteCount::from_gibibytes_u32(1), &RegionAllocationStrategy::Random { seed: None }, REGION_REDUNDANCY_THRESHOLD, ) @@ -2191,11 +2197,13 @@ async fn test_region_allocation_strategy_random_is_idempotent_arbitrary( let datasets_and_regions = datastore .arbitrary_region_allocate( &opctx, - volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from(512).unwrap(), + RegionAllocationFor::DiskVolume { volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from(512).unwrap(), + }, + size: ByteCount::from_gibibytes_u32(1), }, - ByteCount::from_gibibytes_u32(1), &RegionAllocationStrategy::Random { seed: None }, REGION_REDUNDANCY_THRESHOLD + 1, ) @@ -2262,14 +2270,16 @@ async fn test_single_region_allocate_for_replace( let datasets_and_regions = datastore .arbitrary_region_allocate( &opctx, - db_disk.volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from( - region_to_replace.block_size().to_bytes() as u32, - ) - .unwrap(), + RegionAllocationFor::DiskVolume { volume_id: db_disk.volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + region_to_replace.block_size().to_bytes() as u32, + ) + .unwrap(), + }, + size: region_total_size, }, - region_total_size, &RegionAllocationStrategy::Random { seed: None }, one_more, ) @@ -2348,14 +2358,16 @@ async fn test_single_region_allocate_for_replace_not_enough_zpools( let result = datastore .arbitrary_region_allocate( &opctx, - db_disk.volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from( - region_to_replace.block_size().to_bytes() as u32, - ) - .unwrap(), + RegionAllocationFor::DiskVolume { volume_id: db_disk.volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + region_to_replace.block_size().to_bytes() as u32, + ) + .unwrap(), + }, + size: region_total_size, }, - region_total_size, &RegionAllocationStrategy::Random { seed: None }, one_more, ) @@ -2367,14 +2379,16 @@ async fn test_single_region_allocate_for_replace_not_enough_zpools( let datasets_and_regions = datastore .arbitrary_region_allocate( &opctx, - db_disk.volume_id, - ¶ms::DiskSource::Blank { - block_size: params::BlockSize::try_from( - region_to_replace.block_size().to_bytes() as u32, - ) - .unwrap(), + RegionAllocationFor::DiskVolume { volume_id: db_disk.volume_id }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + region_to_replace.block_size().to_bytes() as u32, + ) + .unwrap(), + }, + size: region_total_size, }, - region_total_size, &RegionAllocationStrategy::Random { seed: None }, allocated_regions.len(), ) diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index 3fb6f8f6ec..f0ce2cca72 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -9,15 +9,20 @@ use chrono::Utc; use dropshot::test_util::ClientTestContext; use http::method::Method; use http::StatusCode; +use nexus_config::RegionAllocationStrategy; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db; +use nexus_db_queries::db::datastore::RegionAllocationFor; +use nexus_db_queries::db::datastore::RegionAllocationParameters; +use nexus_db_queries::db::datastore::REGION_REDUNDANCY_THRESHOLD; use nexus_db_queries::db::identity::Resource; use nexus_db_queries::db::lookup::LookupPath; use nexus_test_utils::http_testing::AuthnMode; use nexus_test_utils::http_testing::NexusRequest; use nexus_test_utils::http_testing::RequestBuilder; use nexus_test_utils::resource_helpers::create_default_ip_pool; +use nexus_test_utils::resource_helpers::create_disk; use nexus_test_utils::resource_helpers::create_project; use nexus_test_utils::resource_helpers::object_create; use nexus_test_utils::resource_helpers::DiskTest; @@ -1441,3 +1446,194 @@ async fn test_multiple_deletes_not_sent(cptestctx: &ControlPlaneTestContext) { assert!(!resources_2_datasets_and_snapshots.contains(tuple)); } } + +/// Ensure that allocating one replacement for a snapshot works +#[nexus_test] +async fn test_region_allocation_for_snapshot( + cptestctx: &ControlPlaneTestContext, +) { + let nexus = &cptestctx.server.server_context().nexus; + let datastore = nexus.datastore(); + let opctx = + OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); + + // Create three 10 GiB zpools, each with one dataset. + let mut disk_test = DiskTest::new(&cptestctx).await; + + // An additional disk is required, otherwise the allocation will fail with + // "not enough storage" + disk_test.add_zpool_with_dataset(&cptestctx).await; + + // Assert default is still 10 GiB + assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); + + // Create a disk + let client = &cptestctx.external_client; + let _project_id = create_project_and_pool(client).await; + + let disk = create_disk(&client, PROJECT_NAME, "disk").await; + + // Assert disk has three allocated regions + let disk_id = disk.identity.id; + let (.., db_disk) = LookupPath::new(&opctx, &datastore) + .disk_id(disk_id) + .fetch() + .await + .unwrap_or_else(|_| panic!("test disk {:?} should exist", disk_id)); + + let allocated_regions = + datastore.get_allocated_regions(db_disk.volume_id).await.unwrap(); + assert_eq!(allocated_regions.len(), REGION_REDUNDANCY_THRESHOLD); + + // Create a snapshot of the disk + + let snapshots_url = format!("/v1/snapshots?project={}", PROJECT_NAME); + + let snapshot: views::Snapshot = object_create( + client, + &snapshots_url, + ¶ms::SnapshotCreate { + identity: IdentityMetadataCreateParams { + name: "snapshot".parse().unwrap(), + description: String::from("a snapshot"), + }, + disk: disk_id.into(), + }, + ) + .await; + + assert_eq!(snapshot.disk_id, disk.identity.id); + assert_eq!(snapshot.size, disk.size); + + // There shouldn't be any regions for the snapshot volume + + let snapshot_id = snapshot.identity.id; + let (.., db_snapshot) = LookupPath::new(&opctx, &datastore) + .snapshot_id(snapshot_id) + .fetch() + .await + .unwrap_or_else(|_| { + panic!("test snapshot {:?} should exist", snapshot_id) + }); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 0); + + // Run region allocation for the snapshot volume, setting the redundancy to + // 1 (aka one more than existing number of regions), and expect only _one_ + // region to be allocated. + + datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: db_snapshot.volume_id, + snapshot_id: snapshot.identity.id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + disk.block_size.to_bytes() as u32, + ) + .unwrap(), + }, + size: disk.size, + }, + &RegionAllocationStrategy::Random { seed: None }, + 1, + ) + .await + .unwrap(); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + + // Assert that all regions are on separate datasets from the region + // snapshots + + for (_, region) in allocated_regions { + { + let conn = datastore.pool_connection_for_tests().await.unwrap(); + + use async_bb8_diesel::AsyncRunQueryDsl; + use db::schema::region_snapshot::dsl; + use diesel::ExpressionMethods; + use diesel::QueryDsl; + use diesel::SelectableHelper; + + let region_snapshots: Vec = + dsl::region_snapshot + .filter(dsl::dataset_id.eq(region.dataset_id())) + .filter(dsl::snapshot_id.eq(snapshot.identity.id)) + .select(db::model::RegionSnapshot::as_select()) + .load_async::(&*conn) + .await + .unwrap(); + + assert!(region_snapshots.is_empty()); + } + } + + // Ensure the function is idempotent + + datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: db_snapshot.volume_id, + snapshot_id: snapshot.identity.id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + disk.block_size.to_bytes() as u32, + ) + .unwrap(), + }, + size: disk.size, + }, + &RegionAllocationStrategy::Random { seed: None }, + 1, + ) + .await + .unwrap(); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 1); + + // If an additional region is required, make sure that works too. + disk_test.add_zpool_with_dataset(&cptestctx).await; + + datastore + .arbitrary_region_allocate( + &opctx, + RegionAllocationFor::SnapshotVolume { + volume_id: db_snapshot.volume_id, + snapshot_id: snapshot.identity.id, + }, + RegionAllocationParameters::FromDiskSource { + disk_source: ¶ms::DiskSource::Blank { + block_size: params::BlockSize::try_from( + disk.block_size.to_bytes() as u32, + ) + .unwrap(), + }, + size: disk.size, + }, + &RegionAllocationStrategy::Random { seed: None }, + 2, + ) + .await + .unwrap(); + + let allocated_regions = + datastore.get_allocated_regions(db_snapshot.volume_id).await.unwrap(); + + assert_eq!(allocated_regions.len(), 2); +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 4fd1930d33..338f52f854 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4129,6 +4129,11 @@ CREATE TABLE IF NOT EXISTS omicron.public.migration ( time_target_updated TIMESTAMPTZ ); +/* Lookup region snapshot by snapshot id */ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_snapshot_id on omicron.public.region_snapshot ( + snapshot_id +); + /* * Keep this at the end of file so that the database does not contain a version * until it is fully populated. @@ -4140,7 +4145,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '75.0.0', NULL) + (TRUE, NOW(), NOW(), '76.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/lookup-region-snapshot-by-snapshot-id/up.sql b/schema/crdb/lookup-region-snapshot-by-snapshot-id/up.sql new file mode 100644 index 0000000000..40789781c2 --- /dev/null +++ b/schema/crdb/lookup-region-snapshot-by-snapshot-id/up.sql @@ -0,0 +1,3 @@ +CREATE INDEX IF NOT EXISTS lookup_region_snapshot_by_snapshot_id on omicron.public.region_snapshot ( + snapshot_id +);