Skip to content

Commit

Permalink
Account for region snapshots during allocation (#5901)
Browse files Browse the repository at this point in the history
When replacing a snapshot (aka a read-only downstairs somewhere), region
allocation must occur for a snapshot volume. A snapshot volume's region
set is currently only composed of read-only downstairs which all contain
the same data, and any newly allocated region should take those into
account when looking to meet the redundancy criteria for a region set:
in production, allocate the new region to a distinct sled.

This is done by slightly changing the region allocation query to accept
an optional snapshot id, and, if that is supplied, will add the region
snapshot's pools to the `existing_zpools` temporary table in order to
prevent region allocation there.
  • Loading branch information
jmpesp authored Jun 14, 2024
1 parent fabbc5d commit a8b3ce2
Show file tree
Hide file tree
Showing 11 changed files with 1,060 additions and 84 deletions.
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::collections::BTreeMap;
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(75, 0, 0);
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(76, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -29,6 +29,7 @@ static KNOWN_VERSIONS: Lazy<Vec<KnownVersion>> = Lazy::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(76, "lookup-region-snapshot-by-snapshot-id"),
KnownVersion::new(75, "add-cockroach-zone-id-to-node-id"),
KnownVersion::new(74, "add-migration-table"),
KnownVersion::new(73, "add-vlan-to-uplink"),
Expand Down
2 changes: 2 additions & 0 deletions nexus/db-queries/src/db/datastore/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ use nexus_db_model::AllSchemaVersions;
pub use probe::ProbeInfo;
pub use rack::RackInit;
pub use rack::SledUnderlayAllocationResult;
pub use region::RegionAllocationFor;
pub use region::RegionAllocationParameters;
pub use silo::Discoverability;
pub use sled::SledTransition;
pub use sled::TransitionError;
Expand Down
99 changes: 67 additions & 32 deletions nexus/db-queries/src/db/datastore/region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,29 @@ use omicron_common::api::external::LookupResult;
use slog::Logger;
use uuid::Uuid;

pub enum RegionAllocationFor {
/// Allocate region(s) for a disk volume
DiskVolume { volume_id: Uuid },

/// Allocate region(s) for a snapshot volume, which may have read-only
/// targets.
SnapshotVolume { volume_id: Uuid, snapshot_id: Uuid },
}

/// Describe the region(s) to be allocated
pub enum RegionAllocationParameters<'a> {
FromDiskSource {
disk_source: &'a params::DiskSource,
size: external::ByteCount,
},

FromRaw {
block_size: u64,
blocks_per_extent: u64,
extent_count: u64,
},
}

impl DataStore {
pub(super) fn get_allocated_regions_query(
volume_id: Uuid,
Expand Down Expand Up @@ -156,9 +179,8 @@ impl DataStore {
) -> Result<Vec<(Dataset, Region)>, Error> {
self.arbitrary_region_allocate(
opctx,
volume_id,
disk_source,
size,
RegionAllocationFor::DiskVolume { volume_id },
RegionAllocationParameters::FromDiskSource { disk_source, size },
allocation_strategy,
REGION_REDUNDANCY_THRESHOLD,
)
Expand All @@ -175,47 +197,59 @@ impl DataStore {
/// level for a volume. If a single region is allocated in isolation this
/// could land on the same dataset as one of the existing volume's regions.
///
/// For allocating for snapshot volumes, it's important to take into account
/// `region_snapshot`s that may be used as some of the targets in the region
/// set, representing read-only downstairs served out of a ZFS snapshot
/// instead of a dataset.
///
/// Returns the allocated regions, as well as the datasets to which they
/// belong.
pub async fn arbitrary_region_allocate(
&self,
opctx: &OpContext,
volume_id: Uuid,
disk_source: &params::DiskSource,
size: external::ByteCount,
region_for: RegionAllocationFor,
region_parameters: RegionAllocationParameters<'_>,
allocation_strategy: &RegionAllocationStrategy,
num_regions_required: usize,
) -> Result<Vec<(Dataset, Region)>, Error> {
let block_size =
self.get_block_size_from_disk_source(opctx, &disk_source).await?;
let (blocks_per_extent, extent_count) =
Self::get_crucible_allocation(&block_size, size);
let (volume_id, maybe_snapshot_id) = match region_for {
RegionAllocationFor::DiskVolume { volume_id } => (volume_id, None),

self.arbitrary_region_allocate_direct(
opctx,
volume_id,
u64::from(block_size.to_bytes()),
blocks_per_extent,
extent_count,
allocation_strategy,
num_regions_required,
)
.await
}
RegionAllocationFor::SnapshotVolume { volume_id, snapshot_id } => {
(volume_id, Some(snapshot_id))
}
};

let (block_size, blocks_per_extent, extent_count) =
match region_parameters {
RegionAllocationParameters::FromDiskSource {
disk_source,
size,
} => {
let block_size = self
.get_block_size_from_disk_source(opctx, &disk_source)
.await?;

let (blocks_per_extent, extent_count) =
Self::get_crucible_allocation(&block_size, size);

(
u64::from(block_size.to_bytes()),
blocks_per_extent,
extent_count,
)
}

RegionAllocationParameters::FromRaw {
block_size,
blocks_per_extent,
extent_count,
} => (block_size, blocks_per_extent, extent_count),
};

#[allow(clippy::too_many_arguments)]
pub async fn arbitrary_region_allocate_direct(
&self,
opctx: &OpContext,
volume_id: Uuid,
block_size: u64,
blocks_per_extent: u64,
extent_count: u64,
allocation_strategy: &RegionAllocationStrategy,
num_regions_required: usize,
) -> Result<Vec<(Dataset, Region)>, Error> {
let query = crate::db::queries::region_allocation::allocation_query(
volume_id,
maybe_snapshot_id,
block_size,
blocks_per_extent,
extent_count,
Expand All @@ -234,6 +268,7 @@ impl DataStore {
self.log,
"Allocated regions for volume";
"volume_id" => %volume_id,
"maybe_snapshot_id" => ?maybe_snapshot_id,
"datasets_and_regions" => ?dataset_and_regions,
);

Expand Down
96 changes: 84 additions & 12 deletions nexus/db-queries/src/db/queries/region_allocation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,14 @@ type SelectableSql<T> = <
<T as diesel::Selectable<Pg>>::SelectExpression as diesel::Expression
>::SqlType;

/// For a given volume, idempotently allocate enough regions (according to some
/// allocation strategy) to meet some redundancy level. This should only be used
/// for the region set that is in the top level of the Volume (not the deeper
/// layers of the hierarchy). If that volume has region snapshots in the region
/// set, a `snapshot_id` should be supplied matching those entries.
pub fn allocation_query(
volume_id: uuid::Uuid,
snapshot_id: Option<uuid::Uuid>,
block_size: u64,
blocks_per_extent: u64,
extent_count: u64,
Expand Down Expand Up @@ -116,24 +122,42 @@ pub fn allocation_query(
SELECT
dataset.pool_id,
sum(dataset.size_used) AS size_used
FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),")

// Any zpool already have this volume's existing regions?
.sql("
existing_zpools AS (
SELECT
dataset.pool_id
FROM
dataset INNER JOIN old_regions ON (old_regions.dataset_id = dataset.id)
),")
FROM dataset WHERE ((dataset.size_used IS NOT NULL) AND (dataset.time_deleted IS NULL)) GROUP BY dataset.pool_id),");

let builder = if let Some(snapshot_id) = snapshot_id {
// Any zpool already have this volume's existing regions, or host the
// snapshot volume's regions?
builder.sql("
existing_zpools AS ((
SELECT
dataset.pool_id
FROM
dataset INNER JOIN old_regions ON (old_regions.dataset_id = dataset.id)
) UNION (
select dataset.pool_id from
dataset inner join region_snapshot on (region_snapshot.dataset_id = dataset.id)
where region_snapshot.snapshot_id = ").param().sql(")),")
.bind::<sql_types::Uuid, _>(snapshot_id)
} else {
// Any zpool already have this volume's existing regions?
builder.sql("
existing_zpools AS (
SELECT
dataset.pool_id
FROM
dataset INNER JOIN old_regions ON (old_regions.dataset_id = dataset.id)
),")
};

// Identifies zpools with enough space for region allocation, that are not
// currently used by this Volume's existing regions.
//
// NOTE: 'distinct_sleds' changes the format of the underlying SQL query, as it uses
// distinct bind parameters depending on the conditional branch.
.sql("
candidate_zpools AS (");
let builder = builder.sql(
"
candidate_zpools AS (",
);
let builder = if distinct_sleds {
builder.sql("SELECT DISTINCT ON (zpool.sled_id) ")
} else {
Expand Down Expand Up @@ -384,10 +408,15 @@ mod test {
let blocks_per_extent = 4;
let extent_count = 8;

// Start with snapshot_id = None

let snapshot_id = None;

// First structure: "RandomWithDistinctSleds"

let region_allocate = allocation_query(
volume_id,
snapshot_id,
block_size,
blocks_per_extent,
extent_count,
Expand All @@ -406,6 +435,7 @@ mod test {

let region_allocate = allocation_query(
volume_id,
snapshot_id,
block_size,
blocks_per_extent,
extent_count,
Expand All @@ -417,6 +447,46 @@ mod test {
"tests/output/region_allocate_random_sleds.sql",
)
.await;

// Next, put a value in for snapshot_id

let snapshot_id = Some(Uuid::new_v4());

// First structure: "RandomWithDistinctSleds"

let region_allocate = allocation_query(
volume_id,
snapshot_id,
block_size,
blocks_per_extent,
extent_count,
&RegionAllocationStrategy::RandomWithDistinctSleds {
seed: Some(1),
},
REGION_REDUNDANCY_THRESHOLD,
);
expectorate_query_contents(
&region_allocate,
"tests/output/region_allocate_with_snapshot_distinct_sleds.sql",
)
.await;

// Second structure: "Random"

let region_allocate = allocation_query(
volume_id,
snapshot_id,
block_size,
blocks_per_extent,
extent_count,
&RegionAllocationStrategy::Random { seed: Some(1) },
REGION_REDUNDANCY_THRESHOLD,
);
expectorate_query_contents(
&region_allocate,
"tests/output/region_allocate_with_snapshot_random_sleds.sql",
)
.await;
}

// Explain the possible forms of the SQL query to ensure that it
Expand All @@ -439,6 +509,7 @@ mod test {

let region_allocate = allocation_query(
volume_id,
None,
block_size,
blocks_per_extent,
extent_count,
Expand All @@ -454,6 +525,7 @@ mod test {

let region_allocate = allocation_query(
volume_id,
None,
block_size,
blocks_per_extent,
extent_count,
Expand Down
Loading

0 comments on commit a8b3ce2

Please sign in to comment.