From bf824f23d84abdb7771b85ad681d23bd81ddaec4 Mon Sep 17 00:00:00 2001 From: Greg Colombo Date: Tue, 17 Dec 2024 10:30:16 -0800 Subject: [PATCH] server: move migration failure injection into instance specs (#809) Make the server get migration failure injection device configuration from an instance spec component instead of the config TOML. This is a prelude to a larger change that will make the server stop depending on config TOMLs entirely (in favor of giving clients a library they can use to convert config TOMLs to instance spec components). PHD still needs to be able to set different import/export failure settings for migration sources and targets, so add the migration failure device to the set of components that can be replaced during preamble processing. --- bin/propolis-server/src/lib/initializer.rs | 38 ++------ .../src/lib/migrate/preamble.rs | 21 +++++ .../src/lib/spec/api_spec_v0.rs | 39 +++++++- bin/propolis-server/src/lib/spec/builder.rs | 25 ++++++ bin/propolis-server/src/lib/spec/mod.rs | 13 +++ bin/propolis-server/src/lib/vm/ensure.rs | 2 +- .../src/instance_spec/components/devices.rs | 15 ++++ .../src/instance_spec/v0.rs | 1 + lib/propolis/src/hw/testdev.rs | 2 +- openapi/propolis-server.json | 42 +++++++++ phd-tests/framework/src/test_vm/config.rs | 90 ++++++++++--------- 11 files changed, 213 insertions(+), 75 deletions(-) diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index 9f610b0f8..a3ea3fb45 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -783,49 +783,29 @@ impl<'a> MachineInitializer<'a> { } #[cfg(not(feature = "omicron-build"))] - pub fn initialize_test_devices( - &mut self, - toml_cfg: &std::collections::BTreeMap< - String, - propolis_server_config::Device, - >, - ) { + pub fn initialize_test_devices(&mut self) { use propolis::hw::testdev::{ MigrationFailureDevice, MigrationFailures, }; - if let Some(dev) = toml_cfg.get(MigrationFailureDevice::NAME) { - const FAIL_EXPORTS: &str = "fail_exports"; - const FAIL_IMPORTS: &str = "fail_imports"; - let fail_exports = dev - .options - .get(FAIL_EXPORTS) - .and_then(|val| val.as_integer()) - .unwrap_or(0); - let fail_imports = dev - .options - .get(FAIL_IMPORTS) - .and_then(|val| val.as_integer()) - .unwrap_or(0); - - if fail_exports <= 0 && fail_imports <= 0 { + if let Some(mig) = &self.spec.migration_failure { + if mig.spec.fail_exports == 0 && mig.spec.fail_imports == 0 { info!( self.log, - "migration failure device will not fail, as both - `{FAIL_EXPORTS}` and `{FAIL_IMPORTS}` are 0"; - FAIL_EXPORTS => ?fail_exports, - FAIL_IMPORTS => ?fail_imports, + "migration failure device's failure counts are both 0"; + "device_spec" => ?mig.spec ); } let dev = MigrationFailureDevice::create( &self.log, MigrationFailures { - exports: fail_exports as usize, - imports: fail_imports as usize, + exports: mig.spec.fail_exports as usize, + imports: mig.spec.fail_imports as usize, }, ); - self.devices.insert(MigrationFailureDevice::NAME.into(), dev); + + self.devices.insert(mig.name.clone(), dev); } } diff --git a/bin/propolis-server/src/lib/migrate/preamble.rs b/bin/propolis-server/src/lib/migrate/preamble.rs index 6f0280401..a2946c556 100644 --- a/bin/propolis-server/src/lib/migrate/preamble.rs +++ b/bin/propolis-server/src/lib/migrate/preamble.rs @@ -78,6 +78,27 @@ impl Preamble { ComponentV0::VirtioNetworkBackend(nic.backend_spec.clone()); } + #[cfg(not(feature = "omicron-build"))] + if let Some(mig) = &target_spec.migration_failure { + let Some(to_amend) = source_spec.components.get_mut(&mig.name) + else { + return Err(MigrateError::InstanceSpecsIncompatible(format!( + "replacement component {} not in source spec", + mig.name + ))); + }; + + if !matches!(to_amend, ComponentV0::MigrationFailureInjector(_)) { + return Err(MigrateError::InstanceSpecsIncompatible(format!( + "component {} is not a migration failure injector \ + in the source spec", + mig.name + ))); + } + + *to_amend = ComponentV0::MigrationFailureInjector(mig.spec.clone()); + } + let amended_spec = source_spec.try_into().map_err(|e: ApiSpecError| { MigrateError::PreambleParse(e.to_string()) diff --git a/bin/propolis-server/src/lib/spec/api_spec_v0.rs b/bin/propolis-server/src/lib/spec/api_spec_v0.rs index 1bceaa049..1f88e7e11 100644 --- a/bin/propolis-server/src/lib/spec/api_spec_v0.rs +++ b/bin/propolis-server/src/lib/spec/api_spec_v0.rs @@ -26,6 +26,9 @@ use super::{ StorageDevice, }; +#[cfg(not(feature = "omicron-build"))] +use super::MigrationFailure; + #[cfg(feature = "falcon")] use super::SoftNpuPort; @@ -40,9 +43,9 @@ pub(crate) enum ApiSpecError { #[error("network backend {backend} not found for device {device}")] NetworkBackendNotFound { backend: String, device: String }, - #[cfg(not(feature = "falcon"))] - #[error("softnpu component {0} compiled out")] - SoftNpuCompiledOut(String), + #[allow(dead_code)] + #[error("support for component {component} compiled out via {feature}")] + FeatureCompiledOut { component: String, feature: &'static str }, #[error("backend {0} not used by any device")] BackendNotUsed(String), @@ -61,6 +64,8 @@ impl From for InstanceSpecV0 { serial, pci_pci_bridges, pvpanic, + #[cfg(not(feature = "omicron-build"))] + migration_failure, #[cfg(feature = "falcon")] softnpu, } = val; @@ -151,6 +156,15 @@ impl From for InstanceSpecV0 { ); } + #[cfg(not(feature = "omicron-build"))] + if let Some(mig) = migration_failure { + insert_component( + &mut spec, + mig.name, + ComponentV0::MigrationFailureInjector(mig.spec), + ); + } + #[cfg(feature = "falcon")] { if let Some(softnpu_pci) = softnpu.pci_port { @@ -292,12 +306,29 @@ impl TryFrom for Spec { // apply it to the builder later. boot_settings = Some((device_name, settings)); } + #[cfg(feature = "omicron-build")] + ComponentV0::MigrationFailureInjector(_) => { + return Err(ApiSpecError::FeatureCompiledOut { + component: device_name, + feature: "omicron-build", + }); + } + #[cfg(not(feature = "omicron-build"))] + ComponentV0::MigrationFailureInjector(mig) => { + builder.add_migration_failure_device(MigrationFailure { + name: device_name, + spec: mig, + })?; + } #[cfg(not(feature = "falcon"))] ComponentV0::SoftNpuPciPort(_) | ComponentV0::SoftNpuPort(_) | ComponentV0::SoftNpuP9(_) | ComponentV0::P9fs(_) => { - return Err(ApiSpecError::SoftNpuCompiledOut(device_name)); + return Err(ApiSpecError::FeatureCompiledOut { + component: device_name, + feature: "falcon", + }); } #[cfg(feature = "falcon")] ComponentV0::SoftNpuPciPort(port) => { diff --git a/bin/propolis-server/src/lib/spec/builder.rs b/bin/propolis-server/src/lib/spec/builder.rs index 1a5dee420..b9012d212 100644 --- a/bin/propolis-server/src/lib/spec/builder.rs +++ b/bin/propolis-server/src/lib/spec/builder.rs @@ -32,6 +32,9 @@ use super::{ Board, BootOrderEntry, BootSettings, Disk, Nic, QemuPvpanic, SerialPort, }; +#[cfg(not(feature = "omicron-build"))] +use super::MigrationFailure; + #[cfg(feature = "falcon")] use super::{ParsedSoftNpu, SoftNpuPort}; @@ -59,6 +62,10 @@ pub(crate) enum SpecBuilderError { #[error("pvpanic device already specified")] PvpanicInUse, + #[cfg(not(feature = "omicron-build"))] + #[error("migration failure injection already enabled")] + MigrationFailureInjectionInUse, + #[error("boot settings were already specified")] BootSettingsInUse, @@ -367,6 +374,24 @@ impl SpecBuilder { Ok(self) } + #[cfg(not(feature = "omicron-build"))] + pub fn add_migration_failure_device( + &mut self, + mig: MigrationFailure, + ) -> Result<&Self, SpecBuilderError> { + if self.component_names.contains(&mig.name) { + return Err(SpecBuilderError::ComponentNameInUse(mig.name)); + } + + if self.spec.migration_failure.is_some() { + return Err(SpecBuilderError::MigrationFailureInjectionInUse); + } + + self.component_names.insert(mig.name.clone()); + self.spec.migration_failure = Some(mig); + Ok(self) + } + #[cfg(feature = "falcon")] pub fn set_softnpu_com4( &mut self, diff --git a/bin/propolis-server/src/lib/spec/mod.rs b/bin/propolis-server/src/lib/spec/mod.rs index c87c5a431..dc133fa02 100644 --- a/bin/propolis-server/src/lib/spec/mod.rs +++ b/bin/propolis-server/src/lib/spec/mod.rs @@ -34,6 +34,9 @@ use propolis_api_types::instance_spec::{ }; use thiserror::Error; +#[cfg(not(feature = "omicron-build"))] +use propolis_api_types::instance_spec::components::devices::MigrationFailureInjector; + #[cfg(feature = "falcon")] use propolis_api_types::instance_spec::components::{ backends::DlpiNetworkBackend, @@ -71,6 +74,9 @@ pub(crate) struct Spec { pub pci_pci_bridges: HashMap, pub pvpanic: Option, + #[cfg(not(feature = "omicron-build"))] + pub migration_failure: Option, + #[cfg(feature = "falcon")] pub softnpu: SoftNpu, } @@ -278,6 +284,13 @@ pub struct QemuPvpanic { pub spec: QemuPvpanicDesc, } +#[cfg(not(feature = "omicron-build"))] +#[derive(Clone, Debug)] +pub struct MigrationFailure { + pub name: String, + pub spec: MigrationFailureInjector, +} + #[cfg(feature = "falcon")] #[derive(Clone, Debug)] pub struct SoftNpuPort { diff --git a/bin/propolis-server/src/lib/vm/ensure.rs b/bin/propolis-server/src/lib/vm/ensure.rs index f3e1cd7dd..41632194e 100644 --- a/bin/propolis-server/src/lib/vm/ensure.rs +++ b/bin/propolis-server/src/lib/vm/ensure.rs @@ -403,7 +403,7 @@ async fn initialize_vm_objects( init.initialize_network_devices(&chipset).await?; #[cfg(not(feature = "omicron-build"))] - init.initialize_test_devices(&options.toml_config.devices); + init.initialize_test_devices(); #[cfg(feature = "omicron-build")] info!(log, "`omicron-build` feature enabled, ignoring any test devices"); diff --git a/crates/propolis-api-types/src/instance_spec/components/devices.rs b/crates/propolis-api-types/src/instance_spec/components/devices.rs index 06ced9844..25505cbe6 100644 --- a/crates/propolis-api-types/src/instance_spec/components/devices.rs +++ b/crates/propolis-api-types/src/instance_spec/components/devices.rs @@ -185,3 +185,18 @@ pub struct P9fs { /// The PCI path at which to attach the guest to this P9 filesystem. pub pci_path: PciPath, } + +/// Describes a synthetic device that registers for VM lifecycle notifications +/// and returns errors during attempts to migrate. +/// +/// This is only supported by Propolis servers compiled without the +/// `omicron-build` feature. +#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)] +#[serde(deny_unknown_fields)] +pub struct MigrationFailureInjector { + /// The number of times this device should fail requests to export state. + pub fail_exports: u32, + + /// The number of times this device should fail requests to import state. + pub fail_imports: u32, +} diff --git a/crates/propolis-api-types/src/instance_spec/v0.rs b/crates/propolis-api-types/src/instance_spec/v0.rs index 634e5fe1d..9c49176d5 100644 --- a/crates/propolis-api-types/src/instance_spec/v0.rs +++ b/crates/propolis-api-types/src/instance_spec/v0.rs @@ -22,6 +22,7 @@ pub enum ComponentV0 { SoftNpuPort(components::devices::SoftNpuPort), SoftNpuP9(components::devices::SoftNpuP9), P9fs(components::devices::P9fs), + MigrationFailureInjector(components::devices::MigrationFailureInjector), CrucibleStorageBackend(components::backends::CrucibleStorageBackend), FileStorageBackend(components::backends::FileStorageBackend), BlobStorageBackend(components::backends::BlobStorageBackend), diff --git a/lib/propolis/src/hw/testdev.rs b/lib/propolis/src/hw/testdev.rs index 53307d890..a845d3271 100644 --- a/lib/propolis/src/hw/testdev.rs +++ b/lib/propolis/src/hw/testdev.rs @@ -35,7 +35,7 @@ pub struct MigrationFailures { struct MigrationFailurePayloadV1 {} impl MigrationFailureDevice { - pub const NAME: &'static str = "test-migration-failure"; + const NAME: &'static str = "test-migration-failure"; pub fn create(log: &slog::Logger, fail: MigrationFailures) -> Arc { let log = diff --git a/openapi/propolis-server.json b/openapi/propolis-server.json index aba9a1c03..1d69ebadf 100644 --- a/openapi/propolis-server.json +++ b/openapi/propolis-server.json @@ -783,6 +783,25 @@ ], "additionalProperties": false }, + { + "type": "object", + "properties": { + "component": { + "$ref": "#/components/schemas/MigrationFailureInjector" + }, + "type": { + "type": "string", + "enum": [ + "MigrationFailureInjector" + ] + } + }, + "required": [ + "component", + "type" + ], + "additionalProperties": false + }, { "type": "object", "properties": { @@ -1571,6 +1590,29 @@ "vcr_json" ] }, + "MigrationFailureInjector": { + "description": "Describes a synthetic device that registers for VM lifecycle notifications and returns errors during attempts to migrate.\n\nThis is only supported by Propolis servers compiled without the `omicron-build` feature.", + "type": "object", + "properties": { + "fail_exports": { + "description": "The number of times this device should fail requests to export state.", + "type": "integer", + "format": "uint32", + "minimum": 0 + }, + "fail_imports": { + "description": "The number of times this device should fail requests to import state.", + "type": "integer", + "format": "uint32", + "minimum": 0 + } + }, + "required": [ + "fail_exports", + "fail_imports" + ], + "additionalProperties": false + }, "MigrationState": { "type": "string", "enum": [ diff --git a/phd-tests/framework/src/test_vm/config.rs b/phd-tests/framework/src/test_vm/config.rs index b5a288dd7..68ddaaa9b 100644 --- a/phd-tests/framework/src/test_vm/config.rs +++ b/phd-tests/framework/src/test_vm/config.rs @@ -2,15 +2,15 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -use std::collections::BTreeMap; use std::sync::Arc; use anyhow::Context; use cpuid_utils::CpuidIdent; use propolis_client::types::{ Board, BootOrderEntry, BootSettings, Chipset, ComponentV0, Cpuid, - CpuidEntry, CpuidVendor, InstanceMetadata, InstanceSpecV0, NvmeDisk, - PciPath, SerialPort, SerialPortNumber, VirtioDisk, + CpuidEntry, CpuidVendor, InstanceMetadata, InstanceSpecV0, + MigrationFailureInjector, NvmeDisk, PciPath, SerialPort, SerialPortNumber, + VirtioDisk, }; use uuid::Uuid; @@ -51,11 +51,9 @@ pub struct VmConfig<'dr> { bootrom_artifact: String, boot_order: Option>, disks: Vec>, - devices: BTreeMap, + migration_failure: Option, } -const MIGRATION_FAILURE_DEVICE: &str = "test-migration-failure"; - impl<'dr> VmConfig<'dr> { pub(crate) fn new( vm_name: &str, @@ -72,7 +70,7 @@ impl<'dr> VmConfig<'dr> { bootrom_artifact: bootrom.to_owned(), boot_order: None, disks: Vec::new(), - devices: BTreeMap::new(), + migration_failure: None, }; config.boot_disk( @@ -111,20 +109,22 @@ impl<'dr> VmConfig<'dr> { } pub fn fail_migration_exports(&mut self, exports: u32) -> &mut Self { - self.devices - .entry(MIGRATION_FAILURE_DEVICE.to_owned()) - .or_insert_with(default_migration_failure_device) - .options - .insert("fail_exports".to_string(), exports.into()); + let injector = + self.migration_failure.get_or_insert(MigrationFailureInjector { + fail_exports: 0, + fail_imports: 0, + }); + injector.fail_exports = exports; self } pub fn fail_migration_imports(&mut self, imports: u32) -> &mut Self { - self.devices - .entry(MIGRATION_FAILURE_DEVICE.to_owned()) - .or_insert_with(default_migration_failure_device) - .options - .insert("fail_imports".to_string(), imports.into()); + let injector = + self.migration_failure.get_or_insert(MigrationFailureInjector { + fail_exports: 0, + fail_imports: 0, + }); + injector.fail_imports = imports; self } @@ -198,18 +198,28 @@ impl<'dr> VmConfig<'dr> { &self, framework: &Framework, ) -> anyhow::Result { + let VmConfig { + vm_name, + cpus, + memory_mib, + cpuid, + bootrom_artifact, + boot_order, + disks, + migration_failure, + } = self; + // Figure out where the bootrom is and generate the serialized contents // of a Propolis server config TOML that points to it. let bootrom = framework .artifact_store - .get_bootrom(&self.bootrom_artifact) + .get_bootrom(bootrom_artifact) .await .context("looking up bootrom artifact")?; let config_toml_contents = toml::ser::to_string(&propolis_server_config::Config { bootrom: bootrom.clone().into(), - devices: self.devices.clone(), ..Default::default() }) .context("serializing Propolis server config")?; @@ -227,20 +237,19 @@ impl<'dr> VmConfig<'dr> { // specific guest OS adapter and avoid the guessing games. So far the // above supports existing tests and makes them "Just Work", but a more // complicated test may want more control here. - let boot_disk = self - .disks + let boot_disk = disks .iter() .find(|d| d.name == "boot-disk") .or_else(|| { - if let Some(boot_order) = self.boot_order.as_ref() { - boot_order.first().and_then(|name| { - self.disks.iter().find(|d| &d.name == name) - }) + if let Some(boot_order) = boot_order.as_ref() { + boot_order + .first() + .and_then(|name| disks.iter().find(|d| &d.name == name)) } else { None } }) - .or_else(|| self.disks.first()) + .or_else(|| disks.first()) .expect("VM config includes at least one disk"); // XXX: assuming all bootable images are equivalent to the first, or at @@ -256,7 +265,7 @@ impl<'dr> VmConfig<'dr> { .context("getting guest OS kind for boot disk")?; let mut disk_handles = Vec::new(); - for disk in self.disks.iter() { + for disk in disks.iter() { disk_handles.push( make_disk(disk.name.to_owned(), framework, disk) .await @@ -274,10 +283,10 @@ impl<'dr> VmConfig<'dr> { let mut spec = InstanceSpecV0 { board: Board { - cpus: self.cpus, - memory_mb: self.memory_mib, + cpus: *cpus, + memory_mb: *memory_mib, chipset: Chipset::default(), - cpuid: self.cpuid.as_ref().map(|entries| Cpuid { + cpuid: cpuid.as_ref().map(|entries| Cpuid { entries: entries.clone(), vendor: match host_vendor { cpuid_utils::CpuidVendor::Amd => CpuidVendor::Amd, @@ -292,7 +301,7 @@ impl<'dr> VmConfig<'dr> { // elements for all of them. This assumes the disk handles were created // in the correct order: boot disk first, then in the data disks' // iteration order. - let all_disks = self.disks.iter().zip(disk_handles.iter()); + let all_disks = disks.iter().zip(disk_handles.iter()); for (req, hdl) in all_disks { let pci_path = PciPath::new(0, req.pci_device_num, 0).unwrap(); let backend_spec = hdl.backend_spec(); @@ -324,7 +333,7 @@ impl<'dr> VmConfig<'dr> { ); assert!(_old.is_none()); - if let Some(boot_order) = self.boot_order.as_ref() { + if let Some(boot_order) = boot_order.as_ref() { let _old = spec.components.insert( "boot-settings".to_string(), ComponentV0::BootSettings(BootSettings { @@ -337,6 +346,14 @@ impl<'dr> VmConfig<'dr> { assert!(_old.is_none()); } + if let Some(mig) = migration_failure.as_ref() { + let _old = spec.components.insert( + "migration-failure".to_string(), + ComponentV0::MigrationFailureInjector(mig.clone()), + ); + assert!(_old.is_none()); + } + // Generate random identifiers for this instance's timeseries metadata. let sled_id = Uuid::new_v4(); let metadata = InstanceMetadata { @@ -349,7 +366,7 @@ impl<'dr> VmConfig<'dr> { }; Ok(VmSpec { - vm_name: self.vm_name.clone(), + vm_name: vm_name.clone(), instance_spec: spec, disk_handles, guest_os_kind, @@ -400,10 +417,3 @@ async fn make_disk<'req>( as Arc, }) } - -fn default_migration_failure_device() -> propolis_server_config::Device { - propolis_server_config::Device { - driver: MIGRATION_FAILURE_DEVICE.to_owned(), - options: Default::default(), - } -}