From 81e43ae1d2c237131985eebd1fa5f541daf0f316 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Tue, 12 Sep 2023 16:49:01 -0400 Subject: [PATCH 1/2] platform/qemu: strengthen data block device checks On ppc64le and s390x, the Ignition config is passed as a block device. In the `RemovePrimaryBlockDevice()` code, we need to make sure we don't inadvertently select the `ignition` block device as the new boot device. More generally, the reliance on the `virtio-backend` string is weak. Instead, use a more specific device name for the real block devices when preparing the QEMU arguments, and filter on that. Partially fixes: https://github.com/coreos/coreos-assembler/issues/2725 Partially fixes: https://github.com/coreos/coreos-assembler/issues/3360 --- mantle/platform/qemu.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mantle/platform/qemu.go b/mantle/platform/qemu.go index 8ac84f13d1..712fcd26d9 100644 --- a/mantle/platform/qemu.go +++ b/mantle/platform/qemu.go @@ -383,8 +383,8 @@ func (inst *QemuInstance) SwitchBootOrder() (err2 error) { } // RemovePrimaryBlockDevice deletes the primary device from a qemu instance -// and sets the seconday device as primary. It expects that all block devices -// are mirrors. +// and sets the secondary device as primary. It expects that all block devices +// with device name disk- are mirrors. func (inst *QemuInstance) RemovePrimaryBlockDevice() (err2 error) { var primaryDevice string var secondaryDevicePath string @@ -397,7 +397,7 @@ func (inst *QemuInstance) RemovePrimaryBlockDevice() (err2 error) { // a `BackingFileDepth` parameter of a device and check if // it is a removable and part of `virtio-blk-pci` devices. for _, dev := range blkdevs.Return { - if !dev.Removable && strings.Contains(dev.DevicePath, "virtio-backend") { + if !dev.Removable && strings.HasPrefix(dev.Device, "disk-") { if dev.Inserted.BackingFileDepth == 1 { primaryDevice = dev.DevicePath } else { @@ -1131,7 +1131,7 @@ func (builder *QemuBuilder) addDiskImpl(disk *Disk, primary bool) error { opts = "," + strings.Join(diskOpts, ",") } - id := fmt.Sprintf("d%d", builder.diskID) + id := fmt.Sprintf("disk-%d", builder.diskID) // Avoid file locking detection, and the disks we create // here are always currently ephemeral. From b32fbf20c735b8a118da8a65857f4f5ba405b916 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Tue, 12 Sep 2023 16:51:30 -0400 Subject: [PATCH 2/2] tests/boot-mirror: bump memory request to 8G on ppc64le and aarch64 All our other root reprovisioning tests double the memory request on ppc64le and aarch64 due to the larger page size. Do this for the boot mirroring tests too and increase the memory request to 8G. With the current 4G, the tests would sometimes panic during the reboot right after the primary block device detach. Even with 8G, the panic still happens, albeit much more rarely. Rather than bumping the memory even more, I've found that sleeping a bit before rebooting does the trick. Partially fixes: https://github.com/coreos/coreos-assembler/issues/2725 Partially fixes: https://github.com/coreos/coreos-assembler/issues/3360 --- mantle/kola/harness.go | 1 + mantle/kola/tests/ignition/luks.go | 2 +- mantle/kola/tests/misc/boot-mirror.go | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/mantle/kola/harness.go b/mantle/kola/harness.go index 53acf82051..cf136099e4 100644 --- a/mantle/kola/harness.go +++ b/mantle/kola/harness.go @@ -1205,6 +1205,7 @@ ExecStart=%s // Architectures using 64k pages use slightly more memory, ask for more than requested // to make sure that we don't run out of it. Currently ppc64le and aarch64 use 64k pages. + // See similar logic in boot-mirror.go and luks.go. switch coreosarch.CurrentRpmArch() { case "ppc64le", "aarch64": if targetMeta.MinMemory <= 4096 { diff --git a/mantle/kola/tests/ignition/luks.go b/mantle/kola/tests/ignition/luks.go index 5a5b3b75d0..f256ada921 100644 --- a/mantle/kola/tests/ignition/luks.go +++ b/mantle/kola/tests/ignition/luks.go @@ -164,7 +164,7 @@ func runTest(c cluster.TestCluster, tpm2 bool, threshold int, killTangAfterFirst opts := platform.MachineOptions{ MinMemory: 4096, } - // ppc64le and aarch64 use 64K pages + // ppc64le and aarch64 use 64K pages; see similar logic in harness.go and boot-mirror.go switch coreosarch.CurrentRpmArch() { case "ppc64le", "aarch64": opts.MinMemory = 8192 diff --git a/mantle/kola/tests/misc/boot-mirror.go b/mantle/kola/tests/misc/boot-mirror.go index cb48c07d57..5e9e695396 100644 --- a/mantle/kola/tests/misc/boot-mirror.go +++ b/mantle/kola/tests/misc/boot-mirror.go @@ -100,6 +100,11 @@ func runBootMirrorTest(c cluster.TestCluster) { MinMemory: 4096, }, } + // ppc64le and aarch64 use 64K pages; see similar logic in harness.go and luks.go + switch coreosarch.CurrentRpmArch() { + case "ppc64le", "aarch64": + options.MinMemory = 8192 + } // FIXME: for QEMU tests kola currently assumes the host CPU architecture // matches the one under test userdata := bootmirror.Subst("LAYOUT", coreosarch.CurrentRpmArch()) @@ -147,6 +152,11 @@ func runBootMirrorLUKSTest(c cluster.TestCluster) { MinMemory: 4096, }, } + // ppc64le and aarch64 use 64K pages; see similar logic in harness.go and luks.go + switch coreosarch.CurrentRpmArch() { + case "ppc64le", "aarch64": + options.MinMemory = 8192 + } // FIXME: for QEMU tests kola currently assumes the host CPU architecture // matches the one under test userdata := bootmirrorluks.Subst("LAYOUT", coreosarch.CurrentRpmArch()) @@ -230,6 +240,13 @@ func detachPrimaryBlockDevice(c cluster.TestCluster, m platform.Machine) { }); err != nil { c.Fatalf("Failed to retrieve boot ID: %v", err) } + + // Give some time to the host before doing the reboot. Without it, we've noticed + // that rebooting too quickly after ripping out the primary device can trigger + // a kernel panic on ppc64le. This may be memory-related since the same panic + // happens more easily if memory is lowered to 4G. + time.Sleep(30 * time.Second) + err := m.Reboot() if err != nil { c.Fatalf("Failed to reboot the machine: %v", err)