From d091404bf9880995a83aea200a6c8ac3ea1e6546 Mon Sep 17 00:00:00 2001 From: Alessandro Di Stefano Date: Thu, 23 Feb 2023 16:43:26 +0400 Subject: [PATCH] Fixes to onboard x86_64 servers in the baremetal qe infra (#36487) * Support Dell IPMI power commands On Dell servers, `ipmi power (off|on|reset)` returns errors when the host is in a state that doesn't allow the requested transition. Enforcing two commands (on + off) instead of reset, and ignoring any power off errors to ignore those validation errors. * Set the efi boot order after installing RHCOS in UPI/UEFI/PXE scenarios Some servers' firmware push any new detected boot options to the tail of the boot order. When other boot options are present and bootable, such a server will boot from them instead of the new one. As a (temporary?) workaround, we manually add the boot option. NOTE: it's assumed that old OSes boot options are removed from the boot options list during the wipe operations. xrefs: https://bugzilla.redhat.com/show_bug.cgi?id=1997805 https://github.com/coreos/fedora-coreos-tracker/issues/946 https://github.com/coreos/fedora-coreos-tracker/issues/947 --- .../baremetal-lab-ipi-install-commands.sh | 2 +- .../wipe/baremetal-lab-post-wipe-commands.sh | 16 +++++++++++++++- .../baremetal-lab-pre-console-kargs-commands.sh | 11 +++++++++++ .../baremetal-lab-upi-install-commands.sh | 15 ++++++++++++++- 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/ci-operator/step-registry/baremetal/lab/ipi/install/baremetal-lab-ipi-install-commands.sh b/ci-operator/step-registry/baremetal/lab/ipi/install/baremetal-lab-ipi-install-commands.sh index f1c56c80a80b..26454bc016dc 100644 --- a/ci-operator/step-registry/baremetal/lab/ipi/install/baremetal-lab-ipi-install-commands.sh +++ b/ci-operator/step-registry/baremetal/lab/ipi/install/baremetal-lab-ipi-install-commands.sh @@ -31,7 +31,7 @@ function prepare_bmc() { chassis bootparam set bootflag force_pxe options=PEF,watchdog,reset,power ipmitool -I lanplus -H "$bmc_address" \ -U "$bmc_user" -P "$bmc_pass" \ - power off + power off || echo "Already off" } function update_image_registry() { diff --git a/ci-operator/step-registry/baremetal/lab/post/wipe/baremetal-lab-post-wipe-commands.sh b/ci-operator/step-registry/baremetal/lab/post/wipe/baremetal-lab-post-wipe-commands.sh index 11bc6a46a01e..33dfe16be931 100644 --- a/ci-operator/step-registry/baremetal/lab/post/wipe/baremetal-lab-post-wipe-commands.sh +++ b/ci-operator/step-registry/baremetal/lab/post/wipe/baremetal-lab-post-wipe-commands.sh @@ -61,7 +61,21 @@ function reset_host() { chassis bootparam set bootflag force_pxe options=PEF,watchdog,reset,power ipmitool -I lanplus -H "$bmc_address" \ -U "$bmc_user" -P "$bmc_pass" \ - power reset + power off || echo "Already off" + # If the host is not already powered off, the power on command can fail while the host is still powering off. + # Let's retry the power on command multiple times to make sure the command is received in the correct state. + for i in {1..10} max; do + if [ "$i" == "max" ]; then + echo "Failed to reset $bmc_address" + return 1 + fi + ipmitool -I lanplus -H "$bmc_address" \ + -U "$bmc_user" -P "$bmc_pass" \ + power on && break + echo "Failed to power on $bmc_address, retrying..." + sleep 5 + done + if ! wait_for_power_down "$bmc_address" "$bmc_user" "$bmc_pass" "${name}"; then echo "$bmc_address" >> /tmp/failed fi diff --git a/ci-operator/step-registry/baremetal/lab/pre/console-kargs/baremetal-lab-pre-console-kargs-commands.sh b/ci-operator/step-registry/baremetal/lab/pre/console-kargs/baremetal-lab-pre-console-kargs-commands.sh index 2458e4d44660..fbbf64d6050d 100644 --- a/ci-operator/step-registry/baremetal/lab/pre/console-kargs/baremetal-lab-pre-console-kargs-commands.sh +++ b/ci-operator/step-registry/baremetal/lab/pre/console-kargs/baremetal-lab-pre-console-kargs-commands.sh @@ -60,6 +60,17 @@ systemd: --delete-karg console=ttyS0,115200n8 $(join_by_semicolon "${console_kargs}" "--append-karg console=" "") \ --ignition-url ${base_url%%*(/)}/${role}.ign \ --insecure-ignition --copy-network + # Some servers' firmware push any new detected boot options to the tail of the boot order. + # When other boot options are present and bootable, such a server will boot from them instead of the new one. + # As a (temporary?) workaround, we manually add the boot option. + # NOTE: it's assumed that old OSes boot options are removed from the boot options list during the wipe operations. + # xrefs: https://bugzilla.redhat.com/show_bug.cgi?id=1997805 + # https://github.com/coreos/fedora-coreos-tracker/issues/946 + # https://github.com/coreos/fedora-coreos-tracker/issues/947 + ExecStart=/usr/bin/bash -c ' \ + ARCH=\$(uname -m | sed "s/x86_64/x64/;s/aarch64/aa64/"); \ + /usr/sbin/efibootmgr -c -d "$root_device" -p 2 -c -L "Red Hat CoreOS" -l "\\\\EFI\\\\redhat\\\\shim\$ARCH.efi" \ + ' ExecStart=/usr/bin/systemctl --no-block reboot StandardOutput=kmsg+console StandardError=kmsg+console diff --git a/ci-operator/step-registry/baremetal/lab/upi/install/baremetal-lab-upi-install-commands.sh b/ci-operator/step-registry/baremetal/lab/upi/install/baremetal-lab-upi-install-commands.sh index d03a306c8611..4e1a594a90fb 100644 --- a/ci-operator/step-registry/baremetal/lab/upi/install/baremetal-lab-upi-install-commands.sh +++ b/ci-operator/step-registry/baremetal/lab/upi/install/baremetal-lab-upi-install-commands.sh @@ -129,7 +129,20 @@ function reset_host() { chassis bootparam set bootflag force_pxe options=PEF,watchdog,reset,power ipmitool -I lanplus -H "$bmc_address" \ -U "$bmc_user" -P "$bmc_pass" \ - power reset + power off || echo "Already off" + # If the host is not already powered off, the power on command can fail while the host is still powering off. + # Let's retry the power on command multiple times to make sure the command is received in the correct state. + for i in {1..10} max; do + if [ "$i" == "max" ]; then + echo "Failed to reset $bmc_address" + return 1 + fi + ipmitool -I lanplus -H "$bmc_address" \ + -U "$bmc_user" -P "$bmc_pass" \ + power on && break + echo "Failed to power on $bmc_address, retrying..." + sleep 5 + done } function approve_csrs() {