From 9473d7e79dfe21b8d92dec1c0fa03de23379c4c4 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 24 Aug 2023 13:09:05 -0400 Subject: [PATCH] transposefs: Only autosave-xfs for much larger filesystems The change in https://github.com/coreos/fedora-coreos-config/pull/2320 has been very problematic for OpenShift because our default node configuration is *always* over the threshold, and that causes significant latency on instance provisioning. Experimentally bumping to 400 allocation groups, which is about 700GiB. This is comfortably about the default OpenShift node root disk sizes, and returns us to the prior status quo. While we're here, rework the logging a bit so that we *always* log the `agcount` for debugging purposes. Also: - Only log to stdout for normal conditions - Include the name of the systemd unit in the test description so we can cross-reference - tests: Hoist the expected agcount of 4 to a common variable --- .../ignition-ostree-transposefs.sh | 21 ++++++++++++------- .../root-reprovision/autosave-xfs/test.sh | 12 +++++++---- .../luks/autosave-xfs/test.sh | 8 ++++--- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/overlay.d/05core/usr/lib/dracut/modules.d/40ignition-ostree/ignition-ostree-transposefs.sh b/overlay.d/05core/usr/lib/dracut/modules.d/40ignition-ostree/ignition-ostree-transposefs.sh index 6f89c4007e..62fa977043 100755 --- a/overlay.d/05core/usr/lib/dracut/modules.d/40ignition-ostree/ignition-ostree-transposefs.sh +++ b/overlay.d/05core/usr/lib/dracut/modules.d/40ignition-ostree/ignition-ostree-transposefs.sh @@ -139,17 +139,24 @@ should_autosave_rootfs() { return fi local agcount + # This runs xfs_info on the unmounted filesystem, because mounting an + # XFS filesystem that has grown an excessive number of allocation groups + # can be very slow. eval $(xfs_info "${root_part}" | grep -o 'agcount=[0-9]*') - # Semi-arbitrarily chosen: this is roughly ~64G currently (based on initial - # ag sizing at build time) which seems like a good rootfs size at which to - # discriminate between "throwaway/short-lived systems" and "long-running - # workload systems". It's not like XFS performance is way worse at 128. - if [ "$agcount" -lt 128 ]; then - echo "Filesystem agcount is $agcount; skipping" >&2 + # This is roughly ~700GiB currently (based on initial ag sizing at build time) + # which ensures we grow only on "large" root filesystems. + # Specifically for e.g. OpenShift, this ensures we don't reprovision on default + # worker node root filesystems. + local threshold + threshold=400 + if [ "$agcount" -lt "${threshold}" ]; then + echo "autosave-xfs: ${root_part} agcount=$agcount is lower than threshold=${threshold}" >&2 echo 0 return + else + echo "autosave-xfs: ${root_part} agcount=$agcount meets threshold=${threshold}" >&2 + echo 1 fi - echo 1 } ensure_zram_dev() { diff --git a/tests/kola/root-reprovision/autosave-xfs/test.sh b/tests/kola/root-reprovision/autosave-xfs/test.sh index 5f88d60482..4ac5c12afc 100755 --- a/tests/kola/root-reprovision/autosave-xfs/test.sh +++ b/tests/kola/root-reprovision/autosave-xfs/test.sh @@ -2,8 +2,8 @@ ## kola: ## # This test reprovisions the rootfs automatically. ## tags: "platform-independent reprovision" -## # Trigger automatic XFS reprovisioning -## minDisk: 100 +## # Trigger automatic XFS reprovisioning (heuristic) +## minDisk: 1000 ## # Root reprovisioning requires at least 4GiB of memory. ## minMemory: 4096 ## # This test includes a lot of disk I/O and needs a higher @@ -11,6 +11,7 @@ ## timeoutMin: 15 ## description: Verify the root reprovision with XFS ## on large disk triggers autosaved. +## This test is meant to cover ignition-ostree-transposefs-autosave-xfs.service set -xeuo pipefail @@ -20,10 +21,13 @@ set -xeuo pipefail if [ ! -f /run/ignition-ostree-autosaved-xfs.stamp ]; then fatal "expected autosaved XFS" fi +# Verify we printed something about the agcount +journalctl -u ignition-ostree-transposefs-autosave-xfs.service --grep=agcount ok "autosaved XFS on large disk" eval $(xfs_info / | grep -o 'agcount=[0-9]*') -if [ "$agcount" -gt 4 ]; then - fatal "expected agcount of at most 4, got ${agcount}" +expected=4 +if [ "$agcount" -gt "$expected" ]; then + fatal "expected agcount of at most ${expected}, got ${agcount}" fi ok "low agcount on large disk" diff --git a/tests/kola/root-reprovision/luks/autosave-xfs/test.sh b/tests/kola/root-reprovision/luks/autosave-xfs/test.sh index a3cd80e497..e238a4acd9 100755 --- a/tests/kola/root-reprovision/luks/autosave-xfs/test.sh +++ b/tests/kola/root-reprovision/luks/autosave-xfs/test.sh @@ -10,9 +10,10 @@ ## # timeout value than the default. ## timeoutMin: 15 ## # Trigger automatic XFS reprovisioning -## minDisk: 100 +## minDisk: 1000 ## description: Verify the root reprovision with XFS and TPM ## on large disk triggers autosaved. +## This test is meant to cover ignition-ostree-transposefs-autosave-xfs.service set -xeuo pipefail @@ -27,8 +28,9 @@ if [ -z "${AUTOPKGTEST_REBOOT_MARK:-}" ]; then ok "autosaved XFS on large disk" eval $(xfs_info / | grep -o 'agcount=[0-9]*') - if [ "$agcount" -gt 4 ]; then - fatal "expected agcount of at most 4, got ${agcount}" + expected=4 + if [ "$agcount" -gt "${expected}" ]; then + fatal "expected agcount of at most ${expected}, got ${agcount}" fi ok "low agcount on large disk" fi