Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

40ignition-ostree: add autosave-xfs transposefs unit #2320

Merged
merged 7 commits into from
Apr 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ DefaultDependencies=false
ConditionKernelCommandLine=ostree
ConditionPathExists=!/run/ostree-live
Before=initrd-root-fs.target
After=sysroot.mount ignition-ostree-mount-firstboot-sysroot.service
# This shouldn't be strictly necessary, but it's cleaner to not have OSTree muck
# around with moving mounts while we're still resizing the filesystem.
Before=ostree-prepare-root.service
Before=sysroot.mount ignition-ostree-mount-firstboot-sysroot.service
After=ignition-ostree-uuid-root.service

[Service]
Type=oneshot
ExecStart=/usr/sbin/ignition-ostree-growfs
RemainAfterExit=yes
# So we can transiently mount sysroot
MountFlags=slave
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,42 @@ set -euo pipefail
# partition, unless it determines that either the rootfs was moved or the
# partition was already resized (e.g. via Ignition).

# This is copied from ignition-ostree-transposefs.sh.
# Sometimes, for some reason the by-label symlinks aren't updated. Detect these
# cases, and explicitly `udevadm trigger`.
# See: https://bugzilla.redhat.com/show_bug.cgi?id=1908780
udev_trigger_on_label_mismatch() {
local label=$1; shift
local expected_dev=$1; shift
local actual_dev
expected_dev=$(realpath "${expected_dev}")
# We `|| :` here because sometimes /dev/disk/by-label/$label is missing.
# We've seen this on Fedora kernels with debug enabled (common in `rawhide`).
# See https://github.com/coreos/fedora-coreos-tracker/issues/1092
actual_dev=$(realpath "/dev/disk/by-label/$label" || :)
if [ "$actual_dev" != "$expected_dev" ]; then
echo "Expected /dev/disk/by-label/$label to point to $expected_dev, but points to $actual_dev; triggering udev"
udevadm trigger --settle "$expected_dev"
fi
}

# This is also similar to bits from transposefs.sh.
ignition_cfg=/run/ignition.json
expected_dev=$(jq -r '.storage?.filesystems? // [] | map(select(.label == "root")) | .[0].device // ""' "${ignition_cfg}")
jlebon marked this conversation as resolved.
Show resolved Hide resolved
if [ -n "${expected_dev}" ]; then
udev_trigger_on_label_mismatch root "${expected_dev}"
fi

# If root reprovisioning was triggered, this file contains state of the root
# partition *before* ignition-disks.
saved_partstate=/run/ignition-ostree-rootfs-partstate.sh

# We run after the rootfs is mounted at /sysroot, but before ostree-prepare-root
# moves it to /sysroot/sysroot.
# We run before the rootfs is mounted at /sysroot, but we still need to mount it
# (in a private namespace) since XFS and Btrfs can only do resizing online (EXT4
# can do either).
path=/sysroot

# The use of tail is to avoid errors from duplicate mounts;
# this shouldn't happen for us but we're being conservative.
src=$(findmnt -nvr -o SOURCE "$path" | tail -n1)
src=/dev/disk/by-label/root
mount "${src}" "${path}"

# In the IBM Secure Execution case we use Ignition to grow and reencrypt rootfs
# see overlay.d/05core/usr/lib/dracut/modules.d/35coreos-ignition/coreos-diskful-generator
Expand Down Expand Up @@ -122,5 +147,6 @@ case "${ROOTFS_TYPE}" in
btrfs) btrfs filesystem resize max ${path} ;;
esac

# this is useful for tests
# The ignition-ostree-transposefs-xfsauto.service unit needs to know if we
# actually run. This is also useful for tests.
touch /run/ignition-ostree-growfs.stamp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Unit]
Description=Ignition OSTree: Autosave XFS Rootfs Partition
DefaultDependencies=false
After=ignition-disks.service
# Avoid racing with UUID regeneration
After=ignition-ostree-uuid-root.service
After=ignition-ostree-growfs.service
Before=ignition-ostree-transposefs-restore.service
OnFailure=emergency.target
OnFailureJobMode=isolate

ConditionKernelCommandLine=ostree
# only run if ignition-ostree-growfs ran since that's when pathological cases occur
ConditionPathExists=/run/ignition-ostree-growfs.stamp

[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/usr/libexec/ignition-ostree-transposefs autosave-xfs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ DefaultDependencies=false
After=ignition-disks.service
# Avoid racing with UUID regeneration
After=ignition-ostree-uuid-root.service
Before=ignition-ostree-growfs.service
After=ignition-ostree-growfs.service
Before=ignition-ostree-mount-firstboot-sysroot.service
OnFailure=emergency.target
OnFailureJobMode=isolate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ mount_verbose() {
mount -o "${mode}" "${srcdev}" "${destdir}"
}

# A copy of this exists in ignition-ostree-growfs.sh.
# Sometimes, for some reason the by-label symlinks aren't updated. Detect these
# cases, and explicitly `udevadm trigger`.
# See: https://bugzilla.redhat.com/show_bug.cgi?id=1908780
Expand Down Expand Up @@ -95,8 +96,12 @@ mount_and_restore_filesystem_by_label() {
local mountpoint=$1; shift
local saved_fs=$1; shift
local new_dev
new_dev=$(jq -r "$(query_fslabel "${label}") | .[0].device" "${ignition_cfg}")
udev_trigger_on_label_mismatch "${label}" "${new_dev}"
new_dev=$(jq -r "$(query_fslabel "${label}") | .[0].device // \"\"" "${ignition_cfg}")
# in the autosave-xfs path, it's not driven by the Ignition config so we
# don't expect a new device there
if [ -n "${new_dev}" ]; then
udev_trigger_on_label_mismatch "${label}" "${new_dev}"
fi
mount_verbose "/dev/disk/by-label/${label}" "${mountpoint}" rw
find "${saved_fs}" -mindepth 1 -maxdepth 1 -exec mv -t "${mountpoint}" {} +
}
Expand Down Expand Up @@ -124,6 +129,64 @@ mount_and_save_filesystem_by_label() {
fi
}

# This implements https://github.com/coreos/fedora-coreos-tracker/issues/1183.
should_autosave_rootfs() {
local fstype
fstype=$(lsblk -no FSTYPE "${root_part}")
if [ "$fstype" != xfs ]; then
echo "Filesystem is not XFS (found $fstype); skipping" >&2
echo 0
return
fi
local agcount
eval $(xfs_info "${root_part}" | grep -o 'agcount=[0-9]*')
# Semi-arbitrarily chosen: this is roughly ~64G currently (based on initial
# ag sizing at build time) which seems like a good rootfs size at which to
# discriminate between "throwaway/short-lived systems" and "long-running
# workload systems". It's not like XFS performance is way worse at 128.
if [ "$agcount" -lt 128 ]; then
Comment on lines +143 to +147
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sandeen, ended up going with 128 as a threshold for this.

echo "Filesystem agcount is $agcount; skipping" >&2
echo 0
return
fi
echo 1
}

ensure_zram_dev() {
if test -d "${saved_data}"; then
return 0
fi
mem_available=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
# Just error out early if we don't even have 1G to work with. This
# commonly happens if you `cosa run` but forget to add `--memory`. That
# way you get a nicer error instead of the spew of EIO errors from `cp`.
# The amount we need is really dependent on a bunch of factors, but just
# ballpark it at 3G.
if [ "${mem_available}" -lt $((1*1024*1024)) ] && [ "${wipes_root}" != 0 ]; then
echo "Root reprovisioning requires at least 3G of RAM" >&2
exit 1
fi
modprobe zram num_devices=0
read dev < /sys/class/zram-control/hot_add
# disksize is set arbitrarily large, as zram is capped by mem_limit
echo 10G > /sys/block/zram"${dev}"/disksize
# Limit zram to 90% of available RAM: we want to be greedy since the
# boot breaks anyway, but we still want to leave room for everything
# else so it hits ENOSPC and doesn't invoke the OOM killer
echo $(( mem_available * 90 / 100 ))K > /sys/block/zram"${dev}"/mem_limit
mkfs.xfs -q /dev/zram"${dev}"
mkdir "${saved_data}"
mount /dev/zram"${dev}" "${saved_data}"
# save the zram device number created for when called to cleanup
echo "${dev}" > "${zram_dev}"
}

print_zram_mm_stat() {
echo "zram usage:"
read dev < "${zram_dev}"
cat /sys/block/zram"${dev}"/mm_stat
}

# In Secure Execution case user is not allowed to modify partition table
check_and_set_secex_config() {
if [[ -f /run/coreos/secure-execution ]]; then
Expand Down Expand Up @@ -162,29 +225,8 @@ case "${1:-}" in
echo "Found duplicate or missing ESP, BIOS-BOOT, or PReP labels in config" >&2
exit 1
fi
mem_available=$(grep MemAvailable /proc/meminfo | awk '{print $2}')
# Just error out early if we don't even have 1G to work with. This
# commonly happens if you `cosa run` but forget to add `--memory`. That
# way you get a nicer error instead of the spew of EIO errors from `cp`.
# The amount we need is really dependent on a bunch of factors, but just
# ballpark it at 3G.
if [ "${mem_available}" -lt $((1*1024*1024)) ] && [ "${wipes_root}" != 0 ]; then
echo "Root reprovisioning requires at least 3G of RAM" >&2
exit 1
fi
modprobe zram num_devices=0
read dev < /sys/class/zram-control/hot_add
# disksize is set arbitrarily large, as zram is capped by mem_limit
echo 10G > /sys/block/zram"${dev}"/disksize
# Limit zram to 90% of available RAM: we want to be greedy since the
# boot breaks anyway, but we still want to leave room for everything
# else so it hits ENOSPC and doesn't invoke the OOM killer
echo $(( mem_available * 90 / 100 ))K > /sys/block/zram"${dev}"/mem_limit
mkfs.xfs -q /dev/zram"${dev}"
mkdir "${saved_data}"
mount /dev/zram"${dev}" "${saved_data}"
# save the zram device number created for when called to cleanup
echo "${dev}" > "${zram_dev}"

ensure_zram_dev

if [ "${wipes_root}" != "0" ]; then
mkdir "${saved_root}"
Expand All @@ -202,6 +244,23 @@ case "${1:-}" in
mkdir "${saved_prep}"
fi
;;
autosave-xfs)
should_autosave=$(should_autosave_rootfs)
Copy link
Member

@dustymabe dustymabe Apr 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so (by design) now the script will exit here if an unexpected error occurs (because of set -euo pipefail) and that is how we'll handle the cases we weren't handling before?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, exactly. It's no longer part of an if-statement, so the errexit should be respected.

if [ "${should_autosave}" = "1" ]; then
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if [ "${should_autosave}" = "1" ]; then
if [ "${should_autosave}" == "1" ]; then

It always throws me off that a single = works.

You don't need to change this, I'm just calling it out.

wipes_root=1
ensure_zram_dev
# in the in-place reprovisioning case, the rootfs was already saved
if [ ! -d "${saved_root}" ]; then
mkdir "${saved_root}"
echo "Moving rootfs to RAM..."
mount_and_save_filesystem_by_label root "${saved_root}"
print_zram_mm_stat
fi
mkfs.xfs "${root_part}" -L root -f
# for tests
touch /run/ignition-ostree-autosaved-xfs.stamp
fi
;;
save)
# Mounts happen in a private mount namespace since we're not "offically" mounting
if [ -d "${saved_root}" ]; then
Expand Down Expand Up @@ -233,9 +292,7 @@ case "${1:-}" in
echo "Moving PReP partition to RAM..."
cat "${prep_part}" > "${saved_prep}/partition"
fi
echo "zram usage:"
read dev < "${zram_dev}"
cat /sys/block/zram"${dev}"/mm_stat
print_zram_mm_stat
;;
restore)
# Mounts happen in a private mount namespace since we're not "offically" mounting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ install() {
systemd-sysusers \
systemd-tmpfiles \
sort \
xfs_info \
xfs_spaceman \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧑‍🚀

uniq

if [[ $(uname -m) = s390x ]]; then
Expand Down Expand Up @@ -81,7 +83,7 @@ install() {

inst_multiple jq chattr
inst_script "$moddir/ignition-ostree-transposefs.sh" "/usr/libexec/ignition-ostree-transposefs"
for x in detect save restore; do
for x in detect save autosave-xfs restore; do
install_ignition_unit ignition-ostree-transposefs-${x}.service
done

Expand Down
20 changes: 20 additions & 0 deletions tests/kola/disks/growfs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
## kola:
## exclusive: false

jlebon marked this conversation as resolved.
Show resolved Hide resolved
# This test verifies that the rootfs is automatically grown on first boot in the
# default case and that the autosave-xfs logic didn't kick in.

set -xeuo pipefail

. $KOLA_EXT_DATA/commonlib.sh

if [ ! -f /run/ignition-ostree-growfs.stamp ]; then
fatal "rootfs was not grown on first boot"
fi
ok "rootfs grown on first boot"

if [ -f /run/ignition-ostree-autosaved-xfs.stamp ]; then
fatal "unexpected autosaved XFS"
fi
ok "rootfs wasn't automatically reprovisioned"
1 change: 1 addition & 0 deletions tests/kola/root-reprovision/autosave-xfs/data/commonlib.sh
26 changes: 26 additions & 0 deletions tests/kola/root-reprovision/autosave-xfs/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
## kola:
## # This test reprovisions the rootfs automatically.
## tags: "platform-independent reprovision"
## # Trigger automatic XFS reprovisioning
## minDisk: 64
jlebon marked this conversation as resolved.
Show resolved Hide resolved
## # Root reprovisioning requires at least 4GiB of memory.
## minMemory: 4096
## # This test includes a lot of disk I/O and needs a higher
## # timeout value than the default.
## timeoutMin: 15

set -xeuo pipefail

. $KOLA_EXT_DATA/commonlib.sh

if [ ! -f /run/ignition-ostree-autosaved-xfs.stamp ]; then
fatal "expected autosaved XFS"
fi
ok "autosaved XFS on large disk"

eval $(xfs_info / | grep -o 'agcount=[0-9]*')
if [ "$agcount" -gt 4 ]; then
fatal "expected agcount of at most 4, got ${agcount}"
fi
ok "low agcount on large disk"
5 changes: 5 additions & 0 deletions tests/kola/root-reprovision/linear/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ case "${AUTOPKGTEST_REBOOT_MARK:-}" in
fatal "ignition-ostree-growfs ran"
fi

# check that autosave-xfs didn't run
if [ -e /run/ignition-ostree-autosaved-xfs.stamp ]; then
fatal "unexpected autosaved XFS"
fi

# reboot once to sanity-check we can find root on second boot
/tmp/autopkgtest-reboot rebooted
;;
Expand Down
1 change: 1 addition & 0 deletions tests/kola/root-reprovision/luks/autosave-xfs/config.ign
1 change: 1 addition & 0 deletions tests/kola/root-reprovision/luks/autosave-xfs/data
34 changes: 34 additions & 0 deletions tests/kola/root-reprovision/luks/autosave-xfs/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
## kola:
## # This test reprovisions the rootfs.
## tags: "platform-independent reprovision"
## # Root reprovisioning requires at least 4GiB of memory.
## minMemory: 4096
## # A TPM backend device is not available on s390x to suport TPM.
## architectures: "!s390x"
## # This test includes a lot of disk I/O and needs a higher
## # timeout value than the default.
## timeoutMin: 15
## # Trigger automatic XFS reprovisioning
## minDisk: 64

set -xeuo pipefail

. $KOLA_EXT_DATA/commonlib.sh

# check that we ran automatic XFS reprovisioning
if [ -z "${AUTOPKGTEST_REBOOT_MARK:-}" ]; then
if [ ! -f /run/ignition-ostree-autosaved-xfs.stamp ]; then
fatal "expected autosaved XFS"
fi
ok "autosaved XFS on large disk"

eval $(xfs_info / | grep -o 'agcount=[0-9]*')
if [ "$agcount" -gt 4 ]; then
fatal "expected agcount of at most 4, got ${agcount}"
fi
ok "low agcount on large disk"
fi

# run the rest of the tests
. $KOLA_EXT_DATA/luks-test.sh
Loading