From 15eaef98089d0ff7418d0f556b338409735c3e87 Mon Sep 17 00:00:00 2001 From: Dan Nicholson Date: Thu, 29 Aug 2024 18:19:30 -0600 Subject: [PATCH] switchroot: Stop making /sysroot mount private Back in 2b8d586c5, /sysroot was changed to be a private mount so that submounts of /var do not propagate back to the stateroot /var. That's laudible, but it makes /sysroot different than every other shared mount in the root namespace. In particular, it means that submounts of /sysroot do not propagate into separate mount namespaces. Rather than make /sysroot private, make /var a slave+shared mount so that it receives mount events from /sysroot but not vice versa. That achieves the same effect of preventing /var submount events from propagating back to /sysroot while allowing /sysroot mount events to propagate forward like every other system mount. See mount_namespaces(7)[1] and the linux shared subtrees[2] documentation for details on slave+shared mount propagation. When /var is mounted in the initramfs, this is accomplished with mount(2) syscalls. When /var is mounted after switching to the real root, the mount propagation flags are applied as options in the generated var.mount unit. This depends on a mount(8) feature that has been present since util-linux 2.23. That's available in RHEL 7 and every non-EOL Debian and Ubuntu release. Applying the propagation from var.mount fixes a small race, too. Previously, if a /var submount was added before /sysroot was made private, it would have propagated back into /sysroot. That was possible since ostree-remount.service orders itself after var.mount but not before any /var submounts. 1. https://man7.org/linux/man-pages/man7/mount_namespaces.7.html 2. https://docs.kernel.org/filesystems/sharedsubtree.html Fixes: #2086 (cherry picked from commit 2973ec591008be94d74d08807079d242b914dcd2 without the test since the kola tests aren't run for Endless) https://phabricator.endlessm.com/T35640 --- src/libostree/ostree-impl-system-generator.c | 11 +++++++++- src/switchroot/ostree-prepare-root.c | 21 ++++++++++---------- src/switchroot/ostree-remount.c | 9 --------- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/src/libostree/ostree-impl-system-generator.c b/src/libostree/ostree-impl-system-generator.c index 8404d189..4b15200c 100644 --- a/src/libostree/ostree-impl-system-generator.c +++ b/src/libostree/ostree-impl-system-generator.c @@ -216,6 +216,15 @@ fstab_generator (const char *ostree_cmdline, const char *normal_dir, const char * * Note that our unit doesn't run if systemd.volatile is enabled; * see https://github.com/ostreedev/ostree/pull/856 + * + * To avoid having submounts of /var propagate into $stateroot/var, the mount + * is made with slave+shared propagation. This means that /var will receive + * mount events from the parent /sysroot mount, but not vice versa. Adding a + * shared peer group below the slave group means that submounts of /var will + * inherit normal shared propagation. See mount_namespaces(7), Linux + * Documentation/filesystems/sharedsubtree.txt and + * https://github.com/ostreedev/ostree/issues/2086. This also happens in + * ostree-prepare-root.c for the INITRAMFS_MOUNT_VAR case. */ if (!g_output_stream_printf (outstream, &bytes_written, cancellable, error, "##\n# Automatically generated by ostree-system-generator\n##\n\n" @@ -227,7 +236,7 @@ fstab_generator (const char *ostree_cmdline, const char *normal_dir, const char "[Mount]\n" "Where=%s\n" "What=%s\n" - "Options=bind\n", + "Options=bind,slave,shared\n", var_path, stateroot_var_path)) return FALSE; if (!g_output_stream_flush (outstream, cancellable, error)) diff --git a/src/switchroot/ostree-prepare-root.c b/src/switchroot/ostree-prepare-root.c index d41bdfb2..0ab46abf 100644 --- a/src/switchroot/ostree-prepare-root.c +++ b/src/switchroot/ostree-prepare-root.c @@ -492,6 +492,16 @@ main (int argc, char *argv[]) { if (mount ("../../var", TMP_SYSROOT "/var", NULL, MS_BIND | MS_SILENT, NULL) < 0) err (EXIT_FAILURE, "failed to bind mount ../../var to var"); + + /* To avoid having submounts of /var propagate into $stateroot/var, the + * mount is made with slave+shared propagation. See the comment in + * ostree-impl-system-generator.c when /var isn't mounted in the + * initramfs for further explanation. + */ + if (mount (NULL, TMP_SYSROOT "/var", NULL, MS_SLAVE | MS_SILENT, NULL) < 0) + err (EXIT_FAILURE, "failed to change /var to slave mount"); + if (mount (NULL, TMP_SYSROOT "/var", NULL, MS_SHARED | MS_SILENT, NULL) < 0) + err (EXIT_FAILURE, "failed to change /var to slave+shared mount"); } /* We only stamp /run now if we're running in an initramfs, i.e. we're @@ -551,17 +561,6 @@ main (int argc, char *argv[]) } } - /* The /sysroot mount needs to be private to avoid having a mount for e.g. /var/cache - * also propagate to /sysroot/ostree/deploy/$stateroot/var/cache - * - * Now in reality, today this is overridden by systemd: the *actual* way we fix this up - * is in ostree-remount.c. But let's do it here to express the semantics we want - * at the very start (perhaps down the line systemd will have compile/runtime option - * to say that the initramfs environment did everything right from the start). - */ - if (mount ("none", "sysroot", NULL, MS_PRIVATE | MS_SILENT, NULL) < 0) - err (EXIT_FAILURE, "remounting 'sysroot' private"); - if (running_as_pid1) { execl ("/sbin/init", "/sbin/init", NULL); diff --git a/src/switchroot/ostree-remount.c b/src/switchroot/ostree-remount.c index 80f8ad34..e824cd00 100644 --- a/src/switchroot/ostree-remount.c +++ b/src/switchroot/ostree-remount.c @@ -86,15 +86,6 @@ main (int argc, char *argv[]) */ touch_run_ostree (); - /* The /sysroot mount needs to be private to avoid having a mount for e.g. /var/cache - * also propagate to /sysroot/ostree/deploy/$stateroot/var/cache - * - * Today systemd remounts / (recursively) as shared, so we're undoing that as early - * as possible. See also a copy of this in ostree-prepare-root.c. - */ - if (mount ("none", "/sysroot", NULL, MS_REC | MS_PRIVATE, NULL) < 0) - perror ("warning: While remounting /sysroot MS_PRIVATE"); - bool root_is_composefs = false; struct stat stbuf; if (fstatat (AT_FDCWD, _OSTREE_COMPOSEFS_ROOT_STAMP, &stbuf, 0) == 0)