diff --git a/crates/runc-shim/src/runc.rs b/crates/runc-shim/src/runc.rs index 92bb0f23..abc9dd91 100644 --- a/crates/runc-shim/src/runc.rs +++ b/crates/runc-shim/src/runc.rs @@ -34,6 +34,7 @@ use containerd_shim::{ asynchronous::monitor::{monitor_subscribe, monitor_unsubscribe, Subscription}, io_error, monitor::{ExitEvent, Subject, Topic}, + mount::umount_recursive, other, other_error, protos::{ api::ProcessInfo, @@ -299,6 +300,7 @@ impl ProcessLifecycle for RuncInitLifecycle { ); } } + umount_recursive(Path::new(&self.bundle).join("rootfs").to_str(), 0)?; self.exit_signal.signal(); Ok(()) } diff --git a/crates/runc-shim/src/service.rs b/crates/runc-shim/src/service.rs index 0dda16a4..6e8703a6 100644 --- a/crates/runc-shim/src/service.rs +++ b/crates/runc-shim/src/service.rs @@ -27,6 +27,7 @@ use containerd_shim::{ event::Event, io_error, monitor::{Subject, Topic}, + mount::umount_recursive, protos::{events::task::TaskExit, protobuf::MessageDyn}, util::{ convert_to_timestamp, read_options, read_runtime, read_spec, timestamp, write_str_to_file, @@ -120,6 +121,7 @@ impl Shim for Service { runc.delete(&self.id, Some(&DeleteOpts { force: true })) .await .unwrap_or_else(|e| warn!("failed to remove runc container: {}", e)); + umount_recursive(bundle.join("rootfs").to_str(), 0)?; let mut resp = DeleteResponse::new(); // sigkill resp.set_exit_status(137); diff --git a/crates/shim/src/mount.rs b/crates/shim/src/mount.rs index 6e3882be..969f5dfc 100644 --- a/crates/shim/src/mount.rs +++ b/crates/shim/src/mount.rs @@ -19,6 +19,8 @@ use std::{ collections::HashMap, env, + fs::File, + io::{BufRead, BufReader}, ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, Not}, path::Path, }; @@ -26,7 +28,7 @@ use std::{ use lazy_static::lazy_static; use log::error; #[cfg(target_os = "linux")] -use nix::mount::{mount, MsFlags}; +use nix::mount::{mount, MntFlags, MsFlags}; #[cfg(target_os = "linux")] use nix::sched::{unshare, CloneFlags}; #[cfg(target_os = "linux")] @@ -615,6 +617,179 @@ pub fn mount_rootfs( Err(Error::Unimplemented("start".to_string())) } +#[derive(Debug, Default, Clone)] +struct MountInfo { + /// id is a unique identifier of the mount (may be reused after umount). + pub id: u32, + /// parent is the ID of the parent mount (or of self for the root + /// of this mount namespace's mount tree). + pub parent: u32, + /// major and minor are the major and the minor components of the Dev + /// field of unix.Stat_t structure returned by unix.*Stat calls for + /// files on this filesystem. + pub major: u32, + pub minor: u32, + /// root is the pathname of the directory in the filesystem which forms + /// the root of this mount. + pub root: String, + /// mountpoint is the pathname of the mount point relative to the + /// process's root directory. + pub mountpoint: String, + /// options is a comma-separated list of mount options. + pub options: String, + /// optional are zero or more fields of the form "tag[:value]", + /// separated by a space. Currently, the possible optional fields are + /// "shared", "master", "propagate_from", and "unbindable". For more + /// information, see mount_namespaces(7) Linux man page. + pub optional: String, + /// fs_type is the filesystem type in the form "type[.subtype]". + pub fs_type: String, + /// source is filesystem-specific information, or "none". + pub source: String, + /// vfs_options is a comma-separated list of superblock options. + pub vfs_options: String, +} + +#[cfg(target_os = "linux")] +pub fn umount_recursive(target: Option<&str>, flags: i32) -> Result<()> { + if let Some(target) = target { + let mut mounts = get_mounts(Some(prefix_filter(target.to_string()))); + mounts.sort_by(|a, b| b.mountpoint.len().cmp(&a.mountpoint.len())); + for (index, target) in mounts.iter().enumerate() { + umount_all(Some(target.clone().mountpoint), flags)?; + } + }; + Ok(()) +} + +#[cfg(not(target_os = "linux"))] +pub fn umount_recursive(target: Option<&str>, flags: i32) -> Result<()> { + Ok(()) +} + +#[cfg(target_os = "linux")] +pub fn umount_all(target: Option, flags: i32) -> Result<()> { + if let Some(target) = target { + if let Err(e) = std::fs::metadata(target.clone()) { + if e.kind() == std::io::ErrorKind::NotFound { + return Ok(()); + } + } + loop { + if let Err(e) = nix::mount::umount2( + &std::path::PathBuf::from(&target), + MntFlags::from_bits(flags).unwrap(), + ) { + if e == nix::errno::Errno::EINVAL { + return Ok(()); + } + return Err(Error::from(e)); + } + } + }; + Ok(()) +} + +#[cfg(target_os = "linux")] +fn prefix_filter(prefix: String) -> impl Fn(MountInfo) -> bool { + move |m: MountInfo| { + if let Some(s) = (m.mountpoint.clone() + "/").strip_prefix(&(prefix.clone() + "/")) { + return false; + } + return true; + } +} + +#[cfg(target_os = "linux")] +fn get_mounts(f: Option) -> Vec +where + F: Fn(MountInfo) -> bool, +{ + let file = File::open("/proc/self/mountinfo").expect("Failed to open /proc/self/mountinfo"); + let reader = BufReader::new(file); + + let lines: Vec = reader.lines().filter_map(|line| line.ok()).collect(); + let mount_points = lines + .into_iter() + .filter_map(|line| { + /* + See http://man7.org/linux/man-pages/man5/proc.5.html + + 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + + (1) mount ID: unique identifier of the mount (may be reused after umount) + (2) parent ID: ID of parent (or of self for the top of the mount tree) + (3) major:minor: value of st_dev for files on filesystem + (4) root: root of the mount within the filesystem + (5) mount point: mount point relative to the process's root + (6) mount options: per mount options + (7) optional fields: zero or more fields of the form "tag[:value]" + (8) separator: marks the end of the optional fields + (9) filesystem type: name of filesystem of the form "type[.subtype]" + (10) mount source: filesystem specific information or "none" + (11) super options: per super block options + + In other words, we have: + * 6 mandatory fields (1)..(6) + * 0 or more optional fields (7) + * a separator field (8) + * 3 mandatory fields (9)..(11) + */ + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() < 10 { + // mountpoint parse error. + return None; + } + // separator field + let mut sep_idx = parts.len() - 4; + // In Linux <= 3.9 mounting a cifs with spaces in a share + // name (like "//srv/My Docs") _may_ end up having a space + // in the last field of mountinfo (like "unc=//serv/My Docs"). + // Since kernel 3.10-rc1, cifs option "unc=" is ignored, + // so spaces should not appear. + // + // Check for a separator, and work around the spaces bug + for i in (0..sep_idx).rev() { + if parts[i] == "-" { + sep_idx = i; + break; + } + if sep_idx == 5 { + // mountpoint parse error + return None; + } + } + + let mut mount_info = MountInfo::default(); + mount_info.mountpoint = parts[4].to_string(); + mount_info.fs_type = parts[sep_idx + 1].to_string(); + mount_info.source = parts[sep_idx + 2].to_string(); + mount_info.vfs_options = parts[sep_idx + 3].to_string(); + mount_info.id = str::parse::(parts[0]).unwrap(); + mount_info.parent = str::parse::(parts[1]).unwrap(); + let major_minor = parts[2].splitn(3, ":").collect::>(); + if major_minor.len() != 2 { + // mountpoint parse error. + return None; + } + mount_info.major = str::parse::(major_minor[0]).unwrap(); + mount_info.minor = str::parse::(major_minor[1]).unwrap(); + mount_info.root = parts[3].to_string(); + mount_info.options = parts[5].to_string(); + mount_info.optional = parts[6..sep_idx].join(" "); + if let Some(f) = &f { + if f(mount_info.clone()) { + // skip this mountpoint. This mountpoint is not the container's mountpoint + return None; + } + } + Some(mount_info) + }) + .collect(); + mount_points +} + #[cfg(test)] #[cfg(target_os = "linux")] mod tests {