From 99a9efd217ab69ca2ca33df4521450a8ce24f0ca Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 28 Aug 2023 15:49:01 +0200 Subject: [PATCH 01/18] rsync `mrcnski/pvf-sandbox-whole-process` branch from Polkadot --- polkadot/node/core/pvf/Cargo.toml | 2 +- polkadot/node/core/pvf/common/Cargo.toml | 5 +- polkadot/node/core/pvf/common/src/execute.rs | 4 +- polkadot/node/core/pvf/common/src/lib.rs | 18 +- polkadot/node/core/pvf/common/src/prepare.rs | 9 + .../node/core/pvf/common/src/worker/mod.rs | 96 +++-- .../core/pvf/common/src/worker/security.rs | 349 +++++++++++++++--- .../node/core/pvf/execute-worker/src/lib.rs | 110 +++--- .../node/core/pvf/prepare-worker/src/lib.rs | 111 ++++-- polkadot/node/core/pvf/src/execute/queue.rs | 17 + .../node/core/pvf/src/execute/worker_intf.rs | 22 +- polkadot/node/core/pvf/src/host.rs | 79 +++- polkadot/node/core/pvf/src/prepare/pool.rs | 33 +- .../node/core/pvf/src/prepare/worker_intf.rs | 50 ++- polkadot/node/core/pvf/src/testing.rs | 12 +- polkadot/node/core/pvf/src/worker_intf.rs | 52 ++- polkadot/node/core/pvf/tests/it/main.rs | 2 +- .../node/core/pvf/tests/it/worker_common.rs | 42 ++- 18 files changed, 798 insertions(+), 215 deletions(-) diff --git a/polkadot/node/core/pvf/Cargo.toml b/polkadot/node/core/pvf/Cargo.toml index 872ab0107cb8..5f6e3b26d904 100644 --- a/polkadot/node/core/pvf/Cargo.toml +++ b/polkadot/node/core/pvf/Cargo.toml @@ -47,7 +47,7 @@ assert_matches = "1.4.0" hex-literal = "0.3.4" polkadot-node-core-pvf-common = { path = "common", features = ["test-utils"] } # For the puppet worker, depend on ourselves with the test-utils feature. -polkadot-node-core-pvf = { path = "", features = ["test-utils"] } +polkadot-node-core-pvf = { path = ".", features = ["test-utils"] } adder = { package = "test-parachain-adder", path = "../../../parachain/test-parachains/adder" } halt = { package = "test-parachain-halt", path = "../../../parachain/test-parachains/halt" } diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index f9f900a0fec0..04b2a817dfe3 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -25,10 +25,11 @@ sc-executor-wasmtime = { path = "../../../../../substrate/client/executor/wasmti sp-core = { path = "../../../../../substrate/primitives/core" } sp-externalities = { path = "../../../../../substrate/primitives/externalities" } sp-io = { path = "../../../../../substrate/primitives/io" } -sp-tracing = { path = "../../../../../substrate/primitives/tracing" } +sp-tracing = { path = "../../../../../substrate/primitives/tracing", optional = true } [target.'cfg(target_os = "linux")'.dependencies] landlock = "0.2.0" +rand = "0.8.5" [dev-dependencies] assert_matches = "1.4.0" @@ -37,4 +38,4 @@ tempfile = "3.3.0" [features] # This feature is used to export test code to other crates without putting it in the production build. # Also used for building the puppet worker. -test-utils = [] +test-utils = ["sp-tracing"] diff --git a/polkadot/node/core/pvf/common/src/execute.rs b/polkadot/node/core/pvf/common/src/execute.rs index de5ce39f7838..4ae4911624ca 100644 --- a/polkadot/node/core/pvf/common/src/execute.rs +++ b/polkadot/node/core/pvf/common/src/execute.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use crate::error::InternalValidationError; +use crate::{error::InternalValidationError, SecurityStatus}; use parity_scale_codec::{Decode, Encode}; use polkadot_parachain::primitives::ValidationResult; use polkadot_primitives::ExecutorParams; @@ -26,6 +26,8 @@ use std::time::Duration; pub struct Handshake { /// The executor parameters. pub executor_params: ExecutorParams, + /// Status of security features on the current system. + pub security_status: SecurityStatus, } /// The response from an execution job on the worker. diff --git a/polkadot/node/core/pvf/common/src/lib.rs b/polkadot/node/core/pvf/common/src/lib.rs index 2cc9c72e182c..79d7e6529be2 100644 --- a/polkadot/node/core/pvf/common/src/lib.rs +++ b/polkadot/node/core/pvf/common/src/lib.rs @@ -25,16 +25,15 @@ pub mod worker; pub use cpu_time::ProcessTime; -/// DO NOT USE - internal for macros only. -#[doc(hidden)] -pub mod __private { - pub use sp_tracing::try_init_simple; -} +// Used by `decl_worker_main!`. +#[cfg(feature = "test-utils")] +pub use sp_tracing; const LOG_TARGET: &str = "parachain::pvf-common"; use std::mem; use tokio::io::{self, AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _}; +use parity_scale_codec::{Decode, Encode}; #[cfg(feature = "test-utils")] pub mod tests { @@ -44,6 +43,15 @@ pub mod tests { pub const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(30); } +/// Status of security features on the current system. +#[derive(Debug, Clone, Encode, Decode)] +pub struct SecurityStatus { + /// Whether the landlock features we use are fully available on this system. + pub can_enable_landlock: bool, + // Whether we are able to unshare the user namespace and change the filesystem root. + pub can_unshare_user_namespace_and_change_root: bool, +} + /// Write some data prefixed by its length into `w`. pub async fn framed_send(w: &mut (impl AsyncWrite + Unpin), buf: &[u8]) -> io::Result<()> { let len_buf = buf.len().to_le_bytes(); diff --git a/polkadot/node/core/pvf/common/src/prepare.rs b/polkadot/node/core/pvf/common/src/prepare.rs index c205eddfb8b1..3f29efa864a4 100644 --- a/polkadot/node/core/pvf/common/src/prepare.rs +++ b/polkadot/node/core/pvf/common/src/prepare.rs @@ -14,6 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . +use crate::SecurityStatus; use parity_scale_codec::{Decode, Encode}; /// Preparation statistics, including the CPU time and memory taken. @@ -55,3 +56,11 @@ pub enum PrepareJobKind { /// A prechecking job. Prechecking, } + +/// The payload of the one-time handshake that is done when a worker process is created. Carries +/// data from the host to the worker. +#[derive(Encode, Decode)] +pub struct Handshake { + /// Status of security features on the current system. + pub security_status: SecurityStatus, +} diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 4ea0e5aa1a9a..de2914baa86e 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -23,7 +23,7 @@ use cpu_time::ProcessTime; use futures::never::Never; use std::{ any::Any, - path::PathBuf, + path::{Path, PathBuf}, sync::mpsc::{Receiver, RecvTimeoutError}, time::Duration, }; @@ -41,7 +41,7 @@ macro_rules! decl_worker_main { } fn main() { - $crate::__private::try_init_simple(); + $crate::sp_tracing::try_init_simple(); let args = std::env::args().collect::>(); if args.len() == 1 { @@ -58,6 +58,17 @@ macro_rules! decl_worker_main { println!("{}", $worker_version); return }, + "--check-can-unshare-user-namespace-and-change-root" => { + #[cfg(target_os = "linux")] + let status = if security::unshare_user_namespace_and_change_root().is_ok() { + 0 + } else { + -1 + }; + #[cfg(not(target_os = "linux"))] + let status = -1; + std::process::exit(status) + }, subcommand => { // Must be passed for compatibility with the single-binary test workers. if subcommand != $expected_command { @@ -70,17 +81,23 @@ macro_rules! decl_worker_main { } let mut node_version = None; - let mut socket_path: &str = ""; + let mut socket_path = None; + let mut cache_path = None; for i in (2..args.len()).step_by(2) { match args[i].as_ref() { - "--socket-path" => socket_path = args[i + 1].as_str(), + "--socket-path" => socket_path = Some(args[i + 1].as_str()), "--node-impl-version" => node_version = Some(args[i + 1].as_str()), + "--cache-path" => cache_path = Some(args[i + 1].as_str()), arg => panic!("Unexpected argument found: {}", arg), } } + let socket_path = socket_path.expect("the --socket-path argument is required"); + let cache_path = cache_path.expect("the --cache-path argument is required"); + + let cache_path = &std::path::Path::new(cache_path); - $entrypoint(&socket_path, node_version, Some($worker_version)); + $entrypoint(&socket_path, node_version, Some($worker_version), cache_path); } }; } @@ -102,6 +119,7 @@ pub fn worker_event_loop( socket_path: &str, node_version: Option<&str>, worker_version: Option<&str>, + cache_path: &Path, mut event_loop: F, ) where F: FnMut(UnixStream) -> Fut, @@ -115,6 +133,7 @@ pub fn worker_event_loop( if node_version != worker_version { gum::error!( target: LOG_TARGET, + %debug_id, %worker_pid, %node_version, %worker_version, @@ -127,7 +146,28 @@ pub fn worker_event_loop( } } - remove_env_vars(debug_id); + // TODO: Call based on security_config, error out if should work but fails. + #[cfg(target_os = "linux")] + { + if let Err(err_ctx) = security::change_root(cache_path) { + let err = io::Error::last_os_error(); + gum::error!( + target: LOG_TARGET, + %debug_id, + %worker_pid, + %err_ctx, + ?cache_path, + "Could not change root to be the cache path: {}", + err + ); + worker_shutdown_message(debug_id, worker_pid, err); + return + } + } + + security::remove_env_vars(debug_id); + + gum::info!(target: LOG_TARGET, "5. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); // Run the main worker loop. let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it."); @@ -151,48 +191,6 @@ pub fn worker_event_loop( rt.shutdown_background(); } -/// Delete all env vars to prevent malicious code from accessing them. -fn remove_env_vars(debug_id: &'static str) { - for (key, value) in std::env::vars_os() { - // TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of - // randomness for malicious code. In the future we can remove it also and log in the host; - // see . - if key == "RUST_LOG" { - continue - } - - // In case of a key or value that would cause [`env::remove_var` to - // panic](https://doc.rust-lang.org/std/env/fn.remove_var.html#panics), we first log a - // warning and then proceed to attempt to remove the env var. - let mut err_reasons = vec![]; - let (key_str, value_str) = (key.to_str(), value.to_str()); - if key.is_empty() { - err_reasons.push("key is empty"); - } - if key_str.is_some_and(|s| s.contains('=')) { - err_reasons.push("key contains '='"); - } - if key_str.is_some_and(|s| s.contains('\0')) { - err_reasons.push("key contains null character"); - } - if value_str.is_some_and(|s| s.contains('\0')) { - err_reasons.push("value contains null character"); - } - if !err_reasons.is_empty() { - gum::warn!( - target: LOG_TARGET, - %debug_id, - ?key, - ?value, - "Attempting to remove badly-formatted env var, this may cause the PVF worker to crash. Please remove it yourself. Reasons: {:?}", - err_reasons - ); - } - - std::env::remove_var(key); - } -} - /// Provide a consistent message on worker shutdown. fn worker_shutdown_message(debug_id: &'static str, worker_pid: u32, err: io::Error) { gum::debug!(target: LOG_TARGET, %worker_pid, "quitting pvf worker ({}): {:?}", debug_id, err); @@ -299,7 +297,7 @@ pub mod thread { Arc::new((Mutex::new(WaitOutcome::Pending), Condvar::new())) } - /// Runs a worker thread. Will first enable security features, and afterwards notify the threads + /// Runs a worker thread. Will run the requested function, and afterwards notify the threads /// waiting on the condvar. Catches panics during execution and resumes the panics after /// triggering the condvar, so that the waiting thread is notified on panics. /// diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs index 6c5f96e0b5db..a693c94a4c26 100644 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ b/polkadot/node/core/pvf/common/src/worker/security.rs @@ -18,8 +18,140 @@ //! //! This is needed because workers are used to compile and execute untrusted code (PVFs). +use crate::LOG_TARGET; +#[cfg(target_os = "linux")] +use std::path::{Path, PathBuf}; + +/// Unshare the user namespace and change root to be the artifact directory. +#[cfg(target_os = "linux")] +pub fn unshare_user_namespace_and_change_root(cache_path: &Path) -> Result<(), &'static str> { + use rand::{distributions::Alphanumeric, Rng}; + use std::{ffi::CString, os::unix::ffi::OsStrExt, ptr}; + + const RANDOM_LEN: usize = 10; + let mut buf = Vec::with_capacity(RANDOM_LEN); + buf.extend(rand::thread_rng().sample_iter(&Alphanumeric).take(RANDOM_LEN)); + let s = std::str::from_utf8(&buf) + .expect("the string is collected from a valid utf-8 sequence; qed"); + + let cache_path_str = match cache_path.to_str() { + Some(s) => s, + None => return Err("cache path is not valid UTF-8"), + }; + let cache_path_c = CString::new(cache_path.as_os_str().as_bytes()).unwrap(); + let root_absolute_c = CString::new("/").unwrap(); + // Append a random string to prevent races and to avoid dealing with the dir already existing. + let oldroot_relative_c = CString::new(format!("{}/oldroot-{}", cache_path_str, s)).unwrap(); + let oldroot_absolute_c = CString::new(format!("/oldroot-{}", s)).unwrap(); + + // SAFETY: TODO + unsafe { + // 1. `unshare` the user and the mount namespaces. + if libc::unshare(libc::CLONE_NEWUSER) < 0 { + return Err("unshare user namespace") + } + if libc::unshare(libc::CLONE_NEWNS) < 0 { + return Err("unshare mount namespace") + } + + // 2. `pivot_root` to the artifact directory. + gum::info!(target: LOG_TARGET, "1. {:?}", std::env::current_dir()); + gum::info!(target: LOG_TARGET, "1.5. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); + // Ensure that 'new_root' and its parent mount don't have shared propagation. + if libc::mount( + ptr::null(), + root_absolute_c.as_ptr(), + ptr::null(), + libc::MS_REC | libc::MS_PRIVATE, + ptr::null(), + ) < 0 + { + return Err("mount MS_PRIVATE") + } + if libc::mount( + cache_path_c.as_ptr(), + cache_path_c.as_ptr(), + ptr::null(), // ignored when MS_BIND is used + libc::MS_BIND | libc::MS_REC | libc::MS_NOEXEC | libc::MS_NODEV | libc::MS_NOSUID, + ptr::null(), // ignored when MS_BIND is used + ) < 0 + { + return Err("mount MS_BIND") + } + if libc::mkdir(oldroot_relative_c.as_ptr(), 0755) < 0 { + return Err("mkdir oldroot") + } + if libc::syscall(libc::SYS_pivot_root, cache_path_c.as_ptr(), oldroot_relative_c.as_ptr()) < + 0 + { + return Err("pivot_root") + } + + // 3. Change to the new root, `unmount2` and remove the old root. + if libc::chdir(root_absolute_c.as_ptr()) < 0 { + return Err("chdir to new root") + } + gum::info!(target: LOG_TARGET, "2. {:?}", std::env::current_dir()); + gum::info!(target: LOG_TARGET, "3. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); + if libc::umount2(oldroot_absolute_c.as_ptr(), libc::MNT_DETACH) < 0 { + return Err("umount2 the oldroot") + } + if libc::rmdir(oldroot_absolute_c.as_ptr()) < 0 { + return Err("rmdir the oldroot") + } + gum::info!(target: LOG_TARGET, "4. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); + + // TODO: do some assertions + } + + Ok(()) +} + +/// Delete all env vars to prevent malicious code from accessing them. +pub fn remove_env_vars(debug_id: &'static str) { + for (key, value) in std::env::vars_os() { + // TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of + // randomness for malicious code. In the future we can remove it also and log in the host; + // see . + if key == "RUST_LOG" { + continue + } + + // In case of a key or value that would cause [`env::remove_var` to + // panic](https://doc.rust-lang.org/std/env/fn.remove_var.html#panics), we first log a + // warning and then proceed to attempt to remove the env var. + let mut err_reasons = vec![]; + let (key_str, value_str) = (key.to_str(), value.to_str()); + if key.is_empty() { + err_reasons.push("key is empty"); + } + if key_str.is_some_and(|s| s.contains('=')) { + err_reasons.push("key contains '='"); + } + if key_str.is_some_and(|s| s.contains('\0')) { + err_reasons.push("key contains null character"); + } + if value_str.is_some_and(|s| s.contains('\0')) { + err_reasons.push("value contains null character"); + } + if !err_reasons.is_empty() { + gum::warn!( + target: LOG_TARGET, + %debug_id, + ?key, + ?value, + "Attempting to remove badly-formatted env var, this may cause the PVF worker to crash. Please remove it yourself. Reasons: {:?}", + err_reasons + ); + } + + std::env::remove_var(key); + } +} + /// To what degree landlock is enabled. It's a separate struct from `RulesetStatus` because that is /// only available on Linux, plus this has a nicer name. +#[derive(Debug)] pub enum LandlockStatus { FullyEnforced, PartiallyEnforced, @@ -52,14 +184,19 @@ impl LandlockStatus { /// [landlock]: https://docs.rs/landlock/latest/landlock/index.html #[cfg(target_os = "linux")] pub mod landlock { - use landlock::{Access, AccessFs, Ruleset, RulesetAttr, RulesetError, RulesetStatus, ABI}; + pub use landlock::{path_beneath_rules, Access, AccessFs}; + + use landlock::{ + PathBeneath, PathFd, Ruleset, RulesetAttr, RulesetCreatedAttr, RulesetError, RulesetStatus, + ABI, + }; /// Landlock ABI version. We use ABI V1 because: /// /// 1. It is supported by our reference kernel version. /// 2. Later versions do not (yet) provide additional security. /// - /// # Versions (June 2023) + /// # Versions (as of June 2023) /// /// - Polkadot reference kernel version: 5.16+ /// - ABI V1: 5.13 - introduces landlock, including full restrictions on file reads @@ -87,10 +224,10 @@ pub mod landlock { /// Returns to what degree landlock is enabled with the given ABI on the current Linux /// environment. pub fn get_status() -> Result> { - match std::thread::spawn(|| try_restrict_thread()).join() { + match std::thread::spawn(|| try_restrict(std::iter::empty())).join() { Ok(Ok(status)) => Ok(status), Ok(Err(ruleset_err)) => Err(ruleset_err.into()), - Err(_err) => Err("a panic occurred in try_restrict_thread".into()), + Err(_err) => Err("a panic occurred in try_restrict".into()), } } @@ -108,20 +245,24 @@ pub mod landlock { status_is_fully_enabled(&get_status()) } - /// Tries to restrict the current thread with the following landlock access controls: + /// Tries to restrict the current thread (should only be called in a process' main thread) with + /// the following landlock access controls: /// - /// 1. all global filesystem access - /// 2. ... more may be supported in the future. + /// 1. all global filesystem access restricted, with optional exceptions + /// 2. ... more sandbox types (e.g. networking) may be supported in the future. /// /// If landlock is not supported in the current environment this is simply a noop. /// /// # Returns /// /// The status of the restriction (whether it was fully, partially, or not-at-all enforced). - pub fn try_restrict_thread() -> Result { + pub fn try_restrict( + fs_exceptions: impl Iterator, RulesetError>>, + ) -> Result { let status = Ruleset::new() .handle_access(AccessFs::from_all(LANDLOCK_ABI))? .create()? + .add_rules(fs_exceptions)? .restrict_self()?; Ok(status.ruleset) } @@ -132,55 +273,169 @@ pub mod landlock { use std::{fs, io::ErrorKind, thread}; #[test] - fn restricted_thread_cannot_access_fs() { + fn restricted_thread_cannot_read_file() { // TODO: This would be nice: . if !check_is_fully_enabled() { return } // Restricted thread cannot read from FS. - let handle = thread::spawn(|| { - // Write to a tmp file, this should succeed before landlock is applied. - let text = "foo"; - let tmpfile = tempfile::NamedTempFile::new().unwrap(); - let path = tmpfile.path(); - fs::write(path, text).unwrap(); - let s = fs::read_to_string(path).unwrap(); - assert_eq!(s, text); - - let status = try_restrict_thread().unwrap(); - if !matches!(status, RulesetStatus::FullyEnforced) { - panic!("Ruleset should be enforced since we checked if landlock is enabled"); - } - - // Try to read from the tmp file after landlock. - let result = fs::read_to_string(path); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - }); + let handle = + thread::spawn(|| { + // Create, write, and read two tmp files. This should succeed before any + // landlock restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); + let path1 = tmpfile1.path(); + let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); + let path2 = tmpfile2.path(); + + fs::write(path1, TEXT).unwrap(); + let s = fs::read_to_string(path1).unwrap(); + assert_eq!(s, TEXT); + fs::write(path2, TEXT).unwrap(); + let s = fs::read_to_string(path2).unwrap(); + assert_eq!(s, TEXT); + + // Apply Landlock with a read exception for only one of the files. + let status = try_restrict(path_beneath_rules( + &[path1], + AccessFs::from_read(LANDLOCK_ABI), + )); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); + } + + // Try to read from both files, only tmpfile1 should succeed. + let result = fs::read_to_string(path1); + assert!(matches!( + result, + Ok(s) if s == TEXT + )); + let result = fs::read_to_string(path2); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + + // Apply Landlock for all files. + let status = try_restrict(std::iter::empty()); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); + } + + // Try to read from tmpfile1 after landlock, it should fail. + let result = fs::read_to_string(path1); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); assert!(handle.join().is_ok()); + } + + #[test] + fn restricted_thread_cannot_write_file() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } // Restricted thread cannot write to FS. - let handle = thread::spawn(|| { - let text = "foo"; - let tmpfile = tempfile::NamedTempFile::new().unwrap(); - let path = tmpfile.path(); - - let status = try_restrict_thread().unwrap(); - if !matches!(status, RulesetStatus::FullyEnforced) { - panic!("Ruleset should be enforced since we checked if landlock is enabled"); - } - - // Try to write to the tmp file after landlock. - let result = fs::write(path, text); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - }); + let handle = + thread::spawn(|| { + // Create and write two tmp files. This should succeed before any landlock + // restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); + let path1 = tmpfile1.path(); + let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); + let path2 = tmpfile2.path(); + + fs::write(path1, TEXT).unwrap(); + fs::write(path2, TEXT).unwrap(); + + // Apply Landlock with a write exception for only one of the files. + let status = try_restrict(path_beneath_rules( + &[path1], + AccessFs::from_write(LANDLOCK_ABI), + )); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); + } + + // Try to write to both files, only tmpfile1 should succeed. + let result = fs::write(path1, TEXT); + assert!(matches!(result, Ok(_))); + let result = fs::write(path2, TEXT); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + + // Apply Landlock for all files. + let status = try_restrict(std::iter::empty()); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); + } + + // Try to write to tmpfile1 after landlock, it should fail. + let result = fs::write(path1, TEXT); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); + + assert!(handle.join().is_ok()); + } + + #[test] + fn restricted_thread_can_read_files_but_not_list_dir() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + // Restricted thread can read files but not list directory contents. + let handle = + thread::spawn(|| { + // Create, write to and read a tmp file. This should succeed before any landlock + // restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile = tempfile::NamedTempFile::new().unwrap(); + let filepath = tmpfile.path(); + let dirpath = filepath.parent().unwrap(); + + fs::write(filepath, TEXT).unwrap(); + let s = fs::read_to_string(filepath).unwrap(); + assert_eq!(s, TEXT); + + // Apply Landlock with a general read exception for the directory, *without* the + // `ReadDir` exception. + let status = try_restrict(path_beneath_rules( + &[dirpath], + AccessFs::from_read(LANDLOCK_ABI) ^ AccessFs::ReadDir, + )); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); + } + + // Try to read file, should still be able to. + let result = fs::read_to_string(filepath); + assert!(matches!( + result, + Ok(s) if s == TEXT + )); + + // Try to list dir contents, should fail. + let result = fs::read_dir(dirpath); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); assert!(handle.join().is_ok()); } diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 7a14de18a82f..d00abac30cc9 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -39,7 +39,7 @@ use polkadot_node_core_pvf_common::{ }; use polkadot_parachain::primitives::ValidationResult; use std::{ - path::PathBuf, + path::{Path, PathBuf}, sync::{mpsc::channel, Arc}, time::Duration, }; @@ -119,28 +119,76 @@ async fn send_response(stream: &mut UnixStream, response: Response) -> io::Resul /// /// # Parameters /// -/// The `socket_path` specifies the path to the socket used to communicate with the host. The -/// `node_version`, if `Some`, is checked against the worker version. A mismatch results in -/// immediate worker termination. `None` is used for tests and in other situations when version -/// check is not necessary. +/// - `socket_path` specifies the path to the socket used to communicate with the host. +/// +/// - `node_version`, if `Some`, is checked against the `worker_version`. A mismatch results in +/// immediate worker termination. `None` is used for tests and in other situations when version +/// check is not necessary. +/// +/// - `worker_version`: see above +/// +/// - `cache_path` contains the expected cache path for artifacts and is used to provide a sandbox +/// exception for landlock. pub fn worker_entrypoint( socket_path: &str, node_version: Option<&str>, worker_version: Option<&str>, + cache_path: &Path, ) { worker_event_loop( "execute", socket_path, node_version, worker_version, + cache_path, |mut stream| async move { let worker_pid = std::process::id(); - let handshake = recv_handshake(&mut stream).await?; - let executor = Executor::new(handshake.executor_params).map_err(|e| { + let Handshake { executor_params, security_status } = + recv_handshake(&mut stream).await?; + let executor = Executor::new(executor_params).map_err(|e| { io::Error::new(io::ErrorKind::Other, format!("cannot create executor: {}", e)) })?; + // Try to enable landlock. + { + #[cfg(target_os = "linux")] + let landlock_status = { + use polkadot_node_core_pvf_common::worker::security::landlock::{ + path_beneath_rules, try_restrict, Access, AccessFs, LANDLOCK_ABI, + }; + + // Allow an exception for reading from the artifact cache, but disallow listing + // the directory contents. Since we prepend artifact names with a random hash, + // this means attackers can't discover artifacts apart from the current job. + try_restrict(path_beneath_rules( + &[cache_path], + AccessFs::from_read(LANDLOCK_ABI) ^ AccessFs::ReadDir, + )) + .map(LandlockStatus::from_ruleset_status) + .map_err(|e| e.to_string()) + }; + #[cfg(not(target_os = "linux"))] + let landlock_status: Result = Ok(LandlockStatus::NotEnforced); + + // Error if the host determined that landlock is fully enabled and we couldn't fully + // enforce it here. + if security_status.can_enable_landlock && + !matches!(landlock_status, Ok(LandlockStatus::FullyEnforced)) + { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "could not fully enable landlock: {:?}", + landlock_status + ); + return Err(io::Error::new( + io::ErrorKind::Other, + format!("could not fully enable landlock: {:?}", landlock_status), + )) + } + } + loop { let (artifact_path, params, execution_timeout) = recv_request(&mut stream).await?; gum::debug!( @@ -150,9 +198,11 @@ pub fn worker_entrypoint( artifact_path.display(), ); + if !artifact_path.starts_with(cache_path) { + return Err(io::Error::new(io::ErrorKind::Other, format!("received an artifact path {artifact_path:?} that does not belong to expected artifact dir {cache_path:?}"))) + } + // Get the artifact bytes. - // - // We do this outside the thread so that we can lock down filesystem access there. let compiled_artifact_blob = match std::fs::read(artifact_path) { Ok(bytes) => bytes, Err(err) => { @@ -187,22 +237,11 @@ pub fn worker_entrypoint( let execute_thread = thread::spawn_worker_thread_with_stack_size( "execute thread", move || { - // Try to enable landlock. - #[cfg(target_os = "linux")] - let landlock_status = polkadot_node_core_pvf_common::worker::security::landlock::try_restrict_thread() - .map(LandlockStatus::from_ruleset_status) - .map_err(|e| e.to_string()); - #[cfg(not(target_os = "linux"))] - let landlock_status: Result = Ok(LandlockStatus::NotEnforced); - - ( - validate_using_artifact( - &compiled_artifact_blob, - ¶ms, - executor_2, - cpu_time_start, - ), - landlock_status, + validate_using_artifact( + &compiled_artifact_blob, + ¶ms, + executor_2, + cpu_time_start, ) }, Arc::clone(&condvar), @@ -215,24 +254,9 @@ pub fn worker_entrypoint( let response = match outcome { WaitOutcome::Finished => { let _ = cpu_time_monitor_tx.send(()); - let (result, landlock_status) = execute_thread.join().unwrap_or_else(|e| { - ( - Response::Panic(stringify_panic_payload(e)), - Ok(LandlockStatus::Unavailable), - ) - }); - - // Log if landlock threw an error. - if let Err(err) = landlock_status { - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "error enabling landlock: {}", - err - ); - } - - result + execute_thread + .join() + .unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e))) }, // If the CPU thread is not selected, we signal it to end, the join handle is // dropped and the thread will finish in the background. diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index caa7d33df12a..c6cba418d314 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -34,7 +34,7 @@ use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, executor_intf::Executor, framed_recv, framed_send, - prepare::{MemoryStats, PrepareJobKind, PrepareStats}, + prepare::{Handshake, MemoryStats, PrepareJobKind, PrepareStats}, pvf::PvfPrepData, worker::{ bytes_to_path, cpu_time_monitor_loop, @@ -47,7 +47,7 @@ use polkadot_node_core_pvf_common::{ }; use polkadot_primitives::ExecutorParams; use std::{ - path::PathBuf, + path::{Path, PathBuf}, sync::{mpsc::channel, Arc}, time::Duration, }; @@ -69,6 +69,17 @@ impl AsRef<[u8]> for CompiledArtifact { } } +async fn recv_handshake(stream: &mut UnixStream) -> io::Result { + let handshake_enc = framed_recv(stream).await?; + let handshake = Handshake::decode(&mut &handshake_enc[..]).map_err(|_| { + io::Error::new( + io::ErrorKind::Other, + "prepare pvf recv_handshake: failed to decode Handshake".to_owned(), + ) + })?; + Ok(handshake) +} + async fn recv_request(stream: &mut UnixStream) -> io::Result<(PvfPrepData, PathBuf)> { let pvf = framed_recv(stream).await?; let pvf = PvfPrepData::decode(&mut &pvf[..]).map_err(|e| { @@ -95,10 +106,16 @@ async fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Re /// /// # Parameters /// -/// The `socket_path` specifies the path to the socket used to communicate with the host. The -/// `node_version`, if `Some`, is checked against the worker version. A mismatch results in -/// immediate worker termination. `None` is used for tests and in other situations when version -/// check is not necessary. +/// - `socket_path` specifies the path to the socket used to communicate with the host. +/// +/// - `node_version`, if `Some`, is checked against the `worker_version`. A mismatch results in +/// immediate worker termination. `None` is used for tests and in other situations when version +/// check is not necessary. +/// +/// - `worker_version`: see above +/// +/// - `cache_path` contains the expected cache path for artifacts and is used to provide a sandbox +/// exception for landlock. /// /// # Flow /// @@ -122,15 +139,63 @@ pub fn worker_entrypoint( socket_path: &str, node_version: Option<&str>, worker_version: Option<&str>, + cache_path: &Path, ) { worker_event_loop( "prepare", socket_path, node_version, worker_version, + cache_path, |mut stream| async move { let worker_pid = std::process::id(); + gum::info!(target: LOG_TARGET, "10. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); + + let Handshake { security_status } = recv_handshake(&mut stream).await?; + + gum::info!(target: LOG_TARGET, "11. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); + + // Try to enable landlock. + // { + // #[cfg(target_os = "linux")] + // let landlock_status = { + // use polkadot_node_core_pvf_common::worker::security::landlock::{ + // path_beneath_rules, try_restrict, Access, AccessFs, LANDLOCK_ABI, + // }; + + // // Allow an exception for writing to the artifact cache, with no allowance for + // // listing the directory contents. Since we prepend artifact names with a random + // // hash, this means attackers can't discover artifacts apart from the current + // // job. + // try_restrict(path_beneath_rules( + // &[cache_path], + // AccessFs::from_write(LANDLOCK_ABI), + // )) + // .map(LandlockStatus::from_ruleset_status) + // .map_err(|e| e.to_string()) + // }; + // #[cfg(not(target_os = "linux"))] + // let landlock_status: Result = + // Ok(LandlockStatus::NotEnforced); + + // // Error if the host determined that landlock is fully enabled and we couldn't fully + // // enforce it here. + // if landlock_enabled && !matches!(landlock_status, Ok(LandlockStatus::FullyEnforced)) + // { + // gum::warn!( + // target: LOG_TARGET, + // %worker_pid, + // "could not fully enable landlock: {:?}", + // landlock_status + // ); + // return Err(io::Error::new( + // io::ErrorKind::Other, + // format!("could not fully enable landlock: {:?}", landlock_status), + // )) + // } + // } + loop { let (pvf, temp_artifact_dest) = recv_request(&mut stream).await?; gum::debug!( @@ -139,6 +204,11 @@ pub fn worker_entrypoint( "worker: preparing artifact", ); + // if !temp_artifact_dest.starts_with(cache_path) { + // return Err(io::Error::new(io::ErrorKind::Other, format!("received an artifact + // path {temp_artifact_dest:?} that does not belong to expected cache path + // {cache_path:?}"))) } + let preparation_timeout = pvf.prep_timeout(); let prepare_job_kind = pvf.prep_kind(); let executor_params = (*pvf.executor_params()).clone(); @@ -172,14 +242,6 @@ pub fn worker_entrypoint( let prepare_thread = thread::spawn_worker_thread( "prepare thread", move || { - // Try to enable landlock. - #[cfg(target_os = "linux")] - let landlock_status = polkadot_node_core_pvf_common::worker::security::landlock::try_restrict_thread() - .map(LandlockStatus::from_ruleset_status) - .map_err(|e| e.to_string()); - #[cfg(not(target_os = "linux"))] - let landlock_status: Result = Ok(LandlockStatus::NotEnforced); - #[allow(unused_mut)] let mut result = prepare_artifact(pvf, cpu_time_start); @@ -200,7 +262,7 @@ pub fn worker_entrypoint( }); } - (result, landlock_status) + result }, Arc::clone(&condvar), WaitOutcome::Finished, @@ -213,16 +275,13 @@ pub fn worker_entrypoint( let _ = cpu_time_monitor_tx.send(()); match prepare_thread.join().unwrap_or_else(|err| { - ( - Err(PrepareError::Panic(stringify_panic_payload(err))), - Ok(LandlockStatus::Unavailable), - ) + Err(PrepareError::Panic(stringify_panic_payload(err))) }) { - (Err(err), _) => { + Err(err) => { // Serialized error will be written into the socket. Err(err) }, - (Ok(ok), landlock_status) => { + Ok(ok) => { #[cfg(not(target_os = "linux"))] let (artifact, cpu_time_elapsed) = ok; #[cfg(target_os = "linux")] @@ -242,16 +301,6 @@ pub fn worker_entrypoint( max_rss: extract_max_rss_stat(max_rss, worker_pid), }; - // Log if landlock threw an error. - if let Err(err) = landlock_status { - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "error enabling landlock: {}", - err - ); - } - // Write the serialized artifact into a temp file. // // PVF host only keeps artifacts statuses in its memory, diff --git a/polkadot/node/core/pvf/src/execute/queue.rs b/polkadot/node/core/pvf/src/execute/queue.rs index acb260e25693..b9394a18e1df 100644 --- a/polkadot/node/core/pvf/src/execute/queue.rs +++ b/polkadot/node/core/pvf/src/execute/queue.rs @@ -24,6 +24,7 @@ use crate::{ worker_intf::{IdleWorker, WorkerHandle}, InvalidCandidate, ValidationError, LOG_TARGET, }; +use polkadot_node_core_pvf_common::SecurityStatus; use futures::{ channel::mpsc, future::BoxFuture, @@ -141,6 +142,8 @@ struct Queue { program_path: PathBuf, spawn_timeout: Duration, node_version: Option, + cache_path: PathBuf, + security_status: SecurityStatus, /// The queue of jobs that are waiting for a worker to pick up. queue: VecDeque, @@ -155,6 +158,8 @@ impl Queue { worker_capacity: usize, spawn_timeout: Duration, node_version: Option, + cache_path: PathBuf, + security_status: SecurityStatus, to_queue_rx: mpsc::Receiver, ) -> Self { Self { @@ -162,6 +167,8 @@ impl Queue { program_path, spawn_timeout, node_version, + cache_path, + security_status, to_queue_rx, queue: VecDeque::new(), mux: Mux::new(), @@ -408,6 +415,8 @@ fn spawn_extra_worker(queue: &mut Queue, job: ExecuteJob) { job, queue.spawn_timeout, queue.node_version.clone(), + queue.cache_path.clone(), + queue.security_status.clone(), ) .boxed(), ); @@ -426,6 +435,8 @@ async fn spawn_worker_task( job: ExecuteJob, spawn_timeout: Duration, node_version: Option, + cache_path: PathBuf, + security_status: SecurityStatus, ) -> QueueEvent { use futures_timer::Delay; @@ -435,6 +446,8 @@ async fn spawn_worker_task( job.executor_params.clone(), spawn_timeout, node_version.as_deref(), + &cache_path, + security_status.clone(), ) .await { @@ -499,6 +512,8 @@ pub fn start( worker_capacity: usize, spawn_timeout: Duration, node_version: Option, + cache_path: PathBuf, + security_status: SecurityStatus, ) -> (mpsc::Sender, impl Future) { let (to_queue_tx, to_queue_rx) = mpsc::channel(20); let run = Queue::new( @@ -507,6 +522,8 @@ pub fn start( worker_capacity, spawn_timeout, node_version, + cache_path, + security_status, to_queue_rx, ) .run(); diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 948abd2261d7..91a9c219b34c 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -31,6 +31,7 @@ use polkadot_node_core_pvf_common::{ error::InternalValidationError, execute::{Handshake, Response}, framed_recv, framed_send, + SecurityStatus }; use polkadot_parachain::primitives::ValidationResult; use polkadot_primitives::ExecutorParams; @@ -46,14 +47,27 @@ pub async fn spawn( executor_params: ExecutorParams, spawn_timeout: Duration, node_version: Option<&str>, + cache_path: &Path, + security_status: SecurityStatus, ) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { - let mut extra_args = vec!["execute-worker"]; + let cache_path_str = match cache_path.to_str() { + Some(a) => a, + None => return Err(SpawnErr::InvalidCachePath(cache_path.to_owned())), + }; + let mut extra_args = vec!["execute-worker", "--cache-path", cache_path_str]; if let Some(node_version) = node_version { extra_args.extend_from_slice(&["--node-impl-version", node_version]); } - let (mut idle_worker, worker_handle) = - spawn_with_program_path("execute", program_path, &extra_args, spawn_timeout).await?; - send_handshake(&mut idle_worker.stream, Handshake { executor_params }) + + let (mut idle_worker, worker_handle) = spawn_with_program_path( + "execute", + program_path, + Some(cache_path), + &extra_args, + spawn_timeout, + ) + .await?; + send_handshake(&mut idle_worker.stream, Handshake { executor_params, security_status }) .await .map_err(|error| { gum::warn!( diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 9f3b7e23fd89..8de7f3f6a373 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -34,6 +34,7 @@ use futures::{ use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, pvf::PvfPrepData, + SecurityStatus, }; use polkadot_parachain::primitives::ValidationResult; use std::{ @@ -202,8 +203,13 @@ impl Config { pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future) { gum::debug!(target: LOG_TARGET, ?config, "starting PVF validation host"); - // Run checks for supported security features once per host startup. - warn_if_no_landlock(); + // Run checks for supported security features once per host startup. Warn here if not enabled. + let security_status = { + let can_enable_landlock = check_landlock(); + let (can_unshare_user_namespace_and_change_root) = + check_can_unshare_user_namespace_and_change_root(&config.prepare_worker_program_path); + SecurityStatus { can_enable_landlock, can_unshare_user_namespace_and_change_root } + }; let (to_host_tx, to_host_rx) = mpsc::channel(10); @@ -215,6 +221,7 @@ pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future (ValidationHost, impl Future impl futures::Stream .map(|_| ()) } +/// Check if we can sandbox the root and emit a warning if not. +/// +/// We do this check by spawning a new process and trying to sandbox it. The process must be +/// single-threaded, so we can't just fork here. To get as close as possible to running unshare and +/// pivot_root in a worker, we try them... in a worker. The expected return status is 0 on success +/// and -1 on failure. +fn check_can_unshare_user_namespace_and_change_root(prepare_worker_program_path: &Path) -> bool { + #[cfg(target_os = "linux")] + { + match Command::new(prepare_worker_program_path) + .arg("--check-can-unshare-user-namespace-and-change-root") + .status() + { + Ok(0) => true, + Ok(status) => { + gum::warn!( + target:LOG_TARGET, + %prepare_worker_program_path, + ?status, + "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target:LOG_TARGET, + %prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } + + #[cfg(not(target_os = "linux"))] + { + gum::warn!( + target: LOG_TARGET, + "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with support for unsharing user namespaces for maximum security." + ); + false + } +} + /// Check if landlock is supported and emit a warning if not. -fn warn_if_no_landlock() { +/// +/// TODO: Run in child process. +fn check_landlock() -> bool { #[cfg(target_os = "linux")] { use polkadot_node_core_pvf_common::worker::security::landlock; + let status = landlock::get_status(); if !landlock::status_is_fully_enabled(&status) { let abi = landlock::LANDLOCK_ABI as u8; @@ -885,16 +942,22 @@ fn warn_if_no_landlock() { target: LOG_TARGET, ?status, %abi, - "Cannot fully enable landlock, a Linux kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + "Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." ); + false + } else { + true } } #[cfg(not(target_os = "linux"))] - gum::warn!( - target: LOG_TARGET, - "Cannot enable landlock, a Linux kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." - ); + { + gum::warn!( + target: LOG_TARGET, + "Cannot enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." + ); + false + } } #[cfg(test)] diff --git a/polkadot/node/core/pvf/src/prepare/pool.rs b/polkadot/node/core/pvf/src/prepare/pool.rs index 92aa4896c263..95a66a1d5f5a 100644 --- a/polkadot/node/core/pvf/src/prepare/pool.rs +++ b/polkadot/node/core/pvf/src/prepare/pool.rs @@ -27,6 +27,7 @@ use futures::{ use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, pvf::PvfPrepData, + SecurityStatus, }; use slotmap::HopSlotMap; use std::{ @@ -110,10 +111,12 @@ enum PoolEvent { type Mux = FuturesUnordered>; struct Pool { + // Some variables related to the current session. program_path: PathBuf, cache_path: PathBuf, spawn_timeout: Duration, node_version: Option, + security_status: SecurityStatus, to_pool: mpsc::Receiver, from_pool: mpsc::UnboundedSender, @@ -132,6 +135,7 @@ async fn run( cache_path, spawn_timeout, node_version, + security_status, to_pool, mut from_pool, mut spawned, @@ -160,6 +164,7 @@ async fn run( &cache_path, spawn_timeout, node_version.clone(), + security_status.clone(), &mut spawned, &mut mux, to_pool, @@ -207,6 +212,7 @@ fn handle_to_pool( cache_path: &Path, spawn_timeout: Duration, node_version: Option, + security_status: SecurityStatus, spawned: &mut HopSlotMap, mux: &mut Mux, to_pool: ToPool, @@ -216,7 +222,14 @@ fn handle_to_pool( gum::debug!(target: LOG_TARGET, "spawning a new prepare worker"); metrics.prepare_worker().on_begin_spawn(); mux.push( - spawn_worker_task(program_path.to_owned(), spawn_timeout, node_version).boxed(), + spawn_worker_task( + program_path.to_owned(), + spawn_timeout, + node_version, + cache_path.to_owned(), + security_status, + ) + .boxed(), ); }, ToPool::StartWork { worker, pvf, artifact_path } => { @@ -231,6 +244,7 @@ fn handle_to_pool( pvf, cache_path.to_owned(), artifact_path, + security_status, preparation_timer, ) .boxed(), @@ -260,11 +274,21 @@ async fn spawn_worker_task( program_path: PathBuf, spawn_timeout: Duration, node_version: Option, + cache_path: PathBuf, + security_status: SecurityStatus, ) -> PoolEvent { use futures_timer::Delay; loop { - match worker_intf::spawn(&program_path, spawn_timeout, node_version.as_deref()).await { + match worker_intf::spawn( + &program_path, + spawn_timeout, + node_version.as_deref(), + &cache_path, + security_status.clone(), + ) + .await + { Ok((idle, handle)) => break PoolEvent::Spawn(idle, handle), Err(err) => { gum::warn!(target: LOG_TARGET, "failed to spawn a prepare worker: {:?}", err); @@ -283,9 +307,10 @@ async fn start_work_task( pvf: PvfPrepData, cache_path: PathBuf, artifact_path: PathBuf, + security_status: SecurityStatus, _preparation_timer: Option, ) -> PoolEvent { - let outcome = worker_intf::start_work(&metrics, idle, pvf, &cache_path, artifact_path).await; + let outcome = worker_intf::start_work(&metrics, idle, pvf, &cache_path, artifact_path, security_status).await; PoolEvent::StartWork(worker, outcome) } @@ -434,6 +459,7 @@ pub fn start( cache_path: PathBuf, spawn_timeout: Duration, node_version: Option, + security_status: SecurityStatus, ) -> (mpsc::Sender, mpsc::UnboundedReceiver, impl Future) { let (to_pool_tx, to_pool_rx) = mpsc::channel(10); let (from_pool_tx, from_pool_rx) = mpsc::unbounded(); @@ -444,6 +470,7 @@ pub fn start( cache_path, spawn_timeout, node_version, + security_status, to_pool: to_pool_rx, from_pool: from_pool_tx, spawned: HopSlotMap::with_capacity_and_key(20), diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 5280ab6b42a2..9329c3df7dbd 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -28,8 +28,9 @@ use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, framed_recv, framed_send, - prepare::PrepareStats, + prepare::{Handshake, PrepareStats}, pvf::PvfPrepData, + SecurityStatus, }; use sp_core::hexdisplay::HexDisplay; @@ -46,12 +47,38 @@ pub async fn spawn( program_path: &Path, spawn_timeout: Duration, node_version: Option<&str>, + cache_path: &Path, + security_status: SecurityStatus, ) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { - let mut extra_args = vec!["prepare-worker"]; + let cache_path_str = match cache_path.to_str() { + Some(a) => a, + None => return Err(SpawnErr::InvalidCachePath(cache_path.to_owned())), + }; + let mut extra_args = vec!["prepare-worker", "--cache-path", cache_path_str]; if let Some(node_version) = node_version { extra_args.extend_from_slice(&["--node-impl-version", node_version]); } - spawn_with_program_path("prepare", program_path, &extra_args, spawn_timeout).await + + let (mut idle_worker, worker_handle) = spawn_with_program_path( + "prepare", + program_path, + Some(cache_path), + &extra_args, + spawn_timeout, + ) + .await?; + send_handshake(&mut idle_worker.stream, Handshake { security_status }) + .await + .map_err(|error| { + gum::warn!( + target: LOG_TARGET, + worker_pid = %idle_worker.pid, + ?error, + "failed to send a handshake to the spawned worker", + ); + SpawnErr::Handshake + })?; + Ok((idle_worker, worker_handle)) } pub enum Outcome { @@ -86,19 +113,30 @@ pub async fn start_work( pvf: PvfPrepData, cache_path: &Path, artifact_path: PathBuf, + security_status: SecurityStatus, ) -> Outcome { let IdleWorker { stream, pid } = worker; gum::debug!( target: LOG_TARGET, worker_pid = %pid, + ?security_status, "starting prepare for {}", artifact_path.display(), ); with_tmp_file(stream, pid, cache_path, |tmp_file, mut stream| async move { + // Pass the socket path relative to the cache_path (what the child thinks is root). + let tmp_file_worker_view = if security_status.can_unshare_user_namespace_and_change_root { + Path::new(".").with_file_name( + tmp_file.file_name().expect("tmp files are created with a filename; qed"), + ) + } else { + tmp_file.clone() + }; + let preparation_timeout = pvf.prep_timeout(); - if let Err(err) = send_request(&mut stream, pvf, &tmp_file).await { + if let Err(err) = send_request(&mut stream, pvf, &tmp_file_worker_view).await { gum::warn!( target: LOG_TARGET, worker_pid = %pid, @@ -278,6 +316,10 @@ async fn send_request( Ok(()) } +async fn send_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Result<()> { + framed_send(stream, &handshake.encode()).await +} + async fn recv_response(stream: &mut UnixStream, pid: u32) -> io::Result { let result = framed_recv(stream).await?; let result = PrepareResult::decode(&mut &result[..]).map_err(|e| { diff --git a/polkadot/node/core/pvf/src/testing.rs b/polkadot/node/core/pvf/src/testing.rs index 980a28c01566..129d55337ca3 100644 --- a/polkadot/node/core/pvf/src/testing.rs +++ b/polkadot/node/core/pvf/src/testing.rs @@ -75,17 +75,23 @@ macro_rules! decl_puppet_worker_main { }; let mut node_version = None; - let mut socket_path: &str = ""; + let mut socket_path = None; + let mut cache_path = None; for i in (2..args.len()).step_by(2) { match args[i].as_ref() { - "--socket-path" => socket_path = args[i + 1].as_str(), + "--socket-path" => socket_path = Some(args[i + 1].as_str()), "--node-impl-version" => node_version = Some(args[i + 1].as_str()), + "--cache-path" => cache_path = Some(args[i + 1].as_str()), arg => panic!("Unexpected argument found: {}", arg), } } + let socket_path = socket_path.expect("the --socket-path argument is required"); + let cache_path = cache_path.expect("the --cache-path argument is required"); - entrypoint(&socket_path, node_version, None); + let cache_path = &std::path::Path::new(cache_path); + + entrypoint(&socket_path, node_version, None, cache_path); } }; } diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index 795ad4524443..e020a0af2aec 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -39,15 +39,31 @@ use tokio::{ pub const JOB_TIMEOUT_WALL_CLOCK_FACTOR: u32 = 4; /// This is publicly exposed only for integration tests. +/// +/// # Parameters +/// +/// - `debug_id`: An identifier for the process (e.g. "execute" or "prepare"). +/// +/// - `program_path`: The path to the program. +/// +/// - `socket_dir_path`: An optional path to the dir where the socket should be created, if `None` +/// use a temp dir. +/// +/// - `extra_args`: Optional extra CLI arguments to the program. NOTE: Should only contain data +/// required before the handshake, like node/worker versions for the version check. Other data +/// should go through the handshake. +/// +/// - `spawn_timeout`: The amount of time to wait for the child process to spawn. #[doc(hidden)] pub async fn spawn_with_program_path( debug_id: &'static str, program_path: impl Into, + socket_dir_path: Option<&Path>, extra_args: &[&str], spawn_timeout: Duration, ) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { let program_path = program_path.into(); - with_transient_socket_path(debug_id, |socket_path| { + with_transient_socket_path(debug_id, socket_dir_path, |socket_path| { let socket_path = socket_path.to_owned(); let extra_args: Vec = extra_args.iter().map(|arg| arg.to_string()).collect(); @@ -109,14 +125,23 @@ pub async fn spawn_with_program_path( .await } -async fn with_transient_socket_path(debug_id: &'static str, f: F) -> Result +async fn with_transient_socket_path( + debug_id: &'static str, + socket_dir_path: Option<&Path>, + f: F, +) -> Result where F: FnOnce(&Path) -> Fut, Fut: futures::Future> + 'static, { - let socket_path = tmpfile(&format!("pvf-host-{}", debug_id)) - .await - .map_err(|_| SpawnErr::TmpFile)?; + let socket_prefix = format!("pvf-host-{}-", debug_id); + let socket_path = if let Some(socket_dir_path) = socket_dir_path { + tmpfile_in(&socket_prefix, socket_dir_path).await + } else { + tmpfile(&socket_prefix).await + } + .map_err(|_| SpawnErr::TmpFile)?; + let result = f(&socket_path).await; // Best effort to remove the socket file. Under normal circumstances the socket will be removed @@ -194,6 +219,8 @@ pub enum SpawnErr { AcceptTimeout, /// Failed to send handshake after successful spawning was signaled Handshake, + /// Cache path is not a valid UTF-8 str. + InvalidCachePath(PathBuf), } /// This is a representation of a potentially running worker. Drop it and the process will be @@ -221,10 +248,23 @@ impl WorkerHandle { extra_args: &[String], socket_path: impl AsRef, ) -> io::Result { + // Pass the socket path relative to the cache_path (what the child thinks is root). + let socket_path = if security_config.can_unshare_user_namespace_and_change_root { + Path::new(".").with_file_name( + socket_path + .as_ref() + .file_name() + .expect("socket paths are created with a filename; qed"), + ) + } else { + // We are unable to pivot-root, so pass the socket path as-is. + socket_path.as_ref().as_os_str() + }; + let mut child = process::Command::new(program.as_ref()) .args(extra_args) .arg("--socket-path") - .arg(socket_path.as_ref().as_os_str()) + .arg(socket_path) .stdout(std::process::Stdio::piped()) .kill_on_drop(true) .spawn()?; diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 72c459c2f632..0f30efefc4cd 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -258,7 +258,7 @@ async fn execute_queue_doesnt_stall_with_varying_executor_params() { #[tokio::test] async fn deleting_prepared_artifact_does_not_dispute() { let host = TestHost::new(); - let cache_dir = host.cache_dir.path().clone(); + let cache_dir = host.cache_dir.path(); let result = host .validate_candidate( diff --git a/polkadot/node/core/pvf/tests/it/worker_common.rs b/polkadot/node/core/pvf/tests/it/worker_common.rs index a3bf552e894a..4184c68fe8be 100644 --- a/polkadot/node/core/pvf/tests/it/worker_common.rs +++ b/polkadot/node/core/pvf/tests/it/worker_common.rs @@ -23,26 +23,54 @@ use crate::PUPPET_EXE; // Test spawning a program that immediately exits with a failure code. #[tokio::test] async fn spawn_immediate_exit() { - let result = - spawn_with_program_path("integration-test", PUPPET_EXE, &["exit"], Duration::from_secs(2)) - .await; + let result = spawn_with_program_path( + "integration-test", + PUPPET_EXE, + None, + &["exit"], + Duration::from_secs(2), + ) + .await; assert!(matches!(result, Err(SpawnErr::AcceptTimeout))); } #[tokio::test] async fn spawn_timeout() { - let result = - spawn_with_program_path("integration-test", PUPPET_EXE, &["sleep"], Duration::from_secs(2)) - .await; + let result = spawn_with_program_path( + "integration-test", + PUPPET_EXE, + None, + &["sleep"], + Duration::from_secs(2), + ) + .await; + assert!(matches!(result, Err(SpawnErr::AcceptTimeout))); +} + +#[tokio::test] +async fn should_fail_without_cache_path() { + // --socket-path is handled by `spawn_with_program_path` so we don't pass it here. + let result = spawn_with_program_path( + "integration-test", + PUPPET_EXE, + None, + &["prepare-worker"], + Duration::from_secs(2), + ) + .await; assert!(matches!(result, Err(SpawnErr::AcceptTimeout))); } #[tokio::test] async fn should_connect() { + let cache_path = tempfile::tempdir().unwrap(); + let cache_path_str = cache_path.path().to_str().unwrap(); + let _ = spawn_with_program_path( "integration-test", PUPPET_EXE, - &["prepare-worker"], + Some(cache_path.path()), + &["prepare-worker", "--cache-path", cache_path_str], Duration::from_secs(2), ) .await From c8f296271d641c8ce2db515988267558eecde040 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 3 Sep 2023 14:27:12 +0200 Subject: [PATCH 02/18] Do the rest --- Cargo.lock | 1 + polkadot/node/core/pvf/common/src/error.rs | 41 ++- polkadot/node/core/pvf/common/src/execute.rs | 6 +- polkadot/node/core/pvf/common/src/lib.rs | 4 +- polkadot/node/core/pvf/common/src/prepare.rs | 9 - .../node/core/pvf/common/src/worker/mod.rs | 123 ++++++--- .../core/pvf/common/src/worker/security.rs | 46 ++-- .../node/core/pvf/common/src/worker_dir.rs | 35 +++ .../node/core/pvf/execute-worker/src/lib.rs | 54 ++-- .../node/core/pvf/prepare-worker/src/lib.rs | 124 ++++----- polkadot/node/core/pvf/src/artifacts.rs | 2 +- polkadot/node/core/pvf/src/execute/queue.rs | 10 +- .../node/core/pvf/src/execute/worker_intf.rs | 214 +++++++++------ polkadot/node/core/pvf/src/host.rs | 68 +++-- polkadot/node/core/pvf/src/lib.rs | 3 + polkadot/node/core/pvf/src/prepare/pool.rs | 34 +-- .../node/core/pvf/src/prepare/worker_intf.rs | 249 +++++++++--------- polkadot/node/core/pvf/src/testing.rs | 64 ++++- polkadot/node/core/pvf/src/worker_intf.rs | 210 ++++++++++----- .../node/core/pvf/tests/it/worker_common.rs | 27 +- 20 files changed, 767 insertions(+), 557 deletions(-) create mode 100644 polkadot/node/core/pvf/common/src/worker_dir.rs diff --git a/Cargo.lock b/Cargo.lock index c755be63042b..dd65620f8041 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12274,6 +12274,7 @@ dependencies = [ "parity-scale-codec", "polkadot-parachain", "polkadot-primitives", + "rand 0.8.5", "sc-executor", "sc-executor-common", "sc-executor-wasmtime", diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index 6eb0d9b7df42..6fdd06057c8b 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -44,7 +44,17 @@ pub enum PrepareError { /// The response from the worker is received, but the file cannot be renamed (moved) to the /// final destination location. This state is reported by the validation host (not by the /// worker). - RenameTmpFileErr(String), + RenameTmpFileErr { + err: String, + // Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible + // conversion to `Option`. + src: Option, + dest: Option, + }, + /// The response from the worker is received, but the worker cache could not be cleared. The + /// worker has to be killed to avoid jobs having access to data from other jobs. This state is + /// reported by the validation host (not by the worker). + ClearWorkerDir(String), } impl PrepareError { @@ -58,7 +68,11 @@ impl PrepareError { use PrepareError::*; match self { Prevalidation(_) | Preparation(_) | Panic(_) => true, - TimedOut | IoErr(_) | CreateTmpFileErr(_) | RenameTmpFileErr(_) => false, + TimedOut | + IoErr(_) | + CreateTmpFileErr(_) | + RenameTmpFileErr { .. } | + ClearWorkerDir(_) => false, // Can occur due to issues with the PVF, but also due to local errors. RuntimeConstruction(_) => false, } @@ -76,7 +90,9 @@ impl fmt::Display for PrepareError { TimedOut => write!(f, "prepare: timeout"), IoErr(err) => write!(f, "prepare: io error while receiving response: {}", err), CreateTmpFileErr(err) => write!(f, "prepare: error creating tmp file: {}", err), - RenameTmpFileErr(err) => write!(f, "prepare: error renaming tmp file: {}", err), + RenameTmpFileErr { err, src, dest } => + write!(f, "prepare: error renaming tmp file ({:?} -> {:?}): {}", src, dest, err), + ClearWorkerDir(err) => write!(f, "prepare: error clearing worker cache: {}", err), } } } @@ -89,8 +105,17 @@ impl fmt::Display for PrepareError { pub enum InternalValidationError { /// Some communication error occurred with the host. HostCommunication(String), + /// Host could not create a hard link to the artifact path. + CouldNotCreateLink(String), /// Could not find or open compiled artifact file. CouldNotOpenFile(String), + /// Host could not clear the worker cache after a job. + CouldNotClearWorkerDir { + err: String, + // Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible + // conversion to `Option`. + path: Option, + }, /// An error occurred in the CPU time monitor thread. Should be totally unrelated to /// validation. CpuTimeMonitorThread(String), @@ -104,8 +129,18 @@ impl fmt::Display for InternalValidationError { match self { HostCommunication(err) => write!(f, "validation: some communication error occurred with the host: {}", err), + CouldNotCreateLink(err) => write!( + f, + "validation: host could not create a hard link to the artifact path: {}", + err + ), CouldNotOpenFile(err) => write!(f, "validation: could not find or open compiled artifact file: {}", err), + CouldNotClearWorkerDir { err, path } => write!( + f, + "validation: host could not clear the worker cache ({:?}) after a job: {}", + path, err + ), CpuTimeMonitorThread(err) => write!(f, "validation: an error occurred in the CPU time monitor thread: {}", err), NonDeterministicPrepareError(err) => write!(f, "validation: prepare: {}", err), diff --git a/polkadot/node/core/pvf/common/src/execute.rs b/polkadot/node/core/pvf/common/src/execute.rs index 4ae4911624ca..3c42adda6266 100644 --- a/polkadot/node/core/pvf/common/src/execute.rs +++ b/polkadot/node/core/pvf/common/src/execute.rs @@ -14,20 +14,18 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use crate::{error::InternalValidationError, SecurityStatus}; +use crate::error::InternalValidationError; use parity_scale_codec::{Decode, Encode}; use polkadot_parachain::primitives::ValidationResult; use polkadot_primitives::ExecutorParams; use std::time::Duration; /// The payload of the one-time handshake that is done when a worker process is created. Carries -/// data from the host to the worker. +/// data from the host to the worker that would be too large for CLI parameters.. #[derive(Encode, Decode)] pub struct Handshake { /// The executor parameters. pub executor_params: ExecutorParams, - /// Status of security features on the current system. - pub security_status: SecurityStatus, } /// The response from an execution job on the worker. diff --git a/polkadot/node/core/pvf/common/src/lib.rs b/polkadot/node/core/pvf/common/src/lib.rs index 79d7e6529be2..e56da7efed46 100644 --- a/polkadot/node/core/pvf/common/src/lib.rs +++ b/polkadot/node/core/pvf/common/src/lib.rs @@ -22,6 +22,7 @@ pub mod executor_intf; pub mod prepare; pub mod pvf; pub mod worker; +pub mod worker_dir; pub use cpu_time::ProcessTime; @@ -33,7 +34,6 @@ const LOG_TARGET: &str = "parachain::pvf-common"; use std::mem; use tokio::io::{self, AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _}; -use parity_scale_codec::{Decode, Encode}; #[cfg(feature = "test-utils")] pub mod tests { @@ -44,7 +44,7 @@ pub mod tests { } /// Status of security features on the current system. -#[derive(Debug, Clone, Encode, Decode)] +#[derive(Debug, Clone, Default)] pub struct SecurityStatus { /// Whether the landlock features we use are fully available on this system. pub can_enable_landlock: bool, diff --git a/polkadot/node/core/pvf/common/src/prepare.rs b/polkadot/node/core/pvf/common/src/prepare.rs index 3f29efa864a4..c205eddfb8b1 100644 --- a/polkadot/node/core/pvf/common/src/prepare.rs +++ b/polkadot/node/core/pvf/common/src/prepare.rs @@ -14,7 +14,6 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use crate::SecurityStatus; use parity_scale_codec::{Decode, Encode}; /// Preparation statistics, including the CPU time and memory taken. @@ -56,11 +55,3 @@ pub enum PrepareJobKind { /// A prechecking job. Prechecking, } - -/// The payload of the one-time handshake that is done when a worker process is created. Carries -/// data from the host to the worker. -#[derive(Encode, Decode)] -pub struct Handshake { - /// Status of security features on the current system. - pub security_status: SecurityStatus, -} diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index de2914baa86e..4e27b1455c2d 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -18,12 +18,12 @@ pub mod security; -use crate::LOG_TARGET; +use crate::{worker_dir, SecurityStatus, LOG_TARGET}; use cpu_time::ProcessTime; use futures::never::Never; use std::{ any::Any, - path::{Path, PathBuf}, + path::PathBuf, sync::mpsc::{Receiver, RecvTimeoutError}, time::Duration, }; @@ -41,6 +41,9 @@ macro_rules! decl_worker_main { } fn main() { + #[cfg(target_os = "linux")] + use $crate::worker::security; + $crate::sp_tracing::try_init_simple(); let args = std::env::args().collect::>(); @@ -58,9 +61,23 @@ macro_rules! decl_worker_main { println!("{}", $worker_version); return }, + + "--check-can-enable-landlock" => { + #[cfg(target_os = "linux")] + let status = if security::landlock::status_is_fully_enabled( + &security::landlock::get_status(), + ) { + 0 + } else { + -1 + }; + #[cfg(not(target_os = "linux"))] + let status = -1; + std::process::exit(status) + }, "--check-can-unshare-user-namespace-and-change-root" => { #[cfg(target_os = "linux")] - let status = if security::unshare_user_namespace_and_change_root().is_ok() { + let status = if security::unshare_user_namespace_and_change_root(&std::env::temp_dir()).is_ok() { 0 } else { -1 @@ -69,6 +86,7 @@ macro_rules! decl_worker_main { let status = -1; std::process::exit(status) }, + subcommand => { // Must be passed for compatibility with the single-binary test workers. if subcommand != $expected_command { @@ -80,24 +98,39 @@ macro_rules! decl_worker_main { }, } + let mut worker_dir_path = None; let mut node_version = None; - let mut socket_path = None; - let mut cache_path = None; + let mut can_enable_landlock = false; + let mut can_unshare_user_namespace_and_change_root = false; - for i in (2..args.len()).step_by(2) { + let mut i = 2; + while i < args.len() { match args[i].as_ref() { - "--socket-path" => socket_path = Some(args[i + 1].as_str()), - "--node-impl-version" => node_version = Some(args[i + 1].as_str()), - "--cache-path" => cache_path = Some(args[i + 1].as_str()), + "--worker-dir-path" => { + worker_dir_path = Some(args[i + 1].as_str()); + i += 1 + }, + "--node-impl-version" => { + node_version = Some(args[i + 1].as_str()); + i += 1 + }, + "--can-enable-landlock" => can_enable_landlock = true, + "--can-unshare-user-namespace-and-change-root" => + can_unshare_user_namespace_and_change_root = true, arg => panic!("Unexpected argument found: {}", arg), } + i += 1; } - let socket_path = socket_path.expect("the --socket-path argument is required"); - let cache_path = cache_path.expect("the --cache-path argument is required"); + let worker_dir_path = + worker_dir_path.expect("the --worker-dir-path argument is required"); - let cache_path = &std::path::Path::new(cache_path); + let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned(); + let security_status = $crate::SecurityStatus { + can_enable_landlock, + can_unshare_user_namespace_and_change_root + }; - $entrypoint(&socket_path, node_version, Some($worker_version), cache_path); + $entrypoint(worker_dir_path, node_version, Some($worker_version), security_status); } }; } @@ -106,27 +139,21 @@ macro_rules! decl_worker_main { /// child process. pub const JOB_TIMEOUT_OVERHEAD: Duration = Duration::from_millis(50); -/// Interprets the given bytes as a path. Returns `None` if the given bytes do not constitute a -/// a proper utf-8 string. -pub fn bytes_to_path(bytes: &[u8]) -> Option { - std::str::from_utf8(bytes).ok().map(PathBuf::from) -} - // The worker version must be passed in so that we accurately get the version of the worker, and not // the version that this crate was compiled with. pub fn worker_event_loop( debug_id: &'static str, - socket_path: &str, + #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut worker_dir_path: PathBuf, node_version: Option<&str>, worker_version: Option<&str>, - cache_path: &Path, + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] security_status: &SecurityStatus, mut event_loop: F, ) where - F: FnMut(UnixStream) -> Fut, + F: FnMut(UnixStream, PathBuf) -> Fut, Fut: futures::Future>, { let worker_pid = std::process::id(); - gum::debug!(target: LOG_TARGET, %worker_pid, "starting pvf worker ({})", debug_id); + gum::debug!(target: LOG_TARGET, %worker_pid, ?worker_dir_path, "starting pvf worker ({})", debug_id); // Check for a mismatch between the node and worker versions. if let (Some(node_version), Some(worker_version)) = (node_version, worker_version) { @@ -146,26 +173,35 @@ pub fn worker_event_loop( } } - // TODO: Call based on security_config, error out if should work but fails. - #[cfg(target_os = "linux")] + // Enable some security features. + // + // Landlock is enabled in the prepare- or execute-worker-specific code since we restrict the + // access rights based on whether we are preparing or executing. We also need to remove the + // socket before applying Landlock restrictions. { - if let Err(err_ctx) = security::change_root(cache_path) { - let err = io::Error::last_os_error(); - gum::error!( - target: LOG_TARGET, - %debug_id, - %worker_pid, - %err_ctx, - ?cache_path, - "Could not change root to be the cache path: {}", - err - ); - worker_shutdown_message(debug_id, worker_pid, err); - return + // Call based on whether we can change root. Error out if it should work but fails. + #[cfg(target_os = "linux")] + if security_status.can_unshare_user_namespace_and_change_root { + if let Err(err_ctx) = security::unshare_user_namespace_and_change_root(&worker_dir_path) + { + let err = io::Error::last_os_error(); + gum::error!( + target: LOG_TARGET, + %debug_id, + %worker_pid, + %err_ctx, + ?worker_dir_path, + "Could not change root to be the worker cache path: {}", + err + ); + worker_shutdown_message(debug_id, worker_pid, err); + return + } + worker_dir_path = std::path::Path::new("/").to_owned(); } - } - security::remove_env_vars(debug_id); + security::remove_env_vars(debug_id); + } gum::info!(target: LOG_TARGET, "5. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); @@ -173,10 +209,11 @@ pub fn worker_event_loop( let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it."); let err = rt .block_on(async move { - let stream = UnixStream::connect(socket_path).await?; - let _ = tokio::fs::remove_file(socket_path).await; + let socket_path = worker_dir::socket(&worker_dir_path); + let stream = UnixStream::connect(&socket_path).await?; + let _ = tokio::fs::remove_file(&socket_path).await; - let result = event_loop(stream).await; + let result = event_loop(stream, worker_dir_path).await; result }) diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs index a693c94a4c26..d106114e94e4 100644 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ b/polkadot/node/core/pvf/common/src/worker/security.rs @@ -24,7 +24,7 @@ use std::path::{Path, PathBuf}; /// Unshare the user namespace and change root to be the artifact directory. #[cfg(target_os = "linux")] -pub fn unshare_user_namespace_and_change_root(cache_path: &Path) -> Result<(), &'static str> { +pub fn unshare_user_namespace_and_change_root(worker_dir_path: &Path) -> Result<(), &'static str> { use rand::{distributions::Alphanumeric, Rng}; use std::{ffi::CString, os::unix::ffi::OsStrExt, ptr}; @@ -34,29 +34,27 @@ pub fn unshare_user_namespace_and_change_root(cache_path: &Path) -> Result<(), & let s = std::str::from_utf8(&buf) .expect("the string is collected from a valid utf-8 sequence; qed"); - let cache_path_str = match cache_path.to_str() { - Some(s) => s, - None => return Err("cache path is not valid UTF-8"), - }; - let cache_path_c = CString::new(cache_path.as_os_str().as_bytes()).unwrap(); + let worker_dir_path_str = + worker_dir_path.to_str().ok_or("worker dir path is not valid UTF-8")?; + let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()).unwrap(); let root_absolute_c = CString::new("/").unwrap(); // Append a random string to prevent races and to avoid dealing with the dir already existing. - let oldroot_relative_c = CString::new(format!("{}/oldroot-{}", cache_path_str, s)).unwrap(); + let oldroot_relative_c = + CString::new(format!("{}/oldroot-{}", worker_dir_path_str, s)).unwrap(); let oldroot_absolute_c = CString::new(format!("/oldroot-{}", s)).unwrap(); // SAFETY: TODO unsafe { - // 1. `unshare` the user and the mount namespaces. - if libc::unshare(libc::CLONE_NEWUSER) < 0 { - return Err("unshare user namespace") - } - if libc::unshare(libc::CLONE_NEWNS) < 0 { - return Err("unshare mount namespace") - } + // 1. `unshare` the user and the mount namespaces. + if libc::unshare(libc::CLONE_NEWUSER) < 0 { + return Err("unshare user namespace") + } + if libc::unshare(libc::CLONE_NEWNS) < 0 { + return Err("unshare mount namespace") + } // 2. `pivot_root` to the artifact directory. - gum::info!(target: LOG_TARGET, "1. {:?}", std::env::current_dir()); - gum::info!(target: LOG_TARGET, "1.5. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); + // // Ensure that 'new_root' and its parent mount don't have shared propagation. if libc::mount( ptr::null(), @@ -69,8 +67,8 @@ pub fn unshare_user_namespace_and_change_root(cache_path: &Path) -> Result<(), & return Err("mount MS_PRIVATE") } if libc::mount( - cache_path_c.as_ptr(), - cache_path_c.as_ptr(), + worker_dir_path_c.as_ptr(), + worker_dir_path_c.as_ptr(), ptr::null(), // ignored when MS_BIND is used libc::MS_BIND | libc::MS_REC | libc::MS_NOEXEC | libc::MS_NODEV | libc::MS_NOSUID, ptr::null(), // ignored when MS_BIND is used @@ -81,8 +79,11 @@ pub fn unshare_user_namespace_and_change_root(cache_path: &Path) -> Result<(), & if libc::mkdir(oldroot_relative_c.as_ptr(), 0755) < 0 { return Err("mkdir oldroot") } - if libc::syscall(libc::SYS_pivot_root, cache_path_c.as_ptr(), oldroot_relative_c.as_ptr()) < - 0 + if libc::syscall( + libc::SYS_pivot_root, + worker_dir_path_c.as_ptr(), + oldroot_relative_c.as_ptr(), + ) < 0 { return Err("pivot_root") } @@ -91,17 +92,12 @@ pub fn unshare_user_namespace_and_change_root(cache_path: &Path) -> Result<(), & if libc::chdir(root_absolute_c.as_ptr()) < 0 { return Err("chdir to new root") } - gum::info!(target: LOG_TARGET, "2. {:?}", std::env::current_dir()); - gum::info!(target: LOG_TARGET, "3. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); if libc::umount2(oldroot_absolute_c.as_ptr(), libc::MNT_DETACH) < 0 { return Err("umount2 the oldroot") } if libc::rmdir(oldroot_absolute_c.as_ptr()) < 0 { return Err("rmdir the oldroot") } - gum::info!(target: LOG_TARGET, "4. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); - - // TODO: do some assertions } Ok(()) diff --git a/polkadot/node/core/pvf/common/src/worker_dir.rs b/polkadot/node/core/pvf/common/src/worker_dir.rs new file mode 100644 index 000000000000..b1c36f0afc07 --- /dev/null +++ b/polkadot/node/core/pvf/common/src/worker_dir.rs @@ -0,0 +1,35 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Functions for getting the worker cache files. + +use std::path::{Path, PathBuf}; + +const WORKER_EXECUTE_ARTIFACT_NAME: &str = "artifact"; +const WORKER_PREPARE_TMP_ARTIFACT_NAME: &str = "tmp-artifact"; +const WORKER_SOCKET_NAME: &str = "socket"; + +pub fn execute_artifact(worker_dir_path: &Path) -> PathBuf { + worker_dir_path.join(WORKER_EXECUTE_ARTIFACT_NAME) +} + +pub fn prepare_tmp_artifact(worker_dir_path: &Path) -> PathBuf { + worker_dir_path.join(WORKER_PREPARE_TMP_ARTIFACT_NAME) +} + +pub fn socket(worker_dir_path: &Path) -> PathBuf { + worker_dir_path.join(WORKER_SOCKET_NAME) +} diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index d00abac30cc9..2ec95efe1dcc 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -16,7 +16,7 @@ //! Contains the logic for executing PVFs. Used by the polkadot-execute-worker binary. -pub use polkadot_node_core_pvf_common::executor_intf::Executor; +pub use polkadot_node_core_pvf_common::{executor_intf::Executor, worker_dir, SecurityStatus}; // NOTE: Initializing logging in e.g. tests will not have an effect in the workers, as they are // separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-execute-worker=trace`. @@ -30,7 +30,7 @@ use polkadot_node_core_pvf_common::{ executor_intf::NATIVE_STACK_MAX, framed_recv, framed_send, worker::{ - bytes_to_path, cpu_time_monitor_loop, + cpu_time_monitor_loop, security::LandlockStatus, stringify_panic_payload, thread::{self, WaitOutcome}, @@ -39,7 +39,7 @@ use polkadot_node_core_pvf_common::{ }; use polkadot_parachain::primitives::ValidationResult; use std::{ - path::{Path, PathBuf}, + path::PathBuf, sync::{mpsc::channel, Arc}, time::Duration, }; @@ -92,14 +92,7 @@ async fn recv_handshake(stream: &mut UnixStream) -> io::Result { Ok(handshake) } -async fn recv_request(stream: &mut UnixStream) -> io::Result<(PathBuf, Vec, Duration)> { - let artifact_path = framed_recv(stream).await?; - let artifact_path = bytes_to_path(&artifact_path).ok_or_else(|| { - io::Error::new( - io::ErrorKind::Other, - "execute pvf recv_request: non utf-8 artifact path".to_string(), - ) - })?; +async fn recv_request(stream: &mut UnixStream) -> io::Result<(Vec, Duration)> { let params = framed_recv(stream).await?; let execution_timeout = framed_recv(stream).await?; let execution_timeout = Duration::decode(&mut &execution_timeout[..]).map_err(|_| { @@ -108,7 +101,7 @@ async fn recv_request(stream: &mut UnixStream) -> io::Result<(PathBuf, Vec, "execute pvf recv_request: failed to decode duration".to_string(), ) })?; - Ok((artifact_path, params, execution_timeout)) + Ok((params, execution_timeout)) } async fn send_response(stream: &mut UnixStream, response: Response) -> io::Result<()> { @@ -130,22 +123,22 @@ async fn send_response(stream: &mut UnixStream, response: Response) -> io::Resul /// - `cache_path` contains the expected cache path for artifacts and is used to provide a sandbox /// exception for landlock. pub fn worker_entrypoint( - socket_path: &str, + worker_dir_path: PathBuf, node_version: Option<&str>, worker_version: Option<&str>, - cache_path: &Path, + security_status: SecurityStatus, ) { worker_event_loop( "execute", - socket_path, + worker_dir_path, node_version, worker_version, - cache_path, - |mut stream| async move { + &security_status, + |mut stream, worker_dir_path| async move { let worker_pid = std::process::id(); + let artifact_path = worker_dir::execute_artifact(&worker_dir_path); - let Handshake { executor_params, security_status } = - recv_handshake(&mut stream).await?; + let Handshake { executor_params } = recv_handshake(&mut stream).await?; let executor = Executor::new(executor_params).map_err(|e| { io::Error::new(io::ErrorKind::Other, format!("cannot create executor: {}", e)) })?; @@ -155,18 +148,13 @@ pub fn worker_entrypoint( #[cfg(target_os = "linux")] let landlock_status = { use polkadot_node_core_pvf_common::worker::security::landlock::{ - path_beneath_rules, try_restrict, Access, AccessFs, LANDLOCK_ABI, + path_beneath_rules, try_restrict, AccessFs, }; - // Allow an exception for reading from the artifact cache, but disallow listing - // the directory contents. Since we prepend artifact names with a random hash, - // this means attackers can't discover artifacts apart from the current job. - try_restrict(path_beneath_rules( - &[cache_path], - AccessFs::from_read(LANDLOCK_ABI) ^ AccessFs::ReadDir, - )) - .map(LandlockStatus::from_ruleset_status) - .map_err(|e| e.to_string()) + // Allow an exception for reading from the known artifact path. + try_restrict(path_beneath_rules(&[&artifact_path], AccessFs::ReadFile)) + .map(LandlockStatus::from_ruleset_status) + .map_err(|e| e.to_string()) }; #[cfg(not(target_os = "linux"))] let landlock_status: Result = Ok(LandlockStatus::NotEnforced); @@ -190,7 +178,7 @@ pub fn worker_entrypoint( } loop { - let (artifact_path, params, execution_timeout) = recv_request(&mut stream).await?; + let (params, execution_timeout) = recv_request(&mut stream).await?; gum::debug!( target: LOG_TARGET, %worker_pid, @@ -198,12 +186,8 @@ pub fn worker_entrypoint( artifact_path.display(), ); - if !artifact_path.starts_with(cache_path) { - return Err(io::Error::new(io::ErrorKind::Other, format!("received an artifact path {artifact_path:?} that does not belong to expected artifact dir {cache_path:?}"))) - } - // Get the artifact bytes. - let compiled_artifact_blob = match std::fs::read(artifact_path) { + let compiled_artifact_blob = match std::fs::read(&artifact_path) { Ok(bytes) => bytes, Err(err) => { let response = Response::InternalError( diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index c6cba418d314..4e427bd49425 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -34,20 +34,20 @@ use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, executor_intf::Executor, framed_recv, framed_send, - prepare::{Handshake, MemoryStats, PrepareJobKind, PrepareStats}, + prepare::{MemoryStats, PrepareJobKind, PrepareStats}, pvf::PvfPrepData, worker::{ - bytes_to_path, cpu_time_monitor_loop, + cpu_time_monitor_loop, security::LandlockStatus, stringify_panic_payload, thread::{self, WaitOutcome}, worker_event_loop, }, - ProcessTime, + worker_dir, ProcessTime, SecurityStatus, }; use polkadot_primitives::ExecutorParams; use std::{ - path::{Path, PathBuf}, + path::PathBuf, sync::{mpsc::channel, Arc}, time::Duration, }; @@ -69,18 +69,7 @@ impl AsRef<[u8]> for CompiledArtifact { } } -async fn recv_handshake(stream: &mut UnixStream) -> io::Result { - let handshake_enc = framed_recv(stream).await?; - let handshake = Handshake::decode(&mut &handshake_enc[..]).map_err(|_| { - io::Error::new( - io::ErrorKind::Other, - "prepare pvf recv_handshake: failed to decode Handshake".to_owned(), - ) - })?; - Ok(handshake) -} - -async fn recv_request(stream: &mut UnixStream) -> io::Result<(PvfPrepData, PathBuf)> { +async fn recv_request(stream: &mut UnixStream) -> io::Result { let pvf = framed_recv(stream).await?; let pvf = PvfPrepData::decode(&mut &pvf[..]).map_err(|e| { io::Error::new( @@ -88,14 +77,7 @@ async fn recv_request(stream: &mut UnixStream) -> io::Result<(PvfPrepData, PathB format!("prepare pvf recv_request: failed to decode PvfPrepData: {}", e), ) })?; - let tmp_file = framed_recv(stream).await?; - let tmp_file = bytes_to_path(&tmp_file).ok_or_else(|| { - io::Error::new( - io::ErrorKind::Other, - "prepare pvf recv_request: non utf-8 artifact path".to_string(), - ) - })?; - Ok((pvf, tmp_file)) + Ok(pvf) } async fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Result<()> { @@ -136,79 +118,63 @@ async fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Re /// 7. Send the result of preparation back to the host. If any error occurred in the above steps, we /// send that in the `PrepareResult`. pub fn worker_entrypoint( - socket_path: &str, + worker_dir_path: PathBuf, node_version: Option<&str>, worker_version: Option<&str>, - cache_path: &Path, + security_status: SecurityStatus, ) { worker_event_loop( "prepare", - socket_path, + worker_dir_path, node_version, worker_version, - cache_path, - |mut stream| async move { + &security_status, + |mut stream, worker_dir_path| async move { let worker_pid = std::process::id(); - - gum::info!(target: LOG_TARGET, "10. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); - - let Handshake { security_status } = recv_handshake(&mut stream).await?; - - gum::info!(target: LOG_TARGET, "11. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); + let temp_artifact_dest = worker_dir::prepare_tmp_artifact(&worker_dir_path); // Try to enable landlock. - // { - // #[cfg(target_os = "linux")] - // let landlock_status = { - // use polkadot_node_core_pvf_common::worker::security::landlock::{ - // path_beneath_rules, try_restrict, Access, AccessFs, LANDLOCK_ABI, - // }; - - // // Allow an exception for writing to the artifact cache, with no allowance for - // // listing the directory contents. Since we prepend artifact names with a random - // // hash, this means attackers can't discover artifacts apart from the current - // // job. - // try_restrict(path_beneath_rules( - // &[cache_path], - // AccessFs::from_write(LANDLOCK_ABI), - // )) - // .map(LandlockStatus::from_ruleset_status) - // .map_err(|e| e.to_string()) - // }; - // #[cfg(not(target_os = "linux"))] - // let landlock_status: Result = - // Ok(LandlockStatus::NotEnforced); - - // // Error if the host determined that landlock is fully enabled and we couldn't fully - // // enforce it here. - // if landlock_enabled && !matches!(landlock_status, Ok(LandlockStatus::FullyEnforced)) - // { - // gum::warn!( - // target: LOG_TARGET, - // %worker_pid, - // "could not fully enable landlock: {:?}", - // landlock_status - // ); - // return Err(io::Error::new( - // io::ErrorKind::Other, - // format!("could not fully enable landlock: {:?}", landlock_status), - // )) - // } - // } + { + #[cfg(target_os = "linux")] + let landlock_status = { + use polkadot_node_core_pvf_common::worker::security::landlock::{ + path_beneath_rules, try_restrict, AccessFs, + }; + + // Allow an exception for writing to the known file in the worker cache. + try_restrict(path_beneath_rules(&[temp_artifact_dest], AccessFs::WriteFile)) + .map(LandlockStatus::from_ruleset_status) + .map_err(|e| e.to_string()) + }; + #[cfg(not(target_os = "linux"))] + let landlock_status: Result = Ok(LandlockStatus::NotEnforced); + + // Error if the host determined that landlock is fully enabled and we couldn't fully + // enforce it here. + if security_status.can_enable_landlock && + !matches!(landlock_status, Ok(LandlockStatus::FullyEnforced)) + { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "could not fully enable landlock: {:?}", + landlock_status + ); + return Err(io::Error::new( + io::ErrorKind::Other, + format!("could not fully enable landlock: {:?}", landlock_status), + )) + } + } loop { - let (pvf, temp_artifact_dest) = recv_request(&mut stream).await?; + let pvf = recv_request(&mut stream).await?; gum::debug!( target: LOG_TARGET, %worker_pid, "worker: preparing artifact", ); - // if !temp_artifact_dest.starts_with(cache_path) { - // return Err(io::Error::new(io::ErrorKind::Other, format!("received an artifact - // path {temp_artifact_dest:?} that does not belong to expected cache path - // {cache_path:?}"))) } - let preparation_timeout = pvf.prep_timeout(); let prepare_job_kind = pvf.prep_kind(); let executor_params = (*pvf.executor_params()).clone(); diff --git a/polkadot/node/core/pvf/src/artifacts.rs b/polkadot/node/core/pvf/src/artifacts.rs index a180af15db27..fc5a53fd0435 100644 --- a/polkadot/node/core/pvf/src/artifacts.rs +++ b/polkadot/node/core/pvf/src/artifacts.rs @@ -295,7 +295,7 @@ mod tests { #[tokio::test] async fn artifacts_removes_cache_on_startup() { - let fake_cache_path = crate::worker_intf::tmpfile("test-cache").await.unwrap(); + let fake_cache_path = crate::worker_intf::tmppath("test-cache").await.unwrap(); let fake_artifact_path = { let mut p = fake_cache_path.clone(); p.push("wasmtime_0x1234567890123456789012345678901234567890123456789012345678901234"); diff --git a/polkadot/node/core/pvf/src/execute/queue.rs b/polkadot/node/core/pvf/src/execute/queue.rs index b9394a18e1df..3729700caf00 100644 --- a/polkadot/node/core/pvf/src/execute/queue.rs +++ b/polkadot/node/core/pvf/src/execute/queue.rs @@ -24,13 +24,13 @@ use crate::{ worker_intf::{IdleWorker, WorkerHandle}, InvalidCandidate, ValidationError, LOG_TARGET, }; -use polkadot_node_core_pvf_common::SecurityStatus; use futures::{ channel::mpsc, future::BoxFuture, stream::{FuturesUnordered, StreamExt as _}, Future, FutureExt, }; +use polkadot_node_core_pvf_common::SecurityStatus; use polkadot_primitives::{ExecutorParams, ExecutorParamsHash}; use slotmap::HopSlotMap; use std::{ @@ -142,7 +142,6 @@ struct Queue { program_path: PathBuf, spawn_timeout: Duration, node_version: Option, - cache_path: PathBuf, security_status: SecurityStatus, /// The queue of jobs that are waiting for a worker to pick up. @@ -158,7 +157,6 @@ impl Queue { worker_capacity: usize, spawn_timeout: Duration, node_version: Option, - cache_path: PathBuf, security_status: SecurityStatus, to_queue_rx: mpsc::Receiver, ) -> Self { @@ -167,7 +165,6 @@ impl Queue { program_path, spawn_timeout, node_version, - cache_path, security_status, to_queue_rx, queue: VecDeque::new(), @@ -415,7 +412,6 @@ fn spawn_extra_worker(queue: &mut Queue, job: ExecuteJob) { job, queue.spawn_timeout, queue.node_version.clone(), - queue.cache_path.clone(), queue.security_status.clone(), ) .boxed(), @@ -435,7 +431,6 @@ async fn spawn_worker_task( job: ExecuteJob, spawn_timeout: Duration, node_version: Option, - cache_path: PathBuf, security_status: SecurityStatus, ) -> QueueEvent { use futures_timer::Delay; @@ -446,7 +441,6 @@ async fn spawn_worker_task( job.executor_params.clone(), spawn_timeout, node_version.as_deref(), - &cache_path, security_status.clone(), ) .await @@ -512,7 +506,6 @@ pub fn start( worker_capacity: usize, spawn_timeout: Duration, node_version: Option, - cache_path: PathBuf, security_status: SecurityStatus, ) -> (mpsc::Sender, impl Future) { let (to_queue_tx, to_queue_rx) = mpsc::channel(20); @@ -522,7 +515,6 @@ pub fn start( worker_capacity, spawn_timeout, node_version, - cache_path, security_status, to_queue_rx, ) diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 91a9c219b34c..01e0f9185ef7 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -19,8 +19,8 @@ use crate::{ artifacts::ArtifactPathId, worker_intf::{ - path_to_bytes, spawn_with_program_path, IdleWorker, SpawnErr, WorkerHandle, - JOB_TIMEOUT_WALL_CLOCK_FACTOR, + clear_worker_dir_path, spawn_with_program_path, IdleWorker, SpawnErr, WorkerDir, + WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, }, LOG_TARGET, }; @@ -30,8 +30,7 @@ use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::InternalValidationError, execute::{Handshake, Response}, - framed_recv, framed_send, - SecurityStatus + framed_recv, framed_send, worker_dir, SecurityStatus, }; use polkadot_parachain::primitives::ValidationResult; use polkadot_primitives::ExecutorParams; @@ -39,22 +38,16 @@ use std::{path::Path, time::Duration}; use tokio::{io, net::UnixStream}; /// Spawns a new worker with the given program path that acts as the worker and the spawn timeout. -/// Sends a handshake message to the worker as soon as it is spawned. /// -/// The program should be able to handle ` execute-worker ` invocation. +/// Sends a handshake message to the worker as soon as it is spawned. pub async fn spawn( program_path: &Path, executor_params: ExecutorParams, spawn_timeout: Duration, node_version: Option<&str>, - cache_path: &Path, security_status: SecurityStatus, ) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { - let cache_path_str = match cache_path.to_str() { - Some(a) => a, - None => return Err(SpawnErr::InvalidCachePath(cache_path.to_owned())), - }; - let mut extra_args = vec!["execute-worker", "--cache-path", cache_path_str]; + let mut extra_args = vec!["execute-worker"]; if let Some(node_version) = node_version { extra_args.extend_from_slice(&["--node-impl-version", node_version]); } @@ -62,12 +55,12 @@ pub async fn spawn( let (mut idle_worker, worker_handle) = spawn_with_program_path( "execute", program_path, - Some(cache_path), &extra_args, spawn_timeout, + security_status, ) .await?; - send_handshake(&mut idle_worker.stream, Handshake { executor_params, security_status }) + send_handshake(&mut idle_worker.stream, Handshake { executor_params }) .await .map_err(|error| { gum::warn!( @@ -118,89 +111,156 @@ pub async fn start_work( execution_timeout: Duration, validation_params: Vec, ) -> Outcome { - let IdleWorker { mut stream, pid } = worker; + let IdleWorker { mut stream, pid, worker_dir } = worker; gum::debug!( target: LOG_TARGET, worker_pid = %pid, + ?worker_dir, validation_code_hash = ?artifact.id.code_hash, "starting execute for {}", artifact.path.display(), ); - if let Err(error) = - send_request(&mut stream, &artifact.path, &validation_params, execution_timeout).await - { + with_worker_dir_setup(worker_dir, pid, &artifact.path, |worker_dir| async move { + if let Err(error) = send_request(&mut stream, &validation_params, execution_timeout).await { + gum::warn!( + target: LOG_TARGET, + worker_pid = %pid, + validation_code_hash = ?artifact.id.code_hash, + ?error, + "failed to send an execute request", + ); + return Outcome::IoErr + } + + // We use a generous timeout here. This is in addition to the one in the child process, in + // case the child stalls. We have a wall clock timeout here in the host, but a CPU timeout + // in the child. We want to use CPU time because it varies less than wall clock time under + // load, but the CPU resources of the child can only be measured from the parent after the + // child process terminates. + let timeout = execution_timeout * JOB_TIMEOUT_WALL_CLOCK_FACTOR; + let response = futures::select! { + response = recv_response(&mut stream).fuse() => { + match response { + Err(error) => { + gum::warn!( + target: LOG_TARGET, + worker_pid = %pid, + validation_code_hash = ?artifact.id.code_hash, + ?error, + "failed to recv an execute response", + ); + return Outcome::IoErr + }, + Ok(response) => { + if let Response::Ok{duration, ..} = response { + if duration > execution_timeout { + // The job didn't complete within the timeout. + gum::warn!( + target: LOG_TARGET, + worker_pid = %pid, + "execute job took {}ms cpu time, exceeded execution timeout {}ms.", + duration.as_millis(), + execution_timeout.as_millis(), + ); + + // Return a timeout error. + return Outcome::HardTimeout; + } + } + + response + }, + } + }, + _ = Delay::new(timeout).fuse() => { + gum::warn!( + target: LOG_TARGET, + worker_pid = %pid, + validation_code_hash = ?artifact.id.code_hash, + "execution worker exceeded lenient timeout for execution, child worker likely stalled", + ); + Response::TimedOut + }, + }; + + match response { + Response::Ok { result_descriptor, duration } => Outcome::Ok { + result_descriptor, + duration, + idle_worker: IdleWorker { stream, pid, worker_dir }, + }, + Response::InvalidCandidate(err) => Outcome::InvalidCandidate { + err, + idle_worker: IdleWorker { stream, pid, worker_dir }, + }, + Response::TimedOut => Outcome::HardTimeout, + Response::Panic(err) => Outcome::Panic { err }, + Response::InternalError(err) => Outcome::InternalError { err }, + } + }) + .await +} + +/// Create a temporary file for an artifact in the worker cache, execute the given future/closure +/// passing the file path in, and clean up the worker cache. +/// +/// Failure to clean up the worker cache results in an error - leaving any files here could be a +/// security issue, and we should shut down the worker. This should be very rare. +async fn with_worker_dir_setup( + worker_dir: WorkerDir, + pid: u32, + artifact_path: &Path, + f: F, +) -> Outcome +where + Fut: futures::Future, + F: FnOnce(WorkerDir) -> Fut, +{ + let worker_dir_path = worker_dir.path.clone(); + + // Cheaply create a hard link to the artifact. The artifact is always at a known location in the + // worker cache, and the child can't access any other artifacts or gain any information from the + // original filename. + let link_path = worker_dir::execute_artifact(&worker_dir_path); + if let Err(err) = tokio::fs::hard_link(artifact_path, link_path).await { gum::warn!( target: LOG_TARGET, worker_pid = %pid, - validation_code_hash = ?artifact.id.code_hash, - ?error, - "failed to send an execute request", + ?worker_dir, + "failed to clear worker cache after the job: {:?}", + err, ); - return Outcome::IoErr + return Outcome::InternalError { + err: InternalValidationError::CouldNotCreateLink(format!("{:?}", err)), + } } - // We use a generous timeout here. This is in addition to the one in the child process, in - // case the child stalls. We have a wall clock timeout here in the host, but a CPU timeout - // in the child. We want to use CPU time because it varies less than wall clock time under - // load, but the CPU resources of the child can only be measured from the parent after the - // child process terminates. - let timeout = execution_timeout * JOB_TIMEOUT_WALL_CLOCK_FACTOR; - let response = futures::select! { - response = recv_response(&mut stream).fuse() => { - match response { - Err(error) => { - gum::warn!( - target: LOG_TARGET, - worker_pid = %pid, - validation_code_hash = ?artifact.id.code_hash, - ?error, - "failed to recv an execute response", - ); - return Outcome::IoErr - }, - Ok(response) => { - if let Response::Ok{duration, ..} = response { - if duration > execution_timeout { - // The job didn't complete within the timeout. - gum::warn!( - target: LOG_TARGET, - worker_pid = %pid, - "execute job took {}ms cpu time, exceeded execution timeout {}ms.", - duration.as_millis(), - execution_timeout.as_millis(), - ); - - // Return a timeout error. - return Outcome::HardTimeout; - } - } + let outcome = f(worker_dir).await; - response - }, - } - }, - _ = Delay::new(timeout).fuse() => { + // Try to clear the worker dir. + // + // Note that it may not exist anymore because of the worker dying and being cleaned up. + if let Err(err) = clear_worker_dir_path(&worker_dir_path) { + if !matches!(err.kind(), io::ErrorKind::NotFound) { gum::warn!( target: LOG_TARGET, worker_pid = %pid, - validation_code_hash = ?artifact.id.code_hash, - "execution worker exceeded lenient timeout for execution, child worker likely stalled", + ?worker_dir_path, + "failed to clear worker cache after the job: {:?}", + err, ); - Response::TimedOut - }, - }; - - match response { - Response::Ok { result_descriptor, duration } => - Outcome::Ok { result_descriptor, duration, idle_worker: IdleWorker { stream, pid } }, - Response::InvalidCandidate(err) => - Outcome::InvalidCandidate { err, idle_worker: IdleWorker { stream, pid } }, - Response::TimedOut => Outcome::HardTimeout, - Response::Panic(err) => Outcome::Panic { err }, - Response::InternalError(err) => Outcome::InternalError { err }, + return Outcome::InternalError { + err: InternalValidationError::CouldNotClearWorkerDir { + err: format!("{:?}", err), + path: worker_dir_path.to_str().map(String::from), + }, + } + } } + + outcome } async fn send_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Result<()> { @@ -209,11 +269,9 @@ async fn send_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Re async fn send_request( stream: &mut UnixStream, - artifact_path: &Path, validation_params: &[u8], execution_timeout: Duration, ) -> io::Result<()> { - framed_send(stream, path_to_bytes(artifact_path)).await?; framed_send(stream, validation_params).await?; framed_send(stream, &execution_timeout.encode()).await } diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 8de7f3f6a373..b1c25ad12394 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -205,8 +205,8 @@ pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future (ValidationHost, impl Future (ValidationHost, impl Future impl futures::Stream /// single-threaded, so we can't just fork here. To get as close as possible to running unshare and /// pivot_root in a worker, we try them... in a worker. The expected return status is 0 on success /// and -1 on failure. -fn check_can_unshare_user_namespace_and_change_root(prepare_worker_program_path: &Path) -> bool { +fn check_can_unshare_user_namespace_and_change_root( + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + prepare_worker_program_path: &Path, +) -> bool { #[cfg(target_os = "linux")] { - match Command::new(prepare_worker_program_path) + match std::process::Command::new(prepare_worker_program_path) .arg("--check-can-unshare-user-namespace-and-change-root") .status() { - Ok(0) => true, + Ok(status) if status.success() => true, Ok(status) => { gum::warn!( - target:LOG_TARGET, - %prepare_worker_program_path, + target: LOG_TARGET, + ?prepare_worker_program_path, ?status, "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security." ); @@ -907,8 +908,8 @@ fn check_can_unshare_user_namespace_and_change_root(prepare_worker_program_path: }, Err(err) => { gum::warn!( - target:LOG_TARGET, - %prepare_worker_program_path, + target: LOG_TARGET, + ?prepare_worker_program_path, "Could not start child process: {}", err ); @@ -930,23 +931,38 @@ fn check_can_unshare_user_namespace_and_change_root(prepare_worker_program_path: /// Check if landlock is supported and emit a warning if not. /// /// TODO: Run in child process. -fn check_landlock() -> bool { +fn check_landlock( + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + prepare_worker_program_path: &Path, +) -> bool { #[cfg(target_os = "linux")] { - use polkadot_node_core_pvf_common::worker::security::landlock; - - let status = landlock::get_status(); - if !landlock::status_is_fully_enabled(&status) { - let abi = landlock::LANDLOCK_ABI as u8; - gum::warn!( - target: LOG_TARGET, - ?status, - %abi, - "Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." - ); - false - } else { - true + match std::process::Command::new(prepare_worker_program_path) + .arg("--check-can-enable-landlock") + .status() + { + Ok(status) if status.success() => true, + Ok(status) => { + let abi = + polkadot_node_core_pvf_common::worker::security::landlock::LANDLOCK_ABI as u8; + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + ?status, + %abi, + "Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, } } diff --git a/polkadot/node/core/pvf/src/lib.rs b/polkadot/node/core/pvf/src/lib.rs index 1da0593835fb..737ee42ac77a 100644 --- a/polkadot/node/core/pvf/src/lib.rs +++ b/polkadot/node/core/pvf/src/lib.rs @@ -102,6 +102,8 @@ pub mod testing; // Used by `decl_puppet_worker_main!`. #[cfg(feature = "test-utils")] +pub use polkadot_node_core_pvf_common::worker; +#[cfg(feature = "test-utils")] pub use sp_tracing; pub use error::{InvalidCandidate, ValidationError}; @@ -115,6 +117,7 @@ pub use polkadot_node_core_pvf_common::{ error::{InternalValidationError, PrepareError}, prepare::{PrepareJobKind, PrepareStats}, pvf::PvfPrepData, + SecurityStatus, }; // Re-export worker entrypoints. diff --git a/polkadot/node/core/pvf/src/prepare/pool.rs b/polkadot/node/core/pvf/src/prepare/pool.rs index 95a66a1d5f5a..5e6d00ae1f04 100644 --- a/polkadot/node/core/pvf/src/prepare/pool.rs +++ b/polkadot/node/core/pvf/src/prepare/pool.rs @@ -113,7 +113,6 @@ type Mux = FuturesUnordered>; struct Pool { // Some variables related to the current session. program_path: PathBuf, - cache_path: PathBuf, spawn_timeout: Duration, node_version: Option, security_status: SecurityStatus, @@ -132,7 +131,6 @@ struct Fatal; async fn run( Pool { program_path, - cache_path, spawn_timeout, node_version, security_status, @@ -161,7 +159,6 @@ async fn run( handle_to_pool( &metrics, &program_path, - &cache_path, spawn_timeout, node_version.clone(), security_status.clone(), @@ -209,7 +206,6 @@ async fn purge_dead( fn handle_to_pool( metrics: &Metrics, program_path: &Path, - cache_path: &Path, spawn_timeout: Duration, node_version: Option, security_status: SecurityStatus, @@ -226,7 +222,6 @@ fn handle_to_pool( program_path.to_owned(), spawn_timeout, node_version, - cache_path.to_owned(), security_status, ) .boxed(), @@ -242,9 +237,7 @@ fn handle_to_pool( worker, idle, pvf, - cache_path.to_owned(), artifact_path, - security_status, preparation_timer, ) .boxed(), @@ -274,7 +267,6 @@ async fn spawn_worker_task( program_path: PathBuf, spawn_timeout: Duration, node_version: Option, - cache_path: PathBuf, security_status: SecurityStatus, ) -> PoolEvent { use futures_timer::Delay; @@ -284,7 +276,6 @@ async fn spawn_worker_task( &program_path, spawn_timeout, node_version.as_deref(), - &cache_path, security_status.clone(), ) .await @@ -305,12 +296,10 @@ async fn start_work_task( worker: Worker, idle: IdleWorker, pvf: PvfPrepData, - cache_path: PathBuf, artifact_path: PathBuf, - security_status: SecurityStatus, _preparation_timer: Option, ) -> PoolEvent { - let outcome = worker_intf::start_work(&metrics, idle, pvf, &cache_path, artifact_path, security_status).await; + let outcome = worker_intf::start_work(&metrics, idle, pvf, artifact_path).await; PoolEvent::StartWork(worker, outcome) } @@ -347,14 +336,29 @@ fn handle_mux( ), // Return `Concluded`, but do not kill the worker since the error was on the host // side. - Outcome::RenameTmpFileErr { worker: idle, result: _, err } => + Outcome::RenameTmpFileErr { worker: idle, result: _, err, src, dest } => handle_concluded_no_rip( from_pool, spawned, worker, idle, - Err(PrepareError::RenameTmpFileErr(err)), + Err(PrepareError::RenameTmpFileErr { err, src, dest }), ), + // Could not clear worker cache. Kill the worker so other jobs can't see the data. + Outcome::ClearWorkerDir { err } => { + if attempt_retire(metrics, spawned, worker) { + reply( + from_pool, + FromPool::Concluded { + worker, + rip: true, + result: Err(PrepareError::ClearWorkerDir(err)), + }, + )?; + } + + Ok(()) + }, Outcome::Unreachable => { if attempt_retire(metrics, spawned, worker) { reply(from_pool, FromPool::Rip(worker))?; @@ -456,7 +460,6 @@ fn handle_concluded_no_rip( pub fn start( metrics: Metrics, program_path: PathBuf, - cache_path: PathBuf, spawn_timeout: Duration, node_version: Option, security_status: SecurityStatus, @@ -467,7 +470,6 @@ pub fn start( let run = run(Pool { metrics, program_path, - cache_path, spawn_timeout, node_version, security_status, diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 9329c3df7dbd..55f40306922f 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -19,8 +19,8 @@ use crate::{ metrics::Metrics, worker_intf::{ - path_to_bytes, spawn_with_program_path, tmpfile_in, IdleWorker, SpawnErr, WorkerHandle, - JOB_TIMEOUT_WALL_CLOCK_FACTOR, + clear_worker_dir_path, spawn_with_program_path, IdleWorker, SpawnErr, WorkerDir, + WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, }, LOG_TARGET, }; @@ -28,9 +28,9 @@ use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, framed_recv, framed_send, - prepare::{Handshake, PrepareStats}, + prepare::PrepareStats, pvf::PvfPrepData, - SecurityStatus, + worker_dir, SecurityStatus, }; use sp_core::hexdisplay::HexDisplay; @@ -42,45 +42,31 @@ use tokio::{io, net::UnixStream}; /// Spawns a new worker with the given program path that acts as the worker and the spawn timeout. /// -/// The program should be able to handle ` prepare-worker ` invocation. +/// Sends a handshake message to the worker as soon as it is spawned. pub async fn spawn( program_path: &Path, spawn_timeout: Duration, node_version: Option<&str>, - cache_path: &Path, security_status: SecurityStatus, ) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { - let cache_path_str = match cache_path.to_str() { - Some(a) => a, - None => return Err(SpawnErr::InvalidCachePath(cache_path.to_owned())), - }; - let mut extra_args = vec!["prepare-worker", "--cache-path", cache_path_str]; + let mut extra_args = vec!["prepare-worker"]; if let Some(node_version) = node_version { extra_args.extend_from_slice(&["--node-impl-version", node_version]); } - let (mut idle_worker, worker_handle) = spawn_with_program_path( + Ok(spawn_with_program_path( "prepare", program_path, - Some(cache_path), &extra_args, spawn_timeout, + security_status, ) - .await?; - send_handshake(&mut idle_worker.stream, Handshake { security_status }) - .await - .map_err(|error| { - gum::warn!( - target: LOG_TARGET, - worker_pid = %idle_worker.pid, - ?error, - "failed to send a handshake to the spawned worker", - ); - SpawnErr::Handshake - })?; - Ok((idle_worker, worker_handle)) + .await?) } +/// Outcome of PVF preparation. +/// +/// If the idle worker token is not returned, it means the worker must be terminated. pub enum Outcome { /// The worker has finished the work assigned to it. Concluded { worker: IdleWorker, result: PrepareResult }, @@ -89,9 +75,19 @@ pub enum Outcome { Unreachable, /// The temporary file for the artifact could not be created at the given cache path. CreateTmpFileErr { worker: IdleWorker, err: String }, - /// The response from the worker is received, but the file cannot be renamed (moved) to the + /// The response from the worker is received, but the tmp file cannot be renamed (moved) to the /// final destination location. - RenameTmpFileErr { worker: IdleWorker, result: PrepareResult, err: String }, + RenameTmpFileErr { + worker: IdleWorker, + result: PrepareResult, + err: String, + // Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible + // conversion to `Option`. + src: Option, + dest: Option, + }, + /// The worker cache could not be cleared for the given reason. + ClearWorkerDir { err: String }, /// The worker failed to finish the job until the given deadline. /// /// The worker is no longer usable and should be killed. @@ -111,94 +107,88 @@ pub async fn start_work( metrics: &Metrics, worker: IdleWorker, pvf: PvfPrepData, - cache_path: &Path, artifact_path: PathBuf, - security_status: SecurityStatus, ) -> Outcome { - let IdleWorker { stream, pid } = worker; + let IdleWorker { stream, pid, worker_dir } = worker; gum::debug!( target: LOG_TARGET, worker_pid = %pid, - ?security_status, + ?worker_dir, "starting prepare for {}", artifact_path.display(), ); - with_tmp_file(stream, pid, cache_path, |tmp_file, mut stream| async move { - // Pass the socket path relative to the cache_path (what the child thinks is root). - let tmp_file_worker_view = if security_status.can_unshare_user_namespace_and_change_root { - Path::new(".").with_file_name( - tmp_file.file_name().expect("tmp files are created with a filename; qed"), - ) - } else { - tmp_file.clone() - }; - - let preparation_timeout = pvf.prep_timeout(); - if let Err(err) = send_request(&mut stream, pvf, &tmp_file_worker_view).await { - gum::warn!( - target: LOG_TARGET, - worker_pid = %pid, - "failed to send a prepare request: {:?}", - err, - ); - return Outcome::Unreachable - } - - // Wait for the result from the worker, keeping in mind that there may be a timeout, the - // worker may get killed, or something along these lines. In that case we should propagate - // the error to the pool. - // - // We use a generous timeout here. This is in addition to the one in the child process, in - // case the child stalls. We have a wall clock timeout here in the host, but a CPU timeout - // in the child. We want to use CPU time because it varies less than wall clock time under - // load, but the CPU resources of the child can only be measured from the parent after the - // child process terminates. - let timeout = preparation_timeout * JOB_TIMEOUT_WALL_CLOCK_FACTOR; - let result = tokio::time::timeout(timeout, recv_response(&mut stream, pid)).await; - - match result { - // Received bytes from worker within the time limit. - Ok(Ok(prepare_result)) => - handle_response( - metrics, - IdleWorker { stream, pid }, - prepare_result, - pid, - tmp_file, - artifact_path, - preparation_timeout, - ) - .await, - Ok(Err(err)) => { - // Communication error within the time limit. + with_worker_dir_setup( + worker_dir, + stream, + pid, + |tmp_artifact_file, mut stream, worker_dir| async move { + let preparation_timeout = pvf.prep_timeout(); + if let Err(err) = send_request(&mut stream, pvf).await { gum::warn!( target: LOG_TARGET, worker_pid = %pid, - "failed to recv a prepare response: {:?}", + "failed to send a prepare request: {:?}", err, ); - Outcome::IoErr(err.to_string()) - }, - Err(_) => { - // Timed out here on the host. - gum::warn!( - target: LOG_TARGET, - worker_pid = %pid, - "did not recv a prepare response within the time limit", - ); - Outcome::TimedOut - }, - } - }) + return Outcome::Unreachable + } + + // Wait for the result from the worker, keeping in mind that there may be a timeout, the + // worker may get killed, or something along these lines. In that case we should + // propagate the error to the pool. + // + // We use a generous timeout here. This is in addition to the one in the child process, + // in case the child stalls. We have a wall clock timeout here in the host, but a CPU + // timeout in the child. We want to use CPU time because it varies less than wall clock + // time under load, but the CPU resources of the child can only be measured from the + // parent after the child process terminates. + let timeout = preparation_timeout * JOB_TIMEOUT_WALL_CLOCK_FACTOR; + let result = tokio::time::timeout(timeout, recv_response(&mut stream, pid)).await; + + match result { + // Received bytes from worker within the time limit. + Ok(Ok(prepare_result)) => + handle_response( + metrics, + IdleWorker { stream, pid, worker_dir }, + prepare_result, + pid, + tmp_artifact_file, + artifact_path, + preparation_timeout, + ) + .await, + Ok(Err(err)) => { + // Communication error within the time limit. + gum::warn!( + target: LOG_TARGET, + worker_pid = %pid, + "failed to recv a prepare response: {:?}", + err, + ); + Outcome::IoErr(err.to_string()) + }, + Err(_) => { + // Timed out here on the host. + gum::warn!( + target: LOG_TARGET, + worker_pid = %pid, + "did not recv a prepare response within the time limit", + ); + Outcome::TimedOut + }, + } + }, + ) .await } /// Handles the case where we successfully received response bytes on the host from the child. /// -/// NOTE: Here we know the artifact exists, but is still located in a temporary file which will be -/// cleared by `with_tmp_file`. +/// Here we know the artifact exists, but is still located in a temporary file which will be cleared +/// by [`with_worker_dir_setup`]. async fn handle_response( metrics: &Metrics, worker: IdleWorker, @@ -247,7 +237,13 @@ async fn handle_response( artifact_path.display(), err, ); - Outcome::RenameTmpFileErr { worker, result, err: format!("{:?}", err) } + Outcome::RenameTmpFileErr { + worker, + result, + err: format!("{:?}", err), + src: tmp_file.to_str().map(String::from), + dest: artifact_path.to_str().map(String::from), + } }, }; @@ -258,68 +254,69 @@ async fn handle_response( outcome } -/// Create a temporary file for an artifact at the given cache path and execute the given -/// future/closure passing the file path in. +/// Create a temporary file for an artifact in the worker cache, execute the given future/closure +/// passing the file path in, and clean up the worker cache. /// -/// The function will try best effort to not leave behind the temporary file. -async fn with_tmp_file(stream: UnixStream, pid: u32, cache_path: &Path, f: F) -> Outcome +/// Failure to clean up the worker cache results in an error - leaving any files here could be a +/// security issue, and we should shut down the worker. This should be very rare. +async fn with_worker_dir_setup( + worker_dir: WorkerDir, + stream: UnixStream, + pid: u32, + f: F, +) -> Outcome where Fut: futures::Future, - F: FnOnce(PathBuf, UnixStream) -> Fut, + F: FnOnce(PathBuf, UnixStream, WorkerDir) -> Fut, { - let tmp_file = match tmpfile_in("prepare-artifact-", cache_path).await { - Ok(f) => f, + let worker_dir_path = worker_dir.path.clone(); + + // Create the tmp file here so that the child doesn't need any file creation rights. This will + // be cleared at the end of this function. + let tmp_file = worker_dir::prepare_tmp_artifact(&worker_dir_path); + match tokio::fs::write(&tmp_file, &[]).await { + Ok(()) => (), Err(err) => { gum::warn!( target: LOG_TARGET, worker_pid = %pid, + ?worker_dir, "failed to create a temp file for the artifact: {:?}", err, ); return Outcome::CreateTmpFileErr { - worker: IdleWorker { stream, pid }, + worker: IdleWorker { stream, pid, worker_dir }, err: format!("{:?}", err), } }, }; - let outcome = f(tmp_file.clone(), stream).await; + let outcome = f(tmp_file, stream, worker_dir).await; - // The function called above is expected to move `tmp_file` to a new location upon success. - // However, the function may as well fail and in that case we should remove the tmp file here. + // Try to clear the worker dir. // - // In any case, we try to remove the file here so that there are no leftovers. We only report - // errors that are different from the `NotFound`. - match tokio::fs::remove_file(tmp_file).await { - Ok(()) => (), - Err(err) if err.kind() == std::io::ErrorKind::NotFound => (), - Err(err) => { + // Note that it may not exist anymore because of the worker dying and being cleaned up. + if let Err(err) = clear_worker_dir_path(&worker_dir_path) { + if !matches!(err.kind(), io::ErrorKind::NotFound) { gum::warn!( target: LOG_TARGET, worker_pid = %pid, - "failed to remove the tmp file: {:?}", + ?worker_dir_path, + "failed to clear worker cache after the job: {:?}", err, ); - }, + return Outcome::ClearWorkerDir { err: format!("{:?}", err) } + } } outcome } -async fn send_request( - stream: &mut UnixStream, - pvf: PvfPrepData, - tmp_file: &Path, -) -> io::Result<()> { +async fn send_request(stream: &mut UnixStream, pvf: PvfPrepData) -> io::Result<()> { framed_send(stream, &pvf.encode()).await?; - framed_send(stream, path_to_bytes(tmp_file)).await?; Ok(()) } -async fn send_handshake(stream: &mut UnixStream, handshake: Handshake) -> io::Result<()> { - framed_send(stream, &handshake.encode()).await -} - async fn recv_response(stream: &mut UnixStream, pid: u32) -> io::Result { let result = framed_recv(stream).await?; let result = PrepareResult::decode(&mut &result[..]).map_err(|e| { diff --git a/polkadot/node/core/pvf/src/testing.rs b/polkadot/node/core/pvf/src/testing.rs index 129d55337ca3..304ef2bc2b6a 100644 --- a/polkadot/node/core/pvf/src/testing.rs +++ b/polkadot/node/core/pvf/src/testing.rs @@ -54,6 +54,9 @@ pub fn validate_candidate( macro_rules! decl_puppet_worker_main { () => { fn main() { + #[cfg(target_os = "linux")] + use $crate::worker::security; + $crate::sp_tracing::try_init_simple(); let args = std::env::args().collect::>(); @@ -71,27 +74,68 @@ macro_rules! decl_puppet_worker_main { }, "prepare-worker" => $crate::prepare_worker_entrypoint, "execute-worker" => $crate::execute_worker_entrypoint, + + "--check-can-enable-landlock" => { + #[cfg(target_os = "linux")] + let status = if security::landlock::status_is_fully_enabled( + &security::landlock::get_status(), + ) { + 0 + } else { + -1 + }; + #[cfg(not(target_os = "linux"))] + let status = -1; + std::process::exit(status) + }, + "--check-can-unshare-user-namespace-and-change-root" => { + #[cfg(target_os = "linux")] + let status = if security::unshare_user_namespace_and_change_root(&std::env::temp_dir()).is_ok() { + 0 + } else { + -1 + }; + #[cfg(not(target_os = "linux"))] + let status = -1; + std::process::exit(status) + }, + other => panic!("unknown subcommand: {}", other), }; + let mut worker_dir_path = None; let mut node_version = None; - let mut socket_path = None; - let mut cache_path = None; + let mut can_enable_landlock = false; + let mut can_unshare_user_namespace_and_change_root = false; - for i in (2..args.len()).step_by(2) { + let mut i = 2; + while i < args.len() { match args[i].as_ref() { - "--socket-path" => socket_path = Some(args[i + 1].as_str()), - "--node-impl-version" => node_version = Some(args[i + 1].as_str()), - "--cache-path" => cache_path = Some(args[i + 1].as_str()), + "--worker-dir-path" => { + worker_dir_path = Some(args[i + 1].as_str()); + i += 1 + }, + "--node-impl-version" => { + node_version = Some(args[i + 1].as_str()); + i += 1 + }, + "--can-enable-landlock" => can_enable_landlock = true, + "--can-unshare-user-namespace-and-change-root" => + can_unshare_user_namespace_and_change_root = true, arg => panic!("Unexpected argument found: {}", arg), } + i += 1; } - let socket_path = socket_path.expect("the --socket-path argument is required"); - let cache_path = cache_path.expect("the --cache-path argument is required"); + let worker_dir_path = + worker_dir_path.expect("the --worker-dir-path argument is required"); - let cache_path = &std::path::Path::new(cache_path); + let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned(); + let security_status = $crate::SecurityStatus { + can_enable_landlock, + can_unshare_user_namespace_and_change_root, + }; - entrypoint(&socket_path, node_version, None, cache_path); + entrypoint(worker_dir_path, node_version, None, security_status); } }; } diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index e020a0af2aec..e04a6cd6bf2b 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -20,6 +20,7 @@ use crate::LOG_TARGET; use futures::FutureExt as _; use futures_timer::Delay; use pin_project::pin_project; +use polkadot_node_core_pvf_common::{worker_dir, SecurityStatus}; use rand::Rng; use std::{ fmt, mem, @@ -46,9 +47,6 @@ pub const JOB_TIMEOUT_WALL_CLOCK_FACTOR: u32 = 4; /// /// - `program_path`: The path to the program. /// -/// - `socket_dir_path`: An optional path to the dir where the socket should be created, if `None` -/// use a temp dir. -/// /// - `extra_args`: Optional extra CLI arguments to the program. NOTE: Should only contain data /// required before the handshake, like node/worker versions for the version check. Other data /// should go through the handshake. @@ -58,12 +56,14 @@ pub const JOB_TIMEOUT_WALL_CLOCK_FACTOR: u32 = 4; pub async fn spawn_with_program_path( debug_id: &'static str, program_path: impl Into, - socket_dir_path: Option<&Path>, extra_args: &[&str], spawn_timeout: Duration, + security_status: SecurityStatus, ) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { let program_path = program_path.into(); - with_transient_socket_path(debug_id, socket_dir_path, |socket_path| { + let worker_dir = WorkerDir::new(debug_id).await?; + + with_transient_socket_path(&worker_dir.path.clone(), |socket_path| { let socket_path = socket_path.to_owned(); let extra_args: Vec = extra_args.iter().map(|arg| arg.to_string()).collect(); @@ -74,6 +74,8 @@ pub async fn spawn_with_program_path( %debug_id, ?program_path, ?extra_args, + ?worker_dir, + ?socket_path, "cannot bind unix socket: {:?}", err, ); @@ -81,18 +83,22 @@ pub async fn spawn_with_program_path( })?; let handle = - WorkerHandle::spawn(&program_path, &extra_args, socket_path).map_err(|err| { - gum::warn!( - target: LOG_TARGET, - %debug_id, - ?program_path, - ?extra_args, - "cannot spawn a worker: {:?}", - err, - ); - SpawnErr::ProcessSpawn - })?; + WorkerHandle::spawn(&program_path, &extra_args, &worker_dir.path, security_status) + .map_err(|err| { + gum::warn!( + target: LOG_TARGET, + %debug_id, + ?program_path, + ?extra_args, + ?worker_dir.path, + ?socket_path, + "cannot spawn a worker: {:?}", + err, + ); + SpawnErr::ProcessSpawn + })?; + let worker_dir_path = worker_dir.path.clone(); futures::select! { accept_result = listener.accept().fuse() => { let (stream, _) = accept_result.map_err(|err| { @@ -101,12 +107,14 @@ pub async fn spawn_with_program_path( %debug_id, ?program_path, ?extra_args, + ?worker_dir_path, + ?socket_path, "cannot accept a worker: {:?}", err, ); SpawnErr::Accept })?; - Ok((IdleWorker { stream, pid: handle.id() }, handle)) + Ok((IdleWorker { stream, pid: handle.id(), worker_dir }, handle)) } _ = Delay::new(spawn_timeout).fuse() => { gum::warn!( @@ -114,6 +122,8 @@ pub async fn spawn_with_program_path( %debug_id, ?program_path, ?extra_args, + ?worker_dir_path, + ?socket_path, ?spawn_timeout, "spawning and connecting to socket timed out", ); @@ -125,22 +135,12 @@ pub async fn spawn_with_program_path( .await } -async fn with_transient_socket_path( - debug_id: &'static str, - socket_dir_path: Option<&Path>, - f: F, -) -> Result +async fn with_transient_socket_path(worker_dir_path: &Path, f: F) -> Result where F: FnOnce(&Path) -> Fut, Fut: futures::Future> + 'static, { - let socket_prefix = format!("pvf-host-{}-", debug_id); - let socket_path = if let Some(socket_dir_path) = socket_dir_path { - tmpfile_in(&socket_prefix, socket_dir_path).await - } else { - tmpfile(&socket_prefix).await - } - .map_err(|_| SpawnErr::TmpFile)?; + let socket_path = worker_dir::socket(worker_dir_path); let result = f(&socket_path).await; @@ -151,12 +151,12 @@ where result } -/// Returns a path under the given `dir`. The file name will start with the given prefix. +/// Returns a path under the given `dir`. The path name will start with the given prefix. /// /// There is only a certain number of retries. If exceeded this function will give up and return an /// error. -pub async fn tmpfile_in(prefix: &str, dir: &Path) -> io::Result { - fn tmppath(prefix: &str, dir: &Path) -> PathBuf { +pub async fn tmppath_in(prefix: &str, dir: &Path) -> io::Result { + fn make_tmppath(prefix: &str, dir: &Path) -> PathBuf { use rand::distributions::Alphanumeric; const DESCRIMINATOR_LEN: usize = 10; @@ -168,27 +168,27 @@ pub async fn tmpfile_in(prefix: &str, dir: &Path) -> io::Result { let s = std::str::from_utf8(&buf) .expect("the string is collected from a valid utf-8 sequence; qed"); - let mut file = dir.to_owned(); - file.push(s); - file + let mut path = dir.to_owned(); + path.push(s); + path } const NUM_RETRIES: usize = 50; for _ in 0..NUM_RETRIES { - let candidate_path = tmppath(prefix, dir); - if !candidate_path.exists() { - return Ok(candidate_path) + let tmp_path = make_tmppath(prefix, dir); + if !tmp_path.exists() { + return Ok(tmp_path) } } - Err(io::Error::new(io::ErrorKind::Other, "failed to create a temporary file")) + Err(io::Error::new(io::ErrorKind::Other, "failed to create a temporary path")) } -/// The same as [`tmpfile_in`], but uses [`std::env::temp_dir`] as the directory. -pub async fn tmpfile(prefix: &str) -> io::Result { +/// The same as [`tmppath_in`], but uses [`std::env::temp_dir`] as the directory. +pub async fn tmppath(prefix: &str) -> io::Result { let temp_dir = PathBuf::from(std::env::temp_dir()); - tmpfile_in(prefix, &temp_dir).await + tmppath_in(prefix, &temp_dir).await } /// A struct that represents an idle worker. @@ -202,13 +202,19 @@ pub struct IdleWorker { /// The identifier of this process. Used to reset the niceness. pub pid: u32, + + /// The temporary per-worker path. We clean up the worker dir between jobs and delete it when + /// the worker dies. + pub worker_dir: WorkerDir, } /// An error happened during spawning a worker process. #[derive(Clone, Debug)] pub enum SpawnErr { - /// Cannot obtain a temporary file location. - TmpFile, + /// Cannot obtain a temporary path location. + TmpPath, + /// An FS error occurred. + Fs(String), /// Cannot bind the socket to the given path. Bind, /// An error happened during accepting a connection to the socket. @@ -246,25 +252,24 @@ impl WorkerHandle { fn spawn( program: impl AsRef, extra_args: &[String], - socket_path: impl AsRef, + worker_dir_path: impl AsRef, + security_status: SecurityStatus, ) -> io::Result { - // Pass the socket path relative to the cache_path (what the child thinks is root). - let socket_path = if security_config.can_unshare_user_namespace_and_change_root { - Path::new(".").with_file_name( - socket_path - .as_ref() - .file_name() - .expect("socket paths are created with a filename; qed"), - ) - } else { - // We are unable to pivot-root, so pass the socket path as-is. - socket_path.as_ref().as_os_str() + let security_args = { + let mut args = vec![]; + if security_status.can_enable_landlock { + args.push("--can-enable-landlock".to_string()); + } + if security_status.can_unshare_user_namespace_and_change_root { + args.push("--can-unshare-user-namespace-and-change-root".to_string()); + } + args }; - let mut child = process::Command::new(program.as_ref()) .args(extra_args) - .arg("--socket-path") - .arg(socket_path) + .arg("--worker-dir-path") + .arg(worker_dir_path.as_ref().as_os_str()) + .args(&security_args) .stdout(std::process::Stdio::piped()) .kill_on_drop(true) .spawn()?; @@ -346,16 +351,6 @@ impl fmt::Debug for WorkerHandle { } } -/// Convert the given path into a byte buffer. -pub fn path_to_bytes(path: &Path) -> &[u8] { - // Ideally, we take the `OsStr` of the path, send that and reconstruct this on the other side. - // However, libstd doesn't provide us with such an option. There are crates out there that - // allow for extraction of a path, but TBH it doesn't seem to be a real issue. - // - // However, should be there reports we can incorporate such a crate here. - path.to_str().expect("non-UTF-8 path").as_bytes() -} - /// Write some data prefixed by its length into `w`. pub async fn framed_send(w: &mut (impl AsyncWrite + Unpin), buf: &[u8]) -> io::Result<()> { let len_buf = buf.len().to_le_bytes(); @@ -373,3 +368,80 @@ pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result r.read_exact(&mut buf).await?; Ok(buf) } + +/// A temporary worker cache that contains only files needed by the worker. The worker will change +/// its root (the `/` directory) to this cache directory; it should have access to no other paths on +/// its filesystem. The worker cache should live in a tmp directory in the host's filesystem. +/// +/// NOTE: This struct cleans up its associated directory when it is dropped. Therefore it should not +/// implement `Clone`. +/// +/// # File structure +/// +/// The overall file structure for the PVF system is as follows. The `worker-dir`s are managed by +/// this struct. +/// +/// ```nocompile +/// + /[...]/cache_path/ +/// - artifact-1 +/// - artifact-2 +/// - [...] +/// + /tmp/ +/// - worker-dir-1/ +/// + socket (created by host) +/// + tmp-artifact (created by worker) (prepare-only) +/// + artifact (symlink -> artifact-1) (created by host) (execute-only) +/// - worker-dir-2/ +/// + [...] +/// ``` +#[derive(Debug)] +pub struct WorkerDir { + pub path: PathBuf, +} + +impl WorkerDir { + /// Creates a new, empty worker cache with a random name in a tmp location. + pub async fn new(debug_id: &'static str) -> Result { + let prefix = format!("worker-dir-{}-", debug_id); + let path = tmppath(&prefix).await.map_err(|_| SpawnErr::TmpPath)?; + tokio::fs::create_dir(&path) + .await + .map_err(|err| SpawnErr::Fs(err.to_string()))?; + Ok(Self { path }) + } +} + +// Try to clean up the temporary worker cache at the end of the worker's lifetime. It should be in a +// temporary directory location, but we make a best effort not to leave it around. +impl Drop for WorkerDir { + fn drop(&mut self) { + let _ = std::fs::remove_dir_all(&self.path); + } +} + +// Not async since Rust has trouble with async recursion. There should be few files here anyway. +// +// TODO: Can a lingering malicious job still access future files in the cache? +/// Clear the worker cache without deleting it. This is important because the worker has +/// mounted its own separate filesystem here. +/// +/// Should be called right after a job has finished. We don't want jobs to have access to +/// artifacts from previous jobs. +pub fn clear_worker_dir_path(worker_dir_path: &Path) -> io::Result<()> { + fn remove_dir_contents(path: &Path) -> io::Result<()> { + for entry in std::fs::read_dir(&path)? { + let entry = entry?; + let path = entry.path(); + + if entry.file_type()?.is_dir() { + remove_dir_contents(&path)?; + std::fs::remove_dir(path)?; + } else { + std::fs::remove_file(path)?; + } + } + Ok(()) + } + + remove_dir_contents(worker_dir_path) +} diff --git a/polkadot/node/core/pvf/tests/it/worker_common.rs b/polkadot/node/core/pvf/tests/it/worker_common.rs index 4184c68fe8be..a0ab8eceafb6 100644 --- a/polkadot/node/core/pvf/tests/it/worker_common.rs +++ b/polkadot/node/core/pvf/tests/it/worker_common.rs @@ -16,7 +16,7 @@ use std::time::Duration; -use polkadot_node_core_pvf::testing::{spawn_with_program_path, SpawnErr}; +use polkadot_node_core_pvf::{testing::{spawn_with_program_path, SpawnErr}, SecurityStatus}; use crate::PUPPET_EXE; @@ -26,9 +26,9 @@ async fn spawn_immediate_exit() { let result = spawn_with_program_path( "integration-test", PUPPET_EXE, - None, &["exit"], Duration::from_secs(2), + SecurityStatus::default(), ) .await; assert!(matches!(result, Err(SpawnErr::AcceptTimeout))); @@ -39,23 +39,9 @@ async fn spawn_timeout() { let result = spawn_with_program_path( "integration-test", PUPPET_EXE, - None, &["sleep"], Duration::from_secs(2), - ) - .await; - assert!(matches!(result, Err(SpawnErr::AcceptTimeout))); -} - -#[tokio::test] -async fn should_fail_without_cache_path() { - // --socket-path is handled by `spawn_with_program_path` so we don't pass it here. - let result = spawn_with_program_path( - "integration-test", - PUPPET_EXE, - None, - &["prepare-worker"], - Duration::from_secs(2), + SecurityStatus::default(), ) .await; assert!(matches!(result, Err(SpawnErr::AcceptTimeout))); @@ -63,15 +49,12 @@ async fn should_fail_without_cache_path() { #[tokio::test] async fn should_connect() { - let cache_path = tempfile::tempdir().unwrap(); - let cache_path_str = cache_path.path().to_str().unwrap(); - let _ = spawn_with_program_path( "integration-test", PUPPET_EXE, - Some(cache_path.path()), - &["prepare-worker", "--cache-path", cache_path_str], + &["prepare-worker"], Duration::from_secs(2), + SecurityStatus::default(), ) .await .unwrap(); From 9d2ce42ed0863e0d555ea94db71179090fe31bee Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 3 Sep 2023 16:15:37 +0200 Subject: [PATCH 03/18] Some fixes --- Cargo.lock | 1 - polkadot/node/core/pvf/common/Cargo.toml | 3 +- .../node/core/pvf/common/src/worker/mod.rs | 2 - .../core/pvf/common/src/worker/security.rs | 125 +++++------------- .../node/core/pvf/common/src/worker_dir.rs | 2 +- .../node/core/pvf/execute-worker/src/lib.rs | 7 +- .../node/core/pvf/prepare-worker/src/lib.rs | 9 +- .../node/core/pvf/src/execute/worker_intf.rs | 33 ++--- polkadot/node/core/pvf/src/host.rs | 10 +- .../node/core/pvf/src/prepare/worker_intf.rs | 25 ++-- polkadot/node/core/pvf/src/worker_intf.rs | 27 ++-- .../src/node/utility/pvf-host-and-workers.md | 8 +- 12 files changed, 93 insertions(+), 159 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3c8372f6b066..f3bbc2faf296 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12117,7 +12117,6 @@ dependencies = [ "parity-scale-codec", "polkadot-parachain-primitives", "polkadot-primitives", - "rand 0.8.5", "sc-executor", "sc-executor-common", "sc-executor-wasmtime", diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index 0450e2b978e4..621f7e24f72b 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -25,11 +25,10 @@ sc-executor-wasmtime = { path = "../../../../../substrate/client/executor/wasmti sp-core = { path = "../../../../../substrate/primitives/core" } sp-externalities = { path = "../../../../../substrate/primitives/externalities" } sp-io = { path = "../../../../../substrate/primitives/io" } -sp-tracing = { path = "../../../../../substrate/primitives/tracing", optional = true } +sp-tracing = { path = "../../../../../substrate/primitives/tracing" } [target.'cfg(target_os = "linux")'.dependencies] landlock = "0.2.0" -rand = "0.8.5" [dev-dependencies] assert_matches = "1.4.0" diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 737aa311a683..33da8d80a907 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -205,8 +205,6 @@ pub fn worker_event_loop( security::remove_env_vars(debug_id); } - gum::info!(target: LOG_TARGET, "5. {:?}", std::fs::read_dir(".").unwrap().map(|entry| entry.unwrap().path()).collect::>()); - // Run the main worker loop. let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it."); let err = rt diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs index d106114e94e4..a8212f3ad695 100644 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ b/polkadot/node/core/pvf/common/src/worker/security.rs @@ -17,35 +17,35 @@ //! Functionality for securing workers. //! //! This is needed because workers are used to compile and execute untrusted code (PVFs). +//! +//! We currently employ the following security measures: +//! +//! - Restrict filesystem +//! - Use Landlock to remove all unnecessary FS access rights. +//! - Unshare the user and mount namespaces. +//! - Change the root directory to a worker-specific temporary directory. +//! - Remove env vars use crate::LOG_TARGET; -#[cfg(target_os = "linux")] -use std::path::{Path, PathBuf}; /// Unshare the user namespace and change root to be the artifact directory. #[cfg(target_os = "linux")] -pub fn unshare_user_namespace_and_change_root(worker_dir_path: &Path) -> Result<(), &'static str> { - use rand::{distributions::Alphanumeric, Rng}; +pub fn unshare_user_namespace_and_change_root( + worker_dir_path: &std::path::Path, +) -> Result<(), &'static str> { use std::{ffi::CString, os::unix::ffi::OsStrExt, ptr}; - const RANDOM_LEN: usize = 10; - let mut buf = Vec::with_capacity(RANDOM_LEN); - buf.extend(rand::thread_rng().sample_iter(&Alphanumeric).take(RANDOM_LEN)); - let s = std::str::from_utf8(&buf) - .expect("the string is collected from a valid utf-8 sequence; qed"); - - let worker_dir_path_str = - worker_dir_path.to_str().ok_or("worker dir path is not valid UTF-8")?; let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()).unwrap(); - let root_absolute_c = CString::new("/").unwrap(); - // Append a random string to prevent races and to avoid dealing with the dir already existing. - let oldroot_relative_c = - CString::new(format!("{}/oldroot-{}", worker_dir_path_str, s)).unwrap(); - let oldroot_absolute_c = CString::new(format!("/oldroot-{}", s)).unwrap(); + let root_c = CString::new("/").unwrap(); + let dot_c = CString::new(".").unwrap(); - // SAFETY: TODO + // SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps (2) + // and (3) are adapted from the example in pivot_root(2), with the additional change + // described in the `pivot_root(".", ".")` section. unsafe { // 1. `unshare` the user and the mount namespaces. + // + // Separate calls: in case one flag succeeds but other fails, we give a more precise error. if libc::unshare(libc::CLONE_NEWUSER) < 0 { return Err("unshare user namespace") } @@ -53,12 +53,14 @@ pub fn unshare_user_namespace_and_change_root(worker_dir_path: &Path) -> Result< return Err("unshare mount namespace") } - // 2. `pivot_root` to the artifact directory. + // 2. Setup mounts. // - // Ensure that 'new_root' and its parent mount don't have shared propagation. + // Ensure that new root and its parent mount don't have shared propagation (which would + // cause pivot_root() to return an error), and prevent propagation of mount events to the + // initial mount namespace. if libc::mount( ptr::null(), - root_absolute_c.as_ptr(), + root_c.as_ptr(), ptr::null(), libc::MS_REC | libc::MS_PRIVATE, ptr::null(), @@ -66,9 +68,13 @@ pub fn unshare_user_namespace_and_change_root(worker_dir_path: &Path) -> Result< { return Err("mount MS_PRIVATE") } + if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { + return Err("chdir to worker dir path") + } + // Ensure that the new root is a mount point. if libc::mount( - worker_dir_path_c.as_ptr(), - worker_dir_path_c.as_ptr(), + dot_c.as_ptr(), + dot_c.as_ptr(), ptr::null(), // ignored when MS_BIND is used libc::MS_BIND | libc::MS_REC | libc::MS_NOEXEC | libc::MS_NODEV | libc::MS_NOSUID, ptr::null(), // ignored when MS_BIND is used @@ -76,27 +82,13 @@ pub fn unshare_user_namespace_and_change_root(worker_dir_path: &Path) -> Result< { return Err("mount MS_BIND") } - if libc::mkdir(oldroot_relative_c.as_ptr(), 0755) < 0 { - return Err("mkdir oldroot") - } - if libc::syscall( - libc::SYS_pivot_root, - worker_dir_path_c.as_ptr(), - oldroot_relative_c.as_ptr(), - ) < 0 - { - return Err("pivot_root") - } - // 3. Change to the new root, `unmount2` and remove the old root. - if libc::chdir(root_absolute_c.as_ptr()) < 0 { - return Err("chdir to new root") - } - if libc::umount2(oldroot_absolute_c.as_ptr(), libc::MNT_DETACH) < 0 { - return Err("umount2 the oldroot") + // 3. `pivot_root` to the artifact directory. + if libc::syscall(libc::SYS_pivot_root, &dot_c, &dot_c) < 0 { + return Err("pivot_root") } - if libc::rmdir(oldroot_absolute_c.as_ptr()) < 0 { - return Err("rmdir the oldroot") + if libc::umount2(dot_c.as_ptr(), libc::MNT_DETACH) < 0 { + return Err("umount the old root mount point") } } @@ -386,54 +378,5 @@ pub mod landlock { assert!(handle.join().is_ok()); } - - #[test] - fn restricted_thread_can_read_files_but_not_list_dir() { - // TODO: This would be nice: . - if !check_is_fully_enabled() { - return - } - - // Restricted thread can read files but not list directory contents. - let handle = - thread::spawn(|| { - // Create, write to and read a tmp file. This should succeed before any landlock - // restrictions are applied. - const TEXT: &str = "foo"; - let tmpfile = tempfile::NamedTempFile::new().unwrap(); - let filepath = tmpfile.path(); - let dirpath = filepath.parent().unwrap(); - - fs::write(filepath, TEXT).unwrap(); - let s = fs::read_to_string(filepath).unwrap(); - assert_eq!(s, TEXT); - - // Apply Landlock with a general read exception for the directory, *without* the - // `ReadDir` exception. - let status = try_restrict(path_beneath_rules( - &[dirpath], - AccessFs::from_read(LANDLOCK_ABI) ^ AccessFs::ReadDir, - )); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to read file, should still be able to. - let result = fs::read_to_string(filepath); - assert!(matches!( - result, - Ok(s) if s == TEXT - )); - - // Try to list dir contents, should fail. - let result = fs::read_dir(dirpath); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - }); - - assert!(handle.join().is_ok()); - } } } diff --git a/polkadot/node/core/pvf/common/src/worker_dir.rs b/polkadot/node/core/pvf/common/src/worker_dir.rs index b1c36f0afc07..c2610a4d1128 100644 --- a/polkadot/node/core/pvf/common/src/worker_dir.rs +++ b/polkadot/node/core/pvf/common/src/worker_dir.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -//! Functions for getting the worker cache files. +//! Shared functions for getting the known worker files. use std::path::{Path, PathBuf}; diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index fb6e2ab4766a..bbb57be8d607 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -112,16 +112,15 @@ async fn send_response(stream: &mut UnixStream, response: Response) -> io::Resul /// /// # Parameters /// -/// - `socket_path` specifies the path to the socket used to communicate with the host. +/// - `worker_dir_path`: specifies the path to the worker-specific temporary directory. /// -/// - `node_version`, if `Some`, is checked against the `worker_version`. A mismatch results in +/// - `node_version`: if `Some`, is checked against the `worker_version`. A mismatch results in /// immediate worker termination. `None` is used for tests and in other situations when version /// check is not necessary. /// /// - `worker_version`: see above /// -/// - `cache_path` contains the expected cache path for artifacts and is used to provide a sandbox -/// exception for landlock. +/// - `security_status`: contains the detected status of security features. pub fn worker_entrypoint( worker_dir_path: PathBuf, node_version: Option<&str>, diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 4e427bd49425..a55f73b4a27b 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -88,16 +88,15 @@ async fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Re /// /// # Parameters /// -/// - `socket_path` specifies the path to the socket used to communicate with the host. +/// - `worker_dir_path`: specifies the path to the worker-specific temporary directory. /// -/// - `node_version`, if `Some`, is checked against the `worker_version`. A mismatch results in +/// - `node_version`: if `Some`, is checked against the `worker_version`. A mismatch results in /// immediate worker termination. `None` is used for tests and in other situations when version /// check is not necessary. /// /// - `worker_version`: see above /// -/// - `cache_path` contains the expected cache path for artifacts and is used to provide a sandbox -/// exception for landlock. +/// - `security_status`: contains the detected status of security features. /// /// # Flow /// @@ -142,7 +141,7 @@ pub fn worker_entrypoint( }; // Allow an exception for writing to the known file in the worker cache. - try_restrict(path_beneath_rules(&[temp_artifact_dest], AccessFs::WriteFile)) + try_restrict(path_beneath_rules(&[&temp_artifact_dest], AccessFs::WriteFile)) .map(LandlockStatus::from_ruleset_status) .map_err(|e| e.to_string()) }; diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 6ede67e7687e..e61a11bdcc8d 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -218,12 +218,10 @@ where Fut: futures::Future, F: FnOnce(WorkerDir) -> Fut, { - let worker_dir_path = worker_dir.path.clone(); - // Cheaply create a hard link to the artifact. The artifact is always at a known location in the // worker cache, and the child can't access any other artifacts or gain any information from the // original filename. - let link_path = worker_dir::execute_artifact(&worker_dir_path); + let link_path = worker_dir::execute_artifact(&worker_dir.path); if let Err(err) = tokio::fs::hard_link(artifact_path, link_path).await { gum::warn!( target: LOG_TARGET, @@ -237,26 +235,23 @@ where } } + let worker_dir_path = worker_dir.path.clone(); let outcome = f(worker_dir).await; // Try to clear the worker dir. - // - // Note that it may not exist anymore because of the worker dying and being cleaned up. if let Err(err) = clear_worker_dir_path(&worker_dir_path) { - if !matches!(err.kind(), io::ErrorKind::NotFound) { - gum::warn!( - target: LOG_TARGET, - worker_pid = %pid, - ?worker_dir_path, - "failed to clear worker cache after the job: {:?}", - err, - ); - return Outcome::InternalError { - err: InternalValidationError::CouldNotClearWorkerDir { - err: format!("{:?}", err), - path: worker_dir_path.to_str().map(String::from), - }, - } + gum::warn!( + target: LOG_TARGET, + worker_pid = %pid, + ?worker_dir_path, + "failed to clear worker cache after the job: {:?}", + err, + ); + return Outcome::InternalError { + err: InternalValidationError::CouldNotClearWorkerDir { + err: format!("{:?}", err), + path: worker_dir_path.to_str().map(String::from), + }, } } diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index d43b6738b575..2efb7e76e61a 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -883,9 +883,9 @@ fn pulse_every(interval: std::time::Duration) -> impl futures::Stream /// Check if we can sandbox the root and emit a warning if not. /// /// We do this check by spawning a new process and trying to sandbox it. The process must be -/// single-threaded, so we can't just fork here. To get as close as possible to running unshare and -/// pivot_root in a worker, we try them... in a worker. The expected return status is 0 on success -/// and -1 on failure. +/// single-threaded, so we can't just fork here. To get as close as possible to running the check in +/// a worker, we try it... in a worker. The expected return status is 0 on success and -1 on +/// failure. fn check_can_unshare_user_namespace_and_change_root( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] prepare_worker_program_path: &Path, @@ -930,7 +930,9 @@ fn check_can_unshare_user_namespace_and_change_root( /// Check if landlock is supported and emit a warning if not. /// -/// TODO: Run in child process. +/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible +/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on +/// success and -1 on failure. fn check_landlock( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] prepare_worker_program_path: &Path, diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 55f40306922f..2b97371a15ea 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -269,11 +269,9 @@ where Fut: futures::Future, F: FnOnce(PathBuf, UnixStream, WorkerDir) -> Fut, { - let worker_dir_path = worker_dir.path.clone(); - // Create the tmp file here so that the child doesn't need any file creation rights. This will // be cleared at the end of this function. - let tmp_file = worker_dir::prepare_tmp_artifact(&worker_dir_path); + let tmp_file = worker_dir::prepare_tmp_artifact(&worker_dir.path); match tokio::fs::write(&tmp_file, &[]).await { Ok(()) => (), Err(err) => { @@ -291,22 +289,19 @@ where }, }; + let worker_dir_path = worker_dir.path.clone(); let outcome = f(tmp_file, stream, worker_dir).await; // Try to clear the worker dir. - // - // Note that it may not exist anymore because of the worker dying and being cleaned up. if let Err(err) = clear_worker_dir_path(&worker_dir_path) { - if !matches!(err.kind(), io::ErrorKind::NotFound) { - gum::warn!( - target: LOG_TARGET, - worker_pid = %pid, - ?worker_dir_path, - "failed to clear worker cache after the job: {:?}", - err, - ); - return Outcome::ClearWorkerDir { err: format!("{:?}", err) } - } + gum::warn!( + target: LOG_TARGET, + worker_pid = %pid, + ?worker_dir_path, + "failed to clear worker cache after the job: {:?}", + err, + ); + return Outcome::ClearWorkerDir { err: format!("{:?}", err) } } outcome diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index e04a6cd6bf2b..8a4e664a4e72 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -52,6 +52,8 @@ pub const JOB_TIMEOUT_WALL_CLOCK_FACTOR: u32 = 4; /// should go through the handshake. /// /// - `spawn_timeout`: The amount of time to wait for the child process to spawn. +/// +/// - `security_status`: contains the detected status of security features. #[doc(hidden)] pub async fn spawn_with_program_path( debug_id: &'static str, @@ -225,8 +227,6 @@ pub enum SpawnErr { AcceptTimeout, /// Failed to send handshake after successful spawning was signaled Handshake, - /// Cache path is not a valid UTF-8 str. - InvalidCachePath(PathBuf), } /// This is a representation of a potentially running worker. Drop it and the process will be @@ -387,11 +387,11 @@ pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result /// - artifact-2 /// - [...] /// + /tmp/ -/// - worker-dir-1/ -/// + socket (created by host) -/// + tmp-artifact (created by worker) (prepare-only) -/// + artifact (symlink -> artifact-1) (created by host) (execute-only) -/// - worker-dir-2/ +/// - worker-dir-1/ (new `/` for worker-1) +/// + socket (created by host) +/// + tmp-artifact (created by host) (prepare-only) +/// + artifact (link -> artifact-1) (created by host) (execute-only) +/// - worker-dir-2/ (new `/` for worker-2) /// + [...] /// ``` #[derive(Debug)] @@ -421,9 +421,10 @@ impl Drop for WorkerDir { // Not async since Rust has trouble with async recursion. There should be few files here anyway. // -// TODO: Can a lingering malicious job still access future files in the cache? -/// Clear the worker cache without deleting it. This is important because the worker has -/// mounted its own separate filesystem here. +// TODO: A lingering malicious job can still access future files in this dir. See +// for how to fully secure this. +/// Clear the temporary worker dir without deleting it. Not deleting is important because the worker +/// has mounted its own separate filesystem here. /// /// Should be called right after a job has finished. We don't want jobs to have access to /// artifacts from previous jobs. @@ -443,5 +444,9 @@ pub fn clear_worker_dir_path(worker_dir_path: &Path) -> io::Result<()> { Ok(()) } - remove_dir_contents(worker_dir_path) + // Note the worker dir may not exist anymore because of the worker dying and being cleaned up. + match remove_dir_contents(worker_dir_path) { + Err(err) if matches!(err.kind(), io::ErrorKind::NotFound) => Ok(()), + result => result, + } } diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md index bcf01b61f217..6a14a3a013d4 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -121,10 +121,10 @@ So what are we actually worried about? Things that come to mind: ### Restricting file-system access -A basic security mechanism is to make sure that any thread directly interfacing -with untrusted code does not have access to the file-system. This provides some -protection against attackers accessing sensitive data or modifying data on the -host machine. +A basic security mechanism is to make sure that any process directly interfacing +with untrusted code does not have unnecessary access to the file-system. This +provides some protection against attackers accessing sensitive data or modifying +data on the host machine. ### Clearing env vars From f92650546471f7533b8044bcc00c06400c6d0903 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 4 Sep 2023 10:55:57 +0200 Subject: [PATCH 04/18] cargo fmt --- polkadot/node/core/pvf/common/src/worker/mod.rs | 6 ++++-- polkadot/node/core/pvf/src/testing.rs | 4 +++- polkadot/node/core/pvf/tests/it/worker_common.rs | 5 ++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 33da8d80a907..fa5cb6791232 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -79,7 +79,9 @@ macro_rules! decl_worker_main { }, "--check-can-unshare-user-namespace-and-change-root" => { #[cfg(target_os = "linux")] - let status = if security::unshare_user_namespace_and_change_root(&std::env::temp_dir()).is_ok() { + let status = if security::unshare_user_namespace_and_change_root(&std::env::temp_dir()) + .is_ok() + { 0 } else { -1 @@ -129,7 +131,7 @@ macro_rules! decl_worker_main { let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned(); let security_status = $crate::SecurityStatus { can_enable_landlock, - can_unshare_user_namespace_and_change_root + can_unshare_user_namespace_and_change_root, }; $entrypoint(worker_dir_path, node_version, Some($worker_version), security_status); diff --git a/polkadot/node/core/pvf/src/testing.rs b/polkadot/node/core/pvf/src/testing.rs index 304ef2bc2b6a..ee0e26b55fce 100644 --- a/polkadot/node/core/pvf/src/testing.rs +++ b/polkadot/node/core/pvf/src/testing.rs @@ -90,7 +90,9 @@ macro_rules! decl_puppet_worker_main { }, "--check-can-unshare-user-namespace-and-change-root" => { #[cfg(target_os = "linux")] - let status = if security::unshare_user_namespace_and_change_root(&std::env::temp_dir()).is_ok() { + let status = if security::unshare_user_namespace_and_change_root(&std::env::temp_dir()) + .is_ok() + { 0 } else { -1 diff --git a/polkadot/node/core/pvf/tests/it/worker_common.rs b/polkadot/node/core/pvf/tests/it/worker_common.rs index a0ab8eceafb6..d9e801e4b84d 100644 --- a/polkadot/node/core/pvf/tests/it/worker_common.rs +++ b/polkadot/node/core/pvf/tests/it/worker_common.rs @@ -16,7 +16,10 @@ use std::time::Duration; -use polkadot_node_core_pvf::{testing::{spawn_with_program_path, SpawnErr}, SecurityStatus}; +use polkadot_node_core_pvf::{ + testing::{spawn_with_program_path, SpawnErr}, + SecurityStatus, +}; use crate::PUPPET_EXE; From 32cfbcb2dc6cfcaee68e373edd622eedcf1b806b Mon Sep 17 00:00:00 2001 From: Marcin S Date: Tue, 5 Sep 2023 11:44:54 +0200 Subject: [PATCH 05/18] Fix clippy error --- polkadot/node/core/pvf/src/prepare/worker_intf.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 2b97371a15ea..a1e8031d44c0 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -54,14 +54,8 @@ pub async fn spawn( extra_args.extend_from_slice(&["--node-impl-version", node_version]); } - Ok(spawn_with_program_path( - "prepare", - program_path, - &extra_args, - spawn_timeout, - security_status, - ) - .await?) + spawn_with_program_path("prepare", program_path, &extra_args, spawn_timeout, security_status) + .await } /// Outcome of PVF preparation. From eacb956718d61902fb480a0ca953adcd6d64c2cb Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 10 Sep 2023 15:22:29 +0200 Subject: [PATCH 06/18] Address review comments; refactor/simplify Landlock code --- .../node/core/pvf/common/src/worker/mod.rs | 77 +++++++++---- .../core/pvf/common/src/worker/security.rs | 101 +++++++++--------- .../node/core/pvf/execute-worker/src/lib.rs | 39 +------ .../node/core/pvf/prepare-worker/src/lib.rs | 38 +------ polkadot/node/core/pvf/src/testing.rs | 15 ++- 5 files changed, 115 insertions(+), 155 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index fa5cb6791232..69bd2f920e26 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -23,6 +23,7 @@ use cpu_time::ProcessTime; use futures::never::Never; use std::{ any::Any, + fmt, path::PathBuf, sync::mpsc::{Receiver, RecvTimeoutError}, time::Duration, @@ -66,21 +67,18 @@ macro_rules! decl_worker_main { "--check-can-enable-landlock" => { #[cfg(target_os = "linux")] - let status = if security::landlock::status_is_fully_enabled( - &security::landlock::get_status(), - ) { - 0 - } else { - -1 - }; + let status = if security::landlock::check_is_fully_enabled() { 0 } else { -1 }; #[cfg(not(target_os = "linux"))] let status = -1; std::process::exit(status) }, "--check-can-unshare-user-namespace-and-change-root" => { #[cfg(target_os = "linux")] - let status = if security::unshare_user_namespace_and_change_root(&std::env::temp_dir()) - .is_ok() + let status = if security::unshare_user_namespace_and_change_root( + WorkerKind::Execute, + &std::env::temp_dir(), + ) + .is_ok() { 0 } else { @@ -143,10 +141,25 @@ macro_rules! decl_worker_main { /// child process. pub const JOB_TIMEOUT_OVERHEAD: Duration = Duration::from_millis(50); +#[derive(Debug, Clone, Copy)] +pub enum WorkerKind { + Prepare, + Execute, +} + +impl fmt::Display for WorkerKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Prepare => write!(f, "prepare"), + Self::Execute => write!(f, "execute"), + } + } +} + // The worker version must be passed in so that we accurately get the version of the worker, and not // the version that this crate was compiled with. pub fn worker_event_loop( - debug_id: &'static str, + worker_kind: WorkerKind, #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut worker_dir_path: PathBuf, node_version: Option<&str>, worker_version: Option<&str>, @@ -157,14 +170,14 @@ pub fn worker_event_loop( Fut: futures::Future>, { let worker_pid = std::process::id(); - gum::debug!(target: LOG_TARGET, %worker_pid, ?worker_dir_path, "starting pvf worker ({})", debug_id); + gum::debug!(target: LOG_TARGET, %worker_pid, ?worker_dir_path, "starting pvf worker ({})", worker_kind); // Check for a mismatch between the node and worker versions. if let (Some(node_version), Some(worker_version)) = (node_version, worker_version) { if node_version != worker_version { gum::error!( target: LOG_TARGET, - %debug_id, + %worker_kind, %worker_pid, %node_version, %worker_version, @@ -172,39 +185,38 @@ pub fn worker_event_loop( ); kill_parent_node_in_emergency(); let err = io::Error::new(io::ErrorKind::Unsupported, "Version mismatch"); - worker_shutdown_message(debug_id, worker_pid, err); + worker_shutdown_message(worker_kind, worker_pid, err); return } } // Enable some security features. // - // Landlock is enabled in the prepare- or execute-worker-specific code since we restrict the - // access rights based on whether we are preparing or executing. We also need to remove the - // socket before applying Landlock restrictions. + // Landlock is enabled a bit later after the socket has been removed. { // Call based on whether we can change root. Error out if it should work but fails. #[cfg(target_os = "linux")] if security_status.can_unshare_user_namespace_and_change_root { - if let Err(err_ctx) = security::unshare_user_namespace_and_change_root(&worker_dir_path) + if let Err(err_ctx) = + security::unshare_user_namespace_and_change_root(worker_kind, &worker_dir_path) { let err = io::Error::last_os_error(); gum::error!( target: LOG_TARGET, - %debug_id, + %worker_kind, %worker_pid, %err_ctx, ?worker_dir_path, "Could not change root to be the worker cache path: {}", err ); - worker_shutdown_message(debug_id, worker_pid, err); + worker_shutdown_message(worker_kind, worker_pid, err); return } worker_dir_path = std::path::Path::new("/").to_owned(); } - security::remove_env_vars(debug_id); + security::remove_env_vars(worker_kind); } // Run the main worker loop. @@ -215,6 +227,25 @@ pub fn worker_event_loop( let stream = UnixStream::connect(&socket_path).await?; let _ = tokio::fs::remove_file(&socket_path).await; + #[cfg(target_os = "linux")] + if security_status.can_enable_landlock { + let landlock_status = + security::landlock::enable_for_worker(worker_kind, &worker_dir_path); + if !matches!(landlock_status, Ok(landlock::RulesetStatus::FullyEnforced)) { + // We previously were able to enable landlock, so this should never happen. + // + // TODO: Make this a real error in secure-mode. See: + // + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + "could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs", + landlock_status + ); + } + } + let result = event_loop(stream, worker_dir_path).await; result @@ -222,7 +253,7 @@ pub fn worker_event_loop( // It's never `Ok` because it's `Ok(Never)`. .unwrap_err(); - worker_shutdown_message(debug_id, worker_pid, err); + worker_shutdown_message(worker_kind, worker_pid, err); // We don't want tokio to wait for the tasks to finish. We want to bring down the worker as fast // as possible and not wait for stalled validation to finish. This isn't strictly necessary now, @@ -231,8 +262,8 @@ pub fn worker_event_loop( } /// Provide a consistent message on worker shutdown. -fn worker_shutdown_message(debug_id: &'static str, worker_pid: u32, err: io::Error) { - gum::debug!(target: LOG_TARGET, %worker_pid, "quitting pvf worker ({}): {:?}", debug_id, err); +fn worker_shutdown_message(worker_kind: WorkerKind, worker_pid: u32, err: io::Error) { + gum::debug!(target: LOG_TARGET, %worker_pid, "quitting pvf worker ({}): {:?}", worker_kind, err); } /// Loop that runs in the CPU time monitor thread on prepare and execute jobs. Continuously wakes up diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs index a8212f3ad695..ec3ac80ba661 100644 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ b/polkadot/node/core/pvf/common/src/worker/security.rs @@ -26,18 +26,21 @@ //! - Change the root directory to a worker-specific temporary directory. //! - Remove env vars -use crate::LOG_TARGET; +use crate::{worker::WorkerKind, LOG_TARGET}; +use std::path::Path; /// Unshare the user namespace and change root to be the artifact directory. #[cfg(target_os = "linux")] pub fn unshare_user_namespace_and_change_root( - worker_dir_path: &std::path::Path, + worker_kind: WorkerKind, + worker_dir_path: &Path, ) -> Result<(), &'static str> { use std::{ffi::CString, os::unix::ffi::OsStrExt, ptr}; - let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()).unwrap(); - let root_c = CString::new("/").unwrap(); - let dot_c = CString::new(".").unwrap(); + let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()) + .expect("on unix; the path will never contain 0 bytes; qed"); + let root_c = CString::new("/").expect("input contains no 0 bytes; qed"); + let dot_c = CString::new(".").expect("input contains no 0 bytes; qed"); // SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps (2) // and (3) are adapted from the example in pivot_root(2), with the additional change @@ -72,11 +75,16 @@ pub fn unshare_user_namespace_and_change_root( return Err("chdir to worker dir path") } // Ensure that the new root is a mount point. + let additional_flags = + if let WorkerKind::Execute = worker_kind { libc::MS_RDONLY } else { 0 }; if libc::mount( dot_c.as_ptr(), dot_c.as_ptr(), ptr::null(), // ignored when MS_BIND is used - libc::MS_BIND | libc::MS_REC | libc::MS_NOEXEC | libc::MS_NODEV | libc::MS_NOSUID, + libc::MS_BIND | + libc::MS_REC | libc::MS_NOEXEC | + libc::MS_NODEV | libc::MS_NOSUID | + libc::MS_NOATIME | additional_flags, ptr::null(), // ignored when MS_BIND is used ) < 0 { @@ -96,7 +104,7 @@ pub fn unshare_user_namespace_and_change_root( } /// Delete all env vars to prevent malicious code from accessing them. -pub fn remove_env_vars(debug_id: &'static str) { +pub fn remove_env_vars(worker_kind: WorkerKind) { for (key, value) in std::env::vars_os() { // TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of // randomness for malicious code. In the future we can remove it also and log in the host; @@ -125,7 +133,7 @@ pub fn remove_env_vars(debug_id: &'static str) { if !err_reasons.is_empty() { gum::warn!( target: LOG_TARGET, - %debug_id, + %worker_kind, ?key, ?value, "Attempting to remove badly-formatted env var, this may cause the PVF worker to crash. Please remove it yourself. Reasons: {:?}", @@ -137,30 +145,7 @@ pub fn remove_env_vars(debug_id: &'static str) { } } -/// To what degree landlock is enabled. It's a separate struct from `RulesetStatus` because that is -/// only available on Linux, plus this has a nicer name. -#[derive(Debug)] -pub enum LandlockStatus { - FullyEnforced, - PartiallyEnforced, - NotEnforced, - /// Thread panicked, we don't know what the status is. - Unavailable, -} - -impl LandlockStatus { - #[cfg(target_os = "linux")] - pub fn from_ruleset_status(ruleset_status: ::landlock::RulesetStatus) -> Self { - use ::landlock::RulesetStatus::*; - match ruleset_status { - FullyEnforced => LandlockStatus::FullyEnforced, - PartiallyEnforced => LandlockStatus::PartiallyEnforced, - NotEnforced => LandlockStatus::NotEnforced, - } - } -} - -/// The [landlock] docs say it best: +/// The [landlock] docs say it best: /// /// > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict /// ambient rights (e.g., global filesystem access) for a set of processes by creating safe security @@ -174,10 +159,12 @@ impl LandlockStatus { pub mod landlock { pub use landlock::{path_beneath_rules, Access, AccessFs}; + use crate::worker::WorkerKind; use landlock::{ PathBeneath, PathFd, Ruleset, RulesetAttr, RulesetCreatedAttr, RulesetError, RulesetStatus, ABI, }; + use std::path::Path; /// Landlock ABI version. We use ABI V1 because: /// @@ -208,29 +195,43 @@ pub mod landlock { /// supports it or if it introduces some new feature that is beneficial to security. pub const LANDLOCK_ABI: ABI = ABI::V1; - // TODO: - /// Returns to what degree landlock is enabled with the given ABI on the current Linux - /// environment. - pub fn get_status() -> Result> { - match std::thread::spawn(|| try_restrict(std::iter::empty())).join() { - Ok(Ok(status)) => Ok(status), - Ok(Err(ruleset_err)) => Err(ruleset_err.into()), - Err(_err) => Err("a panic occurred in try_restrict".into()), + /// Tried to enable landlock for the given kind of worker. + pub fn enable_for_worker( + worker_kind: WorkerKind, + worker_dir_path: &Path, + ) -> Result { + use crate::worker_dir; + + match worker_kind { + WorkerKind::Prepare => { + let temp_artifact_dest = worker_dir::prepare_tmp_artifact(worker_dir_path); + + // Allow an exception for writing to the known file in the worker cache. + try_restrict(path_beneath_rules(&[&temp_artifact_dest], AccessFs::WriteFile)) + .map_err(|e| e.to_string()) + }, + WorkerKind::Execute => { + let artifact_path = worker_dir::execute_artifact(worker_dir_path); + + // Allow an exception for reading from the known artifact path. + try_restrict(path_beneath_rules(&[&artifact_path], AccessFs::ReadFile)) + .map_err(|e| e.to_string()) + }, } } - /// Based on the given `status`, returns a single bool indicating whether the given landlock - /// ABI is fully enabled on the current Linux environment. - pub fn status_is_fully_enabled( - status: &Result>, - ) -> bool { - matches!(status, Ok(RulesetStatus::FullyEnforced)) - } - + // TODO: /// Runs a check for landlock and returns a single bool indicating whether the given landlock /// ABI is fully enabled on the current Linux environment. pub fn check_is_fully_enabled() -> bool { - status_is_fully_enabled(&get_status()) + let status_from_thread: Result> = + match std::thread::spawn(|| try_restrict(std::iter::empty())).join() { + Ok(Ok(status)) => Ok(status), + Ok(Err(ruleset_err)) => Err(ruleset_err.into()), + Err(_err) => Err("a panic occurred in try_restrict".into()), + }; + + matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced)) } /// Tries to restrict the current thread (should only be called in a process' main thread) with @@ -244,7 +245,7 @@ pub mod landlock { /// # Returns /// /// The status of the restriction (whether it was fully, partially, or not-at-all enforced). - pub fn try_restrict( + fn try_restrict( fs_exceptions: impl Iterator, RulesetError>>, ) -> Result { let status = Ruleset::new() diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index bbb57be8d607..57bc4b58f1ed 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -31,10 +31,9 @@ use polkadot_node_core_pvf_common::{ framed_recv, framed_send, worker::{ cpu_time_monitor_loop, - security::LandlockStatus, stringify_panic_payload, thread::{self, WaitOutcome}, - worker_event_loop, + worker_event_loop, WorkerKind, }, }; use polkadot_parachain_primitives::primitives::ValidationResult; @@ -128,7 +127,7 @@ pub fn worker_entrypoint( security_status: SecurityStatus, ) { worker_event_loop( - "execute", + WorkerKind::Execute, worker_dir_path, node_version, worker_version, @@ -142,40 +141,6 @@ pub fn worker_entrypoint( io::Error::new(io::ErrorKind::Other, format!("cannot create executor: {}", e)) })?; - // Try to enable landlock. - { - #[cfg(target_os = "linux")] - let landlock_status = { - use polkadot_node_core_pvf_common::worker::security::landlock::{ - path_beneath_rules, try_restrict, AccessFs, - }; - - // Allow an exception for reading from the known artifact path. - try_restrict(path_beneath_rules(&[&artifact_path], AccessFs::ReadFile)) - .map(LandlockStatus::from_ruleset_status) - .map_err(|e| e.to_string()) - }; - #[cfg(not(target_os = "linux"))] - let landlock_status: Result = Ok(LandlockStatus::NotEnforced); - - // Error if the host determined that landlock is fully enabled and we couldn't fully - // enforce it here. - if security_status.can_enable_landlock && - !matches!(landlock_status, Ok(LandlockStatus::FullyEnforced)) - { - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "could not fully enable landlock: {:?}", - landlock_status - ); - return Err(io::Error::new( - io::ErrorKind::Other, - format!("could not fully enable landlock: {:?}", landlock_status), - )) - } - } - loop { let (params, execution_timeout) = recv_request(&mut stream).await?; gum::debug!( diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index a55f73b4a27b..2c0dd255a59a 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -37,8 +37,8 @@ use polkadot_node_core_pvf_common::{ prepare::{MemoryStats, PrepareJobKind, PrepareStats}, pvf::PvfPrepData, worker::{ + WorkerKind, cpu_time_monitor_loop, - security::LandlockStatus, stringify_panic_payload, thread::{self, WaitOutcome}, worker_event_loop, @@ -123,7 +123,7 @@ pub fn worker_entrypoint( security_status: SecurityStatus, ) { worker_event_loop( - "prepare", + WorkerKind::Prepare, worker_dir_path, node_version, worker_version, @@ -132,40 +132,6 @@ pub fn worker_entrypoint( let worker_pid = std::process::id(); let temp_artifact_dest = worker_dir::prepare_tmp_artifact(&worker_dir_path); - // Try to enable landlock. - { - #[cfg(target_os = "linux")] - let landlock_status = { - use polkadot_node_core_pvf_common::worker::security::landlock::{ - path_beneath_rules, try_restrict, AccessFs, - }; - - // Allow an exception for writing to the known file in the worker cache. - try_restrict(path_beneath_rules(&[&temp_artifact_dest], AccessFs::WriteFile)) - .map(LandlockStatus::from_ruleset_status) - .map_err(|e| e.to_string()) - }; - #[cfg(not(target_os = "linux"))] - let landlock_status: Result = Ok(LandlockStatus::NotEnforced); - - // Error if the host determined that landlock is fully enabled and we couldn't fully - // enforce it here. - if security_status.can_enable_landlock && - !matches!(landlock_status, Ok(LandlockStatus::FullyEnforced)) - { - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "could not fully enable landlock: {:?}", - landlock_status - ); - return Err(io::Error::new( - io::ErrorKind::Other, - format!("could not fully enable landlock: {:?}", landlock_status), - )) - } - } - loop { let pvf = recv_request(&mut stream).await?; gum::debug!( diff --git a/polkadot/node/core/pvf/src/testing.rs b/polkadot/node/core/pvf/src/testing.rs index ee0e26b55fce..dc97812edfd7 100644 --- a/polkadot/node/core/pvf/src/testing.rs +++ b/polkadot/node/core/pvf/src/testing.rs @@ -77,21 +77,18 @@ macro_rules! decl_puppet_worker_main { "--check-can-enable-landlock" => { #[cfg(target_os = "linux")] - let status = if security::landlock::status_is_fully_enabled( - &security::landlock::get_status(), - ) { - 0 - } else { - -1 - }; + let status = if security::landlock::check_is_fully_enabled() { 0 } else { -1 }; #[cfg(not(target_os = "linux"))] let status = -1; std::process::exit(status) }, "--check-can-unshare-user-namespace-and-change-root" => { #[cfg(target_os = "linux")] - let status = if security::unshare_user_namespace_and_change_root(&std::env::temp_dir()) - .is_ok() + let status = if security::unshare_user_namespace_and_change_root( + polkadot_node_core_pvf_common::worker::WorkerKind::Execute, + &std::env::temp_dir(), + ) + .is_ok() { 0 } else { From dc6fe0413495a001232ea9ba5e4d7abf7d246eef Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 10 Sep 2023 15:31:59 +0200 Subject: [PATCH 07/18] cargo fmt --- polkadot/node/core/pvf/execute-worker/src/lib.rs | 3 +-- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 57bc4b58f1ed..b2bf59915a99 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -30,8 +30,7 @@ use polkadot_node_core_pvf_common::{ executor_intf::NATIVE_STACK_MAX, framed_recv, framed_send, worker::{ - cpu_time_monitor_loop, - stringify_panic_payload, + cpu_time_monitor_loop, stringify_panic_payload, thread::{self, WaitOutcome}, worker_event_loop, WorkerKind, }, diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 2c0dd255a59a..5db7c0ce299b 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -37,11 +37,9 @@ use polkadot_node_core_pvf_common::{ prepare::{MemoryStats, PrepareJobKind, PrepareStats}, pvf::PvfPrepData, worker::{ - WorkerKind, - cpu_time_monitor_loop, - stringify_panic_payload, + cpu_time_monitor_loop, stringify_panic_payload, thread::{self, WaitOutcome}, - worker_event_loop, + worker_event_loop, WorkerKind, }, worker_dir, ProcessTime, SecurityStatus, }; From ed344abc85778f0b518025797b3523c71444e2d2 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 11 Sep 2023 13:27:34 +0200 Subject: [PATCH 08/18] Address most review comments (will do the last one after lunch) --- polkadot/node/core/pvf/common/src/execute.rs | 2 +- .../node/core/pvf/common/src/worker/mod.rs | 19 ++- .../core/pvf/common/src/worker/security.rs | 126 ++++++++++-------- polkadot/node/core/pvf/src/host.rs | 7 +- 4 files changed, 84 insertions(+), 70 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/execute.rs b/polkadot/node/core/pvf/common/src/execute.rs index 1f38cff88379..399b847791a9 100644 --- a/polkadot/node/core/pvf/common/src/execute.rs +++ b/polkadot/node/core/pvf/common/src/execute.rs @@ -21,7 +21,7 @@ use polkadot_primitives::ExecutorParams; use std::time::Duration; /// The payload of the one-time handshake that is done when a worker process is created. Carries -/// data from the host to the worker that would be too large for CLI parameters.. +/// data from the host to the worker. #[derive(Encode, Decode)] pub struct Handshake { /// The executor parameters. diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 69bd2f920e26..f16eb76aaf91 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -76,6 +76,8 @@ macro_rules! decl_worker_main { #[cfg(target_os = "linux")] let status = if security::unshare_user_namespace_and_change_root( WorkerKind::Execute, + // We're not accessing any files, so we can try to pivot_root in the temp + // dir without conflicts with other processes. &std::env::temp_dir(), ) .is_ok() @@ -184,7 +186,7 @@ pub fn worker_event_loop( "Node and worker version mismatch, node needs restarting, forcing shutdown", ); kill_parent_node_in_emergency(); - let err = io::Error::new(io::ErrorKind::Unsupported, "Version mismatch"); + let err = String::from("Version mismatch"); worker_shutdown_message(worker_kind, worker_pid, err); return } @@ -195,17 +197,20 @@ pub fn worker_event_loop( // Landlock is enabled a bit later after the socket has been removed. { // Call based on whether we can change root. Error out if it should work but fails. + // + // NOTE: This should not be called in a multi-threaded context (i.e. inside the tokio + // runtime). `unshare(2)`: + // + // > CLONE_NEWUSER requires that the calling process is not threaded. #[cfg(target_os = "linux")] if security_status.can_unshare_user_namespace_and_change_root { - if let Err(err_ctx) = + if let Err(err) = security::unshare_user_namespace_and_change_root(worker_kind, &worker_dir_path) { - let err = io::Error::last_os_error(); gum::error!( target: LOG_TARGET, %worker_kind, %worker_pid, - %err_ctx, ?worker_dir_path, "Could not change root to be the worker cache path: {}", err @@ -253,7 +258,7 @@ pub fn worker_event_loop( // It's never `Ok` because it's `Ok(Never)`. .unwrap_err(); - worker_shutdown_message(worker_kind, worker_pid, err); + worker_shutdown_message(worker_kind, worker_pid, err.to_string()); // We don't want tokio to wait for the tasks to finish. We want to bring down the worker as fast // as possible and not wait for stalled validation to finish. This isn't strictly necessary now, @@ -262,8 +267,8 @@ pub fn worker_event_loop( } /// Provide a consistent message on worker shutdown. -fn worker_shutdown_message(worker_kind: WorkerKind, worker_pid: u32, err: io::Error) { - gum::debug!(target: LOG_TARGET, %worker_pid, "quitting pvf worker ({}): {:?}", worker_kind, err); +fn worker_shutdown_message(worker_kind: WorkerKind, worker_pid: u32, err: String) { + gum::debug!(target: LOG_TARGET, %worker_pid, "quitting pvf worker ({}): {}", worker_kind, err); } /// Loop that runs in the CPU time monitor thread on prepare and execute jobs. Continuously wakes up diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs index ec3ac80ba661..6a17888154b8 100644 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ b/polkadot/node/core/pvf/common/src/worker/security.rs @@ -27,14 +27,17 @@ //! - Remove env vars use crate::{worker::WorkerKind, LOG_TARGET}; -use std::path::Path; /// Unshare the user namespace and change root to be the artifact directory. +/// +/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`: +/// +/// > CLONE_NEWUSER requires that the calling process is not threaded. #[cfg(target_os = "linux")] pub fn unshare_user_namespace_and_change_root( worker_kind: WorkerKind, - worker_dir_path: &Path, -) -> Result<(), &'static str> { + worker_dir_path: &std::path::Path, +) -> Result<(), String> { use std::{ffi::CString, os::unix::ffi::OsStrExt, ptr}; let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()) @@ -42,65 +45,72 @@ pub fn unshare_user_namespace_and_change_root( let root_c = CString::new("/").expect("input contains no 0 bytes; qed"); let dot_c = CString::new(".").expect("input contains no 0 bytes; qed"); - // SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps (2) - // and (3) are adapted from the example in pivot_root(2), with the additional change - // described in the `pivot_root(".", ".")` section. - unsafe { - // 1. `unshare` the user and the mount namespaces. - // - // Separate calls: in case one flag succeeds but other fails, we give a more precise error. - if libc::unshare(libc::CLONE_NEWUSER) < 0 { - return Err("unshare user namespace") - } - if libc::unshare(libc::CLONE_NEWNS) < 0 { - return Err("unshare mount namespace") - } + // Wrapper around all the work to prevent repetitive error handling. + // + // # Errors + // + // It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does + // not give the context of which call failed, so we return a &str error. + || -> Result<(), &'static str> { + // SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps + // (2) and (3) are adapted from the example in pivot_root(2), with the additional + // change described in the `pivot_root(".", ".")` section. + unsafe { + // 1. `unshare` the user and the mount namespaces. + if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 { + return Err("unshare user and mount namespaces") + } - // 2. Setup mounts. - // - // Ensure that new root and its parent mount don't have shared propagation (which would - // cause pivot_root() to return an error), and prevent propagation of mount events to the - // initial mount namespace. - if libc::mount( - ptr::null(), - root_c.as_ptr(), - ptr::null(), - libc::MS_REC | libc::MS_PRIVATE, - ptr::null(), - ) < 0 - { - return Err("mount MS_PRIVATE") - } - if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { - return Err("chdir to worker dir path") - } - // Ensure that the new root is a mount point. - let additional_flags = - if let WorkerKind::Execute = worker_kind { libc::MS_RDONLY } else { 0 }; - if libc::mount( - dot_c.as_ptr(), - dot_c.as_ptr(), - ptr::null(), // ignored when MS_BIND is used - libc::MS_BIND | - libc::MS_REC | libc::MS_NOEXEC | - libc::MS_NODEV | libc::MS_NOSUID | - libc::MS_NOATIME | additional_flags, - ptr::null(), // ignored when MS_BIND is used - ) < 0 - { - return Err("mount MS_BIND") - } + // 2. Setup mounts. + // + // Ensure that new root and its parent mount don't have shared propagation (which would + // cause pivot_root() to return an error), and prevent propagation of mount events to + // the initial mount namespace. + if libc::mount( + ptr::null(), + root_c.as_ptr(), + ptr::null(), + libc::MS_REC | libc::MS_PRIVATE, + ptr::null(), + ) < 0 + { + return Err("mount MS_PRIVATE") + } + if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { + return Err("chdir to worker dir path") + } + // Ensure that the new root is a mount point. + let additional_flags = + if let WorkerKind::Execute = worker_kind { libc::MS_RDONLY } else { 0 }; + if libc::mount( + dot_c.as_ptr(), + dot_c.as_ptr(), + ptr::null(), // ignored when MS_BIND is used + libc::MS_BIND | + libc::MS_REC | libc::MS_NOEXEC | + libc::MS_NODEV | libc::MS_NOSUID | + libc::MS_NOATIME | additional_flags, + ptr::null(), // ignored when MS_BIND is used + ) < 0 + { + return Err("mount MS_BIND") + } - // 3. `pivot_root` to the artifact directory. - if libc::syscall(libc::SYS_pivot_root, &dot_c, &dot_c) < 0 { - return Err("pivot_root") - } - if libc::umount2(dot_c.as_ptr(), libc::MNT_DETACH) < 0 { - return Err("umount the old root mount point") + // 3. `pivot_root` to the artifact directory. + if libc::syscall(libc::SYS_pivot_root, &dot_c, &dot_c) < 0 { + return Err("pivot_root") + } + if libc::umount2(dot_c.as_ptr(), libc::MNT_DETACH) < 0 { + return Err("umount the old root mount point") + } } - } - Ok(()) + Ok(()) + }() + .map_err(|err_ctx| { + let err = std::io::Error::last_os_error(); + format!("{}: {}", err_ctx, err) + }) } /// Delete all env vars to prevent malicious code from accessing them. diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 2efb7e76e61a..292771cb4cc6 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -882,10 +882,9 @@ fn pulse_every(interval: std::time::Duration) -> impl futures::Stream /// Check if we can sandbox the root and emit a warning if not. /// -/// We do this check by spawning a new process and trying to sandbox it. The process must be -/// single-threaded, so we can't just fork here. To get as close as possible to running the check in -/// a worker, we try it... in a worker. The expected return status is 0 on success and -1 on -/// failure. +/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible +/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on +/// success and -1 on failure. fn check_can_unshare_user_namespace_and_change_root( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] prepare_worker_program_path: &Path, From 8cedd7bb0578f5193b4e7f36fc2250a895d55b22 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 11 Sep 2023 16:03:33 +0200 Subject: [PATCH 09/18] Use cache as location for worker dirs --- polkadot/node/core/pvf/src/artifacts.rs | 5 ++-- polkadot/node/core/pvf/src/execute/queue.rs | 8 ++++++ .../node/core/pvf/src/execute/worker_intf.rs | 2 ++ polkadot/node/core/pvf/src/host.rs | 2 ++ polkadot/node/core/pvf/src/prepare/pool.rs | 9 +++++++ .../node/core/pvf/src/prepare/worker_intf.rs | 12 +++++++-- polkadot/node/core/pvf/src/worker_intf.rs | 27 ++++++++++--------- polkadot/node/core/pvf/tests/it/main.rs | 8 ++++-- .../node/core/pvf/tests/it/worker_common.rs | 5 +++- 9 files changed, 59 insertions(+), 19 deletions(-) diff --git a/polkadot/node/core/pvf/src/artifacts.rs b/polkadot/node/core/pvf/src/artifacts.rs index 112fc76e7e39..5a1767af75b7 100644 --- a/polkadot/node/core/pvf/src/artifacts.rs +++ b/polkadot/node/core/pvf/src/artifacts.rs @@ -172,9 +172,10 @@ impl Artifacts { /// /// The recognized artifacts will be filled in the table and unrecognized will be removed. pub async fn new(cache_path: &Path) -> Self { - // Make sure that the cache path directory and all its parents are created. - // First delete the entire cache. Nodes are long-running so this should populate shortly. + // First delete the entire cache. This includes artifacts and any leftover worker dirs (see + // [`WorkerDir`]). Nodes are long-running so this should populate shortly. let _ = tokio::fs::remove_dir_all(cache_path).await; + // Make sure that the cache path directory and all its parents are created. let _ = tokio::fs::create_dir_all(cache_path).await; Self { artifacts: HashMap::new() } diff --git a/polkadot/node/core/pvf/src/execute/queue.rs b/polkadot/node/core/pvf/src/execute/queue.rs index 3729700caf00..aca604f0de21 100644 --- a/polkadot/node/core/pvf/src/execute/queue.rs +++ b/polkadot/node/core/pvf/src/execute/queue.rs @@ -140,6 +140,7 @@ struct Queue { // Some variables related to the current session. program_path: PathBuf, + cache_path: PathBuf, spawn_timeout: Duration, node_version: Option, security_status: SecurityStatus, @@ -154,6 +155,7 @@ impl Queue { fn new( metrics: Metrics, program_path: PathBuf, + cache_path: PathBuf, worker_capacity: usize, spawn_timeout: Duration, node_version: Option, @@ -163,6 +165,7 @@ impl Queue { Self { metrics, program_path, + cache_path, spawn_timeout, node_version, security_status, @@ -409,6 +412,7 @@ fn spawn_extra_worker(queue: &mut Queue, job: ExecuteJob) { queue.mux.push( spawn_worker_task( queue.program_path.clone(), + queue.cache_path.clone(), job, queue.spawn_timeout, queue.node_version.clone(), @@ -428,6 +432,7 @@ fn spawn_extra_worker(queue: &mut Queue, job: ExecuteJob) { /// execute other jobs with a compatible execution environment. async fn spawn_worker_task( program_path: PathBuf, + cache_path: PathBuf, job: ExecuteJob, spawn_timeout: Duration, node_version: Option, @@ -438,6 +443,7 @@ async fn spawn_worker_task( loop { match super::worker_intf::spawn( &program_path, + &cache_path, job.executor_params.clone(), spawn_timeout, node_version.as_deref(), @@ -503,6 +509,7 @@ fn assign(queue: &mut Queue, worker: Worker, job: ExecuteJob) { pub fn start( metrics: Metrics, program_path: PathBuf, + cache_path: PathBuf, worker_capacity: usize, spawn_timeout: Duration, node_version: Option, @@ -512,6 +519,7 @@ pub fn start( let run = Queue::new( metrics, program_path, + cache_path, worker_capacity, spawn_timeout, node_version, diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index e61a11bdcc8d..95a6e26fa583 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -42,6 +42,7 @@ use tokio::{io, net::UnixStream}; /// Sends a handshake message to the worker as soon as it is spawned. pub async fn spawn( program_path: &Path, + cache_path: &Path, executor_params: ExecutorParams, spawn_timeout: Duration, node_version: Option<&str>, @@ -55,6 +56,7 @@ pub async fn spawn( let (mut idle_worker, worker_handle) = spawn_with_program_path( "execute", program_path, + cache_path, &extra_args, spawn_timeout, security_status, diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 292771cb4cc6..091d780e4944 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -218,6 +218,7 @@ pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future (ValidationHost, impl Future>; struct Pool { // Some variables related to the current session. program_path: PathBuf, + cache_path: PathBuf, spawn_timeout: Duration, node_version: Option, security_status: SecurityStatus, @@ -131,6 +132,7 @@ struct Fatal; async fn run( Pool { program_path, + cache_path, spawn_timeout, node_version, security_status, @@ -159,6 +161,7 @@ async fn run( handle_to_pool( &metrics, &program_path, + &cache_path, spawn_timeout, node_version.clone(), security_status.clone(), @@ -206,6 +209,7 @@ async fn purge_dead( fn handle_to_pool( metrics: &Metrics, program_path: &Path, + cache_path: &Path, spawn_timeout: Duration, node_version: Option, security_status: SecurityStatus, @@ -220,6 +224,7 @@ fn handle_to_pool( mux.push( spawn_worker_task( program_path.to_owned(), + cache_path.to_owned(), spawn_timeout, node_version, security_status, @@ -265,6 +270,7 @@ fn handle_to_pool( async fn spawn_worker_task( program_path: PathBuf, + cache_path: PathBuf, spawn_timeout: Duration, node_version: Option, security_status: SecurityStatus, @@ -274,6 +280,7 @@ async fn spawn_worker_task( loop { match worker_intf::spawn( &program_path, + &cache_path, spawn_timeout, node_version.as_deref(), security_status.clone(), @@ -460,6 +467,7 @@ fn handle_concluded_no_rip( pub fn start( metrics: Metrics, program_path: PathBuf, + cache_path: PathBuf, spawn_timeout: Duration, node_version: Option, security_status: SecurityStatus, @@ -470,6 +478,7 @@ pub fn start( let run = run(Pool { metrics, program_path, + cache_path, spawn_timeout, node_version, security_status, diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index a1e8031d44c0..7a3a543eac43 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -45,6 +45,7 @@ use tokio::{io, net::UnixStream}; /// Sends a handshake message to the worker as soon as it is spawned. pub async fn spawn( program_path: &Path, + cache_path: &Path, spawn_timeout: Duration, node_version: Option<&str>, security_status: SecurityStatus, @@ -54,8 +55,15 @@ pub async fn spawn( extra_args.extend_from_slice(&["--node-impl-version", node_version]); } - spawn_with_program_path("prepare", program_path, &extra_args, spawn_timeout, security_status) - .await + spawn_with_program_path( + "prepare", + program_path, + cache_path, + &extra_args, + spawn_timeout, + security_status, + ) + .await } /// Outcome of PVF preparation. diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index 8a4e664a4e72..13cd6fb2c171 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -47,6 +47,8 @@ pub const JOB_TIMEOUT_WALL_CLOCK_FACTOR: u32 = 4; /// /// - `program_path`: The path to the program. /// +/// - `cache_path`: The path to the artifact cache. +/// /// - `extra_args`: Optional extra CLI arguments to the program. NOTE: Should only contain data /// required before the handshake, like node/worker versions for the version check. Other data /// should go through the handshake. @@ -58,12 +60,13 @@ pub const JOB_TIMEOUT_WALL_CLOCK_FACTOR: u32 = 4; pub async fn spawn_with_program_path( debug_id: &'static str, program_path: impl Into, + cache_path: &Path, extra_args: &[&str], spawn_timeout: Duration, security_status: SecurityStatus, ) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { let program_path = program_path.into(); - let worker_dir = WorkerDir::new(debug_id).await?; + let worker_dir = WorkerDir::new(debug_id, cache_path).await?; with_transient_socket_path(&worker_dir.path.clone(), |socket_path| { let socket_path = socket_path.to_owned(); @@ -188,6 +191,7 @@ pub async fn tmppath_in(prefix: &str, dir: &Path) -> io::Result { } /// The same as [`tmppath_in`], but uses [`std::env::temp_dir`] as the directory. +#[cfg(test)] pub async fn tmppath(prefix: &str) -> io::Result { let temp_dir = PathBuf::from(std::env::temp_dir()); tmppath_in(prefix, &temp_dir).await @@ -369,24 +373,23 @@ pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result Ok(buf) } -/// A temporary worker cache that contains only files needed by the worker. The worker will change -/// its root (the `/` directory) to this cache directory; it should have access to no other paths on -/// its filesystem. The worker cache should live in a tmp directory in the host's filesystem. +/// A temporary worker dir that contains only files needed by the worker. The worker will change its +/// root (the `/` directory) to this directory; it should have access to no other paths on its +/// filesystem. /// /// NOTE: This struct cleans up its associated directory when it is dropped. Therefore it should not /// implement `Clone`. /// /// # File structure /// -/// The overall file structure for the PVF system is as follows. The `worker-dir`s are managed by +/// The overall file structure for the PVF system is as follows. The `worker-dir-X`s are managed by /// this struct. /// /// ```nocompile -/// + /[...]/cache_path/ +/// + // /// - artifact-1 /// - artifact-2 /// - [...] -/// + /tmp/ /// - worker-dir-1/ (new `/` for worker-1) /// + socket (created by host) /// + tmp-artifact (created by host) (prepare-only) @@ -400,10 +403,10 @@ pub struct WorkerDir { } impl WorkerDir { - /// Creates a new, empty worker cache with a random name in a tmp location. - pub async fn new(debug_id: &'static str) -> Result { + /// Creates a new, empty worker dir with a random name in the given cache dir. + pub async fn new(debug_id: &'static str, cache_dir: &Path) -> Result { let prefix = format!("worker-dir-{}-", debug_id); - let path = tmppath(&prefix).await.map_err(|_| SpawnErr::TmpPath)?; + let path = tmppath_in(&prefix, cache_dir).await.map_err(|_| SpawnErr::TmpPath)?; tokio::fs::create_dir(&path) .await .map_err(|err| SpawnErr::Fs(err.to_string()))?; @@ -411,8 +414,8 @@ impl WorkerDir { } } -// Try to clean up the temporary worker cache at the end of the worker's lifetime. It should be in a -// temporary directory location, but we make a best effort not to leave it around. +// Try to clean up the temporary worker dir at the end of the worker's lifetime. It should be wiped +// on startup, but we make a best effort not to leave it around. impl Drop for WorkerDir { fn drop(&mut self) { let _ = std::fs::remove_dir_all(&self.path); diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 12d87ee44262..085387beba2e 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -282,8 +282,12 @@ async fn deleting_prepared_artifact_does_not_dispute() { { // Get the artifact path (asserting it exists). let mut cache_dir: Vec<_> = std::fs::read_dir(cache_dir).unwrap().collect(); - assert_eq!(cache_dir.len(), 1); - let artifact_path = cache_dir.pop().unwrap().unwrap(); + // Should contain the artifact and the worker dir. + assert_eq!(cache_dir.len(), 2); + let mut artifact_path = cache_dir.pop().unwrap().unwrap(); + if artifact_path.path().is_dir() { + artifact_path = cache_dir.pop().unwrap().unwrap(); + } // Delete the artifact. std::fs::remove_file(artifact_path.path()).unwrap(); diff --git a/polkadot/node/core/pvf/tests/it/worker_common.rs b/polkadot/node/core/pvf/tests/it/worker_common.rs index d9e801e4b84d..a3bb41a6da80 100644 --- a/polkadot/node/core/pvf/tests/it/worker_common.rs +++ b/polkadot/node/core/pvf/tests/it/worker_common.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use std::time::Duration; +use std::{env, time::Duration}; use polkadot_node_core_pvf::{ testing::{spawn_with_program_path, SpawnErr}, @@ -29,6 +29,7 @@ async fn spawn_immediate_exit() { let result = spawn_with_program_path( "integration-test", PUPPET_EXE, + &env::temp_dir(), &["exit"], Duration::from_secs(2), SecurityStatus::default(), @@ -42,6 +43,7 @@ async fn spawn_timeout() { let result = spawn_with_program_path( "integration-test", PUPPET_EXE, + &env::temp_dir(), &["sleep"], Duration::from_secs(2), SecurityStatus::default(), @@ -55,6 +57,7 @@ async fn should_connect() { let _ = spawn_with_program_path( "integration-test", PUPPET_EXE, + &env::temp_dir(), &["prepare-worker"], Duration::from_secs(2), SecurityStatus::default(), From ccc329e7453045427c2f9bb583b6932a69da7081 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Tue, 12 Sep 2023 17:00:23 +0200 Subject: [PATCH 10/18] Clear env vars when spawning process --- .../node/core/pvf/common/src/worker/mod.rs | 20 ++++++-- .../core/pvf/common/src/worker/security.rs | 51 ++++++++----------- polkadot/node/core/pvf/src/worker_intf.rs | 10 +++- 3 files changed, 45 insertions(+), 36 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 322548d2037a..333e1002c4b0 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -191,8 +191,7 @@ pub fn worker_event_loop( "Node and worker version mismatch, node needs restarting, forcing shutdown", ); kill_parent_node_in_emergency(); - let err = String::from("Version mismatch"); - worker_shutdown_message(worker_kind, worker_pid, err); + worker_shutdown_message(worker_kind, worker_pid, "Version mismatch"); return } } @@ -226,7 +225,18 @@ pub fn worker_event_loop( worker_dir_path = std::path::Path::new("/").to_owned(); } - security::remove_env_vars(worker_kind); + if !security::check_env_vars_were_cleared(worker_kind, worker_pid) { + let err = "not all env vars were cleared when spawning the process"; + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + "{}", + err + ); + worker_shutdown_message(worker_kind, worker_pid, err); + return + } } // Run the main worker loop. @@ -263,7 +273,7 @@ pub fn worker_event_loop( // It's never `Ok` because it's `Ok(Never)`. .unwrap_err(); - worker_shutdown_message(worker_kind, worker_pid, err.to_string()); + worker_shutdown_message(worker_kind, worker_pid, &err.to_string()); // We don't want tokio to wait for the tasks to finish. We want to bring down the worker as fast // as possible and not wait for stalled validation to finish. This isn't strictly necessary now, @@ -272,7 +282,7 @@ pub fn worker_event_loop( } /// Provide a consistent message on worker shutdown. -fn worker_shutdown_message(worker_kind: WorkerKind, worker_pid: u32, err: String) { +fn worker_shutdown_message(worker_kind: WorkerKind, worker_pid: u32, err: &str) { gum::debug!(target: LOG_TARGET, %worker_pid, "quitting pvf worker ({}): {}", worker_kind, err); } diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs index 6a17888154b8..41f854f5a682 100644 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ b/polkadot/node/core/pvf/common/src/worker/security.rs @@ -113,8 +113,11 @@ pub fn unshare_user_namespace_and_change_root( }) } -/// Delete all env vars to prevent malicious code from accessing them. -pub fn remove_env_vars(worker_kind: WorkerKind) { +/// Require env vars to have been removed when spawning the process, to prevent malicious code from +/// accessing them. +pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool { + let mut ok = true; + for (key, value) in std::env::vars_os() { // TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of // randomness for malicious code. In the future we can remove it also and log in the host; @@ -122,37 +125,25 @@ pub fn remove_env_vars(worker_kind: WorkerKind) { if key == "RUST_LOG" { continue } - - // In case of a key or value that would cause [`env::remove_var` to - // panic](https://doc.rust-lang.org/std/env/fn.remove_var.html#panics), we first log a - // warning and then proceed to attempt to remove the env var. - let mut err_reasons = vec![]; - let (key_str, value_str) = (key.to_str(), value.to_str()); - if key.is_empty() { - err_reasons.push("key is empty"); - } - if key_str.is_some_and(|s| s.contains('=')) { - err_reasons.push("key contains '='"); - } - if key_str.is_some_and(|s| s.contains('\0')) { - err_reasons.push("key contains null character"); - } - if value_str.is_some_and(|s| s.contains('\0')) { - err_reasons.push("value contains null character"); - } - if !err_reasons.is_empty() { - gum::warn!( - target: LOG_TARGET, - %worker_kind, - ?key, - ?value, - "Attempting to remove badly-formatted env var, this may cause the PVF worker to crash. Please remove it yourself. Reasons: {:?}", - err_reasons - ); + // An exception for MacOS. This is not a secure platform anyway, so we let it slide. + #[cfg(target_os = "macos")] + if key == "__CF_USER_TEXT_ENCODING" { + continue } - std::env::remove_var(key); + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?key, + ?value, + "env var was present that should have been removed", + ); + + ok = false; } + + ok } /// The [landlock] docs say it best: diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index 13cd6fb2c171..82bc912dc9ba 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -269,7 +269,15 @@ impl WorkerHandle { } args }; - let mut child = process::Command::new(program.as_ref()) + + // Clear all env vars from the spawned process. + let mut command = process::Command::new(program.as_ref()); + command.env_clear(); + // Add back any env vars we want to keep. + if let Ok(value) = std::env::var("RUST_LOG") { + command.env("RUST_LOG", value); + } + let mut child = command .args(extra_args) .arg("--worker-dir-path") .arg(worker_dir_path.as_ref().as_os_str()) From 2e6bb65b735827fe19cd3c737086b056e3ab7864 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Thu, 14 Sep 2023 10:43:44 +0200 Subject: [PATCH 11/18] Fix compiler error, add comment --- polkadot/node/core/pvf/common/src/worker/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 333e1002c4b0..006da4965856 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -219,7 +219,7 @@ pub fn worker_event_loop( "Could not change root to be the worker cache path: {}", err ); - worker_shutdown_message(worker_kind, worker_pid, err); + worker_shutdown_message(worker_kind, worker_pid, &err); return } worker_dir_path = std::path::Path::new("/").to_owned(); @@ -247,6 +247,7 @@ pub fn worker_event_loop( let stream = UnixStream::connect(&socket_path).await?; let _ = tokio::fs::remove_file(&socket_path).await; + // Enable landlock now so we don't need an exception for the socket. #[cfg(target_os = "linux")] if security_status.can_enable_landlock { let landlock_status = From a5efc37fea93cdef9684aed2451a68356ac46eda Mon Sep 17 00:00:00 2001 From: Marcin S Date: Thu, 14 Sep 2023 11:52:57 +0200 Subject: [PATCH 12/18] Rearrange worker startup --- .../node/core/pvf/common/src/worker/mod.rs | 78 +++++++++++-------- 1 file changed, 46 insertions(+), 32 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 006da4965856..7497c21af2f6 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -75,7 +75,7 @@ macro_rules! decl_worker_main { "--check-can-unshare-user-namespace-and-change-root" => { #[cfg(target_os = "linux")] let status = if security::unshare_user_namespace_and_change_root( - WorkerKind::Execute, + $crate::worker::WorkerKind::Execute, // We're not accessing any files, so we can try to pivot_root in the temp // dir without conflicts with other processes. &std::env::temp_dir(), @@ -179,6 +179,30 @@ pub fn worker_event_loop( let worker_pid = std::process::id(); gum::debug!(target: LOG_TARGET, %worker_pid, ?worker_dir_path, "starting pvf worker ({})", worker_kind); + // Connect to the socket. + let stream = || -> std::io::Result { + let socket_path = worker_dir::socket(&worker_dir_path); + let std_stream = std::os::unix::net::UnixStream::connect(&socket_path)?; + std_stream.set_nonblocking(true)?; // See note for `from_std`. + let stream = UnixStream::from_std(std_stream); + std::fs::remove_file(&socket_path)?; + stream + }(); + let stream = match stream { + Ok(s) => s, + Err(err) => { + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + "{}", + err + ); + worker_shutdown_message(worker_kind, worker_pid, &err.to_string()); + return + }, + }; + // Check for a mismatch between the node and worker versions. if let (Some(node_version), Some(worker_version)) = (node_version, worker_version) { if node_version != worker_version { @@ -197,8 +221,6 @@ pub fn worker_event_loop( } // Enable some security features. - // - // Landlock is enabled a bit later after the socket has been removed. { // Call based on whether we can change root. Error out if it should work but fails. // @@ -211,6 +233,7 @@ pub fn worker_event_loop( if let Err(err) = security::unshare_user_namespace_and_change_root(worker_kind, &worker_dir_path) { + // The filesystem may be in an inconsistent state, bail out. gum::error!( target: LOG_TARGET, %worker_kind, @@ -225,6 +248,25 @@ pub fn worker_event_loop( worker_dir_path = std::path::Path::new("/").to_owned(); } + #[cfg(target_os = "linux")] + if security_status.can_enable_landlock { + let landlock_status = + security::landlock::enable_for_worker(worker_kind, &worker_dir_path); + if !matches!(landlock_status, Ok(landlock::RulesetStatus::FullyEnforced)) { + // We previously were able to enable, so this should never happen. + // + // TODO: Make this a real error in secure-mode. See: + // + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + "could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs", + landlock_status + ); + } + } + if !security::check_env_vars_were_cleared(worker_kind, worker_pid) { let err = "not all env vars were cleared when spawning the process"; gum::error!( @@ -242,35 +284,7 @@ pub fn worker_event_loop( // Run the main worker loop. let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it."); let err = rt - .block_on(async move { - let socket_path = worker_dir::socket(&worker_dir_path); - let stream = UnixStream::connect(&socket_path).await?; - let _ = tokio::fs::remove_file(&socket_path).await; - - // Enable landlock now so we don't need an exception for the socket. - #[cfg(target_os = "linux")] - if security_status.can_enable_landlock { - let landlock_status = - security::landlock::enable_for_worker(worker_kind, &worker_dir_path); - if !matches!(landlock_status, Ok(landlock::RulesetStatus::FullyEnforced)) { - // We previously were able to enable landlock, so this should never happen. - // - // TODO: Make this a real error in secure-mode. See: - // - gum::error!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - "could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs", - landlock_status - ); - } - } - - let result = event_loop(stream, worker_dir_path).await; - - result - }) + .block_on(event_loop(stream, worker_dir_path)) // It's never `Ok` because it's `Ok(Never)`. .unwrap_err(); From b413c27eb7f0a6b3caa88b9e80a1d5d135e81a9c Mon Sep 17 00:00:00 2001 From: Marcin S Date: Thu, 14 Sep 2023 14:08:28 +0200 Subject: [PATCH 13/18] Fix runtime crash (tokio socket not created in context of a runtime) --- polkadot/node/core/pvf/common/src/worker/mod.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 7497c21af2f6..67a23440455b 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -24,6 +24,7 @@ use futures::never::Never; use std::{ any::Any, fmt, + os::unix::net::UnixStream as StdUnixStream, path::PathBuf, sync::mpsc::{Receiver, RecvTimeoutError}, time::Duration, @@ -180,13 +181,12 @@ pub fn worker_event_loop( gum::debug!(target: LOG_TARGET, %worker_pid, ?worker_dir_path, "starting pvf worker ({})", worker_kind); // Connect to the socket. - let stream = || -> std::io::Result { + let stream = || -> std::io::Result { let socket_path = worker_dir::socket(&worker_dir_path); - let std_stream = std::os::unix::net::UnixStream::connect(&socket_path)?; - std_stream.set_nonblocking(true)?; // See note for `from_std`. - let stream = UnixStream::from_std(std_stream); + let stream = StdUnixStream::connect(&socket_path)?; + stream.set_nonblocking(true)?; // See note for `from_std`. std::fs::remove_file(&socket_path)?; - stream + Ok(stream) }(); let stream = match stream { Ok(s) => s, @@ -284,7 +284,10 @@ pub fn worker_event_loop( // Run the main worker loop. let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it."); let err = rt - .block_on(event_loop(stream, worker_dir_path)) + .block_on(async move { + let stream = UnixStream::from_std(std_stream)?; + event_loop(stream, worker_dir_path).await + }) // It's never `Ok` because it's `Ok(Never)`. .unwrap_err(); From 70e62d8753a697748c58585476a5d1a2e02c4646 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Thu, 14 Sep 2023 14:23:34 +0200 Subject: [PATCH 14/18] Fix compiler error --- polkadot/node/core/pvf/common/src/worker/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 67a23440455b..aba7dba28fbc 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -285,7 +285,7 @@ pub fn worker_event_loop( let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it."); let err = rt .block_on(async move { - let stream = UnixStream::from_std(std_stream)?; + let stream = UnixStream::from_std(stream)?; event_loop(stream, worker_dir_path).await }) // It's never `Ok` because it's `Ok(Never)`. From ea1b48d1d31823cb20e7800e8b0bb47058a9c102 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 17 Sep 2023 10:07:39 +0200 Subject: [PATCH 15/18] Fix pivot_root and add assertions - Fix `libc::syscall` call (use `as_ptr()` instead of references) - Use absolute path for MS_BIND mount --- .../core/pvf/common/src/worker/security.rs | 83 +++++++++++++++---- 1 file changed, 68 insertions(+), 15 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs index 41f854f5a682..229826f3273b 100644 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ b/polkadot/node/core/pvf/common/src/worker/security.rs @@ -31,19 +31,57 @@ use crate::{worker::WorkerKind, LOG_TARGET}; /// Unshare the user namespace and change root to be the artifact directory. /// /// NOTE: This should not be called in a multi-threaded context. `unshare(2)`: -/// -/// > CLONE_NEWUSER requires that the calling process is not threaded. +/// "CLONE_NEWUSER requires that the calling process is not threaded." #[cfg(target_os = "linux")] pub fn unshare_user_namespace_and_change_root( worker_kind: WorkerKind, + worker_pid: u32, worker_dir_path: &std::path::Path, ) -> Result<(), String> { - use std::{ffi::CString, os::unix::ffi::OsStrExt, ptr}; + use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr}; + + // The following was copied from the `cstr_core` crate. + // + // TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723 + #[inline] + #[doc(hidden)] + const fn cstr_is_valid(bytes: &[u8]) -> bool { + if bytes.is_empty() || bytes[bytes.len() - 1] != 0 { + return false + } + + let mut index = 0; + while index < bytes.len() - 1 { + if bytes[index] == 0 { + return false + } + index += 1; + } + true + } + + macro_rules! cstr { + ($e:expr) => {{ + const STR: &[u8] = concat!($e, "\0").as_bytes(); + const STR_VALID: bool = cstr_is_valid(STR); + let _ = [(); 0 - (!(STR_VALID) as usize)]; + #[allow(unused_unsafe)] + unsafe { + core::ffi::CStr::from_bytes_with_nul_unchecked(STR) + } + }} + } + + gum::debug!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "unsharing the user namespace and calling pivot_root", + ); let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()) .expect("on unix; the path will never contain 0 bytes; qed"); - let root_c = CString::new("/").expect("input contains no 0 bytes; qed"); - let dot_c = CString::new(".").expect("input contains no 0 bytes; qed"); // Wrapper around all the work to prevent repetitive error handling. // @@ -68,7 +106,7 @@ pub fn unshare_user_namespace_and_change_root( // the initial mount namespace. if libc::mount( ptr::null(), - root_c.as_ptr(), + cstr!("/").as_ptr(), ptr::null(), libc::MS_REC | libc::MS_PRIVATE, ptr::null(), @@ -76,15 +114,16 @@ pub fn unshare_user_namespace_and_change_root( { return Err("mount MS_PRIVATE") } - if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { - return Err("chdir to worker dir path") - } // Ensure that the new root is a mount point. let additional_flags = - if let WorkerKind::Execute = worker_kind { libc::MS_RDONLY } else { 0 }; + if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind { + libc::MS_RDONLY + } else { + 0 + }; if libc::mount( - dot_c.as_ptr(), - dot_c.as_ptr(), + worker_dir_path_c.as_ptr(), + worker_dir_path_c.as_ptr(), ptr::null(), // ignored when MS_BIND is used libc::MS_BIND | libc::MS_REC | libc::MS_NOEXEC | @@ -97,10 +136,13 @@ pub fn unshare_user_namespace_and_change_root( } // 3. `pivot_root` to the artifact directory. - if libc::syscall(libc::SYS_pivot_root, &dot_c, &dot_c) < 0 { + if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { + return Err("chdir to worker dir path") + } + if libc::syscall(libc::SYS_pivot_root, cstr!(".").as_ptr(), cstr!(".").as_ptr()) < 0 { return Err("pivot_root") } - if libc::umount2(dot_c.as_ptr(), libc::MNT_DETACH) < 0 { + if libc::umount2(cstr!(".").as_ptr(), libc::MNT_DETACH) < 0 { return Err("umount the old root mount point") } } @@ -110,7 +152,18 @@ pub fn unshare_user_namespace_and_change_root( .map_err(|err_ctx| { let err = std::io::Error::last_os_error(); format!("{}: {}", err_ctx, err) - }) + })?; + + // Do some assertions. + if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { + return Err("expected current dir after pivot_root to be `/`".into()) + } + env::set_current_dir("..").map_err(|err| err.to_string())?; + if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { + return Err("expected not to be able to break out of new root by doing `..`".into()) + } + + Ok(()) } /// Require env vars to have been removed when spawning the process, to prevent malicious code from From a696d406b0ada3eb8664a1eff5e7410e11e81b35 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 17 Sep 2023 18:37:34 +0200 Subject: [PATCH 16/18] Add a couple of tests to de-befuddle myself --- polkadot/node/core/pvf/tests/it/adder.rs | 36 +++++++++++++++++- polkadot/node/core/pvf/tests/it/main.rs | 47 +++++++++++++++++++++++- 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/polkadot/node/core/pvf/tests/it/adder.rs b/polkadot/node/core/pvf/tests/it/adder.rs index bad7a66054c9..8bdd09db208a 100644 --- a/polkadot/node/core/pvf/tests/it/adder.rs +++ b/polkadot/node/core/pvf/tests/it/adder.rs @@ -100,7 +100,7 @@ async fn execute_bad_block_on_parent() { let host = TestHost::new(); - let _ret = host + let _err = host .validate_candidate( adder::wasm_binary_unwrap(), ValidationParams { @@ -145,3 +145,37 @@ async fn stress_spawn() { futures::future::join_all((0..100).map(|_| execute(host.clone()))).await; } + +// With one worker, run multiple execution jobs serially. They should not conflict. +#[tokio::test] +async fn execute_can_run_serially() { + let host = std::sync::Arc::new(TestHost::new_with_config(|cfg| { + cfg.execute_workers_max_num = 1; + })); + + async fn execute(host: std::sync::Arc) { + let parent_head = HeadData { number: 0, parent_hash: [0; 32], post_state: hash_state(0) }; + let block_data = BlockData { state: 0, add: 512 }; + let ret = host + .validate_candidate( + adder::wasm_binary_unwrap(), + ValidationParams { + parent_head: GenericHeadData(parent_head.encode()), + block_data: GenericBlockData(block_data.encode()), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + Default::default(), + ) + .await + .unwrap(); + + let new_head = HeadData::decode(&mut &ret.head_data.0[..]).unwrap(); + + assert_eq!(new_head.number, 1); + assert_eq!(new_head.parent_hash, parent_head.hash()); + assert_eq!(new_head.post_state, hash_state(512)); + } + + futures::future::join_all((0..5).map(|_| execute(host.clone()))).await; +} diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 517561387077..f699b5840d8f 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -18,8 +18,8 @@ use assert_matches::assert_matches; use parity_scale_codec::Encode as _; use polkadot_node_core_pvf::{ - start, Config, InvalidCandidate, Metrics, PrepareJobKind, PvfPrepData, ValidationError, - ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR, + start, Config, InvalidCandidate, Metrics, PrepareError, PrepareJobKind, PrepareStats, + PvfPrepData, ValidationError, ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR, }; use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult}; use polkadot_primitives::ExecutorParams; @@ -70,6 +70,33 @@ impl TestHost { Self { cache_dir, host: Mutex::new(host) } } + async fn precheck_pvf( + &self, + code: &[u8], + executor_params: ExecutorParams, + ) -> Result { + let (result_tx, result_rx) = futures::channel::oneshot::channel(); + + let code = sp_maybe_compressed_blob::decompress(code, 16 * 1024 * 1024) + .expect("Compression works"); + + self.host + .lock() + .await + .precheck_pvf( + PvfPrepData::from_code( + code.into(), + executor_params, + TEST_PREPARATION_TIMEOUT, + PrepareJobKind::Prechecking, + ), + result_tx, + ) + .await + .unwrap(); + result_rx.await.unwrap() + } + async fn validate_candidate( &self, code: &[u8], @@ -321,3 +348,19 @@ async fn deleting_prepared_artifact_does_not_dispute() { r => panic!("{:?}", r), } } + +// With one worker, run multiple preparation jobs serially. They should not conflict. +#[tokio::test] +async fn prepare_can_run_serially() { + let host = TestHost::new_with_config(|cfg| { + cfg.prepare_workers_hard_max_num = 1; + }); + + let _stats = host + .precheck_pvf(::adder::wasm_binary_unwrap(), Default::default()) + .await + .unwrap(); + + // Prepare a different wasm blob to prevent skipping work. + let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap(); +} From ad0ed8a3571432890d7509f3b22e335aeaf9309a Mon Sep 17 00:00:00 2001 From: Marcin S Date: Sun, 17 Sep 2023 18:43:52 +0200 Subject: [PATCH 17/18] Move socket to beginning of worker startup - Moved socket connection to beginning of worker startup. - Seemed to be causing landlock violations that didn't happen before. - Removed the conversion to a tokio socket, to rule this out as a cause (we have to eventually remove the tokio dependency anyway). - Realized we were still removing the socket on the host-side, which wasn't needed (on failed rendezvous the host will just wipe the whole worker dir). Removed this removal, to remove a potential cause. - Thought we needed a landlock exception for the socket, so I expanded `try_restrict` to allow for multiple exceptions (wasn't needed in the end, but left it in because why not) - Saw that Landlock was ignoring exceptions for files that didn't exist. I added a check for this case. It clued me in to the actual problem. - Added a bunch of logging and some tests to try to narrow down this befuddling issue. - The socket was fine. Tthe issue turned out to be that landlock exceptions are based on fd's and not paths. We were creating new files for each new job, so except for the first excepted files, no exceptions were accepted. - Applied an exception to the whole worker dir and called it a day. --- polkadot/node/core/pvf/common/src/execute.rs | 2 +- polkadot/node/core/pvf/common/src/lib.rs | 25 ++-- .../node/core/pvf/common/src/worker/mod.rs | 119 ++++++++++------ .../core/pvf/common/src/worker/security.rs | 115 +++++++++------ .../node/core/pvf/execute-worker/src/lib.rs | 33 +++-- .../node/core/pvf/prepare-worker/src/lib.rs | 23 +-- .../node/core/pvf/src/execute/worker_intf.rs | 6 +- polkadot/node/core/pvf/src/host.rs | 18 ++- .../node/core/pvf/src/prepare/worker_intf.rs | 32 ++--- polkadot/node/core/pvf/src/worker_intf.rs | 132 ++++++++---------- 10 files changed, 288 insertions(+), 217 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/execute.rs b/polkadot/node/core/pvf/common/src/execute.rs index 399b847791a9..b89ab089af1c 100644 --- a/polkadot/node/core/pvf/common/src/execute.rs +++ b/polkadot/node/core/pvf/common/src/execute.rs @@ -29,7 +29,7 @@ pub struct Handshake { } /// The response from an execution job on the worker. -#[derive(Encode, Decode)] +#[derive(Debug, Encode, Decode)] pub enum Response { /// The job completed successfully. Ok { diff --git a/polkadot/node/core/pvf/common/src/lib.rs b/polkadot/node/core/pvf/common/src/lib.rs index 393697c340dc..53c287ea9709 100644 --- a/polkadot/node/core/pvf/common/src/lib.rs +++ b/polkadot/node/core/pvf/common/src/lib.rs @@ -31,8 +31,11 @@ pub use sp_tracing; const LOG_TARGET: &str = "parachain::pvf-common"; -use std::mem; -use tokio::io::{self, AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _}; +use std::{ + io::{Read, Write}, + mem, +}; +use tokio::io; #[cfg(feature = "test-utils")] pub mod tests { @@ -51,20 +54,22 @@ pub struct SecurityStatus { pub can_unshare_user_namespace_and_change_root: bool, } -/// Write some data prefixed by its length into `w`. -pub async fn framed_send(w: &mut (impl AsyncWrite + Unpin), buf: &[u8]) -> io::Result<()> { +/// Write some data prefixed by its length into `w`. Sync version of `framed_send` to avoid +/// dependency on tokio. +pub fn framed_send_blocking(w: &mut (impl Write + Unpin), buf: &[u8]) -> io::Result<()> { let len_buf = buf.len().to_le_bytes(); - w.write_all(&len_buf).await?; - w.write_all(buf).await?; + w.write_all(&len_buf)?; + w.write_all(buf)?; Ok(()) } -/// Read some data prefixed by its length from `r`. -pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result> { +/// Read some data prefixed by its length from `r`. Sync version of `framed_recv` to avoid +/// dependency on tokio. +pub fn framed_recv_blocking(r: &mut (impl Read + Unpin)) -> io::Result> { let mut len_buf = [0u8; mem::size_of::()]; - r.read_exact(&mut len_buf).await?; + r.read_exact(&mut len_buf)?; let len = usize::from_le_bytes(len_buf); let mut buf = vec![0; len]; - r.read_exact(&mut buf).await?; + r.read_exact(&mut buf)?; Ok(buf) } diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index aba7dba28fbc..59973f6cbbc6 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -24,12 +24,12 @@ use futures::never::Never; use std::{ any::Any, fmt, - os::unix::net::UnixStream as StdUnixStream, + os::unix::net::UnixStream, path::PathBuf, sync::mpsc::{Receiver, RecvTimeoutError}, time::Duration, }; -use tokio::{io, net::UnixStream, runtime::Runtime}; +use tokio::{io, runtime::Runtime}; /// Use this macro to declare a `fn main() {}` that will create an executable that can be used for /// spawning the desired worker. @@ -50,6 +50,8 @@ macro_rules! decl_worker_main { // See . $crate::sp_tracing::try_init_simple(); + let worker_pid = std::process::id(); + let args = std::env::args().collect::>(); if args.len() == 1 { print_help($expected_command); @@ -75,20 +77,25 @@ macro_rules! decl_worker_main { }, "--check-can-unshare-user-namespace-and-change-root" => { #[cfg(target_os = "linux")] - let status = if security::unshare_user_namespace_and_change_root( - $crate::worker::WorkerKind::Execute, + let status = if let Err(err) = security::unshare_user_namespace_and_change_root( + $crate::worker::WorkerKind::CheckPivotRoot, + worker_pid, // We're not accessing any files, so we can try to pivot_root in the temp // dir without conflicts with other processes. &std::env::temp_dir(), - ) - .is_ok() - { - 0 - } else { + ) { + // Write the error to stderr, log it on the host-side. + eprintln!("{}", err); -1 + } else { + 0 }; #[cfg(not(target_os = "linux"))] - let status = -1; + let status = { + // Write the error to stderr, log it on the host-side. + eprintln!("not available on macos"); + -1 + }; std::process::exit(status) }, @@ -153,6 +160,7 @@ pub const JOB_TIMEOUT_OVERHEAD: Duration = Duration::from_millis(50); pub enum WorkerKind { Prepare, Execute, + CheckPivotRoot, } impl fmt::Display for WorkerKind { @@ -160,6 +168,7 @@ impl fmt::Display for WorkerKind { match self { Self::Prepare => write!(f, "prepare"), Self::Execute => write!(f, "execute"), + Self::CheckPivotRoot => write!(f, "check pivot root"), } } } @@ -178,13 +187,61 @@ pub fn worker_event_loop( Fut: futures::Future>, { let worker_pid = std::process::id(); - gum::debug!(target: LOG_TARGET, %worker_pid, ?worker_dir_path, "starting pvf worker ({})", worker_kind); + gum::debug!( + target: LOG_TARGET, + %worker_pid, + ?worker_dir_path, + ?security_status, + "starting pvf worker ({})", + worker_kind + ); + + // Check for a mismatch between the node and worker versions. + if let (Some(node_version), Some(worker_version)) = (node_version, worker_version) { + if node_version != worker_version { + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + %node_version, + %worker_version, + "Node and worker version mismatch, node needs restarting, forcing shutdown", + ); + kill_parent_node_in_emergency(); + worker_shutdown_message(worker_kind, worker_pid, "Version mismatch"); + return + } + } + + // Make sure that we can read the worker dir path, and log its contents. + let entries = || -> Result, io::Error> { + std::fs::read_dir(&worker_dir_path)? + .map(|res| res.map(|e| e.file_name())) + .collect() + }(); + match entries { + Ok(entries) => + gum::trace!(target: LOG_TARGET, %worker_pid, ?worker_dir_path, "content of worker dir: {:?}", entries), + Err(err) => { + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "Could not read worker dir: {}", + err.to_string() + ); + worker_shutdown_message(worker_kind, worker_pid, &err.to_string()); + return + }, + } // Connect to the socket. - let stream = || -> std::io::Result { - let socket_path = worker_dir::socket(&worker_dir_path); - let stream = StdUnixStream::connect(&socket_path)?; - stream.set_nonblocking(true)?; // See note for `from_std`. + let socket_path = worker_dir::socket(&worker_dir_path); + let stream = || -> std::io::Result { + let stream = UnixStream::connect(&socket_path)?; + // Remove the socket here. We don't also need to do this on the host-side; on failed + // rendezvous, the host will delete the whole worker dir. std::fs::remove_file(&socket_path)?; Ok(stream) }(); @@ -203,23 +260,6 @@ pub fn worker_event_loop( }, }; - // Check for a mismatch between the node and worker versions. - if let (Some(node_version), Some(worker_version)) = (node_version, worker_version) { - if node_version != worker_version { - gum::error!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - %node_version, - %worker_version, - "Node and worker version mismatch, node needs restarting, forcing shutdown", - ); - kill_parent_node_in_emergency(); - worker_shutdown_message(worker_kind, worker_pid, "Version mismatch"); - return - } - } - // Enable some security features. { // Call based on whether we can change root. Error out if it should work but fails. @@ -230,9 +270,11 @@ pub fn worker_event_loop( // > CLONE_NEWUSER requires that the calling process is not threaded. #[cfg(target_os = "linux")] if security_status.can_unshare_user_namespace_and_change_root { - if let Err(err) = - security::unshare_user_namespace_and_change_root(worker_kind, &worker_dir_path) - { + if let Err(err) = security::unshare_user_namespace_and_change_root( + worker_kind, + worker_pid, + &worker_dir_path, + ) { // The filesystem may be in an inconsistent state, bail out. gum::error!( target: LOG_TARGET, @@ -251,7 +293,7 @@ pub fn worker_event_loop( #[cfg(target_os = "linux")] if security_status.can_enable_landlock { let landlock_status = - security::landlock::enable_for_worker(worker_kind, &worker_dir_path); + security::landlock::enable_for_worker(worker_kind, worker_pid, &worker_dir_path); if !matches!(landlock_status, Ok(landlock::RulesetStatus::FullyEnforced)) { // We previously were able to enable, so this should never happen. // @@ -284,10 +326,7 @@ pub fn worker_event_loop( // Run the main worker loop. let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it."); let err = rt - .block_on(async move { - let stream = UnixStream::from_std(stream)?; - event_loop(stream, worker_dir_path).await - }) + .block_on(event_loop(stream, worker_dir_path)) // It's never `Ok` because it's `Ok(Never)`. .unwrap_err(); diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs index 229826f3273b..b7abf028f941 100644 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ b/polkadot/node/core/pvf/common/src/worker/security.rs @@ -211,14 +211,14 @@ pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> /// [landlock]: https://docs.rs/landlock/latest/landlock/index.html #[cfg(target_os = "linux")] pub mod landlock { - pub use landlock::{path_beneath_rules, Access, AccessFs}; + pub use landlock::RulesetStatus; - use crate::worker::WorkerKind; - use landlock::{ - PathBeneath, PathFd, Ruleset, RulesetAttr, RulesetCreatedAttr, RulesetError, RulesetStatus, - ABI, + use crate::{worker::WorkerKind, LOG_TARGET}; + use landlock::*; + use std::{ + fmt, + path::{Path, PathBuf}, }; - use std::path::Path; /// Landlock ABI version. We use ABI V1 because: /// @@ -249,29 +249,56 @@ pub mod landlock { /// supports it or if it introduces some new feature that is beneficial to security. pub const LANDLOCK_ABI: ABI = ABI::V1; - /// Tried to enable landlock for the given kind of worker. + #[derive(Debug)] + pub enum TryRestrictError { + InvalidExceptionPath(PathBuf), + RulesetError(RulesetError), + } + + impl From for TryRestrictError { + fn from(err: RulesetError) -> Self { + Self::RulesetError(err) + } + } + + impl fmt::Display for TryRestrictError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::InvalidExceptionPath(path) => write!(f, "invalid exception path: {:?}", path), + Self::RulesetError(err) => write!(f, "ruleset error: {}", err.to_string()), + } + } + } + + impl std::error::Error for TryRestrictError {} + + /// Try to enable landlock for the given kind of worker. pub fn enable_for_worker( worker_kind: WorkerKind, + worker_pid: u32, worker_dir_path: &Path, - ) -> Result { - use crate::worker_dir; - - match worker_kind { + ) -> Result> { + let exceptions: Vec<(PathBuf, BitFlags)> = match worker_kind { WorkerKind::Prepare => { - let temp_artifact_dest = worker_dir::prepare_tmp_artifact(worker_dir_path); - - // Allow an exception for writing to the known file in the worker cache. - try_restrict(path_beneath_rules(&[&temp_artifact_dest], AccessFs::WriteFile)) - .map_err(|e| e.to_string()) + vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())] }, WorkerKind::Execute => { - let artifact_path = worker_dir::execute_artifact(worker_dir_path); - - // Allow an exception for reading from the known artifact path. - try_restrict(path_beneath_rules(&[&artifact_path], AccessFs::ReadFile)) - .map_err(|e| e.to_string()) + vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())] }, - } + WorkerKind::CheckPivotRoot => + panic!("this should only be passed for checking pivot_root; qed"), + }; + + gum::debug!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "enabling landlock with exceptions: {:?}", + exceptions, + ); + + Ok(try_restrict(exceptions)?) } // TODO: @@ -279,7 +306,9 @@ pub mod landlock { /// ABI is fully enabled on the current Linux environment. pub fn check_is_fully_enabled() -> bool { let status_from_thread: Result> = - match std::thread::spawn(|| try_restrict(std::iter::empty())).join() { + match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())) + .join() + { Ok(Ok(status)) => Ok(status), Ok(Err(ruleset_err)) => Err(ruleset_err.into()), Err(_err) => Err("a panic occurred in try_restrict".into()), @@ -299,14 +328,24 @@ pub mod landlock { /// # Returns /// /// The status of the restriction (whether it was fully, partially, or not-at-all enforced). - fn try_restrict( - fs_exceptions: impl Iterator, RulesetError>>, - ) -> Result { - let status = Ruleset::new() - .handle_access(AccessFs::from_all(LANDLOCK_ABI))? - .create()? - .add_rules(fs_exceptions)? - .restrict_self()?; + fn try_restrict(fs_exceptions: I) -> Result + where + I: IntoIterator, + P: AsRef, + A: Into>, + { + let mut ruleset = + Ruleset::new().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?; + for (fs_path, access_bits) in fs_exceptions { + let paths = &[fs_path.as_ref().to_owned()]; + let mut rules = path_beneath_rules(paths, access_bits).peekable(); + if rules.peek().is_none() { + // `path_beneath_rules` silently ignores missing paths, so check for it manually. + return Err(TryRestrictError::InvalidExceptionPath(fs_path.as_ref().to_owned())) + } + ruleset = ruleset.add_rules(rules)?; + } + let status = ruleset.restrict_self()?; Ok(status.ruleset) } @@ -341,10 +380,7 @@ pub mod landlock { assert_eq!(s, TEXT); // Apply Landlock with a read exception for only one of the files. - let status = try_restrict(path_beneath_rules( - &[path1], - AccessFs::from_read(LANDLOCK_ABI), - )); + let status = try_restrict(vec![(path1, AccessFs::ReadFile)]); if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); } @@ -362,7 +398,7 @@ pub mod landlock { )); // Apply Landlock for all files. - let status = try_restrict(std::iter::empty()); + let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); } @@ -400,10 +436,7 @@ pub mod landlock { fs::write(path2, TEXT).unwrap(); // Apply Landlock with a write exception for only one of the files. - let status = try_restrict(path_beneath_rules( - &[path1], - AccessFs::from_write(LANDLOCK_ABI), - )); + let status = try_restrict(vec![(path1, AccessFs::WriteFile)]); if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); } @@ -418,7 +451,7 @@ pub mod landlock { )); // Apply Landlock for all files. - let status = try_restrict(std::iter::empty()); + let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); } diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index b2bf59915a99..9d7bfdf28669 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -28,7 +28,7 @@ use polkadot_node_core_pvf_common::{ error::InternalValidationError, execute::{Handshake, Response}, executor_intf::NATIVE_STACK_MAX, - framed_recv, framed_send, + framed_recv_blocking, framed_send_blocking, worker::{ cpu_time_monitor_loop, stringify_panic_payload, thread::{self, WaitOutcome}, @@ -37,11 +37,12 @@ use polkadot_node_core_pvf_common::{ }; use polkadot_parachain_primitives::primitives::ValidationResult; use std::{ + os::unix::net::UnixStream, path::PathBuf, sync::{mpsc::channel, Arc}, time::Duration, }; -use tokio::{io, net::UnixStream}; +use tokio::io; // Wasmtime powers the Substrate Executor. It compiles the wasm bytecode into native code. // That native code does not create any stacks and just reuses the stack of the thread that @@ -79,8 +80,8 @@ use tokio::{io, net::UnixStream}; /// The stack size for the execute thread. pub const EXECUTE_THREAD_STACK_SIZE: usize = 2 * 1024 * 1024 + NATIVE_STACK_MAX as usize; -async fn recv_handshake(stream: &mut UnixStream) -> io::Result { - let handshake_enc = framed_recv(stream).await?; +fn recv_handshake(stream: &mut UnixStream) -> io::Result { + let handshake_enc = framed_recv_blocking(stream)?; let handshake = Handshake::decode(&mut &handshake_enc[..]).map_err(|_| { io::Error::new( io::ErrorKind::Other, @@ -90,9 +91,9 @@ async fn recv_handshake(stream: &mut UnixStream) -> io::Result { Ok(handshake) } -async fn recv_request(stream: &mut UnixStream) -> io::Result<(Vec, Duration)> { - let params = framed_recv(stream).await?; - let execution_timeout = framed_recv(stream).await?; +fn recv_request(stream: &mut UnixStream) -> io::Result<(Vec, Duration)> { + let params = framed_recv_blocking(stream)?; + let execution_timeout = framed_recv_blocking(stream)?; let execution_timeout = Duration::decode(&mut &execution_timeout[..]).map_err(|_| { io::Error::new( io::ErrorKind::Other, @@ -102,8 +103,8 @@ async fn recv_request(stream: &mut UnixStream) -> io::Result<(Vec, Duration) Ok((params, execution_timeout)) } -async fn send_response(stream: &mut UnixStream, response: Response) -> io::Result<()> { - framed_send(stream, &response.encode()).await +fn send_response(stream: &mut UnixStream, response: Response) -> io::Result<()> { + framed_send_blocking(stream, &response.encode()) } /// The entrypoint that the spawned execute worker should start with. @@ -135,13 +136,13 @@ pub fn worker_entrypoint( let worker_pid = std::process::id(); let artifact_path = worker_dir::execute_artifact(&worker_dir_path); - let Handshake { executor_params } = recv_handshake(&mut stream).await?; + let Handshake { executor_params } = recv_handshake(&mut stream)?; let executor = Executor::new(executor_params).map_err(|e| { io::Error::new(io::ErrorKind::Other, format!("cannot create executor: {}", e)) })?; loop { - let (params, execution_timeout) = recv_request(&mut stream).await?; + let (params, execution_timeout) = recv_request(&mut stream)?; gum::debug!( target: LOG_TARGET, %worker_pid, @@ -156,7 +157,7 @@ pub fn worker_entrypoint( let response = Response::InternalError( InternalValidationError::CouldNotOpenFile(err.to_string()), ); - send_response(&mut stream, response).await?; + send_response(&mut stream, response)?; continue }, }; @@ -238,7 +239,13 @@ pub fn worker_entrypoint( ), }; - send_response(&mut stream, response).await?; + gum::trace!( + target: LOG_TARGET, + %worker_pid, + "worker: sending response to host: {:?}", + response + ); + send_response(&mut stream, response)?; } }, ); diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 5db7c0ce299b..da22dd395131 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -33,7 +33,7 @@ use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, executor_intf::Executor, - framed_recv, framed_send, + framed_recv_blocking, framed_send_blocking, prepare::{MemoryStats, PrepareJobKind, PrepareStats}, pvf::PvfPrepData, worker::{ @@ -45,11 +45,12 @@ use polkadot_node_core_pvf_common::{ }; use polkadot_primitives::ExecutorParams; use std::{ + os::unix::net::UnixStream, path::PathBuf, sync::{mpsc::channel, Arc}, time::Duration, }; -use tokio::{io, net::UnixStream}; +use tokio::io; /// Contains the bytes for a successfully compiled artifact. pub struct CompiledArtifact(Vec); @@ -67,8 +68,8 @@ impl AsRef<[u8]> for CompiledArtifact { } } -async fn recv_request(stream: &mut UnixStream) -> io::Result { - let pvf = framed_recv(stream).await?; +fn recv_request(stream: &mut UnixStream) -> io::Result { + let pvf = framed_recv_blocking(stream)?; let pvf = PvfPrepData::decode(&mut &pvf[..]).map_err(|e| { io::Error::new( io::ErrorKind::Other, @@ -78,8 +79,8 @@ async fn recv_request(stream: &mut UnixStream) -> io::Result { Ok(pvf) } -async fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Result<()> { - framed_send(stream, &result.encode()).await +fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Result<()> { + framed_send_blocking(stream, &result.encode()) } /// The entrypoint that the spawned prepare worker should start with. @@ -131,7 +132,7 @@ pub fn worker_entrypoint( let temp_artifact_dest = worker_dir::prepare_tmp_artifact(&worker_dir_path); loop { - let pvf = recv_request(&mut stream).await?; + let pvf = recv_request(&mut stream)?; gum::debug!( target: LOG_TARGET, %worker_pid, @@ -278,7 +279,13 @@ pub fn worker_entrypoint( ), }; - send_response(&mut stream, result).await?; + gum::trace!( + target: LOG_TARGET, + %worker_pid, + "worker: sending response to host: {:?}", + result + ); + send_response(&mut stream, result)?; } }, ); diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 95a6e26fa583..783c7c7abbc8 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -19,8 +19,8 @@ use crate::{ artifacts::ArtifactPathId, worker_intf::{ - clear_worker_dir_path, spawn_with_program_path, IdleWorker, SpawnErr, WorkerDir, - WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, + clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker, + SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, }, LOG_TARGET, }; @@ -30,7 +30,7 @@ use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::InternalValidationError, execute::{Handshake, Response}, - framed_recv, framed_send, worker_dir, SecurityStatus, + worker_dir, SecurityStatus, }; use polkadot_parachain_primitives::primitives::ValidationResult; use polkadot_primitives::ExecutorParams; diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 091d780e4944..f0d533cb1894 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -893,16 +893,22 @@ fn check_can_unshare_user_namespace_and_change_root( ) -> bool { #[cfg(target_os = "linux")] { - match std::process::Command::new(prepare_worker_program_path) + let output = std::process::Command::new(prepare_worker_program_path) .arg("--check-can-unshare-user-namespace-and-change-root") - .status() - { - Ok(status) if status.success() => true, - Ok(status) => { + .output(); + + match output { + Ok(output) if output.status.success() => true, + Ok(output) => { + let stderr = std::str::from_utf8(&output.stderr) + .expect("child process writes a UTF-8 string to stderr; qed") + .trim(); gum::warn!( target: LOG_TARGET, ?prepare_worker_program_path, - ?status, + // Docs say to always print status using `Display` implementation. + status = %output.status, + %stderr, "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security." ); false diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 7a3a543eac43..b66c36044343 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -19,15 +19,14 @@ use crate::{ metrics::Metrics, worker_intf::{ - clear_worker_dir_path, spawn_with_program_path, IdleWorker, SpawnErr, WorkerDir, - WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, + clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker, + SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, }, LOG_TARGET, }; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, - framed_recv, framed_send, prepare::PrepareStats, pvf::PvfPrepData, worker_dir, SecurityStatus, @@ -274,21 +273,18 @@ where // Create the tmp file here so that the child doesn't need any file creation rights. This will // be cleared at the end of this function. let tmp_file = worker_dir::prepare_tmp_artifact(&worker_dir.path); - match tokio::fs::write(&tmp_file, &[]).await { - Ok(()) => (), - Err(err) => { - gum::warn!( - target: LOG_TARGET, - worker_pid = %pid, - ?worker_dir, - "failed to create a temp file for the artifact: {:?}", - err, - ); - return Outcome::CreateTmpFileErr { - worker: IdleWorker { stream, pid, worker_dir }, - err: format!("{:?}", err), - } - }, + if let Err(err) = tokio::fs::File::create(&tmp_file).await { + gum::warn!( + target: LOG_TARGET, + worker_pid = %pid, + ?worker_dir, + "failed to create a temp file for the artifact: {:?}", + err, + ); + return Outcome::CreateTmpFileErr { + worker: IdleWorker { stream, pid, worker_dir }, + err: format!("{:?}", err), + } }; let worker_dir_path = worker_dir.path.clone(); diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index 82bc912dc9ba..9825506ba88f 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -67,93 +67,71 @@ pub async fn spawn_with_program_path( ) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { let program_path = program_path.into(); let worker_dir = WorkerDir::new(debug_id, cache_path).await?; - - with_transient_socket_path(&worker_dir.path.clone(), |socket_path| { - let socket_path = socket_path.to_owned(); - let extra_args: Vec = extra_args.iter().map(|arg| arg.to_string()).collect(); - - async move { - let listener = UnixListener::bind(&socket_path).map_err(|err| { + let socket_path = worker_dir::socket(&worker_dir.path); + + let extra_args: Vec = extra_args.iter().map(|arg| arg.to_string()).collect(); + + let listener = UnixListener::bind(&socket_path).map_err(|err| { + gum::warn!( + target: LOG_TARGET, + %debug_id, + ?program_path, + ?extra_args, + ?worker_dir, + ?socket_path, + "cannot bind unix socket: {:?}", + err, + ); + SpawnErr::Bind + })?; + + let handle = WorkerHandle::spawn(&program_path, &extra_args, &worker_dir.path, security_status) + .map_err(|err| { + gum::warn!( + target: LOG_TARGET, + %debug_id, + ?program_path, + ?extra_args, + ?worker_dir.path, + ?socket_path, + "cannot spawn a worker: {:?}", + err, + ); + SpawnErr::ProcessSpawn + })?; + + let worker_dir_path = worker_dir.path.clone(); + futures::select! { + accept_result = listener.accept().fuse() => { + let (stream, _) = accept_result.map_err(|err| { gum::warn!( target: LOG_TARGET, %debug_id, ?program_path, ?extra_args, - ?worker_dir, + ?worker_dir_path, ?socket_path, - "cannot bind unix socket: {:?}", + "cannot accept a worker: {:?}", err, ); - SpawnErr::Bind + SpawnErr::Accept })?; - - let handle = - WorkerHandle::spawn(&program_path, &extra_args, &worker_dir.path, security_status) - .map_err(|err| { - gum::warn!( - target: LOG_TARGET, - %debug_id, - ?program_path, - ?extra_args, - ?worker_dir.path, - ?socket_path, - "cannot spawn a worker: {:?}", - err, - ); - SpawnErr::ProcessSpawn - })?; - - let worker_dir_path = worker_dir.path.clone(); - futures::select! { - accept_result = listener.accept().fuse() => { - let (stream, _) = accept_result.map_err(|err| { - gum::warn!( - target: LOG_TARGET, - %debug_id, - ?program_path, - ?extra_args, - ?worker_dir_path, - ?socket_path, - "cannot accept a worker: {:?}", - err, - ); - SpawnErr::Accept - })?; - Ok((IdleWorker { stream, pid: handle.id(), worker_dir }, handle)) - } - _ = Delay::new(spawn_timeout).fuse() => { - gum::warn!( - target: LOG_TARGET, - %debug_id, - ?program_path, - ?extra_args, - ?worker_dir_path, - ?socket_path, - ?spawn_timeout, - "spawning and connecting to socket timed out", - ); - Err(SpawnErr::AcceptTimeout) - } - } + Ok((IdleWorker { stream, pid: handle.id(), worker_dir }, handle)) } - }) - .await -} - -async fn with_transient_socket_path(worker_dir_path: &Path, f: F) -> Result -where - F: FnOnce(&Path) -> Fut, - Fut: futures::Future> + 'static, -{ - let socket_path = worker_dir::socket(worker_dir_path); - - let result = f(&socket_path).await; - - // Best effort to remove the socket file. Under normal circumstances the socket will be removed - // by the worker. We make sure that it is removed here, just in case a failed rendezvous. - let _ = tokio::fs::remove_file(socket_path).await; - - result + _ = Delay::new(spawn_timeout).fuse() => { + gum::warn!( + target: LOG_TARGET, + %debug_id, + ?program_path, + ?extra_args, + ?worker_dir_path, + ?socket_path, + ?spawn_timeout, + "spawning and connecting to socket timed out", + ); + Err(SpawnErr::AcceptTimeout) + } + } } /// Returns a path under the given `dir`. The path name will start with the given prefix. From 279db25fcfca37a3ccbe1131247cd93229296e87 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 25 Sep 2023 14:34:36 +0200 Subject: [PATCH 18/18] Use cfg_if macro There were a few places where this can be used for a slight improvement to code quality. --- Cargo.lock | 3 + polkadot/node/core/pvf/Cargo.toml | 1 + polkadot/node/core/pvf/common/Cargo.toml | 1 + .../node/core/pvf/prepare-worker/Cargo.toml | 1 + .../node/core/pvf/prepare-worker/src/lib.rs | 11 +- polkadot/node/core/pvf/src/host.rs | 150 +++++++++--------- 6 files changed, 86 insertions(+), 81 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eb13ab4b76a9..1774de8755d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11991,6 +11991,7 @@ version = "1.0.0" dependencies = [ "always-assert", "assert_matches", + "cfg-if", "futures", "futures-timer", "hex-literal", @@ -12047,6 +12048,7 @@ name = "polkadot-node-core-pvf-common" version = "1.0.0" dependencies = [ "assert_matches", + "cfg-if", "cpu-time", "futures", "landlock", @@ -12088,6 +12090,7 @@ dependencies = [ name = "polkadot-node-core-pvf-prepare-worker" version = "1.0.0" dependencies = [ + "cfg-if", "futures", "libc", "parity-scale-codec", diff --git a/polkadot/node/core/pvf/Cargo.toml b/polkadot/node/core/pvf/Cargo.toml index 478d1952d9d9..27f4df117e57 100644 --- a/polkadot/node/core/pvf/Cargo.toml +++ b/polkadot/node/core/pvf/Cargo.toml @@ -8,6 +8,7 @@ license.workspace = true [dependencies] always-assert = "0.1" +cfg-if = "1.0" futures = "0.3.21" futures-timer = "3.0.2" gum = { package = "tracing-gum", path = "../../gum" } diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index 621f7e24f72b..0f7308396d80 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -7,6 +7,7 @@ edition.workspace = true license.workspace = true [dependencies] +cfg-if = "1.0" cpu-time = "1.0.0" futures = "0.3.21" gum = { package = "tracing-gum", path = "../../../gum" } diff --git a/polkadot/node/core/pvf/prepare-worker/Cargo.toml b/polkadot/node/core/pvf/prepare-worker/Cargo.toml index e7a12cd9a809..886209b78c32 100644 --- a/polkadot/node/core/pvf/prepare-worker/Cargo.toml +++ b/polkadot/node/core/pvf/prepare-worker/Cargo.toml @@ -7,6 +7,7 @@ edition.workspace = true license.workspace = true [dependencies] +cfg-if = "1.0" futures = "0.3.21" gum = { package = "tracing-gum", path = "../../../gum" } libc = "0.2.139" diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index da22dd395131..a24f5024722b 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -212,10 +212,13 @@ pub fn worker_entrypoint( Err(err) }, Ok(ok) => { - #[cfg(not(target_os = "linux"))] - let (artifact, cpu_time_elapsed) = ok; - #[cfg(target_os = "linux")] - let (artifact, cpu_time_elapsed, max_rss) = ok; + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + let (artifact, cpu_time_elapsed, max_rss) = ok; + } else { + let (artifact, cpu_time_elapsed) = ok; + } + } // Stop the memory stats worker and get its observed memory stats. #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index f0d533cb1894..81695829122b 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -891,48 +891,46 @@ fn check_can_unshare_user_namespace_and_change_root( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] prepare_worker_program_path: &Path, ) -> bool { - #[cfg(target_os = "linux")] - { - let output = std::process::Command::new(prepare_worker_program_path) - .arg("--check-can-unshare-user-namespace-and-change-root") - .output(); - - match output { - Ok(output) if output.status.success() => true, - Ok(output) => { - let stderr = std::str::from_utf8(&output.stderr) - .expect("child process writes a UTF-8 string to stderr; qed") - .trim(); - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - // Docs say to always print status using `Display` implementation. - status = %output.status, - %stderr, - "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security." - ); - false - }, - Err(err) => { - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - "Could not start child process: {}", - err - ); - false - }, + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + let output = std::process::Command::new(prepare_worker_program_path) + .arg("--check-can-unshare-user-namespace-and-change-root") + .output(); + + match output { + Ok(output) if output.status.success() => true, + Ok(output) => { + let stderr = std::str::from_utf8(&output.stderr) + .expect("child process writes a UTF-8 string to stderr; qed") + .trim(); + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + // Docs say to always print status using `Display` implementation. + status = %output.status, + %stderr, + "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } else { + gum::warn!( + target: LOG_TARGET, + "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with support for unsharing user namespaces for maximum security." + ); + false } } - - #[cfg(not(target_os = "linux"))] - { - gum::warn!( - target: LOG_TARGET, - "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with support for unsharing user namespaces for maximum security." - ); - false - } } /// Check if landlock is supported and emit a warning if not. @@ -944,45 +942,43 @@ fn check_landlock( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] prepare_worker_program_path: &Path, ) -> bool { - #[cfg(target_os = "linux")] - { - match std::process::Command::new(prepare_worker_program_path) - .arg("--check-can-enable-landlock") - .status() - { - Ok(status) if status.success() => true, - Ok(status) => { - let abi = - polkadot_node_core_pvf_common::worker::security::landlock::LANDLOCK_ABI as u8; - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - ?status, - %abi, - "Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." - ); - false - }, - Err(err) => { - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - "Could not start child process: {}", - err - ); - false - }, + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + match std::process::Command::new(prepare_worker_program_path) + .arg("--check-can-enable-landlock") + .status() + { + Ok(status) if status.success() => true, + Ok(status) => { + let abi = + polkadot_node_core_pvf_common::worker::security::landlock::LANDLOCK_ABI as u8; + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + ?status, + %abi, + "Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } else { + gum::warn!( + target: LOG_TARGET, + "Cannot enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." + ); + false } } - - #[cfg(not(target_os = "linux"))] - { - gum::warn!( - target: LOG_TARGET, - "Cannot enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." - ); - false - } } #[cfg(test)]