diff --git a/src/cgroup.c b/src/cgroup.c index 48f611ae..3fcf7ec1 100644 --- a/src/cgroup.c +++ b/src/cgroup.c @@ -9,11 +9,13 @@ #include #include #include +#ifdef __linux__ #include #include #include #include #include +#endif #ifndef CGROUP2_SUPER_MAGIC #define CGROUP2_SUPER_MAGIC 0x63677270 @@ -24,6 +26,8 @@ int oom_event_fd = -1; int oom_cgroup_fd = -1; +#ifdef __linux__ + static char *process_cgroup_subsystem_path(int pid, bool cgroup2, const char *subsystem); static void setup_oom_handling_cgroup_v2(int pid); static void setup_oom_handling_cgroup_v1(int pid); @@ -314,3 +318,5 @@ static int write_oom_files() } return oom_fd >= 0 ? 0 : -1; } + +#endif diff --git a/src/cli.c b/src/cli.c index 14adff6a..33b28bd5 100644 --- a/src/cli.c +++ b/src/cli.c @@ -8,7 +8,9 @@ #include #include #include +#ifdef __linux__ #include +#endif gboolean opt_version = FALSE; gboolean opt_terminal = FALSE; diff --git a/src/close_fds.c b/src/close_fds.c index 8ae6ee1f..278ea67c 100644 --- a/src/close_fds.c +++ b/src/close_fds.c @@ -20,7 +20,6 @@ #include "close_fds.h" #include "runtime_args.h" -#include #include static int open_files_max_fd; diff --git a/src/cmsg.c b/src/cmsg.c index bb9dbda1..f14a94fa 100644 --- a/src/cmsg.c +++ b/src/cmsg.c @@ -32,6 +32,10 @@ #error cmsg.c requires C99 or later #endif +#ifdef __FreeBSD__ +#define ECOMM EINVAL +#endif + #define error(s) \ do { \ fprintf(stderr, "nsenter: %s %s\n", s, strerror(errno)); \ diff --git a/src/conmon.c b/src/conmon.c index 6c482324..77250fbc 100644 --- a/src/conmon.c +++ b/src/conmon.c @@ -21,8 +21,6 @@ #include "seccomp_notify.h" #include "runtime_args.h" -#include -#include #include #include @@ -134,7 +132,7 @@ int main(int argc, char *argv[]) * Set self as subreaper so we can wait for container process * and return its exit code. */ - int ret = prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); + int ret = set_subreaper(true); if (ret != 0) { pexit("Failed to set as subreaper"); } @@ -228,7 +226,7 @@ int main(int argc, char *argv[]) if (create_pid < 0) { pexit("Failed to fork the create command"); } else if (!create_pid) { - if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) + if (set_pdeathsig(SIGKILL) < 0) _pexit("Failed to set PDEATHSIG"); if (sigprocmask(SIG_SETMASK, &oldmask, NULL) < 0) _pexit("Failed to unblock signals"); @@ -238,21 +236,21 @@ int main(int argc, char *argv[]) workerfd_stdin = dev_null_r; if (dup2(workerfd_stdin, STDIN_FILENO) < 0) _pexit("Failed to dup over stdin"); - if (workerfd_stdin != dev_null_r && fchmod(STDIN_FILENO, 0777) < 0) + if (workerfd_stdin != dev_null_r && isatty(workerfd_stdin) && fchmod(STDIN_FILENO, 0777) < 0) nwarn("Failed to chmod stdin"); if (workerfd_stdout < 0) workerfd_stdout = dev_null_w; if (dup2(workerfd_stdout, STDOUT_FILENO) < 0) _pexit("Failed to dup over stdout"); - if (workerfd_stdout != dev_null_w && fchmod(STDOUT_FILENO, 0777) < 0) + if (workerfd_stdout != dev_null_w && isatty(workerfd_stdout) && fchmod(STDOUT_FILENO, 0777) < 0) nwarn("Failed to chmod stdout"); if (workerfd_stderr < 0) workerfd_stderr = workerfd_stdout; if (dup2(workerfd_stderr, STDERR_FILENO) < 0) _pexit("Failed to dup over stderr"); - if (workerfd_stderr != dev_null_w && fchmod(STDERR_FILENO, 0777) < 0) + if (workerfd_stderr != dev_null_w && isatty(workerfd_stderr) && fchmod(STDERR_FILENO, 0777) < 0) nwarn("Failed to chmod stderr"); } /* If LISTEN_PID env is set, we need to set the LISTEN_PID @@ -314,11 +312,7 @@ int main(int argc, char *argv[]) .pid_to_handler = pid_to_handler, .exit_status_cache = NULL, }; - sigset_t set; - sigemptyset(&set); - sigaddset(&set, SIGCHLD); - sigprocmask(SIG_BLOCK, &set, NULL); - int signal_fd = signalfd(-1, &set, SFD_CLOEXEC); + int signal_fd = get_signal_descriptor(SIGCHLD); if (signal_fd < 0) pexit("Failed to create signalfd for SIGCHLD"); int signal_fd_tag = g_unix_fd_add(signal_fd, G_IO_IN, on_signalfd_cb, &data); @@ -407,7 +401,9 @@ int main(int argc, char *argv[]) if ((opt_api_version >= 1 || !opt_exec) && sync_pipe_fd >= 0) write_sync_fd(sync_pipe_fd, container_pid, NULL); +#ifdef __linux__ setup_oom_handling(container_pid); +#endif if (mainfd_stdout >= 0) { g_unix_fd_add(mainfd_stdout, G_IO_IN, stdio_cb, GINT_TO_POINTER(STDOUT_PIPE)); @@ -457,7 +453,9 @@ int main(int argc, char *argv[]) g_main_loop_run(main_loop); } +#ifdef __linux__ check_cgroup2_oom(); +#endif /* Drain stdout and stderr only if a timeout doesn't occur */ if (!timed_out) diff --git a/src/conn_sock.c b/src/conn_sock.c index 8c7aa30d..11309ee9 100644 --- a/src/conn_sock.c +++ b/src/conn_sock.c @@ -85,10 +85,61 @@ char *setup_seccomp_socket(const char *socket) return setup_socket(&seccomp_socket_fd, socket); } +#ifdef __linux__ +static void bind_relative_to_dir(int dir_fd, int sock_fd, const char *path) +{ + struct sockaddr_un addr; + + /* + * To be able to access the location of the attach socket, without first creating the attach socket + * but also be able to handle arbitrary length paths, we open the parent dir (base_path), and then use + * the corresponding entry in `/proc/self/fd` to act as the path to base_path, then we use the socket_relative_name + * to actually refer to the file where the socket will be created below. + */ + addr.sun_family = AF_UNIX; + if (dir_fd == -1) { + strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); + } else { + snprintf(addr.sun_path, sizeof(addr.sun_path) - 1, "/proc/self/fd/%d/%s", dir_fd, path); + } + ndebugf("addr{sun_family=AF_UNIX, sun_path=%s}", addr.sun_path); + + if (fchmod(sock_fd, 0700)) + pexit("Failed to change console-socket permissions"); + if (bind(sock_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) + pexit("Failed to bind to console-socket"); +} +#endif + +#ifdef __FreeBSD__ + +// FreeBSD earlier than 13.1-RELEASE doesn't have O_PATH +#ifndef O_PATH +#define O_PATH 0 +#endif + +static void bind_relative_to_dir(int dir_fd, int sock_fd, const char *path) +{ + struct sockaddr_un addr; + + if (dir_fd == -1) { + dir_fd = AT_FDCWD; + } + + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); + ndebugf("addr{sun_family=AF_UNIX, sun_path=%s}", addr.sun_path); + if (bindat(dir_fd, sock_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) + pexit("Failed to bind to console-socket"); + if (fchmodat(dir_fd, addr.sun_path, 0700, AT_SYMLINK_NOFOLLOW)) + pexit("Failed to change console-socket permissions"); +} +#endif + static char *setup_socket(int *fd, const char *path) { - struct sockaddr_un addr = {0}; char *csname = NULL; + char *bname = NULL; _cleanup_close_ int sfd = -1; if (path != NULL) { @@ -114,8 +165,6 @@ static char *setup_socket(int *fd, const char *path) bname = basename(bname_buf); if (bname == NULL) pexitf("Cannot get basename for %s", csname); - - snprintf(addr.sun_path, sizeof(addr.sun_path) - 1, "/proc/self/fd/%d/%s", sfd, bname); } else { _cleanup_free_ const char *tmpdir = g_get_tmp_dir(); @@ -132,20 +181,14 @@ static char *setup_socket(int *fd, const char *path) if (unlink(csname) < 0) pexit("Failed to unlink temporary random path"); - strncpy(addr.sun_path, csname, sizeof(addr.sun_path) - 1); + bname = csname; } - addr.sun_family = AF_UNIX; - ndebugf("addr{sun_family=AF_UNIX, sun_path=%s}", addr.sun_path); - /* Bind to the console socket path. */ *fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); if (*fd < 0) pexit("Failed to create socket"); - if (fchmod(*fd, 0700)) - pexit("Failed to change console-socket permissions"); - if (bind(*fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) - pexit("Failed to bind to console-socket"); + bind_relative_to_dir(sfd, *fd, bname); if (listen(*fd, 128) < 0) pexit("Failed to listen on console-socket"); @@ -186,16 +229,20 @@ void setup_notify_socket(char *socket_path) g_free(symlink_dir_path); } +static size_t max_socket_path_len() +{ + struct sockaddr_un addr; + return sizeof(addr.sun_path); +} + /* REMEMBER to g_free() the return value! */ static char *bind_unix_socket(char *socket_relative_name, int sock_type, mode_t perms, struct remote_sock_s *remote_sock, gboolean use_full_attach_path) { int socket_fd = -1; - struct sockaddr_un socket_addr = {0}; - socket_addr.sun_family = AF_UNIX; /* get the parent_dir of the socket. We'll use this to get the location of the socket. */ - char *parent_dir = socket_parent_dir(use_full_attach_path, sizeof(socket_addr.sun_path)); + char *parent_dir = socket_parent_dir(use_full_attach_path, max_socket_path_len()); /* * To be able to access the location of the attach socket, without first creating the attach socket @@ -207,11 +254,6 @@ static char *bind_unix_socket(char *socket_relative_name, int sock_type, mode_t if (parent_dir_fd < 0) pexitf("failed to open socket path parent dir %s", parent_dir); - _cleanup_free_ char *sock_proc_entry = g_strdup_printf("/proc/self/fd/%d/%s", parent_dir_fd, socket_relative_name); - strncpy(socket_addr.sun_path, sock_proc_entry, sizeof(socket_addr.sun_path) - 1); - ndebugf("addr{sun_family=AF_UNIX, sun_path=%s}", socket_addr.sun_path); - - /* * We use the fullpath for operations that aren't as limited in length as socket_addr.sun_path * Cleanup of this variable is up to the caller @@ -227,14 +269,10 @@ static char *bind_unix_socket(char *socket_relative_name, int sock_type, mode_t if (socket_fd == -1) pexitf("Failed to create socket %s", sock_fullpath); - if (fchmod(socket_fd, perms)) - pexitf("Failed to change socket permissions %s", sock_fullpath); - if (unlink(sock_fullpath) == -1 && errno != ENOENT) pexitf("Failed to remove existing socket: %s", sock_fullpath); - if (bind(socket_fd, (struct sockaddr *)&socket_addr, sizeof(struct sockaddr_un)) == -1) - pexitf("Failed to bind socket: %s", sock_fullpath); + bind_relative_to_dir(parent_dir_fd, socket_fd, socket_relative_name); if (chmod(sock_fullpath, perms)) pexitf("Failed to change socket permissions %s", sock_fullpath); diff --git a/src/ctr_exit.c b/src/ctr_exit.c index ffe432b9..a2eeed7c 100644 --- a/src/ctr_exit.c +++ b/src/ctr_exit.c @@ -13,8 +13,6 @@ #include #include #include -#include -#include #include volatile pid_t container_pid = -1; @@ -84,10 +82,8 @@ gboolean on_signalfd_cb(gint fd, G_GNUC_UNUSED GIOCondition condition, gpointer struct pid_check_data *data = (struct pid_check_data *)user_data; /* dequeue the signal from the signalfd */ - struct signalfd_siginfo siginfo; - ssize_t s = read(fd, &siginfo, sizeof siginfo); - g_assert_cmpint(s, ==, sizeof siginfo); - g_assert_cmpint(siginfo.ssi_signo, ==, SIGCHLD); + int sig = dequeue_signal_event(fd); + g_assert_cmpint(sig, ==, SIGCHLD); check_child_processes(data->pid_to_handler, data->exit_status_cache); return G_SOURCE_CONTINUE; @@ -154,7 +150,7 @@ void do_exit_command() * Unfortunately, that also means that any new subchildren from * still running processes could also get lost */ - if (prctl(PR_SET_CHILD_SUBREAPER, 0) != 0) { + if (set_subreaper(false) != 0) { nwarn("Failed to disable self subreaper attribute - might wait for indirect children a long time"); } diff --git a/src/oom.c b/src/oom.c index 0041a6b9..5ab56891 100644 --- a/src/oom.c +++ b/src/oom.c @@ -7,6 +7,7 @@ void attempt_oom_adjust(const char *const oom_score) { +#ifdef __linux__ int oom_score_fd = open("/proc/self/oom_score_adj", O_WRONLY); if (oom_score_fd < 0) { ndebugf("failed to open /proc/self/oom_score_adj: %s\n", strerror(errno)); @@ -16,4 +17,7 @@ void attempt_oom_adjust(const char *const oom_score) ndebugf("failed to write to /proc/self/oom_score_adj: %s\n", strerror(errno)); } close(oom_score_fd); +#else + (void)oom_score; +#endif } diff --git a/src/seccomp_notify.c b/src/seccomp_notify.c index 701d344b..bbc06ddb 100644 --- a/src/seccomp_notify.c +++ b/src/seccomp_notify.c @@ -6,10 +6,7 @@ #endif #include -#include #include -#include -#include #include #include #include @@ -19,10 +16,16 @@ #include "cli.h" // opt_bundle_path #include "utils.h" #include "cmsg.h" -#include "seccomp_notify.h" #ifdef USE_SECCOMP +#include +#include +#include + +#include "seccomp_notify.h" + + #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE #define SECCOMP_USER_NOTIF_FLAG_CONTINUE (1UL << 0) #endif diff --git a/src/seccomp_notify_plugin.h b/src/seccomp_notify_plugin.h index b8c660a9..ab590b1d 100644 --- a/src/seccomp_notify_plugin.h +++ b/src/seccomp_notify_plugin.h @@ -1,9 +1,9 @@ #ifndef SECCOMP_NOTIFY_PLUGIN_H -#include - #ifdef USE_SECCOMP +#include + struct seccomp_notify_conf_s { const char *runtime_root_path; const char *name; diff --git a/src/utils.c b/src/utils.c index f0019a76..a417035e 100644 --- a/src/utils.c +++ b/src/utils.c @@ -1,6 +1,16 @@ +#define _GNU_SOURCE + #include "utils.h" #include #include +#ifdef __linux__ +#include +#include +#endif +#ifdef __FreeBSD__ +#include +#include +#endif log_level_t log_level = WARN_LEVEL; char *log_cid = NULL; @@ -39,6 +49,18 @@ void set_conmon_logs(char *level_name, char *cid_, gboolean syslog_, char *tag) nexitf("No such log level %s", level_name); } +#ifdef __FreeBSD__ +static bool retryable_error(int err) +{ + return err == EINTR || err == EAGAIN; +} +#else +static bool retryable_error(int err) +{ + return err == EINTR; +} +#endif + ssize_t write_all(int fd, const void *buf, size_t count) { size_t remaining = count; @@ -48,7 +70,7 @@ ssize_t write_all(int fd, const void *buf, size_t count) while (remaining > 0) { do { res = write(fd, p, remaining); - } while (res == -1 && errno == EINTR); + } while (res == -1 && retryable_error(errno)); if (res <= 0) return -1; @@ -59,3 +81,79 @@ ssize_t write_all(int fd, const void *buf, size_t count) return count; } + +#ifdef __linux__ + +int set_subreaper(gboolean enabled) +{ + return prctl(PR_SET_CHILD_SUBREAPER, enabled, 0, 0, 0); +} + +int set_pdeathsig(int sig) +{ + return prctl(PR_SET_PDEATHSIG, sig); +} + +int get_signal_descriptor(int sig) +{ + sigset_t set; + sigemptyset(&set); + sigaddset(&set, sig); + sigprocmask(SIG_BLOCK, &set, NULL); + return signalfd(-1, &set, SFD_CLOEXEC); +} + +int dequeue_signal_event(int fd) +{ + struct signalfd_siginfo siginfo; + ssize_t s = read(fd, &siginfo, sizeof siginfo); + g_assert_cmpint(s, ==, sizeof siginfo); + return siginfo.ssi_signo; +} + +#endif + +#ifdef __FreeBSD__ + +int set_subreaper(gboolean enabled) +{ + if (enabled) { + return procctl(P_PID, getpid(), PROC_REAP_ACQUIRE, NULL); + } else { + return procctl(P_PID, getpid(), PROC_REAP_RELEASE, NULL); + } +} + +int set_pdeathsig(int sig) +{ + return procctl(P_PID, getpid(), PROC_PDEATHSIG_CTL, &sig); +} + +int get_signal_descriptor(int sig) +{ + sigset_t set; + sigemptyset(&set); + sigaddset(&set, sig); + sigprocmask(SIG_BLOCK, &set, NULL); + + int kq = kqueue(); + fcntl(kq, F_SETFD, FD_CLOEXEC); + struct kevent kev; + EV_SET(&kev, sig, EVFILT_SIGNAL, EV_ADD, 0, 0, NULL); + if (kevent(kq, &kev, 1, NULL, 0, NULL)) { + pexitf("failed to add kevent signal %d", sig); + } + return kq; +} + +int dequeue_signal_event(int kq) +{ + struct kevent kev; + int n = kevent(kq, NULL, 0, &kev, 1, NULL); + if (n != 1) { + pexit("failed to read signal event"); + } + return kev.ident; +} + +#endif diff --git a/src/utils.h b/src/utils.h index 5d5caf34..f3cf8534 100644 --- a/src/utils.h +++ b/src/utils.h @@ -225,4 +225,11 @@ static inline void hashtable_free_cleanup(GHashTable **tbl) ssize_t write_all(int fd, const void *buf, size_t count); +int set_subreaper(gboolean enabled); + +int set_pdeathsig(int sig); + +int get_signal_descriptor(int sig); +int dequeue_signal_event(int fd); + #endif /* !defined(UTILS_H) */