From 904c53ea311fd6fae945a55202b0a7ccf3783465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Pokorn=C3=BD?= Date: Tue, 16 Apr 2019 00:04:47 +0200 Subject: [PATCH] High: pacemakerd vs. IPC/procfs confused deputy authenticity issue (2/4) [2/4: pacemakerd to trust pre-existing processes via new checks instead] In pacemakerd in the context of entrusting pre-existing processes, we now resort to procfs-based solution only in boundary, fouled cases, and primarily examine the credentials of the processes already occupying known IPC end-points before adopting them. The commit applies the new helpers from 1/1 so as to close the both related sensitive problems, CVE-2018-16877 and CVE-2018-16878, in a unified manner, this time limited to the main daemon of pacemaker (pacemakerd). To be noted that it is clearly not 100% for this purpose for still allowing for TOCTTOU, but that's what commit (3/3) is meant to solve for the most part, plus there may be optimizations solving this concern as a side effect, but that requires an active assistance on the libqb side (https://github.com/ClusterLabs/libqb/issues/325) since any improvement on pacemaker side in isolation would be very cumbersome if generally possible at all, but either way means a new, soft compatibility encumberment. As a follow-up to what was put in preceding 1/3 commit, PID of 1 tracked as child's identification on FreeBSD (or when socket-based activation is used with systemd) is treated specially, incl. special precaution with child's PID discovered as 1 elsewhere. v2: courtesy of Yan Gao of SUSE for early discovery and report for what's primarily solved with 4/4 commit, in extension, child daemons in the initialization phase coinciding with IPC-feasibility based process scan in pacemakerd in a way that those are missed (although they are to come up fully just moments later only to interfere with naturally spawned ones) are now considered so that if any native children later fail for said clash, the pre-existing counterpart may get adopted instead of ending up with repeated spawn-bury loop ad nauseam without real progress (note that PCMK_fail_fast=true could possibly help, but that's rather a big hammer not suitable for all the use cases, not the ones we try to deal with gracefully here) --- mcp/pacemaker.c | 431 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 362 insertions(+), 69 deletions(-) diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c index 2986be65153..86df216bc80 100644 --- a/mcp/pacemaker.c +++ b/mcp/pacemaker.c @@ -1,5 +1,7 @@ /* - * Copyright 2010-2018 Andrew Beekhof + * Copyright 2010-2019 the Pacemaker project contributors + * + * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. @@ -10,17 +12,23 @@ #include #include +#include #include #include #include #include #include +#include /* indirectly: CRM_EX_* */ +#include /* cib_channel_ro */ #include #include #include #include #include + +#include /* PCMK__SPECIAL_PID*, ... */ + #ifdef SUPPORT_COROSYNC #include #endif @@ -31,6 +39,7 @@ gboolean pcmk_quorate = FALSE; gboolean fatal_error = FALSE; GMainLoop *mainloop = NULL; +static bool global_keep_tracking = false; #define PCMK_PROCESS_CHECK_INTERVAL 5 @@ -48,6 +57,7 @@ typedef struct pcmk_child_s { const char *name; const char *uid; const char *command; + const char *endpoint; /* IPC server name */ gboolean active_before_startup; } pcmk_child_t; @@ -59,17 +69,35 @@ typedef struct pcmk_child_s { static pcmk_child_t pcmk_children[] = { { 0, crm_proc_none, 0, 0, FALSE, "none", NULL, NULL }, { 0, crm_proc_plugin, 0, 0, FALSE, "ais", NULL, NULL }, - { 0, crm_proc_lrmd, 3, 0, TRUE, "lrmd", NULL, CRM_DAEMON_DIR"/lrmd" }, - { 0, crm_proc_cib, 1, 0, TRUE, "cib", CRM_DAEMON_USER, CRM_DAEMON_DIR"/cib" }, - { 0, crm_proc_crmd, 6, 0, TRUE, "crmd", CRM_DAEMON_USER, CRM_DAEMON_DIR"/crmd" }, - { 0, crm_proc_attrd, 4, 0, TRUE, "attrd", CRM_DAEMON_USER, CRM_DAEMON_DIR"/attrd" }, - { 0, crm_proc_stonithd, 0, 0, TRUE, "stonithd", NULL, NULL }, - { 0, crm_proc_pe, 5, 0, TRUE, "pengine", CRM_DAEMON_USER, CRM_DAEMON_DIR"/pengine" }, - { 0, crm_proc_mgmtd, 0, 0, TRUE, "mgmtd", NULL, HB_DAEMON_DIR"/mgmtd" }, - { 0, crm_proc_stonith_ng, 2, 0, TRUE, "stonith-ng", NULL, CRM_DAEMON_DIR"/stonithd" }, + { 0, crm_proc_lrmd, 3, 0, TRUE, "lrmd", NULL, CRM_DAEMON_DIR"/lrmd", + CRM_SYSTEM_LRMD + }, + { 0, crm_proc_cib, 1, 0, TRUE, "cib", CRM_DAEMON_USER, CRM_DAEMON_DIR"/cib", + cib_channel_ro + }, + { 0, crm_proc_crmd, 6, 0, TRUE, "crmd", CRM_DAEMON_USER, CRM_DAEMON_DIR"/crmd", + CRM_SYSTEM_CRMD + }, + { 0, crm_proc_attrd, 4, 0, TRUE, "attrd", CRM_DAEMON_USER, CRM_DAEMON_DIR"/attrd", + T_ATTRD + }, + { 0, crm_proc_stonithd, 0, 0, TRUE, "stonithd", NULL, NULL, + NULL + }, + { 0, crm_proc_pe, 5, 0, TRUE, "pengine", CRM_DAEMON_USER, CRM_DAEMON_DIR"/pengine", + CRM_SYSTEM_PENGINE + }, + { 0, crm_proc_mgmtd, 0, 0, TRUE, "mgmtd", NULL, HB_DAEMON_DIR"/mgmtd", + NULL + }, + { 0, crm_proc_stonith_ng, 2, 0, TRUE, "stonith-ng", NULL, CRM_DAEMON_DIR"/stonithd", + "stonith-ng" + }, }; /* *INDENT-ON* */ +static gboolean check_active_before_startup_processes(gpointer user_data); +static int pcmk_child_active(pcmk_child_t *child); static gboolean start_child(pcmk_child_t * child); void update_process_clients(crm_client_t *client); void update_process_peers(void); @@ -131,14 +159,31 @@ pcmk_process_exit(pcmk_child_t * child) } if (shutdown_trigger) { + /* resume step-wise shutdown (returned TRUE yields no parallelizing) */ mainloop_set_trigger(shutdown_trigger); + /* intended to speed up propagating expected lay-off of the daemons? */ update_node_processes(local_nodeid, NULL, get_process_list()); - } else if (child->respawn && crm_is_true(getenv("PCMK_fail_fast"))) { + } else if (!child->respawn) { + /* nothing to do */ + + } else if (crm_is_true(getenv("PCMK_fail_fast"))) { crm_err("Rebooting system because of %s", child->name); pcmk_panic(__FUNCTION__); - } else if (child->respawn) { + } else if (pcmk_child_active(child) == 1) { + crm_warn("One-off suppressing strict respawning of a child process %s," + " appears alright per %s IPC end-point", + child->name, child->endpoint); + /* need to monitor how it evolves, and start new process if badly */ + child->active_before_startup = TRUE; + if (!global_keep_tracking) { + global_keep_tracking = true; + g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, + check_active_before_startup_processes, NULL); + } + + } else { crm_notice("Respawning failed child process: %s", child->name); start_child(child); } @@ -215,8 +260,13 @@ stop_child(pcmk_child_t * child, int signal) signal = SIGTERM; } - if (child->command == NULL) { - crm_debug("Nothing to do for child \"%s\"", child->name); + /* why to skip PID of 1? + - FreeBSD ~ how untrackable process behind IPC is masqueraded as + - elsewhere: how "init" task is designated; in particular, in systemd + arrangement of socket-based activation, this is pretty real */ + if (child->command == NULL || child->pid == PCMK__SPECIAL_PID) { + crm_debug("Nothing to do for child \"%s\" (process %lld)", + child->name, (long long) PCMK__SPECIAL_PID_AS_0(child->pid)); return TRUE; } @@ -241,6 +291,11 @@ stop_child(pcmk_child_t * child, int signal) static char *opts_default[] = { NULL, NULL }; static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL }; +/* TODO once libqb is taught to juggle with IPC end-points carried over as + bare file descriptor (https://github.com/ClusterLabs/libqb/issues/325) + it shall hand over these descriptors here if/once they are successfully + pre-opened in (presumably) pcmk_child_active, to avoid any remaining + room for races */ static gboolean start_child(pcmk_child_t * child) { @@ -371,7 +426,10 @@ escalate_shutdown(gpointer data) pcmk_child_t *child = data; - if (child->pid) { + if (child->pid == PCMK__SPECIAL_PID) { + pcmk_process_exit(child); + + } else if (child->pid) { /* Use SIGSEGV instead of SIGKILL to create a core so we can see what it was up to */ crm_err("Child %s not terminating in a timely manner, forcing", child->name); stop_child(child, SIGSEGV); @@ -379,6 +437,8 @@ escalate_shutdown(gpointer data) return FALSE; } +#define SHUTDOWN_ESCALATION_PERIOD 180000 /* 3m */ + static gboolean pcmk_shutdown_worker(gpointer user_data) { @@ -407,11 +467,24 @@ pcmk_shutdown_worker(gpointer user_data) time_t now = time(NULL); if (child->respawn) { + if (child->pid == PCMK__SPECIAL_PID) { + crm_warn("The process behind %s IPC cannot be" + " terminated, so either wait the graceful" + " period of %ld s for its native termination" + " if it vitally depends on some other daemons" + " going down in a controlled way already," + " or locate and kill the correct %s process" + " on your own; set PCMK_fail_fast=1 to avoid" + " this altogether next time around", + child->name, (long) SHUTDOWN_ESCALATION_PERIOD, + child->command); + } next_log = now + 30; child->respawn = FALSE; stop_child(child, SIGTERM); if (phase < pcmk_children[pcmk_child_crmd].start_seq) { - g_timeout_add(180000 /* 3m */ , escalate_shutdown, child); + g_timeout_add(SHUTDOWN_ESCALATION_PERIOD, + escalate_shutdown, child); } } else if (now >= next_log) { @@ -696,7 +769,106 @@ mcp_chown(const char *path, uid_t uid, gid_t gid) } } -#if SUPPORT_PROCFS +/*! + * \internal + * \brief Check the liveness of the child based on IPC name and PID if tracked + * + * \param[inout] child Child tracked data + * + * \return 0 if no trace of child's liveness detected (while it is detectable + * to begin with, at least according to one of the two properties), + * 1 if everything is fine, 2 if it's up per PID, but not per IPC + * end-point (still starting?), -1 on error, and -2 when the child + * (its IPC) blocked with an unauthorized process (log message + * emitted in both latter cases) + * + * \note This function doesn't modify any of \p child members but \c pid, + * and is not actively toying with processes as such but invoking + * \c stop_child in one particular case (there's for some reason + * a different authentic holder of the IPC end-point). + */ +static int +pcmk_child_active(pcmk_child_t *child) { + static uid_t cl_uid = 0; + static gid_t cl_gid = 0; + const uid_t root_uid = 0; + const gid_t root_gid = 0; + const uid_t *ref_uid; + const gid_t *ref_gid; + int ret = 0; + pid_t ipc_pid = 0; + const char *use_name; + + if (child->endpoint == NULL + && (child->pid <= 0 || child->pid == PCMK__SPECIAL_PID)) { + crm_err("Cannot track child %s for missing both API end-point and PID", + child->name); + ret = -1; /* misuse of the function when child is not trackable */ + + } else if (child->endpoint != NULL) { + + ref_uid = (child->uid != NULL) ? &cl_uid : &root_uid; + ref_gid = (child->uid != NULL) ? &cl_gid : &root_gid; + + if (child->uid != NULL && !cl_uid && !cl_gid + && crm_user_lookup(CRM_DAEMON_USER, &cl_uid, &cl_gid) < 0) { + crm_err("Could not find user and group IDs for user %s", + CRM_DAEMON_USER); + ret = -1; + } else if ((ret = pcmk__ipc_is_authentic_process_active(child->endpoint, + *ref_uid, *ref_gid, + &ipc_pid)) < 0) { + /* game over */ + } else if (child->pid <= 0) { + /* hit new child to be initialized, or reset to zero + and investigate further for ret == 0 */ + child->pid = ipc_pid; + } else if (ipc_pid && child->pid != ipc_pid) { + /* ultimately strange for ret == 1; either way, investigate */ + ret = 0; + } + } + + if (!ret) { + use_name = (child->flag == crm_proc_stonith_ng) + ? "stonithd" /* compensate "simplification" 61fc951e9 */ + : child->name; + /* when no IPC based liveness detected (incl. if ever a child without + IPC is tracked), or detected for a different _authentic_ process; + safe on FreeBSD since the only change possible from a proper child's + PID into "special" PID of 1 behind more loosely related process */ + ret = crm_pid_active(child->pid, use_name); + if (ipc_pid && (ret != 1 + || ipc_pid == PCMK__SPECIAL_PID + || crm_pid_active(ipc_pid, use_name) == 1)) { + if (ret == 1) { + /* assume there's no forking-while-retaining-IPC-socket + involved in the "children's" lifecycle, hence that the + tracking got out of sync purely because of some external + (esotheric?) forces (user initiated process "refresh" by + force? or intentionally racing on start-up, even?), and + that switching over to this other detected, authentic + instance with an IPC already in possession is a better + trade-off than "neutralizing" it first so as to give + either the original or possibly a new to-be-spawned + daemon process a leeway for operation, which would + otherwise have to be carried out */ + /* not possessing IPC, afterall (what about corosync CPG?) */ + stop_child(child, SIGKILL); + } else { + ret = 1; + } + child->pid = ipc_pid; + } else if (ret == 1) { + ret = 2; /* up per PID, but not per IPC (still starting?) */ + } else if (!child->pid && ret == -1) { + ret = 0; /* correct -1 on FreeBSD from above back to 0 */ + } + } + + return ret; +} + static gboolean check_active_before_startup_processes(gpointer user_data) { @@ -713,15 +885,41 @@ check_active_before_startup_processes(gpointer user_data) continue; } else { const char *name = pcmk_children[lpc].name; - if (pcmk_children[lpc].flag == crm_proc_stonith_ng) { - name = "stonithd"; - } - - if (crm_pid_active(pcmk_children[lpc].pid, name) != 1) { - crm_notice("Process %s terminated (pid=%d)", - name, pcmk_children[lpc].pid); - pcmk_process_exit(&(pcmk_children[lpc])); - continue; + int ret; + + switch ((ret = pcmk_child_active(&pcmk_children[lpc]))) { + case 1: + break; + case 0: + case 2: /* this very case: it was OK once already */ + if (pcmk_children[lpc].respawn == TRUE) { + /* presumably after crash, hence critical */ + crm_crit("Process %s terminated (pid=%lld)%s", \ + name, (long long) + PCMK__SPECIAL_PID_AS_0(pcmk_children[lpc].pid), + ret ? ", at least per IPC end-point that went AWOL" + : ""); + } else { + /* orderly shutdown */ + crm_notice("Process %s terminated (pid=%lld)%s", \ + name, (long long) + PCMK__SPECIAL_PID_AS_0(pcmk_children[lpc].pid), + ret ? ", at least per IPC end-point that went AWOL" + : ""); + } + pcmk_process_exit(&(pcmk_children[lpc])); + continue; + default: + crm_crit("Unexpected value from pcmk_child_active:" + " %d (pid=%lld)", ret, + (long long) PCMK__SPECIAL_PID_AS_0( + pcmk_children[lpc].pid)); + /* fall through */ + case -1: + case -2: + /* message(s) already emitted */ + crm_exit(DAEMON_RESPAWN_STOP); + break; /* static analysis/noreturn */ } } /* at least one of the processes found at startup @@ -730,61 +928,147 @@ check_active_before_startup_processes(gpointer user_data) } } + global_keep_tracking = keep_tracking; return keep_tracking; } -#endif // SUPPORT_PROCFS -static void +/*! + * \internal + * \brief Initial one-off check of the pre-existing "child" processes + * + * With "child" process, we mean the subdaemon that defines an API end-point + * (all of them do as of the comment) -- the possible complement is skipped + * as it is deemed it has no such shared resources to cause conflicts about, + * hence it can presumably be started anew without hesitation. + * If that won't hold true in the future, the concept of a shared resource + * will have to be generalized beyond the API end-point. + * + * For boundary cases that the "child" is still starting (IPC end-point is yet + * to be witnessed), or more rarely (practically FreeBSD only), when there's + * a pre-existing "untrackable" authentic process, we give the situation some + * time to possibly unfold in the right direction, meaning that said socket + * will appear or the unattainable process will disappear per the observable + * IPC, respectively. + * + * \return 0 if no such "child" process found, positive number X when X + * "children" detected, -1 on an internal error, -2 when any + * would-be-used IPC is blocked with an unauthorized process + * + * \note Since this gets run at the very start, \c respawn_count fields + * for particular children get temporarily overloaded with "rounds + * of waiting" tracking, restored once we are about to finish with + * success (i.e. returning value >=0) and will remain unrestored + * otherwise. One way to suppress liveness detection logic for + * particular child is to set the said value to a negative number. + */ +#define WAIT_TRIES 4 /* together with interleaved sleeps, worst case ~ 1s */ +static int find_and_track_existing_processes(void) { -#if SUPPORT_PROCFS - DIR *dp; - struct dirent *entry; - bool start_tracker = FALSE; - char entry_name[64]; - - dp = opendir("/proc"); - if (!dp) { - /* no proc directory to search through */ - crm_notice("Can not read /proc directory to track existing components"); - return; - } - - while ((entry = readdir(dp)) != NULL) { - int pid; - int max = SIZEOF(pcmk_children); - int i; - - if (crm_procfs_process_info(entry, entry_name, &pid) < 0) { - continue; - } - for (i = 0; i < max; i++) { - const char *name = pcmk_children[i].name; - - if (pcmk_children[i].start_seq == 0) { + unsigned tracking = 0U; + bool wait_in_progress; + int cur; + size_t i, rounds; + + for (rounds = 1; rounds <= WAIT_TRIES; rounds++) { + wait_in_progress = false; + for (i = 0; i < SIZEOF(pcmk_children); i++) { + if (!pcmk_children[i].endpoint + || pcmk_children[i].respawn_count < 0 + || !(cur = pcmk_child_active(&pcmk_children[i]))) { + /* as a speculation, don't give up in the context of + pcmk_child_active check if there are more rounds to + come for other reasons, but don't artificially wait just + because of this, since we would preferably start ASAP */ continue; } - if (pcmk_children[i].flag == crm_proc_stonith_ng) { - name = "stonithd"; - } - if (safe_str_eq(entry_name, name) && (crm_pid_active(pid, NULL) == 1)) { - crm_notice("Tracking existing %s process (pid=%d)", name, pid); - pcmk_children[i].pid = pid; - pcmk_children[i].active_before_startup = TRUE; - start_tracker = TRUE; - break; + pcmk_children[i].respawn_count = rounds; + switch (cur) { + case 1: + if (pcmk_children[i].pid == PCMK__SPECIAL_PID) { + if (crm_is_true(getenv("PCMK_fail_fast"))) { + crm_crit("Cannot reliably track pre-existing" + " authentic process behind %s IPC on this" + " platform and PCMK_fail_fast requested", + pcmk_children[i].endpoint); + return -1; + } else if (pcmk_children[i].respawn_count == WAIT_TRIES) { + crm_notice("Assuming pre-existing authentic, though" + " on this platform untrackable, process" + " behind %s IPC is stable (was in %d" + " previous samples) so rather than" + " bailing out (PCMK_fail_fast not" + " requested), we just switch to a less" + " optimal IPC liveness monitoring" + " (not very suitable for heavy load)", + pcmk_children[i].name, WAIT_TRIES - 1); + crm_warn("The process behind %s IPC cannot be" + " terminated, so the overall shutdown" + " will get delayed implicitly (%ld s)," + " which serves as a graceful period for" + " its native termination if it vitally" + " depends on some other daemons going" + " down in a controlled way already", + pcmk_children[i].name, + (long) SHUTDOWN_ESCALATION_PERIOD); + } else { + wait_in_progress = true; + crm_warn("Cannot reliably track pre-existing" + " authentic process behind %s IPC on this" + " platform, can still disappear in %d" + " attempt(s)", pcmk_children[i].endpoint, + WAIT_TRIES - pcmk_children[i].respawn_count); + continue; + } + } + crm_notice("Tracking existing %s process (pid=%lld)", + pcmk_children[i].name, + (long long) PCMK__SPECIAL_PID_AS_0( + pcmk_children[i].pid)); + pcmk_children[i].respawn_count = -1; /* 0~keep watching */ + pcmk_children[i].active_before_startup = TRUE; + tracking++; + break; + case 2: + if (pcmk_children[i].respawn_count == WAIT_TRIES) { + crm_crit("%s IPC end-point for existing authentic" + " process %lld did not (re)appear", + pcmk_children[i].endpoint, + (long long) PCMK__SPECIAL_PID_AS_0( + pcmk_children[i].pid)); + return -1; + } + wait_in_progress = true; + crm_warn("Cannot find %s IPC end-point for existing" + " authentic process %lld, can still (re)appear" + " in %d attempts (?)", + pcmk_children[i].endpoint, + (long long) PCMK__SPECIAL_PID_AS_0( + pcmk_children[i].pid), + WAIT_TRIES - pcmk_children[i].respawn_count); + continue; + case -1: + case -2: + return cur; /* messages already emitted */ + default: + crm_crit("Unexpected condition"CRM_XS"cur=%d", cur); + return -1; /* unexpected condition */ } } + if (!wait_in_progress) { + break; + } + (void) poll(NULL, 0, 250); /* a bit for changes to possibly happen */ + } + for (i = 0; i < SIZEOF(pcmk_children); i++) { + pcmk_children[i].respawn_count = 0; /* restore pristine state */ } - if (start_tracker) { - g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, check_active_before_startup_processes, - NULL); + if (tracking) { + g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, + check_active_before_startup_processes, NULL); } - closedir(dp); -#else - crm_notice("No procfs support, so skipping check for existing components"); -#endif // SUPPORT_PROCFS + return (tracking > INT_MAX) ? INT_MAX : tracking; } static void @@ -1106,7 +1390,16 @@ main(int argc, char **argv) setenv("PCMK_watchdog", "false", 1); } - find_and_track_existing_processes(); + switch (find_and_track_existing_processes()) { + case -1: + crm_crit("Internal fatality, see the log"); + crm_exit(DAEMON_RESPAWN_STOP); + case -2: + crm_crit("Blocked by foreign process, kill the offender"); + crm_exit(ENOLCK); + default: + break; + }; cluster.destroy = mcp_cpg_destroy; cluster.cpg.cpg_deliver_fn = mcp_cpg_deliver;