diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h index 2d345528a4e..35bd229cced 100644 --- a/crmd/crmd_utils.h +++ b/crmd/crmd_utils.h @@ -97,6 +97,7 @@ void crmd_join_phase_log(int level); const char *get_timer_desc(fsa_timer_t * timer); gboolean too_many_st_failures(void); +void reset_st_fail_count(const char * target); # define fsa_register_cib_callback(id, flag, data, fn) do { \ fsa_cib_conn->cmds->register_callback( \ diff --git a/crmd/lrm.c b/crmd/lrm.c index 1f514eba9e6..fe2d02b70d6 100644 --- a/crmd/lrm.c +++ b/crmd/lrm.c @@ -1014,7 +1014,7 @@ lrm_clear_last_failure(const char *rsc_id, const char *node_name) } } free(attr); - + g_list_free(lrm_state_list); } static gboolean diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c index 5e51f5e87f5..07cd67cafc5 100644 --- a/crmd/remote_lrmd_ra.c +++ b/crmd/remote_lrmd_ra.c @@ -396,7 +396,7 @@ handle_remote_ra_exec(gpointer user_data) fsa_cib_delete(XML_CIB_TAG_STATUS, status, cib_quorum_override, rc, NULL); crm_info("Forced a remote LRM refresh before connection start: call=%d", rc); crm_log_xml_trace(status, "CLEAR LRM"); - free(status); + free_xml(status); rc = handle_remote_ra_start(lrm_state, cmd, cmd->timeout); if (rc == 0) { diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c index 4c80431a9f2..fffc9729a9d 100644 --- a/crmd/te_callbacks.c +++ b/crmd/te_callbacks.c @@ -349,6 +349,20 @@ too_many_st_failures(void) return FALSE; } +void +reset_st_fail_count(const char *target) +{ + struct st_fail_rec *rec = NULL; + + if (stonith_failures) { + rec = g_hash_table_lookup(stonith_failures, target); + } + + if (rec) { + rec->count = 0; + } +} + void tengine_stonith_callback(stonith_t * stonith, stonith_callback_data_t * data) { diff --git a/crmd/te_events.c b/crmd/te_events.c index 112f056c531..f2b9b629e9d 100644 --- a/crmd/te_events.c +++ b/crmd/te_events.c @@ -135,6 +135,7 @@ update_failcount(xmlNode * event, const char *event_node_uuid, int rc, int targe const char *value = NULL; const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY); const char *on_uname = get_uname_from_event(event); + const char *origin = crm_element_value(event, XML_ATTR_ORIGIN); if (rc == 99) { /* this is an internal code for "we're busy, try again" */ @@ -144,6 +145,12 @@ update_failcount(xmlNode * event, const char *event_node_uuid, int rc, int targe return FALSE; } + if (safe_str_eq(origin, "build_active_RAs")) { + crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh", + id, rc, on_uname); + return FALSE; + } + if (failed_stop_offset == NULL) { failed_stop_offset = strdup(INFINITY_S); } diff --git a/crmd/te_utils.c b/crmd/te_utils.c index aa22fa393c8..3dcbf471c7d 100644 --- a/crmd/te_utils.c +++ b/crmd/te_utils.c @@ -122,6 +122,11 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event) return; } + if (st_event->result == pcmk_ok && + safe_str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE)) { + reset_st_fail_count(st_event->target); + } + crm_notice("Peer %s was%s terminated (%s) by %s for %s: %s (ref=%s) by client %s", st_event->target, st_event->result == pcmk_ok ? "" : " not", st_event->operation, diff --git a/doc/Pacemaker_Explained/en-US/Ch-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Resources.txt index a3d862b5274..3436bf8152c 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Resources.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Resources.txt @@ -374,6 +374,22 @@ indexterm:[Resource,Option,target-role] indexterm:[multiple-active,Resource Option] indexterm:[Resource,Option,multiple-active] +|remote-node +|++ (disabled) +|The name of the remote-node this resource defines. This both enables the resource as a remote-node and defines the unique name used to identify the remote-node. If no other parameters are set, this value will also be assumed as the hostname to connect to at port 3121. +WARNING+ This value cannot overlap with any resource or node IDs. + +|remote-port +|+3121+ +|Configure a custom port to use for the guest connection to pacemaker_remote. + +|remote-addr +|+remote-node+ value used as hostname +|The ip address or hostname to connect to if remote-node's name is not the hostname of the guest. + +|+remote-connect-timeout+ +|+60s+ +|How long before a pending guest connection will time out. + |========================================================= If you performed the following commands on the previous LSB Email resource diff --git a/doc/Pacemaker_Remote/en-US/Ch-Example.txt b/doc/Pacemaker_Remote/en-US/Ch-Example.txt index 33b70dfbb81..ca94044945f 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Example.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Example.txt @@ -66,7 +66,7 @@ Last updated: Wed Mar 13 13:52:39 2013 Last change: Wed Mar 13 13:25:17 2013 via crmd on node1 Stack: corosync Current DC: node1 (24815808) - partition with quorum -Version: 1.1.9 +Version: 1.1.10 2 Nodes configured, unknown expected votes 2 Resources configured. @@ -91,7 +91,7 @@ Last updated: Wed Mar 13 13:52:39 2013 Last change: Wed Mar 13 13:25:17 2013 via crmd on node1 Stack: corosync Current DC: node1 (24815808) - partition with quorum -Version: 1.1.9 +Version: 1.1.10 2 Nodes configured, unknown expected votes 2 Resources configured. diff --git a/doc/Pacemaker_Remote/en-US/Ch-Intro.txt b/doc/Pacemaker_Remote/en-US/Ch-Intro.txt index a9d575a0540..a505ea1916e 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Intro.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Intro.txt @@ -1,7 +1,7 @@ = Extending High Availability Cluster into Virtual Nodes = == Overview == -The recent addition of the +pacemaker_remote+ service supported by +Pacemaker version 1.1.9.1 and greater+ allows nodes not running the cluster stack (pacemaker+corosync) to integrate into the cluster and have the cluster manage their resources just as if they were a real cluster node. This means that pacemaker clusters are now capable of managing both launching virtual environments (KVM/LXC) as well as launching the resources that live withing those virtual environments without requiring the virtual environments to run pacemaker or corosync. +The recent addition of the +pacemaker_remote+ service supported by +Pacemaker version 1.1.10 and greater+ allows nodes not running the cluster stack (pacemaker+corosync) to integrate into the cluster and have the cluster manage their resources just as if they were a real cluster node. This means that pacemaker clusters are now capable of managing both launching virtual environments (KVM/LXC) as well as launching the resources that live withing those virtual environments without requiring the virtual environments to run pacemaker or corosync. == Terms == +cluster-node+ - A baremetal hardware node running the High Availability stack (pacemaker + corosync) diff --git a/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt index a57d7d73f1a..fe0077524fe 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-KVM-Tutorial.txt @@ -6,7 +6,7 @@ == Step 1: Setup the Host == -This tutorial was created using Fedora 18 on the host and guest nodes. Anything that is capable of running libvirt and pacemaker v1.1.9.1 or greater will do though. An installation guide for installing Fedora 18 can be found here, http://docs.fedoraproject.org/en-US/Fedora/18/html/Installation_Guide/. +This tutorial was created using Fedora 18 on the host and guest nodes. Anything that is capable of running libvirt and pacemaker v1.1.10 or greater will do though. An installation guide for installing Fedora 18 can be found here, http://docs.fedoraproject.org/en-US/Fedora/18/html/Installation_Guide/. Fedora 18 (or similar distro) host preparation steps. @@ -94,7 +94,7 @@ Verify pacemaker status. At first the 'pcs cluster status' output will look lik Last change: Thu Mar 14 12:25:55 2013 via crmd on example-host Stack: corosync Current DC: - Version: 1.1.9.1 + Version: 1.1.10 1 Nodes configured, unknown expected votes 0 Resources configured. ---- @@ -285,7 +285,7 @@ Last updated: Thu Mar 14 16:41:22 2013 Last change: Thu Mar 14 16:41:08 2013 via crmd on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum -Version: 1.1.9.1 +Version: 1.1.10 1 Nodes configured, unknown expected votes 0 Resources configured. @@ -344,7 +344,7 @@ Last updated: Fri Mar 15 09:30:30 2013 Last change: Thu Mar 14 17:21:35 2013 via cibadmin on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum -Version: 1.1.9.1 +Version: 1.1.10 2 Nodes configured, unknown expected votes 2 Resources configured. @@ -426,7 +426,7 @@ Last updated: Fri Mar 15 11:00:31 2013 Last change: Fri Mar 15 09:54:16 2013 via cibadmin on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum -Version: 1.1.9.1 +Version: 1.1.10 2 Nodes configured, unknown expected votes 7 Resources configured. @@ -455,7 +455,7 @@ Last updated: Fri Mar 15 11:03:17 2013 Last change: Fri Mar 15 09:54:16 2013 via cibadmin on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum -Version: 1.1.9.1 +Version: 1.1.10 2 Nodes configured, unknown expected votes 7 Resources configured. diff --git a/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt index bcb1c3cc48f..c3459c086a5 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt @@ -6,7 +6,7 @@ == Step 1: Setup LXC Host == -This tutorial was tested with Fedora 18. Anything that is capable of running libvirt and pacemaker v1.1.9.1 or greater will do though. An installation guide for installing Fedora 18 can be found here, http://docs.fedoraproject.org/en-US/Fedora/18/html/Installation_Guide/. +This tutorial was tested with Fedora 18. Anything that is capable of running libvirt and pacemaker v1.1.10 or greater will do though. An installation guide for installing Fedora 18 can be found here, http://docs.fedoraproject.org/en-US/Fedora/18/html/Installation_Guide/. Fedora 18 (or similar distro) host preparation steps. @@ -96,7 +96,7 @@ Verify pacemaker status. At first the 'pcs cluster status' output will look lik Last change: Thu Mar 14 12:25:55 2013 via crmd on example-host Stack: corosync Current DC: - Version: 1.1.9.1 + Version: 1.1.10 1 Nodes configured, unknown expected votes 0 Resources configured. ---- @@ -201,7 +201,7 @@ Last updated: Thu Mar 14 16:41:22 2013 Last change: Thu Mar 14 16:41:08 2013 via crmd on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum -Version: 1.1.9.1 +Version: 1.1.10 1 Nodes configured, unknown expected votes 0 Resources configured. @@ -239,7 +239,7 @@ Last updated: Mon Mar 18 17:15:46 2013 Last change: Mon Mar 18 17:15:26 2013 via cibadmin on guest1 Stack: corosync Current DC: example-host (175810752) - partition WITHOUT quorum -Version: 1.1.9.1 +Version: 1.1.10 4 Nodes configured, unknown expected votes 6 Resources configured. @@ -277,7 +277,7 @@ Last updated: Mon Mar 18 17:31:54 2013 Last change: Mon Mar 18 17:31:05 2013 via cibadmin on example-host Stack: corosync Current DC: example=host (175810752) - partition WITHOUT quorum -Version: 1.1.9.1 +Version: 1.1.10 4 Nodes configured, unknown expected votes 11 Resources configured. diff --git a/doc/Pacemaker_Remote/en-US/Ch-Options.txt b/doc/Pacemaker_Remote/en-US/Ch-Options.txt index 3ca1800d377..17e8a34faf8 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Options.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Options.txt @@ -23,7 +23,7 @@ When configuring a virtual machine or lxc resource to act as a remote-node, thes |Configure a custom port to use for the guest connection to pacemaker_remote. |+remote-addr+ -|node name +|+remote-node+ value used as hostname |The ip address or hostname to connect to if remote-node's name is not the hostname of the guest. |+remote-connect-timeout+ diff --git a/lib/common/remote.c b/lib/common/remote.c index fd302cb5728..8b00f1660b9 100644 --- a/lib/common/remote.c +++ b/lib/common/remote.c @@ -728,11 +728,11 @@ check_connect_finished(gpointer userdata) } else { close(sock); } - free(cb_data); if (cb_data->callback) { cb_data->callback(cb_data->userdata, rc); } + free(cb_data); return FALSE; reschedule: @@ -821,12 +821,12 @@ int crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */ void *userdata, void (*callback) (void *userdata, int sock)) { - struct addrinfo *res; - struct addrinfo *rp; + struct addrinfo *res = NULL; + struct addrinfo *rp = NULL; struct addrinfo hints; const char *server = host; int ret_ga; - int sock; + int sock = -1; /* getaddrinfo */ memset(&hints, 0, sizeof(struct addrinfo)); @@ -843,7 +843,7 @@ crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */ if (!res || !res->ai_addr) { crm_err("getaddrinfo failed"); - return -1; + goto async_cleanup; } for (rp = res; rp != NULL; rp = rp->ai_next) { @@ -879,7 +879,8 @@ crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */ if (callback) { if (internal_tcp_connect_async (sock, rp->ai_addr, rp->ai_addrlen, timeout, userdata, callback) == 0) { - return 0; /* Success for now, we'll hear back later in the callback */ + sock = 0; + goto async_cleanup; /* Success for now, we'll hear back later in the callback */ } } else { @@ -891,8 +892,12 @@ crm_remote_tcp_connect_async(const char *host, int port, int timeout, /*ms */ close(sock); sock = -1; } - freeaddrinfo(res); +async_cleanup: + + if (res) { + freeaddrinfo(res); + } return sock; } diff --git a/lib/services/services_linux.c b/lib/services/services_linux.c index 233bd31dae9..6192ccf75f8 100644 --- a/lib/services/services_linux.c +++ b/lib/services/services_linux.c @@ -540,6 +540,7 @@ services_os_action_execute(svc_action_t * op, gboolean synchronous) close(op->opaque->stdout_fd); close(op->opaque->stderr_fd); + close(sfd); if (sigismember(&old_mask, SIGCHLD) == 0) { if (sigprocmask(SIG_UNBLOCK, &mask, NULL) < 0) { diff --git a/tools/cibadmin.c b/tools/cibadmin.c index 886fd9ca77b..479c8c21900 100644 --- a/tools/cibadmin.c +++ b/tools/cibadmin.c @@ -71,6 +71,8 @@ int request_id = 0; int operation_status = 0; cib_t *the_cib = NULL; gboolean force_flag = FALSE; +gboolean quiet = FALSE; +int bump_log_num = 0; /* *INDENT-OFF* */ static struct crm_option long_options[] = { @@ -227,7 +229,7 @@ main(int argc, char **argv) int option_index = 0; - crm_log_init(NULL, LOG_CRIT, FALSE, FALSE, argc, argv, FALSE); + crm_system_name = "cibadmin"; crm_set_options(NULL, "command [options] [data]", long_options, "Provides direct access to the cluster configuration." "\n\nAllows the configuration, or sections of it, to be queried, modified, replaced and deleted." @@ -266,6 +268,7 @@ main(int argc, char **argv) break; case 'Q': cib_action = CIB_OP_QUERY; + quiet = TRUE; break; case 'P': cib_action = CIB_OP_APPLY_DIFF; @@ -316,7 +319,7 @@ main(int argc, char **argv) break; case 'V': command_options = command_options | cib_verbose; - crm_bump_log_level(argc, argv); + bump_log_num++; break; case '?': case '$': @@ -384,6 +387,15 @@ main(int argc, char **argv) break; } } + + if (bump_log_num > 0) { + quiet = FALSE; + } + crm_log_init(NULL, LOG_CRIT, FALSE, FALSE, argc, argv, quiet); + while (bump_log_num > 0) { + crm_bump_log_level(argc, argv); + bump_log_num--; + } if (optind < argc) { printf("non-option ARGV-elements: ");