From ac102e53f3d5eb318e682101b2060cba7ae90936 Mon Sep 17 00:00:00 2001 From: Dianna Hohensee Date: Fri, 10 May 2024 08:33:28 -0400 Subject: [PATCH] Improve join NotMasterException response, and add class documentation (#108107) The NotMasterException response to a join request is difficult to use to diagnose a failed join attempt. Enhancing the NotMasterException to include what node is thought to be master and the current term. This additional information will help readers locate the real master, to go look at those logs. The additional class documentation on JoinHelper and ClusterFormationFailureHelper should improve comprehension of the circumstances of error message logs. --- .../ClusterFormationFailureHelper.java | 23 ++++++++++++++- .../cluster/coordination/JoinHelper.java | 29 +++++++++++++++++-- .../coordination/NodeJoinExecutor.java | 10 ++++++- 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java b/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java index c2cd403836593..e81d8d73af9a2 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/ClusterFormationFailureHelper.java @@ -43,9 +43,16 @@ import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING; import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY; +/** + * Handles periodic debug logging of information regarding why the cluster has failed to form. + * Periodic logging begins once {@link #start()} is called, and ceases on {@link #stop()}. + */ public class ClusterFormationFailureHelper { private static final Logger logger = LogManager.getLogger(ClusterFormationFailureHelper.class); + /** + * This time period controls how often warning log messages will be written if this node fails to join or form a cluster. + */ public static final Setting DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING = Setting.timeSetting( "discovery.cluster_formation_warning_timeout", TimeValue.timeValueMillis(10000), @@ -61,6 +68,16 @@ public class ClusterFormationFailureHelper { @Nullable // if no warning is scheduled private volatile WarningScheduler warningScheduler; + /** + * Works with the {@link JoinHelper} to log the latest node-join attempt failure and cluster state debug information. Must call + * {@link ClusterFormationState#start()} to begin. + * + * @param settings provides the period in which to log cluster formation errors. + * @param clusterFormationStateSupplier information about the current believed cluster state (See {@link ClusterFormationState}) + * @param threadPool the thread pool on which to run debug logging + * @param logLastFailedJoinAttempt invokes an instance of the JoinHelper to log the last encountered join failure + * (See {@link JoinHelper#logLastFailedJoinAttempt()}) + */ public ClusterFormationFailureHelper( Settings settings, Supplier clusterFormationStateSupplier, @@ -78,6 +95,10 @@ public boolean isRunning() { return warningScheduler != null; } + /** + * Schedules a warning debug message to be logged in 'clusterFormationWarningTimeout' time, and periodically thereafter, until + * {@link ClusterFormationState#stop()} has been called. + */ public void start() { assert warningScheduler == null; warningScheduler = new WarningScheduler(); @@ -129,7 +150,7 @@ public String toString() { } /** - * If this node believes that cluster formation has failed, this record provides information that can be used to determine why that is. + * This record provides node state information that can be used to determine why cluster formation has failed. */ public record ClusterFormationState( List initialMasterNodesSetting, diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java b/server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java index b960bb02ceb7f..059400ad81cfb 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/JoinHelper.java @@ -194,13 +194,23 @@ private void unregisterAndReleaseConnection(DiscoveryNode destination, Releasabl Releasables.close(connectionReference); } - // package-private for testing + /** + * Saves information about a join failure. The failure information may be logged later via either {@link FailedJoinAttempt#logNow} + * or {@link FailedJoinAttempt#lastFailedJoinAttempt}. + * + * Package-private for testing. + */ static class FailedJoinAttempt { private final DiscoveryNode destination; private final JoinRequest joinRequest; private final ElasticsearchException exception; private final long timestamp; + /** + * @param destination the master node targeted by the join request. + * @param joinRequest the join request that was sent to the perceived master node. + * @param exception the error response received in reply to the join request attempt. + */ FailedJoinAttempt(DiscoveryNode destination, JoinRequest joinRequest, ElasticsearchException exception) { this.destination = destination; this.joinRequest = joinRequest; @@ -208,10 +218,18 @@ static class FailedJoinAttempt { this.timestamp = System.nanoTime(); } + /** + * Logs the failed join attempt exception. + * {@link FailedJoinAttempt#getLogLevel(ElasticsearchException)} determines at what log-level the log is written. + */ void logNow() { logger.log(getLogLevel(exception), () -> format("failed to join %s with %s", destination, joinRequest), exception); } + /** + * Returns the appropriate log level based on the given exception. Every error is at least DEBUG, but unexpected errors are INFO. + * For example, NotMasterException and CircuitBreakingExceptions are DEBUG logs. + */ static Level getLogLevel(ElasticsearchException e) { Throwable cause = e.unwrapCause(); if (cause instanceof CoordinationStateRejectedException @@ -226,6 +244,10 @@ void logWarnWithTimestamp() { logger.warn( () -> format( "last failed join attempt was %s ago, failed to join %s with %s", + // 'timestamp' is when this error exception was received by the local node. If the time that has passed since the error + // was originally received is quite large, it could indicate that this is a stale error exception from some prior + // out-of-order request response (where a later sent request but earlier received response was successful); or + // alternatively an old error could indicate that this node did not retry the join request for a very long time. TimeValue.timeValueMillis(TimeValue.nsecToMSec(System.nanoTime() - timestamp)), destination, joinRequest @@ -235,6 +257,9 @@ void logWarnWithTimestamp() { } } + /** + * Logs a warning message if {@link #lastFailedJoinAttempt} has been set with a failure. + */ void logLastFailedJoinAttempt() { FailedJoinAttempt attempt = lastFailedJoinAttempt.get(); if (attempt != null) { @@ -247,7 +272,7 @@ public void sendJoinRequest(DiscoveryNode destination, long term, Optional assert destination.isMasterNode() : "trying to join master-ineligible " + destination; final StatusInfo statusInfo = nodeHealthService.getHealth(); if (statusInfo.getStatus() == UNHEALTHY) { - logger.debug("dropping join request to [{}]: [{}]", destination, statusInfo.getInfo()); + logger.debug("dropping join request to [{}], unhealthy status: [{}]", destination, statusInfo.getInfo()); return; } final JoinRequest joinRequest = new JoinRequest( diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/NodeJoinExecutor.java b/server/src/main/java/org/elasticsearch/cluster/coordination/NodeJoinExecutor.java index 2c024063e2399..9223e02fc946c 100644 --- a/server/src/main/java/org/elasticsearch/cluster/coordination/NodeJoinExecutor.java +++ b/server/src/main/java/org/elasticsearch/cluster/coordination/NodeJoinExecutor.java @@ -26,6 +26,7 @@ import org.elasticsearch.cluster.routing.allocation.AllocationService; import org.elasticsearch.cluster.version.CompatibilityVersions; import org.elasticsearch.common.Priority; +import org.elasticsearch.common.Strings; import org.elasticsearch.features.FeatureService; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; @@ -123,7 +124,14 @@ public ClusterState execute(BatchExecutionContext batchExecutionContex newState = ClusterState.builder(initialState); } else { logger.trace("processing node joins, but we are not the master. current master: {}", currentNodes.getMasterNode()); - throw new NotMasterException("Node [" + currentNodes.getLocalNode() + "] not master for join request"); + throw new NotMasterException( + Strings.format( + "Node [%s] not master for join request. Current known master [%s], current term [%d]", + currentNodes.getLocalNode(), + currentNodes.getMasterNode(), + term + ) + ); } DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder(newState.nodes());