Skip to content

Commit

Permalink
Improve join NotMasterException response, and add class documentation (
Browse files Browse the repository at this point in the history
…elastic#108107)

The NotMasterException response to a join request is difficult to use
to diagnose a failed join attempt. Enhancing the NotMasterException
to include what node is thought to be master and the current term.
This additional information will help readers locate the real master,
to go look at those logs.

The additional class documentation on JoinHelper and
ClusterFormationFailureHelper should improve comprehension of
the circumstances of error message logs.
  • Loading branch information
DiannaHohensee authored May 10, 2024
1 parent fed8088 commit ac102e5
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,16 @@
import static org.elasticsearch.cluster.coordination.ClusterBootstrapService.INITIAL_MASTER_NODES_SETTING;
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;

/**
* Handles periodic debug logging of information regarding why the cluster has failed to form.
* Periodic logging begins once {@link #start()} is called, and ceases on {@link #stop()}.
*/
public class ClusterFormationFailureHelper {
private static final Logger logger = LogManager.getLogger(ClusterFormationFailureHelper.class);

/**
* This time period controls how often warning log messages will be written if this node fails to join or form a cluster.
*/
public static final Setting<TimeValue> DISCOVERY_CLUSTER_FORMATION_WARNING_TIMEOUT_SETTING = Setting.timeSetting(
"discovery.cluster_formation_warning_timeout",
TimeValue.timeValueMillis(10000),
Expand All @@ -61,6 +68,16 @@ public class ClusterFormationFailureHelper {
@Nullable // if no warning is scheduled
private volatile WarningScheduler warningScheduler;

/**
* Works with the {@link JoinHelper} to log the latest node-join attempt failure and cluster state debug information. Must call
* {@link ClusterFormationState#start()} to begin.
*
* @param settings provides the period in which to log cluster formation errors.
* @param clusterFormationStateSupplier information about the current believed cluster state (See {@link ClusterFormationState})
* @param threadPool the thread pool on which to run debug logging
* @param logLastFailedJoinAttempt invokes an instance of the JoinHelper to log the last encountered join failure
* (See {@link JoinHelper#logLastFailedJoinAttempt()})
*/
public ClusterFormationFailureHelper(
Settings settings,
Supplier<ClusterFormationState> clusterFormationStateSupplier,
Expand All @@ -78,6 +95,10 @@ public boolean isRunning() {
return warningScheduler != null;
}

/**
* Schedules a warning debug message to be logged in 'clusterFormationWarningTimeout' time, and periodically thereafter, until
* {@link ClusterFormationState#stop()} has been called.
*/
public void start() {
assert warningScheduler == null;
warningScheduler = new WarningScheduler();
Expand Down Expand Up @@ -129,7 +150,7 @@ public String toString() {
}

/**
* If this node believes that cluster formation has failed, this record provides information that can be used to determine why that is.
* This record provides node state information that can be used to determine why cluster formation has failed.
*/
public record ClusterFormationState(
List<String> initialMasterNodesSetting,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,24 +194,42 @@ private void unregisterAndReleaseConnection(DiscoveryNode destination, Releasabl
Releasables.close(connectionReference);
}

// package-private for testing
/**
* Saves information about a join failure. The failure information may be logged later via either {@link FailedJoinAttempt#logNow}
* or {@link FailedJoinAttempt#lastFailedJoinAttempt}.
*
* Package-private for testing.
*/
static class FailedJoinAttempt {
private final DiscoveryNode destination;
private final JoinRequest joinRequest;
private final ElasticsearchException exception;
private final long timestamp;

/**
* @param destination the master node targeted by the join request.
* @param joinRequest the join request that was sent to the perceived master node.
* @param exception the error response received in reply to the join request attempt.
*/
FailedJoinAttempt(DiscoveryNode destination, JoinRequest joinRequest, ElasticsearchException exception) {
this.destination = destination;
this.joinRequest = joinRequest;
this.exception = exception;
this.timestamp = System.nanoTime();
}

/**
* Logs the failed join attempt exception.
* {@link FailedJoinAttempt#getLogLevel(ElasticsearchException)} determines at what log-level the log is written.
*/
void logNow() {
logger.log(getLogLevel(exception), () -> format("failed to join %s with %s", destination, joinRequest), exception);
}

/**
* Returns the appropriate log level based on the given exception. Every error is at least DEBUG, but unexpected errors are INFO.
* For example, NotMasterException and CircuitBreakingExceptions are DEBUG logs.
*/
static Level getLogLevel(ElasticsearchException e) {
Throwable cause = e.unwrapCause();
if (cause instanceof CoordinationStateRejectedException
Expand All @@ -226,6 +244,10 @@ void logWarnWithTimestamp() {
logger.warn(
() -> format(
"last failed join attempt was %s ago, failed to join %s with %s",
// 'timestamp' is when this error exception was received by the local node. If the time that has passed since the error
// was originally received is quite large, it could indicate that this is a stale error exception from some prior
// out-of-order request response (where a later sent request but earlier received response was successful); or
// alternatively an old error could indicate that this node did not retry the join request for a very long time.
TimeValue.timeValueMillis(TimeValue.nsecToMSec(System.nanoTime() - timestamp)),
destination,
joinRequest
Expand All @@ -235,6 +257,9 @@ void logWarnWithTimestamp() {
}
}

/**
* Logs a warning message if {@link #lastFailedJoinAttempt} has been set with a failure.
*/
void logLastFailedJoinAttempt() {
FailedJoinAttempt attempt = lastFailedJoinAttempt.get();
if (attempt != null) {
Expand All @@ -247,7 +272,7 @@ public void sendJoinRequest(DiscoveryNode destination, long term, Optional<Join>
assert destination.isMasterNode() : "trying to join master-ineligible " + destination;
final StatusInfo statusInfo = nodeHealthService.getHealth();
if (statusInfo.getStatus() == UNHEALTHY) {
logger.debug("dropping join request to [{}]: [{}]", destination, statusInfo.getInfo());
logger.debug("dropping join request to [{}], unhealthy status: [{}]", destination, statusInfo.getInfo());
return;
}
final JoinRequest joinRequest = new JoinRequest(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.elasticsearch.cluster.routing.allocation.AllocationService;
import org.elasticsearch.cluster.version.CompatibilityVersions;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.Strings;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
Expand Down Expand Up @@ -123,7 +124,14 @@ public ClusterState execute(BatchExecutionContext<JoinTask> batchExecutionContex
newState = ClusterState.builder(initialState);
} else {
logger.trace("processing node joins, but we are not the master. current master: {}", currentNodes.getMasterNode());
throw new NotMasterException("Node [" + currentNodes.getLocalNode() + "] not master for join request");
throw new NotMasterException(
Strings.format(
"Node [%s] not master for join request. Current known master [%s], current term [%d]",
currentNodes.getLocalNode(),
currentNodes.getMasterNode(),
term
)
);
}

DiscoveryNodes.Builder nodesBuilder = DiscoveryNodes.builder(newState.nodes());
Expand Down

0 comments on commit ac102e5

Please sign in to comment.