diff --git a/docs/changelog/112330.yaml b/docs/changelog/112330.yaml new file mode 100644 index 0000000000000..498698f5175ba --- /dev/null +++ b/docs/changelog/112330.yaml @@ -0,0 +1,5 @@ +pr: 112330 +summary: Add links to network disconnect troubleshooting +area: Network +type: enhancement +issues: [] diff --git a/docs/reference/modules/transport.asciidoc b/docs/reference/modules/transport.asciidoc index d08da2cfc1d2f..fc7b6831ca848 100644 --- a/docs/reference/modules/transport.asciidoc +++ b/docs/reference/modules/transport.asciidoc @@ -185,16 +185,18 @@ configured, and defaults otherwise to `transport.tcp.reuse_address`. A transport connection between two nodes is made up of a number of long-lived TCP connections, some of which may be idle for an extended period of time. -Nonetheless, Elasticsearch requires these connections to remain open, and it -can disrupt the operation of your cluster if any inter-node connections are -closed by an external influence such as a firewall. It is important to -configure your network to preserve long-lived idle connections between -Elasticsearch nodes, for instance by leaving `*.tcp.keep_alive` enabled and -ensuring that the keepalive interval is shorter than any timeout that might -cause idle connections to be closed, or by setting `transport.ping_schedule` if -keepalives cannot be configured. Devices which drop connections when they reach -a certain age are a common source of problems to Elasticsearch clusters, and -must not be used. +Nonetheless, {es} requires these connections to remain open, and it can disrupt +the operation of your cluster if any inter-node connections are closed by an +external influence such as a firewall. It is important to configure your network +to preserve long-lived idle connections between {es} nodes, for instance by +leaving `*.tcp.keep_alive` enabled and ensuring that the keepalive interval is +shorter than any timeout that might cause idle connections to be closed, or by +setting `transport.ping_schedule` if keepalives cannot be configured. Devices +which drop connections when they reach a certain age are a common source of +problems to {es} clusters, and must not be used. + +For information about troubleshooting unexpected network disconnections, see +<>. [[request-compression]] ===== Request compression diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java index 59c55fb7b624a..f73425c42a1c2 100644 --- a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java +++ b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java @@ -43,6 +43,7 @@ public enum ReferenceDocs { UNSTABLE_CLUSTER_TROUBLESHOOTING, LAGGING_NODE_TROUBLESHOOTING, SHARD_LOCK_TROUBLESHOOTING, + NETWORK_DISCONNECT_TROUBLESHOOTING, CONCURRENT_REPOSITORY_WRITERS, ARCHIVE_INDICES, HTTP_TRACER, diff --git a/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java b/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java index 4d6a66b6ec075..da8f7b25e5197 100644 --- a/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java +++ b/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java @@ -12,6 +12,7 @@ import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.support.ContextPreservingActionListener; import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.common.ReferenceDocs; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.common.util.concurrent.ListenableFuture; @@ -237,7 +238,13 @@ private void connectToNodeOrRetry( if (connectingRefCounter.hasReferences() == false) { logger.trace("connection manager shut down, closing transport connection to [{}]", node); } else if (conn.hasReferences()) { - logger.info("transport connection to [{}] closed by remote", node.descriptionWithoutAttributes()); + logger.info( + """ + transport connection to [{}] closed by remote; \ + if unexpected, see [{}] for troubleshooting guidance""", + node.descriptionWithoutAttributes(), + ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING + ); // In production code we only close connections via ref-counting, so this message confirms that a // 'node-left ... reason: disconnected' event was caused by external factors. Put differently, if a // node leaves the cluster with "reason: disconnected" but without this message being logged then diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json index cc0bc5e2257c8..4da6de7f7b561 100644 --- a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json +++ b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json @@ -4,6 +4,7 @@ "UNSTABLE_CLUSTER_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html", "LAGGING_NODE_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#troubleshooting-unstable-cluster-lagging", "SHARD_LOCK_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#troubleshooting-unstable-cluster-shardlockobtainfailedexception", + "NETWORK_DISCONNECT_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#troubleshooting-unstable-cluster-network", "CONCURRENT_REPOSITORY_WRITERS": "diagnosing-corrupted-repositories.html", "ARCHIVE_INDICES": "archive-indices.html", "HTTP_TRACER": "modules-network.html#http-rest-request-tracer", diff --git a/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java b/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java index 27874d4311cd2..675c3e63db7d5 100644 --- a/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java +++ b/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java @@ -188,7 +188,10 @@ public void testDisconnectLogging() { "remotely-triggered close message", ClusterConnectionManager.class.getCanonicalName(), Level.INFO, - "transport connection to [" + remoteClose.descriptionWithoutAttributes() + "] closed by remote" + "transport connection to [" + + remoteClose.descriptionWithoutAttributes() + + "] closed by remote; " + + "if unexpected, see [https://www.elastic.co/guide/en/elasticsearch/reference/*] for troubleshooting guidance" ) ); mockLog.addExpectation(