From 136b1a1adf1d9575701496be0053af12aab1851c Mon Sep 17 00:00:00 2001 From: Artem Prigoda Date: Wed, 28 Feb 2024 17:08:56 +0100 Subject: [PATCH] [test] Run cluster explanation if can't relocate a shard from a node (#105747) If we can't relocate a short to a different node for some reason, print out a shard allocation explanation, so we have more debug information for diagnostics. Resolves #104807 See #105443 --- .../cluster/PrevalidateShardPathIT.java | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/PrevalidateShardPathIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/PrevalidateShardPathIT.java index 3a1fa8e5da272..560a525ec526c 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/PrevalidateShardPathIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/PrevalidateShardPathIT.java @@ -9,14 +9,18 @@ package org.elasticsearch.cluster; import org.apache.lucene.tests.util.LuceneTestCase; +import org.elasticsearch.action.admin.cluster.allocation.ClusterAllocationExplainRequest; +import org.elasticsearch.action.admin.cluster.allocation.TransportClusterAllocationExplainAction; import org.elasticsearch.action.admin.cluster.node.shutdown.NodePrevalidateShardPathResponse; import org.elasticsearch.action.admin.cluster.node.shutdown.PrevalidateShardPathRequest; import org.elasticsearch.action.admin.cluster.node.shutdown.PrevalidateShardPathResponse; import org.elasticsearch.action.admin.cluster.node.shutdown.TransportPrevalidateShardPathAction; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.UUIDs; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.ChunkedToXContent; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.test.ESIntegTestCase; @@ -77,7 +81,31 @@ public void testCheckShards() throws Exception { assertThat(resp2.getNodes().size(), equalTo(1)); assertThat(resp2.getNodes().get(0).getNode().getId(), equalTo(node2Id)); assertTrue("There should be no failures in the response", resp.failures().isEmpty()); - assertTrue("The relocation source node should have removed the shard(s)", resp2.getNodes().get(0).getShardIds().isEmpty()); + Set node2ShardIds = resp2.getNodes().get(0).getShardIds(); + if (node2ShardIds.size() > 0) { + for (var node2Shard : clusterService().state() + .routingTable() + .allShards() + .filter(s -> s.getIndexName().equals(indexName)) + .filter(s -> node2ShardIds.contains(s.shardId())) + .filter(s -> s.currentNodeId().equals(node2Id)) + .toList()) { + var explanation = client().execute( + TransportClusterAllocationExplainAction.TYPE, + new ClusterAllocationExplainRequest().setIndex(node2Shard.getIndexName()) + .setCurrentNode(node2Shard.currentNodeId()) + .setShard(node2Shard.id()) + .setPrimary(node2Shard.primary()) + ).get(); + logger.info( + "Shard: {} is still located on relocation source node: {}. Allocation explanation: {}", + node2Shard.shardId(), + node2, + Strings.toString(ChunkedToXContent.wrapAsToXContent(explanation), false, true) + ); + } + throw new AssertionError("The relocation source node should have removed the shard(s)"); + } } catch (AssertionError e) { // Removal of shards which are no longer allocated to the node is attempted on every cluster state change in IndicesStore. // If for whatever reason the removal is not triggered (e.g. not enough nodes reported that the shards are active) or it