diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java index 30edea6551067..669e24f9fb555 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java @@ -31,6 +31,9 @@ import java.util.stream.Collectors; import static org.opensearch.cluster.routing.ShardRoutingState.STARTED; +import static org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator.PREFER_PRIMARY_SHARD_BALANCE; +import static org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator.PREFER_PRIMARY_SHARD_REBALANCE; +import static org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator.PRIMARY_SHARD_REBALANCE_BUFFER; import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; @OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0) @@ -58,6 +61,20 @@ public void enablePreferPrimaryBalance() { ); } + public void setAllocationRelocationStrategy(boolean preferPrimaryBalance, boolean preferPrimaryRebalance, float buffer) { + assertAcked( + client().admin() + .cluster() + .prepareUpdateSettings() + .setPersistentSettings( + Settings.builder() + .put(PREFER_PRIMARY_SHARD_BALANCE.getKey(), preferPrimaryBalance) + .put(PREFER_PRIMARY_SHARD_REBALANCE.getKey(), preferPrimaryRebalance) + .put(PRIMARY_SHARD_REBALANCE_BUFFER.getKey(), buffer) + ) + ); + } + /** * This test verifies that the overall primary balance is attained during allocation. This test verifies primary * balance per index and across all indices is maintained. @@ -87,7 +104,7 @@ public void testGlobalPrimaryAllocation() throws Exception { state = client().admin().cluster().prepareState().execute().actionGet().getState(); logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance(); - verifyPrimaryBalance(); + verifyPrimaryBalance(0.0f); } /** @@ -224,6 +241,70 @@ public void testAllocationWithDisruption() throws Exception { verifyPerIndexPrimaryBalance(); } + /** + * Similar to testSingleIndexShardAllocation test but creates multiple indices, multiple nodes adding in and getting + * removed. The test asserts post each such event that primary shard distribution is balanced for each index as well as across the nodes + * when the PREFER_PRIMARY_SHARD_REBALANCE is set to true + */ + public void testAllocationAndRebalanceWithDisruption() throws Exception { + internalCluster().startClusterManagerOnlyNode(); + final int maxReplicaCount = 2; + final int maxShardCount = 2; + // Create higher number of nodes than number of shards to reduce chances of SameShardAllocationDecider kicking-in + // and preventing primary relocations + final int nodeCount = randomIntBetween(5, 10); + final int numberOfIndices = randomIntBetween(1, 10); + final float buffer = randomIntBetween(1, 4) * 0.10f; + + logger.info("--> Creating {} nodes", nodeCount); + final List nodeNames = new ArrayList<>(); + for (int i = 0; i < nodeCount; i++) { + nodeNames.add(internalCluster().startNode()); + } + setAllocationRelocationStrategy(true, true, buffer); + + int shardCount, replicaCount; + ClusterState state; + for (int i = 0; i < numberOfIndices; i++) { + shardCount = randomIntBetween(1, maxShardCount); + replicaCount = randomIntBetween(1, maxReplicaCount); + logger.info("--> Creating index test{} with primary {} and replica {}", i, shardCount, replicaCount); + createIndex("test" + i, shardCount, replicaCount, i % 2 == 0); + ensureGreen(TimeValue.timeValueSeconds(60)); + if (logger.isTraceEnabled()) { + state = client().admin().cluster().prepareState().execute().actionGet().getState(); + logger.info(ShardAllocations.printShardDistribution(state)); + } + } + state = client().admin().cluster().prepareState().execute().actionGet().getState(); + logger.info(ShardAllocations.printShardDistribution(state)); + verifyPerIndexPrimaryBalance(); + verifyPrimaryBalance(buffer); + + final int additionalNodeCount = randomIntBetween(1, 5); + logger.info("--> Adding {} nodes", additionalNodeCount); + + internalCluster().startNodes(additionalNodeCount); + ensureGreen(TimeValue.timeValueSeconds(60)); + state = client().admin().cluster().prepareState().execute().actionGet().getState(); + logger.info(ShardAllocations.printShardDistribution(state)); + verifyPerIndexPrimaryBalance(); + verifyPrimaryBalance(buffer); + + int nodeCountToStop = additionalNodeCount; + while (nodeCountToStop > 0) { + internalCluster().stopRandomDataNode(); + // give replica a chance to promote as primary before terminating node containing the replica + ensureGreen(TimeValue.timeValueSeconds(60)); + nodeCountToStop--; + } + state = client().admin().cluster().prepareState().execute().actionGet().getState(); + logger.info("--> Cluster state post nodes stop {}", state); + logger.info(ShardAllocations.printShardDistribution(state)); + verifyPerIndexPrimaryBalance(); + verifyPrimaryBalance(buffer); + } + /** * Utility method which ensures cluster has balanced primary shard distribution across a single index. * @throws Exception exception @@ -263,7 +344,7 @@ private void verifyPerIndexPrimaryBalance() throws Exception { }, 60, TimeUnit.SECONDS); } - private void verifyPrimaryBalance() throws Exception { + private void verifyPrimaryBalance(float buffer) throws Exception { assertBusy(() -> { final ClusterState currentState = client().admin().cluster().prepareState().execute().actionGet().getState(); RoutingNodes nodes = currentState.getRoutingNodes(); @@ -278,7 +359,7 @@ private void verifyPrimaryBalance() throws Exception { .filter(ShardRouting::primary) .collect(Collectors.toList()) .size(); - assertTrue(primaryCount <= avgPrimaryShardsPerNode); + assertTrue(primaryCount <= (avgPrimaryShardsPerNode * (1 + buffer))); } }, 60, TimeUnit.SECONDS); }