From f85ff03fa8eeb3ec92a8649b1679b1eb4b5df45c Mon Sep 17 00:00:00 2001
From: Rishikesh <62345295+Rishikesh1159@users.noreply.github.com>
Date: Thu, 5 Sep 2024 12:31:21 -0700
Subject: [PATCH 1/7] [Backport 2.x] Add support for randomizing Remote Store
 enabled testing (#13845)

* backport #12488 to 2.x branch.

Signed-off-by: Rishikesh1159 <rishireddy1159@gmail.com>

* Fix indentation.

Signed-off-by: Rishikesh1159 <rishireddy1159@gmail.com>

---------

Signed-off-by: Rishikesh1159 <rishireddy1159@gmail.com>
Signed-off-by: Rishikesh <62345295+Rishikesh1159@users.noreply.github.com>
---
 .../recovery/ReplicaToPrimaryPromotionIT.java |  7 +++-
 .../test/OpenSearchIntegTestCase.java         | 32 ++++++++++++++++---
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/recovery/ReplicaToPrimaryPromotionIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/recovery/ReplicaToPrimaryPromotionIT.java
index 3df4ecff5250c..a2543f0592145 100644
--- a/server/src/internalClusterTest/java/org/opensearch/indices/recovery/ReplicaToPrimaryPromotionIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/indices/recovery/ReplicaToPrimaryPromotionIT.java
@@ -56,6 +56,11 @@ protected int numberOfReplicas() {
         return 1;
     }
 
+    @Override
+    public boolean useRandomReplicationStrategy() {
+        return true;
+    }
+
     public void testPromoteReplicaToPrimary() throws Exception {
         final String indexName = randomAlphaOfLength(5).toLowerCase(Locale.ROOT);
         createIndex(indexName);
@@ -65,7 +70,7 @@ public void testPromoteReplicaToPrimary() throws Exception {
             try (BackgroundIndexer indexer = new BackgroundIndexer(indexName, "_doc", client(), numOfDocs)) {
                 waitForDocs(numOfDocs, indexer);
             }
-            refresh(indexName);
+            refreshAndWaitForReplication(indexName);
         }
 
         assertHitCount(client().prepareSearch(indexName).setSize(0).get(), numOfDocs);
diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
index 705610829cc9a..a63abdeb626b7 100644
--- a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
+++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java
@@ -391,6 +391,14 @@ public abstract class OpenSearchIntegTestCase extends OpenSearchTestCase {
      */
     public static final String TESTS_CLUSTER_NAME = "tests.clustername";
 
+    protected static final String REMOTE_BACKED_STORAGE_REPOSITORY_NAME = "test-remote-store-repo";
+
+    private Path remoteStoreRepositoryPath;
+
+    private ReplicationType randomReplicationType;
+
+    private String randomStorageType;
+
     /**
      * The lucene_default {@link Codec} is not added to the list as it internally maps to Asserting {@link Codec}.
      * The override to fetch the {@link CompletionFieldMapper.CompletionFieldType} postings format is not available for this codec.
@@ -1985,11 +1993,19 @@ protected Settings nodeSettings(int nodeOrdinal) {
             builder.put(TelemetrySettings.TRACER_ENABLED_SETTING.getKey(), true);
         }
 
-        // Randomly set a replication strategy for the node. Replication Strategy can still be manually overridden by subclass if needed.
+        // Randomly set a Replication Strategy and storage type for the node. Both Replication Strategy and Storage Type can still be
+        // manually overridden by subclass if needed.
         if (useRandomReplicationStrategy()) {
-            ReplicationType replicationType = randomBoolean() ? ReplicationType.DOCUMENT : ReplicationType.SEGMENT;
-            logger.info("Randomly using Replication Strategy as {}.", replicationType.toString());
-            builder.put(CLUSTER_REPLICATION_TYPE_SETTING.getKey(), replicationType);
+            if (randomReplicationType.equals(ReplicationType.SEGMENT) && randomStorageType.equals("REMOTE_STORE")) {
+                logger.info("Randomly using Replication Strategy as {} and Storage Type as {}.", randomReplicationType, randomStorageType);
+                if (remoteStoreRepositoryPath == null) {
+                    remoteStoreRepositoryPath = randomRepoPath().toAbsolutePath();
+                }
+                builder.put(remoteStoreClusterSettings(REMOTE_BACKED_STORAGE_REPOSITORY_NAME, remoteStoreRepositoryPath));
+            } else {
+                logger.info("Randomly using Replication Strategy as {} and Storage Type as {}.", randomReplicationType, randomStorageType);
+                builder.put(CLUSTER_REPLICATION_TYPE_SETTING.getKey(), randomReplicationType);
+            }
         }
         return builder.build();
     }
@@ -2042,6 +2058,14 @@ protected boolean ignoreExternalCluster() {
     }
 
     protected TestCluster buildTestCluster(Scope scope, long seed) throws IOException {
+        if (useRandomReplicationStrategy()) {
+            randomReplicationType = randomBoolean() ? ReplicationType.DOCUMENT : ReplicationType.SEGMENT;
+            if (randomReplicationType.equals(ReplicationType.SEGMENT)) {
+                randomStorageType = randomBoolean() ? "REMOTE_STORE" : "LOCAL";
+            } else {
+                randomStorageType = "LOCAL";
+            }
+        }
         String clusterAddresses = System.getProperty(TESTS_CLUSTER);
         if (Strings.hasLength(clusterAddresses) && ignoreExternalCluster() == false) {
             if (scope == Scope.TEST) {

From f2ecd3e763598e72c604cc1a0c616bae7dda1940 Mon Sep 17 00:00:00 2001
From: "opensearch-trigger-bot[bot]"
 <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com>
Date: Thu, 5 Sep 2024 16:06:03 -0400
Subject: [PATCH 2/7] Bump com.azure:azure-identity from 1.13.0 to 1.13.2 in
 /plugins/repository-azure (#15578) (#15755)

* Bump com.azure:azure-identity in /plugins/repository-azure

Bumps [com.azure:azure-identity](https://github.com/Azure/azure-sdk-for-java) from 1.13.0 to 1.13.2.
- [Release notes](https://github.com/Azure/azure-sdk-for-java/releases)
- [Commits](https://github.com/Azure/azure-sdk-for-java/compare/azure-core_1.13.0...azure-identity_1.13.2)

---
updated-dependencies:
- dependency-name: com.azure:azure-identity
  dependency-type: direct:production
  update-type: version-update:semver-patch
...



* Updating SHAs



* Update changelog



---------




(cherry picked from commit e5e8504acdf1339f53dced4c00f5c9cd91b95f2c)

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: dependabot[bot] <dependabot[bot]@users.noreply.github.com>
---
 CHANGELOG.md                                                    | 1 +
 plugins/repository-azure/build.gradle                           | 2 +-
 .../repository-azure/licenses/azure-identity-1.13.0.jar.sha1    | 1 -
 .../repository-azure/licenses/azure-identity-1.13.2.jar.sha1    | 1 +
 4 files changed, 3 insertions(+), 2 deletions(-)
 delete mode 100644 plugins/repository-azure/licenses/azure-identity-1.13.0.jar.sha1
 create mode 100644 plugins/repository-azure/licenses/azure-identity-1.13.2.jar.sha1

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5a8391dd79d5f..4dbc667fcdbbf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -78,6 +78,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Bump `com.netflix.nebula.ospackage-base` from 11.9.1 to 11.10.0 ([#15419](https://github.com/opensearch-project/OpenSearch/pull/15419))
 - Bump `org.roaringbitmap:RoaringBitmap` from 1.1.0 to 1.2.1 ([#15423](https://github.com/opensearch-project/OpenSearch/pull/15423))
 - Bump `icu4j` from 70.1 to 75.1 ([#15469](https://github.com/opensearch-project/OpenSearch/pull/15469))
+- Bump `com.azure:azure-identity` from 1.13.0 to 1.13.2 ([#15578](https://github.com/opensearch-project/OpenSearch/pull/15578))
 
 ### Changed
 - Add lower limit for primary and replica batch allocators timeout ([#14979](https://github.com/opensearch-project/OpenSearch/pull/14979))
diff --git a/plugins/repository-azure/build.gradle b/plugins/repository-azure/build.gradle
index e5d3bd215f8cb..e3dd8c1bd0dfa 100644
--- a/plugins/repository-azure/build.gradle
+++ b/plugins/repository-azure/build.gradle
@@ -57,7 +57,7 @@ dependencies {
   api "io.netty:netty-transport-native-unix-common:${versions.netty}"
   implementation project(':modules:transport-netty4')
   api 'com.azure:azure-storage-blob:12.23.0'
-  api 'com.azure:azure-identity:1.13.0'
+  api 'com.azure:azure-identity:1.13.2'
   // Start of transitive dependencies for azure-identity
   api 'com.microsoft.azure:msal4j-persistence-extension:1.3.0'
   api "net.java.dev.jna:jna-platform:${versions.jna}"
diff --git a/plugins/repository-azure/licenses/azure-identity-1.13.0.jar.sha1 b/plugins/repository-azure/licenses/azure-identity-1.13.0.jar.sha1
deleted file mode 100644
index b59c2a3be5c92..0000000000000
--- a/plugins/repository-azure/licenses/azure-identity-1.13.0.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-54b44a74636322d06e9dc42d611a9f12a0966790
\ No newline at end of file
diff --git a/plugins/repository-azure/licenses/azure-identity-1.13.2.jar.sha1 b/plugins/repository-azure/licenses/azure-identity-1.13.2.jar.sha1
new file mode 100644
index 0000000000000..7c98a9ccba592
--- /dev/null
+++ b/plugins/repository-azure/licenses/azure-identity-1.13.2.jar.sha1
@@ -0,0 +1 @@
+50a1daef3eb5c6ab2e1351a3e3f5a7649a8fe464
\ No newline at end of file

From d5e86e13ec950fda97e99bd75b1e34d99e10fd45 Mon Sep 17 00:00:00 2001
From: Arpit-Bandejiya <abandeji@amazon.com>
Date: Fri, 6 Sep 2024 01:37:24 +0530
Subject: [PATCH 3/7] Schedule reroute after allocator timed out (#15565)
 (#15749)

* Schedule reroute after allocator timed out

Signed-off-by: Rishab Nahata <rnnahata@amazon.com>
(cherry picked from commit 4f50b4d705deaa971d802cd22fb120f75c722517)

Co-authored-by: Rishab Nahata <rnnahata@amazon.com>
---
 .../org/opensearch/cluster/ClusterModule.java |   4 +
 .../allocator/BalancedShardsAllocator.java    |  25 ++
 .../allocation/allocator/ShardsAllocator.java |   3 +
 .../gateway/ShardsBatchGatewayAllocator.java  |  30 +-
 .../main/java/org/opensearch/node/Node.java   |   1 +
 .../cluster/ClusterModuleTests.java           |  13 +
 ...TimeBoundBalancedShardsAllocatorTests.java | 257 +++++++++++++-----
 .../gateway/GatewayAllocatorTests.java        |  62 ++++-
 .../TestShardBatchGatewayAllocator.java       |   6 +-
 9 files changed, 321 insertions(+), 80 deletions(-)

diff --git a/server/src/main/java/org/opensearch/cluster/ClusterModule.java b/server/src/main/java/org/opensearch/cluster/ClusterModule.java
index 0d15158f31e34..9e2074c7aacf9 100644
--- a/server/src/main/java/org/opensearch/cluster/ClusterModule.java
+++ b/server/src/main/java/org/opensearch/cluster/ClusterModule.java
@@ -52,6 +52,7 @@
 import org.opensearch.cluster.metadata.RepositoriesMetadata;
 import org.opensearch.cluster.metadata.WeightedRoutingMetadata;
 import org.opensearch.cluster.routing.DelayedAllocationService;
+import org.opensearch.cluster.routing.RerouteService;
 import org.opensearch.cluster.routing.allocation.AllocationService;
 import org.opensearch.cluster.routing.allocation.ExistingShardsAllocator;
 import org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
@@ -476,4 +477,7 @@ public void setExistingShardsAllocators(GatewayAllocator gatewayAllocator, Shard
         allocationService.setExistingShardsAllocators(existingShardsAllocators);
     }
 
+    public void setRerouteServiceForAllocator(RerouteService rerouteService) {
+        shardsAllocator.setRerouteService(rerouteService);
+    }
 }
diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
index a5193ca602f04..785636fa7ff2a 100644
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
@@ -35,6 +35,7 @@
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.util.IntroSorter;
+import org.opensearch.cluster.routing.RerouteService;
 import org.opensearch.cluster.routing.RoutingNode;
 import org.opensearch.cluster.routing.RoutingNodes;
 import org.opensearch.cluster.routing.ShardMovementStrategy;
@@ -49,12 +50,14 @@
 import org.opensearch.cluster.routing.allocation.RebalanceParameter;
 import org.opensearch.cluster.routing.allocation.RoutingAllocation;
 import org.opensearch.cluster.routing.allocation.ShardAllocationDecision;
+import org.opensearch.common.Priority;
 import org.opensearch.common.inject.Inject;
 import org.opensearch.common.settings.ClusterSettings;
 import org.opensearch.common.settings.Setting;
 import org.opensearch.common.settings.Setting.Property;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.common.unit.TimeValue;
+import org.opensearch.core.action.ActionListener;
 
 import java.util.HashMap;
 import java.util.HashSet;
@@ -202,6 +205,7 @@ public class BalancedShardsAllocator implements ShardsAllocator {
     private volatile boolean ignoreThrottleInRestore;
     private volatile TimeValue allocatorTimeout;
     private long startTime;
+    private RerouteService rerouteService;
 
     public BalancedShardsAllocator(Settings settings) {
         this(settings, new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS));
@@ -231,6 +235,12 @@ public BalancedShardsAllocator(Settings settings, ClusterSettings clusterSetting
         clusterSettings.addSettingsUpdateConsumer(ALLOCATOR_TIMEOUT_SETTING, this::setAllocatorTimeout);
     }
 
+    @Override
+    public void setRerouteService(RerouteService rerouteService) {
+        assert this.rerouteService == null : "RerouteService is already set";
+        this.rerouteService = rerouteService;
+    }
+
     /**
      * Changes in deprecated setting SHARD_MOVE_PRIMARY_FIRST_SETTING affect value of its replacement setting SHARD_MOVEMENT_STRATEGY_SETTING.
      */
@@ -342,6 +352,7 @@ public void allocate(RoutingAllocation allocation) {
         localShardsBalancer.allocateUnassigned();
         localShardsBalancer.moveShards();
         localShardsBalancer.balance();
+        scheduleRerouteIfAllocatorTimedOut();
 
         final ShardsBalancer remoteShardsBalancer = new RemoteShardsBalancer(logger, allocation);
         remoteShardsBalancer.allocateUnassigned();
@@ -404,6 +415,20 @@ private void failAllocationOfNewPrimaries(RoutingAllocation allocation) {
         }
     }
 
+    private void scheduleRerouteIfAllocatorTimedOut() {
+        if (allocatorTimedOut()) {
+            assert rerouteService != null : "RerouteService not set to schedule reroute after allocator time out";
+            rerouteService.reroute(
+                "reroute after balanced shards allocator timed out",
+                Priority.HIGH,
+                ActionListener.wrap(
+                    r -> logger.trace("reroute after balanced shards allocator timed out completed"),
+                    e -> logger.debug("reroute after balanced shards allocator timed out failed", e)
+                )
+            );
+        }
+    }
+
     /**
      * Returns the currently configured delta threshold
      */
diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/ShardsAllocator.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/ShardsAllocator.java
index 29e9acca4e6c2..38aafff6ce3e8 100644
--- a/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/ShardsAllocator.java
+++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/ShardsAllocator.java
@@ -32,6 +32,7 @@
 
 package org.opensearch.cluster.routing.allocation.allocator;
 
+import org.opensearch.cluster.routing.RerouteService;
 import org.opensearch.cluster.routing.ShardRouting;
 import org.opensearch.cluster.routing.allocation.AllocateUnassignedDecision;
 import org.opensearch.cluster.routing.allocation.MoveDecision;
@@ -73,4 +74,6 @@ public interface ShardsAllocator {
      * the cluster explain API, then this method should throw a {@code UnsupportedOperationException}.
      */
     ShardAllocationDecision decideShardAllocation(ShardRouting shard, RoutingAllocation allocation);
+
+    default void setRerouteService(RerouteService rerouteService) {}
 }
diff --git a/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java b/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java
index d18304ea73ed0..5e2dcbcd70b40 100644
--- a/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java
+++ b/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java
@@ -184,11 +184,11 @@ public void cleanCaches() {
 
     // for tests
     protected ShardsBatchGatewayAllocator() {
-        this(DEFAULT_SHARD_BATCH_SIZE);
+        this(DEFAULT_SHARD_BATCH_SIZE, null);
     }
 
-    protected ShardsBatchGatewayAllocator(long batchSize) {
-        this.rerouteService = null;
+    protected ShardsBatchGatewayAllocator(long batchSize, RerouteService rerouteService) {
+        this.rerouteService = rerouteService;
         this.batchStartedAction = null;
         this.primaryShardBatchAllocator = null;
         this.batchStoreAction = null;
@@ -297,6 +297,18 @@ public void run() {
                 public void onComplete() {
                     logger.trace("Triggering oncomplete after timeout for [{}] primary shards", timedOutPrimaryShardIds.size());
                     primaryBatchShardAllocator.allocateUnassignedBatchOnTimeout(timedOutPrimaryShardIds, allocation, true);
+                    if (timedOutPrimaryShardIds.isEmpty() == false) {
+                        logger.trace("scheduling reroute after existing shards allocator timed out for primary shards");
+                        assert rerouteService != null;
+                        rerouteService.reroute(
+                            "reroute after existing shards allocator timed out",
+                            Priority.HIGH,
+                            ActionListener.wrap(
+                                r -> logger.trace("reroute after existing shards allocator timed out completed"),
+                                e -> logger.debug("reroute after existing shards allocator timed out failed", e)
+                            )
+                        );
+                    }
                 }
             };
         } else {
@@ -320,6 +332,18 @@ public void run() {
                 public void onComplete() {
                     logger.trace("Triggering oncomplete after timeout for [{}] replica shards", timedOutReplicaShardIds.size());
                     replicaBatchShardAllocator.allocateUnassignedBatchOnTimeout(timedOutReplicaShardIds, allocation, false);
+                    if (timedOutReplicaShardIds.isEmpty() == false) {
+                        logger.trace("scheduling reroute after existing shards allocator timed out for replica shards");
+                        assert rerouteService != null;
+                        rerouteService.reroute(
+                            "reroute after existing shards allocator timed out",
+                            Priority.HIGH,
+                            ActionListener.wrap(
+                                r -> logger.trace("reroute after existing shards allocator timed out completed"),
+                                e -> logger.debug("reroute after existing shards allocator timed out failed", e)
+                            )
+                        );
+                    }
                 }
             };
         }
diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java
index 26150bdecdcf5..72531b98ec102 100644
--- a/server/src/main/java/org/opensearch/node/Node.java
+++ b/server/src/main/java/org/opensearch/node/Node.java
@@ -867,6 +867,7 @@ protected Node(
             final RerouteService rerouteService = new BatchedRerouteService(clusterService, clusterModule.getAllocationService()::reroute);
             rerouteServiceReference.set(rerouteService);
             clusterService.setRerouteService(rerouteService);
+            clusterModule.setRerouteServiceForAllocator(rerouteService);
 
             final RecoverySettings recoverySettings = new RecoverySettings(settings, settingsModule.getClusterSettings());
 
diff --git a/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java b/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java
index 97706927ba857..f8240e775cfa5 100644
--- a/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java
+++ b/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java
@@ -337,6 +337,19 @@ public void testQueryGroupMetadataRegister() {
         );
     }
 
+    public void testRerouteServiceSetForBalancedShardsAllocator() {
+        ClusterModule clusterModule = new ClusterModule(
+            Settings.EMPTY,
+            clusterService,
+            Collections.emptyList(),
+            clusterInfoService,
+            null,
+            threadContext,
+            new ClusterManagerMetrics(NoopMetricsRegistry.INSTANCE)
+        );
+        clusterModule.setRerouteServiceForAllocator((reason, priority, listener) -> listener.onResponse(clusterService.state()));
+    }
+
     private static ClusterPlugin existingShardsAllocatorPlugin(final String allocatorName) {
         return new ClusterPlugin() {
             @Override
diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/allocator/TimeBoundBalancedShardsAllocatorTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/allocator/TimeBoundBalancedShardsAllocatorTests.java
index a10c305686638..45a0bd7b18afd 100644
--- a/server/src/test/java/org/opensearch/cluster/routing/allocation/allocator/TimeBoundBalancedShardsAllocatorTests.java
+++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/allocator/TimeBoundBalancedShardsAllocatorTests.java
@@ -8,6 +8,7 @@
 
 package org.opensearch.cluster.routing.allocation.allocator;
 
+import org.opensearch.OpenSearchException;
 import org.opensearch.Version;
 import org.opensearch.cluster.ClusterInfo;
 import org.opensearch.cluster.ClusterName;
@@ -17,6 +18,7 @@
 import org.opensearch.cluster.metadata.Metadata;
 import org.opensearch.cluster.node.DiscoveryNode;
 import org.opensearch.cluster.node.DiscoveryNodes;
+import org.opensearch.cluster.routing.RerouteService;
 import org.opensearch.cluster.routing.RoutingNodes;
 import org.opensearch.cluster.routing.RoutingTable;
 import org.opensearch.cluster.routing.ShardRouting;
@@ -26,14 +28,20 @@
 import org.opensearch.cluster.routing.allocation.decider.AllocationDeciders;
 import org.opensearch.cluster.routing.allocation.decider.Decision;
 import org.opensearch.cluster.routing.allocation.decider.SameShardAllocationDecider;
+import org.opensearch.cluster.service.ClusterService;
+import org.opensearch.common.Priority;
 import org.opensearch.common.settings.ClusterSettings;
 import org.opensearch.common.settings.Settings;
+import org.opensearch.test.ClusterServiceUtils;
+import org.opensearch.threadpool.TestThreadPool;
+import org.junit.After;
 
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.atomic.AtomicBoolean;
 
 import static org.opensearch.cluster.routing.ShardRoutingState.INITIALIZING;
 import static org.opensearch.cluster.routing.ShardRoutingState.STARTED;
@@ -41,26 +49,49 @@
 
 public class TimeBoundBalancedShardsAllocatorTests extends OpenSearchAllocationTestCase {
 
+    private TestThreadPool threadPool;
+    private ClusterService clusterService;
+    private ClusterState state;
+
     private final DiscoveryNode node1 = newNode("node1", "node1", Collections.singletonMap("zone", "1a"));
     private final DiscoveryNode node2 = newNode("node2", "node2", Collections.singletonMap("zone", "1b"));
     private final DiscoveryNode node3 = newNode("node3", "node3", Collections.singletonMap("zone", "1c"));
 
-    public void testAllUnassignedShardsAllocatedWhenNoTimeOut() {
+    @After
+    @Override
+    public void tearDown() throws Exception {
+        super.tearDown();
+        if (threadPool != null) {
+            final boolean terminated = terminate(threadPool);
+            assert terminated;
+        }
+        if (clusterService != null) {
+            clusterService.close();
+        }
+    }
+
+    public void setupStateAndService(Metadata metadata, RoutingTable routingTable) {
+        state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
+            .metadata(metadata)
+            .routingTable(routingTable)
+            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
+            .build();
+        threadPool = new TestThreadPool(getTestName());
+        clusterService = ClusterServiceUtils.createClusterService(state, threadPool);
+    }
+
+    public void testAllUnassignedShardsAllocatedWhenNoTimeOutAndRerouteNotScheduled() {
         int numberOfIndices = 2;
         int numberOfShards = 5;
         int numberOfReplicas = 1;
         int totalPrimaryCount = numberOfIndices * numberOfShards;
         int totalShardCount = numberOfIndices * (numberOfShards * (numberOfReplicas + 1));
         Settings.Builder settings = Settings.builder();
-        // passing total shard count for timed out latch such that no shard times out
-        BalancedShardsAllocator allocator = new TestBalancedShardsAllocator(settings.build(), new CountDownLatch(totalShardCount));
+        // passing sufficiently high count for timeout latch to simulate no time out
+        BalancedShardsAllocator allocator = new TestBalancedShardsAllocator(settings.build(), new CountDownLatch(Integer.MAX_VALUE));
         Metadata metadata = buildMetadata(Metadata.builder(), numberOfIndices, numberOfShards, numberOfReplicas);
         RoutingTable routingTable = buildRoutingTable(metadata);
-        ClusterState state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
-            .metadata(metadata)
-            .routingTable(routingTable)
-            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
-            .build();
+        setupStateAndService(metadata, routingTable);
         RoutingAllocation allocation = new RoutingAllocation(
             yesAllocationDeciders(),
             new RoutingNodes(state, false),
@@ -69,6 +100,18 @@ public void testAllUnassignedShardsAllocatedWhenNoTimeOut() {
             null,
             System.nanoTime()
         );
+        AtomicBoolean rerouteScheduled = new AtomicBoolean(false);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            if (randomBoolean()) {
+                listener.onFailure(new OpenSearchException("simulated"));
+            } else {
+                listener.onResponse(clusterService.state());
+            }
+            assertEquals("reroute after balanced shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteScheduled.compareAndSet(false, true);
+        };
+        allocator.setRerouteService(rerouteService);
         allocator.allocate(allocation);
         List<ShardRouting> initializingShards = allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING);
         int node1Recoveries = allocation.routingNodes().getInitialPrimariesIncomingRecoveries(node1.getId());
@@ -77,9 +120,10 @@ public void testAllUnassignedShardsAllocatedWhenNoTimeOut() {
         assertEquals(totalShardCount, initializingShards.size());
         assertEquals(0, allocation.routingNodes().unassigned().ignored().size());
         assertEquals(totalPrimaryCount, node1Recoveries + node2Recoveries + node3Recoveries);
+        assertFalse(rerouteScheduled.get());
     }
 
-    public void testAllUnassignedShardsIgnoredWhenTimedOut() {
+    public void testAllUnassignedShardsIgnoredWhenTimedOutAndRerouteScheduled() {
         int numberOfIndices = 2;
         int numberOfShards = 5;
         int numberOfReplicas = 1;
@@ -89,11 +133,7 @@ public void testAllUnassignedShardsIgnoredWhenTimedOut() {
         BalancedShardsAllocator allocator = new TestBalancedShardsAllocator(settings.build(), new CountDownLatch(0));
         Metadata metadata = buildMetadata(Metadata.builder(), numberOfIndices, numberOfShards, numberOfReplicas);
         RoutingTable routingTable = buildRoutingTable(metadata);
-        ClusterState state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
-            .metadata(metadata)
-            .routingTable(routingTable)
-            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
-            .build();
+        setupStateAndService(metadata, routingTable);
         RoutingAllocation allocation = new RoutingAllocation(
             yesAllocationDeciders(),
             new RoutingNodes(state, false),
@@ -102,6 +142,18 @@ public void testAllUnassignedShardsIgnoredWhenTimedOut() {
             null,
             System.nanoTime()
         );
+        AtomicBoolean rerouteScheduled = new AtomicBoolean(false);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            if (randomBoolean()) {
+                listener.onFailure(new OpenSearchException("simulated"));
+            } else {
+                listener.onResponse(clusterService.state());
+            }
+            assertEquals("reroute after balanced shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteScheduled.compareAndSet(false, true);
+        };
+        allocator.setRerouteService(rerouteService);
         allocator.allocate(allocation);
         List<ShardRouting> initializingShards = allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING);
         int node1Recoveries = allocation.routingNodes().getInitialPrimariesIncomingRecoveries(node1.getId());
@@ -110,9 +162,10 @@ public void testAllUnassignedShardsIgnoredWhenTimedOut() {
         assertEquals(0, initializingShards.size());
         assertEquals(totalShardCount, allocation.routingNodes().unassigned().ignored().size());
         assertEquals(0, node1Recoveries + node2Recoveries + node3Recoveries);
+        assertTrue(rerouteScheduled.get());
     }
 
-    public void testAllocatePartialPrimaryShardsUntilTimedOut() {
+    public void testAllocatePartialPrimaryShardsUntilTimedOutAndRerouteScheduled() {
         int numberOfIndices = 2;
         int numberOfShards = 5;
         int numberOfReplicas = 1;
@@ -123,11 +176,7 @@ public void testAllocatePartialPrimaryShardsUntilTimedOut() {
         BalancedShardsAllocator allocator = new TestBalancedShardsAllocator(settings.build(), new CountDownLatch(shardsToAllocate));
         Metadata metadata = buildMetadata(Metadata.builder(), numberOfIndices, numberOfShards, numberOfReplicas);
         RoutingTable routingTable = buildRoutingTable(metadata);
-        ClusterState state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
-            .metadata(metadata)
-            .routingTable(routingTable)
-            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
-            .build();
+        setupStateAndService(metadata, routingTable);
         RoutingAllocation allocation = new RoutingAllocation(
             yesAllocationDeciders(),
             new RoutingNodes(state, false),
@@ -136,6 +185,18 @@ public void testAllocatePartialPrimaryShardsUntilTimedOut() {
             null,
             System.nanoTime()
         );
+        AtomicBoolean rerouteScheduled = new AtomicBoolean(false);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            if (randomBoolean()) {
+                listener.onFailure(new OpenSearchException("simulated"));
+            } else {
+                listener.onResponse(clusterService.state());
+            }
+            assertEquals("reroute after balanced shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteScheduled.compareAndSet(false, true);
+        };
+        allocator.setRerouteService(rerouteService);
         allocator.allocate(allocation);
         List<ShardRouting> initializingShards = allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING);
         int node1Recoveries = allocation.routingNodes().getInitialPrimariesIncomingRecoveries(node1.getId());
@@ -144,9 +205,10 @@ public void testAllocatePartialPrimaryShardsUntilTimedOut() {
         assertEquals(shardsToAllocate, initializingShards.size());
         assertEquals(totalShardCount - shardsToAllocate, allocation.routingNodes().unassigned().ignored().size());
         assertEquals(shardsToAllocate, node1Recoveries + node2Recoveries + node3Recoveries);
+        assertTrue(rerouteScheduled.get());
     }
 
-    public void testAllocateAllPrimaryShardsAndPartialReplicaShardsUntilTimedOut() {
+    public void testAllocateAllPrimaryShardsAndPartialReplicaShardsUntilTimedOutAndRerouteScheduled() {
         int numberOfIndices = 2;
         int numberOfShards = 5;
         int numberOfReplicas = 1;
@@ -158,11 +220,7 @@ public void testAllocateAllPrimaryShardsAndPartialReplicaShardsUntilTimedOut() {
         BalancedShardsAllocator allocator = new TestBalancedShardsAllocator(settings.build(), new CountDownLatch(shardsToAllocate));
         Metadata metadata = buildMetadata(Metadata.builder(), numberOfIndices, numberOfShards, numberOfReplicas);
         RoutingTable routingTable = buildRoutingTable(metadata);
-        ClusterState state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
-            .metadata(metadata)
-            .routingTable(routingTable)
-            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
-            .build();
+        setupStateAndService(metadata, routingTable);
         RoutingAllocation allocation = new RoutingAllocation(
             yesAllocationDeciders(),
             new RoutingNodes(state, false),
@@ -171,6 +229,18 @@ public void testAllocateAllPrimaryShardsAndPartialReplicaShardsUntilTimedOut() {
             null,
             System.nanoTime()
         );
+        AtomicBoolean rerouteScheduled = new AtomicBoolean(false);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            if (randomBoolean()) {
+                listener.onFailure(new OpenSearchException("simulated"));
+            } else {
+                listener.onResponse(clusterService.state());
+            }
+            assertEquals("reroute after balanced shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteScheduled.compareAndSet(false, true);
+        };
+        allocator.setRerouteService(rerouteService);
         allocator.allocate(allocation);
         List<ShardRouting> initializingShards = allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING);
         int node1Recoveries = allocation.routingNodes().getInitialPrimariesIncomingRecoveries(node1.getId());
@@ -179,20 +249,17 @@ public void testAllocateAllPrimaryShardsAndPartialReplicaShardsUntilTimedOut() {
         assertEquals(shardsToAllocate, initializingShards.size());
         assertEquals(totalShardCount - shardsToAllocate, allocation.routingNodes().unassigned().ignored().size());
         assertEquals(numberOfShards * numberOfIndices, node1Recoveries + node2Recoveries + node3Recoveries);
+        assertTrue(rerouteScheduled.get());
     }
 
-    public void testAllShardsMoveWhenExcludedAndTimeoutNotBreached() {
+    public void testAllShardsMoveWhenExcludedAndTimeoutNotBreachedAndRerouteNotScheduled() {
         int numberOfIndices = 3;
         int numberOfShards = 5;
         int numberOfReplicas = 1;
         int totalShardCount = numberOfIndices * (numberOfShards * (numberOfReplicas + 1));
         Metadata metadata = buildMetadata(Metadata.builder(), numberOfIndices, numberOfShards, numberOfReplicas);
         RoutingTable routingTable = buildRoutingTable(metadata);
-        ClusterState state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
-            .metadata(metadata)
-            .routingTable(routingTable)
-            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
-            .build();
+        setupStateAndService(metadata, routingTable);
         MockAllocationService allocationService = createAllocationService();
         state = applyStartedShardsUntilNoChange(state, allocationService);
         // check all shards allocated
@@ -200,8 +267,7 @@ public void testAllShardsMoveWhenExcludedAndTimeoutNotBreached() {
         assertEquals(totalShardCount, state.getRoutingNodes().shardsWithState(STARTED).size());
         int node1ShardCount = state.getRoutingNodes().node("node1").size();
         Settings settings = Settings.builder().put("cluster.routing.allocation.exclude.zone", "1a").build();
-        int shardsToMove = 10 + 1000; // such that time out is never breached
-        BalancedShardsAllocator allocator = new TestBalancedShardsAllocator(settings, new CountDownLatch(shardsToMove));
+        BalancedShardsAllocator allocator = new TestBalancedShardsAllocator(settings, new CountDownLatch(Integer.MAX_VALUE));
         RoutingAllocation allocation = new RoutingAllocation(
             allocationDecidersForExcludeAPI(settings),
             new RoutingNodes(state, false),
@@ -210,30 +276,39 @@ public void testAllShardsMoveWhenExcludedAndTimeoutNotBreached() {
             null,
             System.nanoTime()
         );
+        AtomicBoolean rerouteScheduled = new AtomicBoolean(false);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            if (randomBoolean()) {
+                listener.onFailure(new OpenSearchException("simulated"));
+            } else {
+                listener.onResponse(clusterService.state());
+            }
+            assertEquals("reroute after balanced shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteScheduled.compareAndSet(false, true);
+        };
+        allocator.setRerouteService(rerouteService);
         allocator.allocate(allocation);
         List<ShardRouting> relocatingShards = allocation.routingNodes().shardsWithState(ShardRoutingState.RELOCATING);
         assertEquals(node1ShardCount, relocatingShards.size());
+        assertFalse(rerouteScheduled.get());
     }
 
-    public void testNoShardsMoveWhenExcludedAndTimeoutBreached() {
+    public void testNoShardsMoveWhenExcludedAndTimeoutBreachedAndRerouteScheduled() {
         int numberOfIndices = 3;
         int numberOfShards = 5;
         int numberOfReplicas = 1;
         int totalShardCount = numberOfIndices * (numberOfShards * (numberOfReplicas + 1));
         Metadata metadata = buildMetadata(Metadata.builder(), numberOfIndices, numberOfShards, numberOfReplicas);
         RoutingTable routingTable = buildRoutingTable(metadata);
-        ClusterState state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
-            .metadata(metadata)
-            .routingTable(routingTable)
-            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
-            .build();
+        setupStateAndService(metadata, routingTable);
         MockAllocationService allocationService = createAllocationService();
         state = applyStartedShardsUntilNoChange(state, allocationService);
         // check all shards allocated
         assertEquals(0, state.getRoutingNodes().shardsWithState(INITIALIZING).size());
         assertEquals(totalShardCount, state.getRoutingNodes().shardsWithState(STARTED).size());
         Settings settings = Settings.builder().put("cluster.routing.allocation.exclude.zone", "1a").build();
-        int shardsToMove = 0; // such that time out is never breached
+        int shardsToMove = 0; // such that time out is breached
         BalancedShardsAllocator allocator = new TestBalancedShardsAllocator(settings, new CountDownLatch(shardsToMove));
         RoutingAllocation allocation = new RoutingAllocation(
             allocationDecidersForExcludeAPI(settings),
@@ -243,23 +318,32 @@ public void testNoShardsMoveWhenExcludedAndTimeoutBreached() {
             null,
             System.nanoTime()
         );
+        AtomicBoolean rerouteScheduled = new AtomicBoolean(false);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            if (randomBoolean()) {
+                listener.onFailure(new OpenSearchException("simulated"));
+            } else {
+                listener.onResponse(clusterService.state());
+            }
+            assertEquals("reroute after balanced shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteScheduled.compareAndSet(false, true);
+        };
+        allocator.setRerouteService(rerouteService);
         allocator.allocate(allocation);
         List<ShardRouting> relocatingShards = allocation.routingNodes().shardsWithState(ShardRoutingState.RELOCATING);
         assertEquals(0, relocatingShards.size());
+        assertTrue(rerouteScheduled.get());
     }
 
-    public void testPartialShardsMoveWhenExcludedAndTimeoutBreached() {
+    public void testPartialShardsMoveWhenExcludedAndTimeoutBreachedAndRerouteScheduled() {
         int numberOfIndices = 3;
         int numberOfShards = 5;
         int numberOfReplicas = 1;
         int totalShardCount = numberOfIndices * (numberOfShards * (numberOfReplicas + 1));
         Metadata metadata = buildMetadata(Metadata.builder(), numberOfIndices, numberOfShards, numberOfReplicas);
         RoutingTable routingTable = buildRoutingTable(metadata);
-        ClusterState state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
-            .metadata(metadata)
-            .routingTable(routingTable)
-            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
-            .build();
+        setupStateAndService(metadata, routingTable);
         MockAllocationService allocationService = createAllocationService();
         state = applyStartedShardsUntilNoChange(state, allocationService);
         // check all shards allocated
@@ -279,23 +363,32 @@ public void testPartialShardsMoveWhenExcludedAndTimeoutBreached() {
             null,
             System.nanoTime()
         );
+        AtomicBoolean rerouteScheduled = new AtomicBoolean(false);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            if (randomBoolean()) {
+                listener.onFailure(new OpenSearchException("simulated"));
+            } else {
+                listener.onResponse(clusterService.state());
+            }
+            assertEquals("reroute after balanced shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteScheduled.compareAndSet(false, true);
+        };
+        allocator.setRerouteService(rerouteService);
         allocator.allocate(allocation);
         List<ShardRouting> relocatingShards = allocation.routingNodes().shardsWithState(ShardRoutingState.RELOCATING);
         assertEquals(shardsToMove / 3, relocatingShards.size());
+        assertTrue(rerouteScheduled.get());
     }
 
-    public void testClusterRebalancedWhenNotTimedOut() {
+    public void testClusterRebalancedWhenNotTimedOutAndRerouteNotScheduled() {
         int numberOfIndices = 1;
         int numberOfShards = 15;
         int numberOfReplicas = 1;
         int totalShardCount = numberOfIndices * (numberOfShards * (numberOfReplicas + 1));
         Metadata metadata = buildMetadata(Metadata.builder(), numberOfIndices, numberOfShards, numberOfReplicas);
         RoutingTable routingTable = buildRoutingTable(metadata);
-        ClusterState state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
-            .metadata(metadata)
-            .routingTable(routingTable)
-            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
-            .build();
+        setupStateAndService(metadata, routingTable);
         MockAllocationService allocationService = createAllocationService(
             Settings.builder().put("cluster.routing.allocation.exclude.zone", "1a").build()
         ); // such that no shards are allocated to node1
@@ -306,8 +399,7 @@ public void testClusterRebalancedWhenNotTimedOut() {
         assertEquals(totalShardCount, state.getRoutingNodes().shardsWithState(STARTED).size());
         assertEquals(0, node1ShardCount);
         Settings newSettings = Settings.builder().put("cluster.routing.allocation.exclude.zone", "").build();
-        int shardsToMove = 1000; // such that time out is never breached
-        BalancedShardsAllocator allocator = new TestBalancedShardsAllocator(newSettings, new CountDownLatch(shardsToMove));
+        BalancedShardsAllocator allocator = new TestBalancedShardsAllocator(newSettings, new CountDownLatch(Integer.MAX_VALUE));
         RoutingAllocation allocation = new RoutingAllocation(
             allocationDecidersForExcludeAPI(newSettings),
             new RoutingNodes(state, false),
@@ -316,23 +408,32 @@ public void testClusterRebalancedWhenNotTimedOut() {
             null,
             System.nanoTime()
         );
+        AtomicBoolean rerouteScheduled = new AtomicBoolean(false);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            if (randomBoolean()) {
+                listener.onFailure(new OpenSearchException("simulated"));
+            } else {
+                listener.onResponse(clusterService.state());
+            }
+            assertEquals("reroute after balanced shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteScheduled.compareAndSet(false, true);
+        };
+        allocator.setRerouteService(rerouteService);
         allocator.allocate(allocation);
         List<ShardRouting> relocatingShards = allocation.routingNodes().shardsWithState(ShardRoutingState.RELOCATING);
         assertEquals(totalShardCount / 3, relocatingShards.size());
+        assertFalse(rerouteScheduled.get());
     }
 
-    public void testClusterNotRebalancedWhenTimedOut() {
+    public void testClusterNotRebalancedWhenTimedOutAndRerouteScheduled() {
         int numberOfIndices = 1;
         int numberOfShards = 15;
         int numberOfReplicas = 1;
         int totalShardCount = numberOfIndices * (numberOfShards * (numberOfReplicas + 1));
         Metadata metadata = buildMetadata(Metadata.builder(), numberOfIndices, numberOfShards, numberOfReplicas);
         RoutingTable routingTable = buildRoutingTable(metadata);
-        ClusterState state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
-            .metadata(metadata)
-            .routingTable(routingTable)
-            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
-            .build();
+        setupStateAndService(metadata, routingTable);
         MockAllocationService allocationService = createAllocationService(
             Settings.builder().put("cluster.routing.allocation.exclude.zone", "1a").build()
         ); // such that no shards are allocated to node1
@@ -353,23 +454,32 @@ public void testClusterNotRebalancedWhenTimedOut() {
             null,
             System.nanoTime()
         );
+        AtomicBoolean rerouteScheduled = new AtomicBoolean(false);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            if (randomBoolean()) {
+                listener.onFailure(new OpenSearchException("simulated"));
+            } else {
+                listener.onResponse(clusterService.state());
+            }
+            assertEquals("reroute after balanced shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteScheduled.compareAndSet(false, true);
+        };
+        allocator.setRerouteService(rerouteService);
         allocator.allocate(allocation);
         List<ShardRouting> relocatingShards = allocation.routingNodes().shardsWithState(ShardRoutingState.RELOCATING);
         assertEquals(0, relocatingShards.size());
+        assertTrue(rerouteScheduled.get());
     }
 
-    public void testClusterPartialRebalancedWhenTimedOut() {
+    public void testClusterPartialRebalancedWhenTimedOutAndRerouteScheduled() {
         int numberOfIndices = 1;
         int numberOfShards = 15;
         int numberOfReplicas = 1;
         int totalShardCount = numberOfIndices * (numberOfShards * (numberOfReplicas + 1));
         Metadata metadata = buildMetadata(Metadata.builder(), numberOfIndices, numberOfShards, numberOfReplicas);
         RoutingTable routingTable = buildRoutingTable(metadata);
-        ClusterState state = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
-            .metadata(metadata)
-            .routingTable(routingTable)
-            .nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3))
-            .build();
+        setupStateAndService(metadata, routingTable);
         MockAllocationService allocationService = createAllocationService(
             Settings.builder().put("cluster.routing.allocation.exclude.zone", "1a").build()
         ); // such that no shards are allocated to node1
@@ -404,9 +514,22 @@ public Decision canRebalance(ShardRouting shardRouting, RoutingAllocation alloca
             null,
             System.nanoTime()
         );
+        AtomicBoolean rerouteScheduled = new AtomicBoolean(false);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            if (randomBoolean()) {
+                listener.onFailure(new OpenSearchException("simulated"));
+            } else {
+                listener.onResponse(clusterService.state());
+            }
+            assertEquals("reroute after balanced shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteScheduled.compareAndSet(false, true);
+        };
+        allocator.setRerouteService(rerouteService);
         allocator.allocate(allocation);
         List<ShardRouting> relocatingShards = allocation.routingNodes().shardsWithState(ShardRoutingState.RELOCATING);
         assertEquals(3, relocatingShards.size());
+        assertTrue(rerouteScheduled.get());
     }
 
     public void testAllocatorNeverTimedOutIfValueIsMinusOne() {
diff --git a/server/src/test/java/org/opensearch/gateway/GatewayAllocatorTests.java b/server/src/test/java/org/opensearch/gateway/GatewayAllocatorTests.java
index c7eae77d6deba..ebc2e59fa5a30 100644
--- a/server/src/test/java/org/opensearch/gateway/GatewayAllocatorTests.java
+++ b/server/src/test/java/org/opensearch/gateway/GatewayAllocatorTests.java
@@ -10,6 +10,7 @@
 
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.opensearch.OpenSearchException;
 import org.opensearch.Version;
 import org.opensearch.action.support.nodes.BaseNodeResponse;
 import org.opensearch.cluster.ClusterInfo;
@@ -22,6 +23,7 @@
 import org.opensearch.cluster.routing.IndexRoutingTable;
 import org.opensearch.cluster.routing.IndexShardRoutingTable;
 import org.opensearch.cluster.routing.RecoverySource;
+import org.opensearch.cluster.routing.RerouteService;
 import org.opensearch.cluster.routing.RoutingNodes;
 import org.opensearch.cluster.routing.RoutingTable;
 import org.opensearch.cluster.routing.ShardRouting;
@@ -30,6 +32,8 @@
 import org.opensearch.cluster.routing.UnassignedInfo;
 import org.opensearch.cluster.routing.allocation.RoutingAllocation;
 import org.opensearch.cluster.routing.allocation.decider.AllocationDeciders;
+import org.opensearch.cluster.service.ClusterService;
+import org.opensearch.common.Priority;
 import org.opensearch.common.collect.Tuple;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.common.unit.TimeValue;
@@ -37,7 +41,9 @@
 import org.opensearch.common.util.set.Sets;
 import org.opensearch.core.index.shard.ShardId;
 import org.opensearch.snapshots.SnapshotShardSizeInfo;
+import org.opensearch.test.ClusterServiceUtils;
 import org.opensearch.test.gateway.TestShardBatchGatewayAllocator;
+import org.opensearch.threadpool.TestThreadPool;
 import org.junit.Before;
 
 import java.util.ArrayList;
@@ -47,13 +53,13 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 
 import static org.opensearch.gateway.ShardsBatchGatewayAllocator.PRIMARY_BATCH_ALLOCATOR_TIMEOUT_SETTING;
 import static org.opensearch.gateway.ShardsBatchGatewayAllocator.PRIMARY_BATCH_ALLOCATOR_TIMEOUT_SETTING_KEY;
 import static org.opensearch.gateway.ShardsBatchGatewayAllocator.REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING;
 import static org.opensearch.gateway.ShardsBatchGatewayAllocator.REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING_KEY;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 
 public class GatewayAllocatorTests extends OpenSearchAllocationTestCase {
 
@@ -426,22 +432,62 @@ public void testReplicaAllocatorTimeout() {
         assertEquals(-1, REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING.get(build).getMillis());
     }
 
-    public void testCollectTimedOutShards() throws InterruptedException {
+    public void testCollectTimedOutShardsAndScheduleReroute_Success() throws InterruptedException {
         createIndexAndUpdateClusterState(2, 5, 2);
-        CountDownLatch latch = new CountDownLatch(10);
-        testShardsBatchGatewayAllocator = new TestShardBatchGatewayAllocator(latch);
+        TestThreadPool threadPool = new TestThreadPool(getTestName());
+        ClusterService clusterService = ClusterServiceUtils.createClusterService(clusterState, threadPool);
+        final CountDownLatch rerouteLatch = new CountDownLatch(2);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            listener.onResponse(clusterService.state());
+            assertThat(rerouteLatch.getCount(), greaterThanOrEqualTo(0L));
+            assertEquals("reroute after existing shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteLatch.countDown();
+        };
+        CountDownLatch timedOutShardsLatch = new CountDownLatch(20);
+        testShardsBatchGatewayAllocator = new TestShardBatchGatewayAllocator(timedOutShardsLatch, 1000, rerouteService);
         testShardsBatchGatewayAllocator.setPrimaryBatchAllocatorTimeout(TimeValue.ZERO);
         testShardsBatchGatewayAllocator.setReplicaBatchAllocatorTimeout(TimeValue.ZERO);
         BatchRunnableExecutor executor = testShardsBatchGatewayAllocator.allocateAllUnassignedShards(testAllocation, true);
         executor.run();
-        assertTrue(latch.await(1, TimeUnit.MINUTES));
-        latch = new CountDownLatch(10);
-        testShardsBatchGatewayAllocator = new TestShardBatchGatewayAllocator(latch);
+        assertEquals(timedOutShardsLatch.getCount(), 10);
+        assertEquals(1, rerouteLatch.getCount());
+        executor = testShardsBatchGatewayAllocator.allocateAllUnassignedShards(testAllocation, false);
+        executor.run();
+        assertEquals(timedOutShardsLatch.getCount(), 0);
+        assertEquals(0, rerouteLatch.getCount()); // even with failure it doesn't leak any listeners
+        final boolean terminated = terminate(threadPool);
+        assert terminated;
+        clusterService.close();
+    }
+
+    public void testCollectTimedOutShardsAndScheduleReroute_Failure() throws InterruptedException {
+        createIndexAndUpdateClusterState(2, 5, 2);
+        TestThreadPool threadPool = new TestThreadPool(getTestName());
+        ClusterService clusterService = ClusterServiceUtils.createClusterService(clusterState, threadPool);
+        final CountDownLatch rerouteLatch = new CountDownLatch(2);
+        final RerouteService rerouteService = (reason, priority, listener) -> {
+            listener.onFailure(new OpenSearchException("simulated"));
+            assertThat(rerouteLatch.getCount(), greaterThanOrEqualTo(0L));
+            assertEquals("reroute after existing shards allocator timed out", reason);
+            assertEquals(Priority.HIGH, priority);
+            rerouteLatch.countDown();
+        };
+        CountDownLatch timedOutShardsLatch = new CountDownLatch(20);
+        testShardsBatchGatewayAllocator = new TestShardBatchGatewayAllocator(timedOutShardsLatch, 1000, rerouteService);
         testShardsBatchGatewayAllocator.setPrimaryBatchAllocatorTimeout(TimeValue.ZERO);
         testShardsBatchGatewayAllocator.setReplicaBatchAllocatorTimeout(TimeValue.ZERO);
+        BatchRunnableExecutor executor = testShardsBatchGatewayAllocator.allocateAllUnassignedShards(testAllocation, true);
+        executor.run();
+        assertEquals(timedOutShardsLatch.getCount(), 10);
+        assertEquals(1, rerouteLatch.getCount());
         executor = testShardsBatchGatewayAllocator.allocateAllUnassignedShards(testAllocation, false);
         executor.run();
-        assertTrue(latch.await(1, TimeUnit.MINUTES));
+        assertEquals(timedOutShardsLatch.getCount(), 0);
+        assertEquals(0, rerouteLatch.getCount()); // even with failure it doesn't leak any listeners
+        final boolean terminated = terminate(threadPool);
+        assert terminated;
+        clusterService.close();
     }
 
     private void createIndexAndUpdateClusterState(int count, int numberOfShards, int numberOfReplicas) {
diff --git a/test/framework/src/main/java/org/opensearch/test/gateway/TestShardBatchGatewayAllocator.java b/test/framework/src/main/java/org/opensearch/test/gateway/TestShardBatchGatewayAllocator.java
index 156b1d7c620e6..c2ff228a6bf3a 100644
--- a/test/framework/src/main/java/org/opensearch/test/gateway/TestShardBatchGatewayAllocator.java
+++ b/test/framework/src/main/java/org/opensearch/test/gateway/TestShardBatchGatewayAllocator.java
@@ -10,6 +10,7 @@
 
 import org.opensearch.cluster.node.DiscoveryNode;
 import org.opensearch.cluster.node.DiscoveryNodes;
+import org.opensearch.cluster.routing.RerouteService;
 import org.opensearch.cluster.routing.ShardRouting;
 import org.opensearch.cluster.routing.allocation.AllocateUnassignedDecision;
 import org.opensearch.cluster.routing.allocation.RoutingAllocation;
@@ -39,12 +40,13 @@ public TestShardBatchGatewayAllocator() {
 
     }
 
-    public TestShardBatchGatewayAllocator(CountDownLatch latch) {
+    public TestShardBatchGatewayAllocator(CountDownLatch latch, long maxBatchSize, RerouteService rerouteService) {
+        super(maxBatchSize, rerouteService);
         this.latch = latch;
     }
 
     public TestShardBatchGatewayAllocator(long maxBatchSize) {
-        super(maxBatchSize);
+        super(maxBatchSize, null);
     }
 
     Map<String /* node id */, Map<ShardId, ShardRouting>> knownAllocations = new HashMap<>();

From e853216ab22e50057867e1c0a5ae8bd67a6daf6c Mon Sep 17 00:00:00 2001
From: Andriy Redko <andriy.redko@aiven.io>
Date: Thu, 5 Sep 2024 16:43:01 -0400
Subject: [PATCH 4/7] Add 2.17 release notes (#15762) (#15768)

Signed-off-by: Andriy Redko <andriy.redko@aiven.io>
---
 CHANGELOG.md                                  |  90 +--------------
 .../opensearch.release-notes-2.17.0.md        | 104 ++++++++++++++++++
 2 files changed, 105 insertions(+), 89 deletions(-)
 create mode 100644 release-notes/opensearch.release-notes-2.17.0.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4dbc667fcdbbf..435df7bda62a9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,108 +5,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [Unreleased 2.x]
 ### Added
-- [Workload Management] Add Settings for Workload Management feature ([#15028](https://github.com/opensearch-project/OpenSearch/pull/15028))
-- Fix for hasInitiatedFetching to fix allocation explain and manual reroute APIs (([#14972](https://github.com/opensearch-project/OpenSearch/pull/14972))
-- [Workload Management] Add queryGroupId to Task ([14708](https://github.com/opensearch-project/OpenSearch/pull/14708))
-- Add setting to ignore throttling nodes for allocation of unassigned primaries in remote restore ([#14991](https://github.com/opensearch-project/OpenSearch/pull/14991))
-- [Workload Management] Add Delete QueryGroup API Logic ([#14735](https://github.com/opensearch-project/OpenSearch/pull/14735))
-- [Streaming Indexing] Enhance RestClient with a new streaming API support ([#14437](https://github.com/opensearch-project/OpenSearch/pull/14437))
-- Add basic aggregation support for derived fields ([#14618](https://github.com/opensearch-project/OpenSearch/pull/14618))
-- [Workload Management] Add Create QueryGroup API Logic ([#14680](https://github.com/opensearch-project/OpenSearch/pull/14680))- [Workload Management] Add Create QueryGroup API Logic ([#14680](https://github.com/opensearch-project/OpenSearch/pull/14680))
-- Add ThreadContextPermission for markAsSystemContext and allow core to perform the method ([#15016](https://github.com/opensearch-project/OpenSearch/pull/15016))
-- Add ThreadContextPermission for stashAndMergeHeaders and stashWithOrigin ([#15039](https://github.com/opensearch-project/OpenSearch/pull/15039))
-- [Concurrent Segment Search] Support composite aggregations with scripting ([#15072](https://github.com/opensearch-project/OpenSearch/pull/15072))
-- Add `rangeQuery` and `regexpQuery` for `constant_keyword` field type ([#14711](https://github.com/opensearch-project/OpenSearch/pull/14711))
-- Add took time to request nodes stats ([#15054](https://github.com/opensearch-project/OpenSearch/pull/15054))
-- [Workload Management] Add Get QueryGroup API Logic ([14709](https://github.com/opensearch-project/OpenSearch/pull/14709))
-- [Workload Management] Add Update QueryGroup API Logic ([#14775](https://github.com/opensearch-project/OpenSearch/pull/14775))
-- [Workload Management] QueryGroup resource tracking framework changes ([#13897](https://github.com/opensearch-project/OpenSearch/pull/13897))
-- Support filtering on a large list encoded by bitmap ([#14774](https://github.com/opensearch-project/OpenSearch/pull/14774))
-- Add slice execution listeners to SearchOperationListener interface ([#15153](https://github.com/opensearch-project/OpenSearch/pull/15153))
-- Make balanced shards allocator timebound ([#15239](https://github.com/opensearch-project/OpenSearch/pull/15239))
-- Add allowlist setting for ingest-geoip and ingest-useragent ([#15325](https://github.com/opensearch-project/OpenSearch/pull/15325))
-- Adding access to noSubMatches and noOverlappingMatches in Hyphenation ([#13895](https://github.com/opensearch-project/OpenSearch/pull/13895))
-- Star tree mapping changes ([#14605](https://github.com/opensearch-project/OpenSearch/pull/14605))
-- Add support for index level max slice count setting for concurrent segment search ([#15336](https://github.com/opensearch-project/OpenSearch/pull/15336))
-- Support cancellation for cat shards and node stats API.([#13966](https://github.com/opensearch-project/OpenSearch/pull/13966))
-- [Streaming Indexing] Introduce bulk HTTP API streaming flavor ([#15381](https://github.com/opensearch-project/OpenSearch/pull/15381))
-- Add support for centralize snapshot creation with pinned timestamp ([#15124](https://github.com/opensearch-project/OpenSearch/pull/15124))
-- Add concurrent search support for Derived Fields ([#15326](https://github.com/opensearch-project/OpenSearch/pull/15326))
-- [Workload Management] Add query group stats constructs ([#15343](https://github.com/opensearch-project/OpenSearch/pull/15343)))
-- Add limit on number of processors for Ingest pipeline([#15460](https://github.com/opensearch-project/OpenSearch/pull/15465)).
-- Add runAs to Subject interface and introduce IdentityAwarePlugin extension point ([#14630](https://github.com/opensearch-project/OpenSearch/pull/14630))
-- [Workload Management] Add rejection logic for co-ordinator and shard level requests ([#15428](https://github.com/opensearch-project/OpenSearch/pull/15428)))
-- Adding translog durability validation in index templates ([#15494](https://github.com/opensearch-project/OpenSearch/pull/15494))
-- [Range Queries] Add new approximateable query framework to short-circuit range queries ([#13788](https://github.com/opensearch-project/OpenSearch/pull/13788))
-- [Workload Management] Add query group level failure tracking ([#15227](https://github.com/opensearch-project/OpenSearch/pull/15527))
-- [Reader Writer Separation] Add experimental search replica shard type to achieve reader writer separation ([#15237](https://github.com/opensearch-project/OpenSearch/pull/15237))
-- Add index creation using the context field ([#15290](https://github.com/opensearch-project/OpenSearch/pull/15290))
-- [Remote Publication] Add remote download stats ([#15291](https://github.com/opensearch-project/OpenSearch/pull/15291)))
-- Add support to upload snapshot shard blobs with hashed prefix ([#15426](https://github.com/opensearch-project/OpenSearch/pull/15426))
-- Add prefix support to hashed prefix & infix path types on remote store ([#15557](https://github.com/opensearch-project/OpenSearch/pull/15557))
-- Add canRemain method to TargetPoolAllocationDecider to move shards from local to remote pool for hot to warm tiering ([#15010](https://github.com/opensearch-project/OpenSearch/pull/15010))
-- Add support for pluggable deciders for concurrent search ([#15363](https://github.com/opensearch-project/OpenSearch/pull/15363))
-- Add support for comma-separated list of index names to be used with Snapshot Status API ([#15409](https://github.com/opensearch-project/OpenSearch/pull/15409))[SnapshotV2] Snapshot Status API changes (#15409))
-- Optimise snapshot deletion to speed up snapshot deletion and creation ([#15568](https://github.com/opensearch-project/OpenSearch/pull/15568))
-- [Remote Publication] Added checksum validation for cluster state behind a cluster setting ([#15218](https://github.com/opensearch-project/OpenSearch/pull/15218))
-- Relax the join validation for Remote State publication ([#15471](https://github.com/opensearch-project/OpenSearch/pull/15471))
-- Optimize NodeIndicesStats output behind flag ([#14454](https://github.com/opensearch-project/OpenSearch/pull/14454))
 - MultiTermQueries in keyword fields now default to `indexed` approach and gated behind cluster setting ([#15637](https://github.com/opensearch-project/OpenSearch/pull/15637))
 
 ### Dependencies
-- Bump `netty` from 4.1.111.Final to 4.1.112.Final ([#15081](https://github.com/opensearch-project/OpenSearch/pull/15081))
-- Bump `org.apache.commons:commons-lang3` from 3.14.0 to 3.15.0 ([#14861](https://github.com/opensearch-project/OpenSearch/pull/14861))
-- OpenJDK Update (July 2024 Patch releases) ([#14998](https://github.com/opensearch-project/OpenSearch/pull/14998))
-- Bump `com.microsoft.azure:msal4j` from 1.16.1 to 1.17.0 ([#14995](https://github.com/opensearch-project/OpenSearch/pull/14995), [#15420](https://github.com/opensearch-project/OpenSearch/pull/15420))
-- Bump `actions/github-script` from 6 to 7 ([#14997](https://github.com/opensearch-project/OpenSearch/pull/14997))
-- Bump `org.tukaani:xz` from 1.9 to 1.10 ([#15110](https://github.com/opensearch-project/OpenSearch/pull/15110))
-- Bump `org.apache.avro:avro` from 1.11.3 to 1.12.0 in /plugins/repository-hdfs ([#15119](https://github.com/opensearch-project/OpenSearch/pull/15119))
-- Bump `org.bouncycastle:bcpg-fips` from 1.0.7.1 to 2.0.9 ([#15103](https://github.com/opensearch-project/OpenSearch/pull/15103), [#15299](https://github.com/opensearch-project/OpenSearch/pull/15299))
-- Bump `com.azure:azure-core` from 1.49.1 to 1.51.0 ([#15111](https://github.com/opensearch-project/OpenSearch/pull/15111))
-- Bump `org.xerial.snappy:snappy-java` from 1.1.10.5 to 1.1.10.6 ([#15207](https://github.com/opensearch-project/OpenSearch/pull/15207))
-- Bump `com.azure:azure-xml` from 1.0.0 to 1.1.0 ([#15206](https://github.com/opensearch-project/OpenSearch/pull/15206))
-- Bump `reactor` from 3.5.19 to 3.5.20 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
-- Bump `reactor-netty` from 1.1.21 to 1.1.22 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
-- Bump `org.apache.kerby:kerb-admin` from 2.0.3 to 2.1.0 ([#15301](https://github.com/opensearch-project/OpenSearch/pull/15301))
-- Bump `com.azure:azure-core-http-netty` from 1.15.1 to 1.15.3 ([#15300](https://github.com/opensearch-project/OpenSearch/pull/15300))
-- Bump `com.gradle.develocity` from 3.17.6 to 3.18 ([#15297](https://github.com/opensearch-project/OpenSearch/pull/15297))
-- Bump `commons-cli:commons-cli` from 1.8.0 to 1.9.0 ([#15298](https://github.com/opensearch-project/OpenSearch/pull/15298))
-- Bump `opentelemetry` from 1.40.0 to 1.41.0 ([#15361](https://github.com/opensearch-project/OpenSearch/pull/15361))
-- Bump `opentelemetry-semconv` from 1.26.0-alpha to 1.27.0-alpha ([#15361](https://github.com/opensearch-project/OpenSearch/pull/15361))
-- Bump `tj-actions/changed-files` from 44 to 45 ([#15422](https://github.com/opensearch-project/OpenSearch/pull/15422))
-- Bump `dnsjava:dnsjava` from 3.6.0 to 3.6.1 ([#15418](https://github.com/opensearch-project/OpenSearch/pull/15418))
-- Bump `com.netflix.nebula.ospackage-base` from 11.9.1 to 11.10.0 ([#15419](https://github.com/opensearch-project/OpenSearch/pull/15419))
-- Bump `org.roaringbitmap:RoaringBitmap` from 1.1.0 to 1.2.1 ([#15423](https://github.com/opensearch-project/OpenSearch/pull/15423))
-- Bump `icu4j` from 70.1 to 75.1 ([#15469](https://github.com/opensearch-project/OpenSearch/pull/15469))
 - Bump `com.azure:azure-identity` from 1.13.0 to 1.13.2 ([#15578](https://github.com/opensearch-project/OpenSearch/pull/15578))
 
 ### Changed
-- Add lower limit for primary and replica batch allocators timeout ([#14979](https://github.com/opensearch-project/OpenSearch/pull/14979))
-- Optimize regexp-based include/exclude on aggregations when pattern matches prefixes ([#14371](https://github.com/opensearch-project/OpenSearch/pull/14371))
-- Replace and block usages of org.apache.logging.log4j.util.Strings ([#15238](https://github.com/opensearch-project/OpenSearch/pull/15238))
-- Remote publication using minimum node version for backward compatibility ([#15216](https://github.com/opensearch-project/OpenSearch/pull/15216))
 
 
 ### Deprecated
 
 ### Removed
-- Remove some unused code in the search backpressure package ([#15518](https://github.com/opensearch-project/OpenSearch/pull/15518))
 
 ### Fixed
-- Fix constraint bug which allows more primary shards than average primary shards per index ([#14908](https://github.com/opensearch-project/OpenSearch/pull/14908))
-- Fix NPE when bulk ingest with empty pipeline ([#15033](https://github.com/opensearch-project/OpenSearch/pull/15033))
-- Fix missing value of FieldSort for unsigned_long ([#14963](https://github.com/opensearch-project/OpenSearch/pull/14963))
-- Fix delete index template failed when the index template matches a data stream but is unused ([#15080](https://github.com/opensearch-project/OpenSearch/pull/15080))
-- Fix array_index_out_of_bounds_exception when indexing documents with field name containing only dot ([#15126](https://github.com/opensearch-project/OpenSearch/pull/15126))
-- Fixed array field name omission in flat_object function for nested JSON ([#13620](https://github.com/opensearch-project/OpenSearch/pull/13620))
-- Fix incorrect parameter names in MinHash token filter configuration handling ([#15233](https://github.com/opensearch-project/OpenSearch/pull/15233))
-- Fix range aggregation optimization ignoring top level queries ([#15287](https://github.com/opensearch-project/OpenSearch/pull/15287))
-- Fix indexing error when flat_object field is explicitly null ([#15375](https://github.com/opensearch-project/OpenSearch/pull/15375))
-- Fix split response processor not included in allowlist ([#15393](https://github.com/opensearch-project/OpenSearch/pull/15393))
-- Fix unchecked cast in dynamic action map getter ([#15394](https://github.com/opensearch-project/OpenSearch/pull/15394))
-- Fix null values indexed as "null" strings in flat_object field ([#14069](https://github.com/opensearch-project/OpenSearch/pull/14069))
-- Fix terms query on wildcard field returns nothing ([#15607](https://github.com/opensearch-project/OpenSearch/pull/15607))
 
 ### Security
 
-[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.16...2.x
+[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.17...2.x
diff --git a/release-notes/opensearch.release-notes-2.17.0.md b/release-notes/opensearch.release-notes-2.17.0.md
new file mode 100644
index 0000000000000..18b7c9ac7cd68
--- /dev/null
+++ b/release-notes/opensearch.release-notes-2.17.0.md
@@ -0,0 +1,104 @@
+## 2024-09-17 Version 2.17.0 Release Notes
+
+## [2.17.0]
+### Added
+- [Workload Management] Add Settings for Workload Management feature ([#15028](https://github.com/opensearch-project/OpenSearch/pull/15028))
+- Fix for hasInitiatedFetching to fix allocation explain and manual reroute APIs (([#14972](https://github.com/opensearch-project/OpenSearch/pull/14972))
+- [Workload Management] Add queryGroupId to Task ([14708](https://github.com/opensearch-project/OpenSearch/pull/14708))
+- Add setting to ignore throttling nodes for allocation of unassigned primaries in remote restore ([#14991](https://github.com/opensearch-project/OpenSearch/pull/14991))
+- [Workload Management] Add Delete QueryGroup API Logic ([#14735](https://github.com/opensearch-project/OpenSearch/pull/14735))
+- [Streaming Indexing] Enhance RestClient with a new streaming API support ([#14437](https://github.com/opensearch-project/OpenSearch/pull/14437))
+- Add basic aggregation support for derived fields ([#14618](https://github.com/opensearch-project/OpenSearch/pull/14618))
+- [Workload Management] Add Create QueryGroup API Logic ([#14680](https://github.com/opensearch-project/OpenSearch/pull/14680))- [Workload Management] Add Create QueryGroup API Logic ([#14680](https://github.com/opensearch-project/OpenSearch/pull/14680))
+- Add ThreadContextPermission for markAsSystemContext and allow core to perform the method ([#15016](https://github.com/opensearch-project/OpenSearch/pull/15016))
+- Add ThreadContextPermission for stashAndMergeHeaders and stashWithOrigin ([#15039](https://github.com/opensearch-project/OpenSearch/pull/15039))
+- [Concurrent Segment Search] Support composite aggregations with scripting ([#15072](https://github.com/opensearch-project/OpenSearch/pull/15072))
+- Add `rangeQuery` and `regexpQuery` for `constant_keyword` field type ([#14711](https://github.com/opensearch-project/OpenSearch/pull/14711))
+- Add took time to request nodes stats ([#15054](https://github.com/opensearch-project/OpenSearch/pull/15054))
+- [Workload Management] Add Get QueryGroup API Logic ([14709](https://github.com/opensearch-project/OpenSearch/pull/14709))
+- [Workload Management] Add Update QueryGroup API Logic ([#14775](https://github.com/opensearch-project/OpenSearch/pull/14775))
+- [Workload Management] QueryGroup resource tracking framework changes ([#13897](https://github.com/opensearch-project/OpenSearch/pull/13897))
+- Support filtering on a large list encoded by bitmap ([#14774](https://github.com/opensearch-project/OpenSearch/pull/14774))
+- Add slice execution listeners to SearchOperationListener interface ([#15153](https://github.com/opensearch-project/OpenSearch/pull/15153))
+- Make balanced shards allocator timebound ([#15239](https://github.com/opensearch-project/OpenSearch/pull/15239))
+- Add allowlist setting for ingest-geoip and ingest-useragent ([#15325](https://github.com/opensearch-project/OpenSearch/pull/15325))
+- Adding access to noSubMatches and noOverlappingMatches in Hyphenation ([#13895](https://github.com/opensearch-project/OpenSearch/pull/13895))
+- Star tree mapping changes ([#14605](https://github.com/opensearch-project/OpenSearch/pull/14605))
+- Add support for index level max slice count setting for concurrent segment search ([#15336](https://github.com/opensearch-project/OpenSearch/pull/15336))
+- Support cancellation for cat shards and node stats API.([#13966](https://github.com/opensearch-project/OpenSearch/pull/13966))
+- [Streaming Indexing] Introduce bulk HTTP API streaming flavor ([#15381](https://github.com/opensearch-project/OpenSearch/pull/15381))
+- Add support for centralize snapshot creation with pinned timestamp ([#15124](https://github.com/opensearch-project/OpenSearch/pull/15124))
+- Add concurrent search support for Derived Fields ([#15326](https://github.com/opensearch-project/OpenSearch/pull/15326))
+- [Workload Management] Add query group stats constructs ([#15343](https://github.com/opensearch-project/OpenSearch/pull/15343)))
+- Add limit on number of processors for Ingest pipeline([#15460](https://github.com/opensearch-project/OpenSearch/pull/15465)).
+- Add runAs to Subject interface and introduce IdentityAwarePlugin extension point ([#14630](https://github.com/opensearch-project/OpenSearch/pull/14630))
+- [Workload Management] Add rejection logic for co-ordinator and shard level requests ([#15428](https://github.com/opensearch-project/OpenSearch/pull/15428)))
+- Adding translog durability validation in index templates ([#15494](https://github.com/opensearch-project/OpenSearch/pull/15494))
+- [Range Queries] Add new approximateable query framework to short-circuit range queries ([#13788](https://github.com/opensearch-project/OpenSearch/pull/13788))
+- [Workload Management] Add query group level failure tracking ([#15227](https://github.com/opensearch-project/OpenSearch/pull/15527))
+- [Reader Writer Separation] Add experimental search replica shard type to achieve reader writer separation ([#15237](https://github.com/opensearch-project/OpenSearch/pull/15237))
+- Add index creation using the context field ([#15290](https://github.com/opensearch-project/OpenSearch/pull/15290))
+- [Remote Publication] Add remote download stats ([#15291](https://github.com/opensearch-project/OpenSearch/pull/15291))
+- Add support to upload snapshot shard blobs with hashed prefix ([#15426](https://github.com/opensearch-project/OpenSearch/pull/15426))
+- Add prefix support to hashed prefix & infix path types on remote store ([#15557](https://github.com/opensearch-project/OpenSearch/pull/15557))
+- Add canRemain method to TargetPoolAllocationDecider to move shards from local to remote pool for hot to warm tiering ([#15010](https://github.com/opensearch-project/OpenSearch/pull/15010))
+- Add support for pluggable deciders for concurrent search ([#15363](https://github.com/opensearch-project/OpenSearch/pull/15363))
+- Optimise snapshot deletion to speed up snapshot deletion and creation ([#15568](https://github.com/opensearch-project/OpenSearch/pull/15568))
+- [Remote Publication] Added checksum validation for cluster state behind a cluster setting ([#15218](https://github.com/opensearch-project/OpenSearch/pull/15218))
+- Optimize NodeIndicesStats output behind flag ([#14454](https://github.com/opensearch-project/OpenSearch/pull/14454))
+- Add support for comma-separated list of index names to be used with Snapshot Status API ([#15409](https://github.com/opensearch-project/OpenSearch/pull/15409))[SnapshotV2] Snapshot Status API changes (#15409))
+- ClusterManagerTaskThrottler Improvements ([#15508](https://github.com/opensearch-project/OpenSearch/pull/15508))
+- Relax the join validation for Remote State publication ([#15471](https://github.com/opensearch-project/OpenSearch/pull/15471))
+- Reset DiscoveryNodes in all transport node actions request ([#15131](https://github.com/opensearch-project/OpenSearch/pull/15131))
+
+### Dependencies
+- Bump `netty` from 4.1.111.Final to 4.1.112.Final ([#15081](https://github.com/opensearch-project/OpenSearch/pull/15081))
+- Bump `org.apache.commons:commons-lang3` from 3.14.0 to 3.15.0 ([#14861](https://github.com/opensearch-project/OpenSearch/pull/14861))
+- OpenJDK Update (July 2024 Patch releases) ([#14998](https://github.com/opensearch-project/OpenSearch/pull/14998))
+- Bump `com.microsoft.azure:msal4j` from 1.16.1 to 1.17.0 ([#14995](https://github.com/opensearch-project/OpenSearch/pull/14995), [#15420](https://github.com/opensearch-project/OpenSearch/pull/15420))
+- Bump `actions/github-script` from 6 to 7 ([#14997](https://github.com/opensearch-project/OpenSearch/pull/14997))
+- Bump `org.tukaani:xz` from 1.9 to 1.10 ([#15110](https://github.com/opensearch-project/OpenSearch/pull/15110))
+- Bump `org.apache.avro:avro` from 1.11.3 to 1.12.0 in /plugins/repository-hdfs ([#15119](https://github.com/opensearch-project/OpenSearch/pull/15119))
+- Bump `org.bouncycastle:bcpg-fips` from 1.0.7.1 to 2.0.9 ([#15103](https://github.com/opensearch-project/OpenSearch/pull/15103), [#15299](https://github.com/opensearch-project/OpenSearch/pull/15299))
+- Bump `com.azure:azure-core` from 1.49.1 to 1.51.0 ([#15111](https://github.com/opensearch-project/OpenSearch/pull/15111))
+- Bump `org.xerial.snappy:snappy-java` from 1.1.10.5 to 1.1.10.6 ([#15207](https://github.com/opensearch-project/OpenSearch/pull/15207))
+- Bump `com.azure:azure-xml` from 1.0.0 to 1.1.0 ([#15206](https://github.com/opensearch-project/OpenSearch/pull/15206))
+- Bump `reactor` from 3.5.19 to 3.5.20 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
+- Bump `reactor-netty` from 1.1.21 to 1.1.22 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
+- Bump `org.apache.kerby:kerb-admin` from 2.0.3 to 2.1.0 ([#15301](https://github.com/opensearch-project/OpenSearch/pull/15301))
+- Bump `com.azure:azure-core-http-netty` from 1.15.1 to 1.15.3 ([#15300](https://github.com/opensearch-project/OpenSearch/pull/15300))
+- Bump `com.gradle.develocity` from 3.17.6 to 3.18 ([#15297](https://github.com/opensearch-project/OpenSearch/pull/15297))
+- Bump `commons-cli:commons-cli` from 1.8.0 to 1.9.0 ([#15298](https://github.com/opensearch-project/OpenSearch/pull/15298))
+- Bump `opentelemetry` from 1.40.0 to 1.41.0 ([#15361](https://github.com/opensearch-project/OpenSearch/pull/15361))
+- Bump `opentelemetry-semconv` from 1.26.0-alpha to 1.27.0-alpha ([#15361](https://github.com/opensearch-project/OpenSearch/pull/15361))
+- Bump `tj-actions/changed-files` from 44 to 45 ([#15422](https://github.com/opensearch-project/OpenSearch/pull/15422))
+- Bump `dnsjava:dnsjava` from 3.6.0 to 3.6.1 ([#15418](https://github.com/opensearch-project/OpenSearch/pull/15418))
+- Bump `com.netflix.nebula.ospackage-base` from 11.9.1 to 11.10.0 ([#15419](https://github.com/opensearch-project/OpenSearch/pull/15419))
+- Bump `org.roaringbitmap:RoaringBitmap` from 1.1.0 to 1.2.1 ([#15423](https://github.com/opensearch-project/OpenSearch/pull/15423))
+- Bump `icu4j` from 70.1 to 75.1 ([#15469](https://github.com/opensearch-project/OpenSearch/pull/15469))
+
+### Changed
+- Add lower limit for primary and replica batch allocators timeout ([#14979](https://github.com/opensearch-project/OpenSearch/pull/14979))
+- Optimize regexp-based include/exclude on aggregations when pattern matches prefixes ([#14371](https://github.com/opensearch-project/OpenSearch/pull/14371))
+- Replace and block usages of org.apache.logging.log4j.util.Strings ([#15238](https://github.com/opensearch-project/OpenSearch/pull/15238))
+- Remote publication using minimum node version for backward compatibility ([#15216](https://github.com/opensearch-project/OpenSearch/pull/15216))
+
+### Deprecated
+
+### Removed
+- Remove some unused code in the search backpressure package ([#15518](https://github.com/opensearch-project/OpenSearch/pull/15518))
+
+### Fixed
+- Fix constraint bug which allows more primary shards than average primary shards per index ([#14908](https://github.com/opensearch-project/OpenSearch/pull/14908))
+- Fix NPE when bulk ingest with empty pipeline ([#15033](https://github.com/opensearch-project/OpenSearch/pull/15033))
+- Fix missing value of FieldSort for unsigned_long ([#14963](https://github.com/opensearch-project/OpenSearch/pull/14963))
+- Fix delete index template failed when the index template matches a data stream but is unused ([#15080](https://github.com/opensearch-project/OpenSearch/pull/15080))
+- Fix array_index_out_of_bounds_exception when indexing documents with field name containing only dot ([#15126](https://github.com/opensearch-project/OpenSearch/pull/15126))
+- Fixed array field name omission in flat_object function for nested JSON ([#13620](https://github.com/opensearch-project/OpenSearch/pull/13620))
+- Fix incorrect parameter names in MinHash token filter configuration handling ([#15233](https://github.com/opensearch-project/OpenSearch/pull/15233))
+- Fix range aggregation optimization ignoring top level queries ([#15287](https://github.com/opensearch-project/OpenSearch/pull/15287))
+- Fix indexing error when flat_object field is explicitly null ([#15375](https://github.com/opensearch-project/OpenSearch/pull/15375))
+- Fix split response processor not included in allowlist ([#15393](https://github.com/opensearch-project/OpenSearch/pull/15393))
+- Fix unchecked cast in dynamic action map getter ([#15394](https://github.com/opensearch-project/OpenSearch/pull/15394))
+- Fix null values indexed as "null" strings in flat_object field ([#14069](https://github.com/opensearch-project/OpenSearch/pull/14069))
+- Fix terms query on wildcard field returns nothing ([#15607](https://github.com/opensearch-project/OpenSearch/pull/15607))

From e6862cdc868b20afed6ac23696d5e9ae7df4906e Mon Sep 17 00:00:00 2001
From: "opensearch-trigger-bot[bot]"
 <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com>
Date: Thu, 5 Sep 2024 17:58:14 -0400
Subject: [PATCH 5/7] Adding release notes (#15771) (#15779)

(cherry picked from commit e892f23c443ddfbfd82bf0e447636231d34ecd2f)

Signed-off-by: Harsha Vamsi Kalluri <harshavamsi096@gmail.com>
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 release-notes/opensearch.release-notes-2.17.0.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/release-notes/opensearch.release-notes-2.17.0.md b/release-notes/opensearch.release-notes-2.17.0.md
index 18b7c9ac7cd68..41c7a89ea634c 100644
--- a/release-notes/opensearch.release-notes-2.17.0.md
+++ b/release-notes/opensearch.release-notes-2.17.0.md
@@ -50,6 +50,7 @@
 - ClusterManagerTaskThrottler Improvements ([#15508](https://github.com/opensearch-project/OpenSearch/pull/15508))
 - Relax the join validation for Remote State publication ([#15471](https://github.com/opensearch-project/OpenSearch/pull/15471))
 - Reset DiscoveryNodes in all transport node actions request ([#15131](https://github.com/opensearch-project/OpenSearch/pull/15131))
+- MultiTermQueries in keyword fields now default to `indexed` approach and gated behind cluster setting ([#15637](https://github.com/opensearch-project/OpenSearch/pull/15637))
 
 ### Dependencies
 - Bump `netty` from 4.1.111.Final to 4.1.112.Final ([#15081](https://github.com/opensearch-project/OpenSearch/pull/15081))

From 93b0755bd64fb35c61aef0f5b223c30bfd9a84b2 Mon Sep 17 00:00:00 2001
From: "opensearch-trigger-bot[bot]"
 <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com>
Date: Thu, 5 Sep 2024 18:00:25 -0400
Subject: [PATCH 6/7] Fixing MacOS 13 assemble workflows (#15747) (#15773)

(cherry picked from commit fe61e4f299d4b832dc232cb223ec946585177f52)

Signed-off-by: Andriy Redko <andriy.redko@aiven.io>
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .github/workflows/assemble.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/assemble.yml b/.github/workflows/assemble.yml
index 294627622a136..b3838b8e5ae97 100644
--- a/.github/workflows/assemble.yml
+++ b/.github/workflows/assemble.yml
@@ -30,8 +30,11 @@ jobs:
       - name: Setup docker (missing on MacOS)
         id: setup_docker
         if: runner.os == 'macos'
+        continue-on-error: true
         run: |
-          exit 0;
+          brew install docker colima coreutils
+          gtimeout 15m colima start
+        shell: bash
       - name: Run Gradle (assemble)
         if: runner.os == 'macos' && steps.setup_docker.outcome != 'success'
         run: |
@@ -45,4 +48,4 @@ jobs:
       - name: Run Gradle (assemble)
         if: runner.os == 'macos' && steps.setup_docker.outcome == 'success'
         run: |
-          exit 0;
+          ./gradlew assemble --parallel --no-build-cache -PDISABLE_BUILD_CACHE -Druntime.java=${{ matrix.java }}

From 3b8a741a8547c21317449b7d783edd378a8eecb7 Mon Sep 17 00:00:00 2001
From: Harsha Vamsi Kalluri <harshavamsi096@gmail.com>
Date: Thu, 5 Sep 2024 15:51:01 -0700
Subject: [PATCH 7/7] Remove un-necessary changelog entry (#15781)

Signed-off-by: Harsha Vamsi Kalluri <harshavamsi096@gmail.com>
---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 435df7bda62a9..6397b81a5f1c3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [Unreleased 2.x]
 ### Added
-- MultiTermQueries in keyword fields now default to `indexed` approach and gated behind cluster setting ([#15637](https://github.com/opensearch-project/OpenSearch/pull/15637))
 
 ### Dependencies
 - Bump `com.azure:azure-identity` from 1.13.0 to 1.13.2 ([#15578](https://github.com/opensearch-project/OpenSearch/pull/15578))