diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index f769f8729c25b..0afe8617e156f 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -276,6 +276,7 @@ public void apply(Settings value, Settings current, Settings previous) { BalancedShardsAllocator.THRESHOLD_SETTING, BalancedShardsAllocator.IGNORE_THROTTLE_FOR_REMOTE_RESTORE, BalancedShardsAllocator.ALLOCATOR_TIMEOUT_SETTING, + BalancedShardsAllocator.FOLLOW_UP_REROUTE_PRIORITY_SETTING, BreakerSettings.CIRCUIT_BREAKER_LIMIT_SETTING, BreakerSettings.CIRCUIT_BREAKER_OVERHEAD_SETTING, BreakerSettings.CIRCUIT_BREAKER_TYPE, @@ -353,6 +354,7 @@ public void apply(Settings value, Settings current, Settings previous) { ShardsBatchGatewayAllocator.GATEWAY_ALLOCATOR_BATCH_SIZE, ShardsBatchGatewayAllocator.PRIMARY_BATCH_ALLOCATOR_TIMEOUT_SETTING, ShardsBatchGatewayAllocator.REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING, + ShardsBatchGatewayAllocator.FOLLOW_UP_REROUTE_PRIORITY_SETTING, PersistedClusterStateService.SLOW_WRITE_LOGGING_THRESHOLD, NetworkModule.HTTP_DEFAULT_TYPE_SETTING, NetworkModule.TRANSPORT_DEFAULT_TYPE_SETTING, diff --git a/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java b/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java index da00a0fb686d3..8e63133e87806 100644 --- a/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java +++ b/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java @@ -53,6 +53,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Set; @@ -82,6 +83,7 @@ public class ShardsBatchGatewayAllocator implements ExistingShardsAllocator { private TimeValue primaryShardsBatchGatewayAllocatorTimeout; private TimeValue replicaShardsBatchGatewayAllocatorTimeout; + private volatile Priority followUpRerouteTaskPriority; public static final TimeValue MIN_ALLOCATOR_TIMEOUT = TimeValue.timeValueSeconds(20); private final ClusterManagerMetrics clusterManagerMetrics; @@ -145,6 +147,32 @@ public void validate(TimeValue timeValue) { Setting.Property.Dynamic ); + /** + * Adjusts the priority of the followup reroute task when current round times out. NORMAL is right for reasonable clusters, + * but for a cluster in a messed up state which is starving NORMAL priority tasks, it might be necessary to raise this higher + * to allocate existing shards. + */ + public static final Setting FOLLOW_UP_REROUTE_PRIORITY_SETTING = new Setting<>( + "cluster.routing.allocation.shards_batch_gateway_allocator.schedule_reroute.priority", + Priority.NORMAL.toString(), + ShardsBatchGatewayAllocator::parseReroutePriority, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + + private static Priority parseReroutePriority(String priorityString) { + final Priority priority = Priority.valueOf(priorityString.toUpperCase(Locale.ROOT)); + switch (priority) { + case NORMAL: + case HIGH: + case URGENT: + return priority; + } + throw new IllegalArgumentException( + "priority [" + priority + "] not supported for [" + FOLLOW_UP_REROUTE_PRIORITY_SETTING.getKey() + "]" + ); + } + private final RerouteService rerouteService; private final PrimaryShardBatchAllocator primaryShardBatchAllocator; private final ReplicaShardBatchAllocator replicaShardBatchAllocator; @@ -179,6 +207,8 @@ public ShardsBatchGatewayAllocator( this.replicaShardsBatchGatewayAllocatorTimeout = REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING.get(settings); clusterSettings.addSettingsUpdateConsumer(REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING, this::setReplicaBatchAllocatorTimeout); this.clusterManagerMetrics = clusterManagerMetrics; + setFollowUpRerouteTaskPriority(FOLLOW_UP_REROUTE_PRIORITY_SETTING.get(settings)); + clusterSettings.addSettingsUpdateConsumer(FOLLOW_UP_REROUTE_PRIORITY_SETTING, this::setFollowUpRerouteTaskPriority); } @Override @@ -309,7 +339,7 @@ public void onComplete() { assert rerouteService != null; rerouteService.reroute( "reroute after existing shards allocator [P] timed out", - Priority.NORMAL, + followUpRerouteTaskPriority, ActionListener.wrap( r -> logger.trace("reroute after existing shards allocator timed out completed"), e -> logger.debug("reroute after existing shards allocator timed out failed", e) @@ -344,7 +374,7 @@ public void onComplete() { assert rerouteService != null; rerouteService.reroute( "reroute after existing shards allocator [R] timed out", - Priority.NORMAL, + followUpRerouteTaskPriority, ActionListener.wrap( r -> logger.trace("reroute after existing shards allocator timed out completed"), e -> logger.debug("reroute after existing shards allocator timed out failed", e) @@ -920,4 +950,8 @@ protected void setPrimaryBatchAllocatorTimeout(TimeValue primaryShardsBatchGatew protected void setReplicaBatchAllocatorTimeout(TimeValue replicaShardsBatchGatewayAllocatorTimeout) { this.replicaShardsBatchGatewayAllocatorTimeout = replicaShardsBatchGatewayAllocatorTimeout; } + + private void setFollowUpRerouteTaskPriority(Priority followUpRerouteTaskPriority) { + this.followUpRerouteTaskPriority = followUpRerouteTaskPriority; + } }