Skip to content

Commit

Permalink
PrimaryShardAllocator refactor to abstract out shard state and method…
Browse files Browse the repository at this point in the history
… calls (opensearch-project#9760)

* PrimaryShardAllocator refactor to abstract out shard state and method calls

Signed-off-by: Shivansh Arora <[email protected]>
Signed-off-by: Shivansh Arora <[email protected]>
  • Loading branch information
shiv0408 committed Apr 25, 2024
1 parent f0aa97a commit a6fc9d2
Showing 1 changed file with 83 additions and 36 deletions.
119 changes: 83 additions & 36 deletions server/src/main/java/org/opensearch/gateway/PrimaryShardAllocator.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,27 +81,28 @@ public abstract class PrimaryShardAllocator extends BaseGatewayShardAllocator {
/**
* Is the allocator responsible for allocating the given {@link ShardRouting}?
*/
private static boolean isResponsibleFor(final ShardRouting shard) {
protected static boolean isResponsibleFor(final ShardRouting shard) {
return shard.primary() // must be primary
&& shard.unassigned() // must be unassigned
// only handle either an existing store or a snapshot recovery
&& (shard.recoverySource().getType() == RecoverySource.Type.EXISTING_STORE
|| shard.recoverySource().getType() == RecoverySource.Type.SNAPSHOT);
}

@Override
public AllocateUnassignedDecision makeAllocationDecision(
final ShardRouting unassignedShard,
final RoutingAllocation allocation,
final Logger logger
) {
/**
* Skip doing fetchData call for a shard if recovery mode is snapshot. Also do not take decision if allocator is
* not responsible for this particular shard.
*
* @param unassignedShard unassigned shard routing
* @param allocation routing allocation object
* @return allocation decision taken for this shard
*/
protected AllocateUnassignedDecision getInEligibleShardDecision(ShardRouting unassignedShard, RoutingAllocation allocation) {
if (isResponsibleFor(unassignedShard) == false) {
// this allocator is not responsible for allocating this shard
return AllocateUnassignedDecision.NOT_TAKEN;
}

final boolean explain = allocation.debugDecision();

if (unassignedShard.recoverySource().getType() == RecoverySource.Type.SNAPSHOT
&& allocation.snapshotShardSizeInfo().getShardSize(unassignedShard) == null) {
List<NodeAllocationResult> nodeDecisions = null;
Expand All @@ -110,17 +111,52 @@ public AllocateUnassignedDecision makeAllocationDecision(
}
return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.FETCHING_SHARD_DATA, nodeDecisions);
}
return null;
}

@Override
public AllocateUnassignedDecision makeAllocationDecision(
final ShardRouting unassignedShard,
final RoutingAllocation allocation,
final Logger logger
) {
AllocateUnassignedDecision decision = getInEligibleShardDecision(unassignedShard, allocation);
if (decision != null) {
return decision;
}
final FetchResult<NodeGatewayStartedShards> shardState = fetchData(unassignedShard, allocation);
if (shardState.hasData() == false) {
List<NodeGatewayStartedShards> nodeShardStates = adaptToNodeStartedShardList(shardState);
return getAllocationDecision(unassignedShard, allocation, nodeShardStates, logger);
}

/**
* Transforms {@link FetchResult} of {@link NodeGatewayStartedShards} to {@link List} of {@link NodeGatewayStartedShards}
* Returns null if {@link FetchResult} does not have any data.
*/
private static List<NodeGatewayStartedShards> adaptToNodeStartedShardList(FetchResult<NodeGatewayStartedShards> shardsState) {
if (!shardsState.hasData()) {
return null;
}
List<NodeGatewayStartedShards> nodeShardStates = new ArrayList<>();
shardsState.getData().forEach((node, nodeGatewayStartedShard) -> { nodeShardStates.add(nodeGatewayStartedShard); });
return nodeShardStates;
}

protected AllocateUnassignedDecision getAllocationDecision(
ShardRouting unassignedShard,
RoutingAllocation allocation,
List<NodeGatewayStartedShards> shardState,
Logger logger
) {
final boolean explain = allocation.debugDecision();
if (shardState == null) {
allocation.setHasPendingAsyncFetch();
List<NodeAllocationResult> nodeDecisions = null;
if (explain) {
nodeDecisions = buildDecisionsForAllNodes(unassignedShard, allocation);
}
return AllocateUnassignedDecision.no(AllocationStatus.FETCHING_SHARD_DATA, nodeDecisions);
}

// don't create a new IndexSetting object for every shard as this could cause a lot of garbage
// on cluster restart if we allocate a boat load of shards
final IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(unassignedShard.index());
Expand Down Expand Up @@ -260,11 +296,11 @@ public AllocateUnassignedDecision makeAllocationDecision(
*/
private static List<NodeAllocationResult> buildNodeDecisions(
NodesToAllocate nodesToAllocate,
FetchResult<NodeGatewayStartedShards> fetchedShardData,
List<NodeGatewayStartedShards> fetchedShardData,
Set<String> inSyncAllocationIds
) {
List<NodeAllocationResult> nodeResults = new ArrayList<>();
Collection<NodeGatewayStartedShards> ineligibleShards;
Collection<NodeGatewayStartedShards> ineligibleShards = new ArrayList<>();
if (nodesToAllocate != null) {
final Set<DiscoveryNode> discoNodes = new HashSet<>();
nodeResults.addAll(
Expand All @@ -280,15 +316,13 @@ private static List<NodeAllocationResult> buildNodeDecisions(
})
.collect(Collectors.toList())
);
ineligibleShards = fetchedShardData.getData()
.values()
.stream()
ineligibleShards = fetchedShardData.stream()
.filter(shardData -> discoNodes.contains(shardData.getNode()) == false)
.collect(Collectors.toList());
} else {
// there were no shard copies that were eligible for being assigned the allocation,
// so all fetched shard data are ineligible shards
ineligibleShards = fetchedShardData.getData().values();
ineligibleShards = fetchedShardData;
}

nodeResults.addAll(
Expand Down Expand Up @@ -328,12 +362,12 @@ protected static NodeShardsResult buildNodeShardsResult(
boolean matchAnyShard,
Set<String> ignoreNodes,
Set<String> inSyncAllocationIds,
FetchResult<NodeGatewayStartedShards> shardState,
List<NodeGatewayStartedShards> shardState,
Logger logger
) {
List<NodeGatewayStartedShards> nodeShardStates = new ArrayList<>();
int numberOfAllocationsFound = 0;
for (NodeGatewayStartedShards nodeShardState : shardState.getData().values()) {
for (NodeGatewayStartedShards nodeShardState : shardState) {
DiscoveryNode node = nodeShardState.getNode();
String allocationId = nodeShardState.allocationId();

Expand Down Expand Up @@ -386,11 +420,27 @@ protected static NodeShardsResult buildNodeShardsResult(
}
}

/*
Orders the active shards copies based on below comparators
1. No store exception i.e. shard copy is readable
2. Prefer previous primary shard
3. Prefer shard copy with the highest replication checkpoint. It is NO-OP for doc rep enabled indices.
nodeShardStates.sort(createActiveShardComparator(matchAnyShard, inSyncAllocationIds));

if (logger.isTraceEnabled()) {
logger.trace(
"{} candidates for allocation: {}",
shard,
nodeShardStates.stream().map(s -> s.getNode().getName()).collect(Collectors.joining(", "))
);
}
return new NodeShardsResult(nodeShardStates, numberOfAllocationsFound);
}

private static Comparator<NodeGatewayStartedShards> createActiveShardComparator(
boolean matchAnyShard,
Set<String> inSyncAllocationIds
) {
/**
* Orders the active shards copies based on below comparators
* 1. No store exception i.e. shard copy is readable
* 2. Prefer previous primary shard
* 3. Prefer shard copy with the highest replication checkpoint. It is NO-OP for doc rep enabled indices.
*/
final Comparator<NodeGatewayStartedShards> comparator; // allocation preference
if (matchAnyShard) {
Expand All @@ -406,16 +456,7 @@ protected static NodeShardsResult buildNodeShardsResult(
.thenComparing(HIGHEST_REPLICATION_CHECKPOINT_FIRST_COMPARATOR);
}

nodeShardStates.sort(comparator);

if (logger.isTraceEnabled()) {
logger.trace(
"{} candidates for allocation: {}",
shard,
nodeShardStates.stream().map(s -> s.getNode().getName()).collect(Collectors.joining(", "))
);
}
return new NodeShardsResult(nodeShardStates, numberOfAllocationsFound);
return comparator;
}

/**
Expand Down Expand Up @@ -457,7 +498,10 @@ private static NodesToAllocate buildNodesToAllocate(

protected abstract FetchResult<NodeGatewayStartedShards> fetchData(ShardRouting shard, RoutingAllocation allocation);

private static class NodeShardsResult {
/**
* This class encapsulates the result of a call to {@link #buildNodeShardsResult}
*/
static class NodeShardsResult {
final List<NodeGatewayStartedShards> orderedAllocationCandidates;
final int allocationsFound;

Expand All @@ -467,7 +511,10 @@ private static class NodeShardsResult {
}
}

static class NodesToAllocate {
/**
* This class encapsulates the result of a call to {@link #buildNodesToAllocate}
*/
protected static class NodesToAllocate {
final List<DecidedNode> yesNodeShards;
final List<DecidedNode> throttleNodeShards;
final List<DecidedNode> noNodeShards;
Expand Down

0 comments on commit a6fc9d2

Please sign in to comment.