From 5164fe2e4d7ebb3aa6d952837fc2dabc45c1e171 Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Thu, 27 Jul 2023 16:07:24 -0700 Subject: [PATCH] Fix SegmentReplicationIT.testReplicaHasDiffFilesThanPrimary for node-node replication (#8912) (#8936) * Fix SegmentReplicationIT.testReplicahasDiffFilesThanPrimary This test is now failing for node-node replication. On the primary shard the prepareSegmentReplication method should cancel any ongoing replication if it is running and start a new sync. Thisis incorrectly using Map.compute which will not replace the existing handler entry in the allocationIdToHandlers map. It will only cancel the existing source handler. As a result this can leave the copyState map with an entry and hold an index commit while the test is cleaning up. The copyState is only cleared when a handler is cancelled directly or from a cluster state update. * PR feedback. --------- (cherry picked from commit 4ad418210a51c518119a4c9c565fbf7e9bc4b5c1) Signed-off-by: Marc Handalian Signed-off-by: github-actions[bot] Co-authored-by: github-actions[bot] --- .../OngoingSegmentReplications.java | 26 ++++++++++---- .../OngoingSegmentReplicationsTests.java | 34 +++++++++++++++++++ 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java b/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java index e0e356f1531e1..4712ae6c18759 100644 --- a/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java +++ b/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java @@ -139,13 +139,25 @@ void startSegmentCopy(GetSegmentFilesRequest request, ActionListener { - if (segrepHandler != null) { - logger.warn("Override handler for allocation id {}", request.getTargetAllocationId()); - cancelHandlers(handler -> handler.getAllocationId().equals(request.getTargetAllocationId()), "cancel due to retry"); - } - return createTargetHandler(request.getTargetNode(), copyState, request.getTargetAllocationId(), fileChunkWriter); - }); + final SegmentReplicationSourceHandler newHandler = createTargetHandler( + request.getTargetNode(), + copyState, + request.getTargetAllocationId(), + fileChunkWriter + ); + final SegmentReplicationSourceHandler existingHandler = allocationIdToHandlers.putIfAbsent( + request.getTargetAllocationId(), + newHandler + ); + // If we are already replicating to this allocation Id, cancel the old and replace with a new execution. + // This will clear the old handler & referenced copy state holding an incref'd indexCommit. + if (existingHandler != null) { + logger.warn("Override handler for allocation id {}", request.getTargetAllocationId()); + cancelHandlers(handler -> handler.getAllocationId().equals(request.getTargetAllocationId()), "cancel due to retry"); + assert allocationIdToHandlers.containsKey(request.getTargetAllocationId()) == false; + allocationIdToHandlers.put(request.getTargetAllocationId(), newHandler); + } + assert allocationIdToHandlers.containsKey(request.getTargetAllocationId()); return copyState; } diff --git a/server/src/test/java/org/opensearch/indices/replication/OngoingSegmentReplicationsTests.java b/server/src/test/java/org/opensearch/indices/replication/OngoingSegmentReplicationsTests.java index 3b289114f5ca1..84a53ae22a6bc 100644 --- a/server/src/test/java/org/opensearch/indices/replication/OngoingSegmentReplicationsTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/OngoingSegmentReplicationsTests.java @@ -403,4 +403,38 @@ public void testCancelForMissingIds() throws IOException { assertEquals(0, replications.cachedCopyStateSize()); closeShards(replica_2); } + + public void testPrepareForReplicationAlreadyReplicating() throws IOException { + OngoingSegmentReplications replications = new OngoingSegmentReplications(mockIndicesService, recoverySettings); + final String replicaAllocationId = replica.routingEntry().allocationId().getId(); + final CheckpointInfoRequest request = new CheckpointInfoRequest(1L, replicaAllocationId, primaryDiscoveryNode, testCheckpoint); + + final CopyState copyState = replications.prepareForReplication(request, mock(FileChunkWriter.class)); + + final SegmentReplicationSourceHandler handler = replications.getHandlers().get(replicaAllocationId); + assertEquals(handler.getCopyState(), copyState); + assertEquals(1, copyState.refCount()); + + ReplicationCheckpoint secondCheckpoint = new ReplicationCheckpoint( + testCheckpoint.getShardId(), + testCheckpoint.getPrimaryTerm(), + testCheckpoint.getSegmentsGen(), + testCheckpoint.getSegmentInfosVersion() + 1, + testCheckpoint.getCodec() + ); + + final CheckpointInfoRequest secondRequest = new CheckpointInfoRequest( + 1L, + replicaAllocationId, + primaryDiscoveryNode, + secondCheckpoint + ); + + final CopyState secondCopyState = replications.prepareForReplication(secondRequest, mock(FileChunkWriter.class)); + final SegmentReplicationSourceHandler secondHandler = replications.getHandlers().get(replicaAllocationId); + assertEquals(secondHandler.getCopyState(), secondCopyState); + assertEquals("New copy state is incref'd", 1, secondCopyState.refCount()); + assertEquals("Old copy state is cleaned up", 0, copyState.refCount()); + + } }