From cc013431cbc8d6aa3f44a7e2a15517cf1bf07986 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Wed, 26 Jul 2023 00:26:39 -0700 Subject: [PATCH 1/8] Fix test testDropPrimaryDuringReplication and clean up ReplicationCheckpoint validation. This test is now occasionally failing with replicas having 0 documents. This occurs in a couple of ways: 1. After dropping the old primary the new primary is not publishing a checkpoint to replicas unless it indexes docs from translog after flipping to primary mode. If there is nothing to index, it will not publish a checkpoint, but the other replica could have never sync'd with the original primary and be left out of date. - This PR fixes this by force publishing a checkpoint after the new primary flips to primary mode. 2. The replica receives a checkpoint post failover and cancels its sync with the former primary that is still active, recognizing a primary term bump. However this cancellation is async and immediately starting a new replication event could fail as its still replicating. - This PR fixes this by attempting to process the latest received checkpoint on failure, if the shard is not failed and still behind. This PR also introduces a few changes to ensure the accuracy of the ReplicationCheckpoint tracked on primary & replicas. - Ensure the checkpoint stored in SegmentReplicationTarget is the checkpoint passed from the primary and not locally computed. This ensures checks for primary term are accurate and not using a locally compued operationPrimaryTerm. - Introduces a refresh listener for both primary & replica to update the ReplicationCheckpoint and store it in replicationTracker post refresh rather than redundantly computing when accessed. - Removes unnecessary onCheckpointPublished method used to start replication timers manually. This will happen automatically on primaries once its local cp is updated. Signed-off-by: Marc Handalian --- .../replication/SegmentReplicationIT.java | 10 +- .../index/engine/NRTReplicationEngine.java | 15 +++ .../opensearch/index/shard/IndexShard.java | 64 +++++++---- .../shard/RemoteStoreRefreshListener.java | 1 - .../replication/SegmentReplicationTarget.java | 20 +++- .../SegmentReplicationTargetService.java | 107 +++++++++++------- ...SegmentReplicationCheckpointPublisher.java | 1 - .../engine/NRTReplicationEngineTests.java | 67 +++++++++++ .../SegmentReplicationIndexShardTests.java | 12 +- .../SegmentReplicationTargetServiceTests.java | 63 +++++++---- .../SegmentReplicationTargetTests.java | 14 +-- .../recovery/ReplicationCollectionTests.java | 2 + .../index/shard/IndexShardTestCase.java | 6 +- 13 files changed, 275 insertions(+), 107 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java index 08186bf3f9362..72b6a0296e3bb 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java @@ -44,6 +44,8 @@ import org.opensearch.cluster.routing.ShardRouting; import org.opensearch.cluster.routing.ShardRoutingState; import org.opensearch.cluster.routing.allocation.command.CancelAllocationCommand; +import org.opensearch.common.collect.Tuple; +import org.opensearch.common.concurrent.GatedCloseable; import org.opensearch.core.common.io.stream.NamedWriteableRegistry; import org.opensearch.common.lucene.index.OpenSearchDirectoryReader; import org.opensearch.common.settings.Settings; @@ -60,6 +62,7 @@ import org.opensearch.index.shard.IndexShard; import org.opensearch.core.index.shard.ShardId; import org.opensearch.indices.recovery.FileChunkRequest; +import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint; import org.opensearch.indices.replication.common.ReplicationType; import org.opensearch.search.SearchService; import org.opensearch.search.builder.PointInTimeBuilder; @@ -983,8 +986,11 @@ public void testScrollCreatedOnReplica() throws Exception { ) ); final IndexShard replicaShard = getIndexShard(replica, INDEX_NAME); - final SegmentInfos segmentInfos = replicaShard.getLatestSegmentInfosAndCheckpoint().v1().get(); - final Collection snapshottedSegments = segmentInfos.files(false); + final Tuple, ReplicationCheckpoint> tuple = replicaShard.getLatestSegmentInfosAndCheckpoint(); + final Collection snapshottedSegments; + try (final GatedCloseable closeable = tuple.v1()) { + snapshottedSegments = closeable.get().files(false); + } // opens a scrolled query before a flush is called. // this is for testing scroll segment consistency between refresh and flush SearchResponse searchResponse = client(replica).prepareSearch() diff --git a/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java b/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java index e852658d7b3ba..e9987591aac30 100644 --- a/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java @@ -34,6 +34,7 @@ import java.io.Closeable; import java.io.IOException; import java.util.Arrays; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Objects; @@ -445,6 +446,20 @@ protected SegmentInfos getLatestSegmentInfos() { return readerManager.getSegmentInfos(); } + @Override + public synchronized GatedCloseable getSegmentInfosSnapshot() { + // get reference to latest infos + final SegmentInfos latestSegmentInfos = getLatestSegmentInfos(); + // incref all files + try { + final Collection files = latestSegmentInfos.files(true); + store.incRefFileDeleter(files); + return new GatedCloseable<>(latestSegmentInfos, () -> store.decRefFileDeleter(files)); + } catch (IOException e) { + throw new EngineException(shardId, e.getMessage(), e); + } + } + protected LocalCheckpointTracker getLocalCheckpointTracker() { return localCheckpointTracker; } diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 2b85193275a13..408eb149509a8 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -679,6 +679,12 @@ public void updateShardState( resetEngineToGlobalCheckpoint(); } replicationTracker.activatePrimaryMode(getLocalCheckpoint()); + + if (indexSettings.isSegRepEnabled()) { + // force publish a checkpoint now that shard is in primary mode. + checkpointPublisher.publish(this, getLatestReplicationCheckpoint()); + } + ensurePeerRecoveryRetentionLeasesExist(); /* * If this shard was serving as a replica shard when another shard was promoted to primary then @@ -1551,15 +1557,7 @@ public GatedCloseable acquireSafeIndexCommit() throws EngineExcepti * @return EMPTY checkpoint before the engine is opened and null for non-segrep enabled indices */ public ReplicationCheckpoint getLatestReplicationCheckpoint() { - final Tuple, ReplicationCheckpoint> infosAndCheckpoint = getLatestSegmentInfosAndCheckpoint(); - if (infosAndCheckpoint == null) { - return null; - } - try (final GatedCloseable ignored = infosAndCheckpoint.v1()) { - return infosAndCheckpoint.v2(); - } catch (IOException e) { - throw new OpenSearchException("Error Closing SegmentInfos Snapshot", e); - } + return replicationTracker.getLatestReplicationCheckpoint(); } /** @@ -1573,13 +1571,11 @@ public ReplicationCheckpoint getLatestReplicationCheckpoint() { * */ public Tuple, ReplicationCheckpoint> getLatestSegmentInfosAndCheckpoint() { - if (indexSettings.isSegRepEnabled() == false) { - return null; - } + assert indexSettings.isSegRepEnabled(); Tuple, ReplicationCheckpoint> nullSegmentInfosEmptyCheckpoint = new Tuple<>( new GatedCloseable<>(null, () -> {}), - ReplicationCheckpoint.empty(shardId, getDefaultCodecName()) + getLatestReplicationCheckpoint() ); if (getEngineOrNull() == null) { @@ -1598,11 +1594,7 @@ public Tuple, ReplicationCheckpoint> getLatestSegme getOperationPrimaryTerm(), segmentInfos.getGeneration(), segmentInfos.getVersion(), - // TODO: Update replicas to compute length from SegmentInfos. Replicas do not yet incref segments with - // getSegmentInfosSnapshot, so computing length from SegmentInfos can cause issues. - shardRouting.primary() - ? store.getSegmentMetadataMap(segmentInfos).values().stream().mapToLong(StoreFileMetadata::length).sum() - : store.stats(StoreStats.UNKNOWN_RESERVED_BYTES).getSizeInBytes(), + store.getSegmentMetadataMap(segmentInfos).values().stream().mapToLong(StoreFileMetadata::length).sum(), getEngine().config().getCodec().getName() ) ); @@ -1858,10 +1850,6 @@ public void resetToWriteableEngine() throws IOException, InterruptedException, T indexShardOperationPermits.blockOperations(30, TimeUnit.MINUTES, () -> { resetEngineToGlobalCheckpoint(); }); } - public void onCheckpointPublished(ReplicationCheckpoint checkpoint) { - replicationTracker.setLatestReplicationCheckpoint(checkpoint); - } - /** * Wrapper for a non-closing reader * @@ -2342,6 +2330,11 @@ private void innerOpenEngineAndTranslog(LongSupplier globalCheckpointSupplier, b final Engine newEngine = engineFactory.newReadWriteEngine(config); onNewEngine(newEngine); currentEngineReference.set(newEngine); + + if (indexSettings.isSegRepEnabled()) { + // set initial replication checkpoints into tracker. + updateReplicationCheckpoint(); + } // We set active because we are now writing operations to the engine; this way, // we can flush if we go idle after some time and become inactive. active.set(true); @@ -3667,6 +3660,9 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) thro internalRefreshListener.clear(); internalRefreshListener.add(new RefreshMetricUpdater(refreshMetric)); + if (indexSettings.isSegRepEnabled()) { + internalRefreshListener.add(new ReplicationCheckpointUpdater()); + } if (this.checkpointPublisher != null && shardRouting.primary() && indexSettings.isSegRepLocalEnabled()) { internalRefreshListener.add(new CheckpointRefreshListener(this, this.checkpointPublisher)); } @@ -4471,6 +4467,30 @@ public void afterRefresh(boolean didRefresh) throws IOException { } } + /** + * Refresh listener to update the Shard's ReplicationCheckpoint post refresh. + */ + private class ReplicationCheckpointUpdater implements ReferenceManager.RefreshListener { + @Override + public void beforeRefresh() throws IOException {} + + @Override + public void afterRefresh(boolean didRefresh) throws IOException { + if (didRefresh) { + updateReplicationCheckpoint(); + } + } + } + + private void updateReplicationCheckpoint() { + final Tuple, ReplicationCheckpoint> tuple = getLatestSegmentInfosAndCheckpoint(); + try (final GatedCloseable ignored = tuple.v1()) { + replicationTracker.setLatestReplicationCheckpoint(tuple.v2()); + } catch (IOException e) { + throw new OpenSearchException("Error Closing SegmentInfos Snapshot", e); + } + } + private EngineConfig.TombstoneDocSupplier tombstoneDocSupplier() { final RootObjectMapper.Builder noopRootMapper = new RootObjectMapper.Builder("__noop"); final DocumentMapper noopDocumentMapper = mapperService != null diff --git a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java index 8dd0c8b9d4405..f401ab2ddff85 100644 --- a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java +++ b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java @@ -185,7 +185,6 @@ private synchronized boolean syncSegments() { return true; } ReplicationCheckpoint checkpoint = indexShard.getLatestReplicationCheckpoint(); - indexShard.onCheckpointPublished(checkpoint); beforeSegmentsSync(); long refreshTimeMs = segmentTracker.getLocalRefreshTimeMs(), refreshClockTimeMs = segmentTracker.getLocalRefreshClockTimeMs(); long refreshSeqNo = segmentTracker.getLocalRefreshSeqNo(); diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java index c22701dfc94ce..3a84163bb979d 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java @@ -51,9 +51,14 @@ public class SegmentReplicationTarget extends ReplicationTarget { public final static String REPLICATION_PREFIX = "replication."; - public SegmentReplicationTarget(IndexShard indexShard, SegmentReplicationSource source, ReplicationListener listener) { + public SegmentReplicationTarget( + IndexShard indexShard, + ReplicationCheckpoint checkpoint, + SegmentReplicationSource source, + ReplicationListener listener + ) { super("replication_target", indexShard, new ReplicationLuceneIndex(), listener); - this.checkpoint = indexShard.getLatestReplicationCheckpoint(); + this.checkpoint = checkpoint; this.source = source; this.state = new SegmentReplicationState( indexShard.routingEntry(), @@ -90,12 +95,19 @@ public SegmentReplicationState state() { } public SegmentReplicationTarget retryCopy() { - return new SegmentReplicationTarget(indexShard, source, listener); + return new SegmentReplicationTarget(indexShard, checkpoint, source, listener); } @Override public String description() { - return String.format(Locale.ROOT, "Id:[%d] Shard:[%s] Source:[%s]", getId(), shardId(), source.getDescription()); + return String.format( + Locale.ROOT, + "Id:[%d] Checkpoint [%s] Shard:[%s] Source:[%s]", + getId(), + getCheckpoint(), + shardId(), + source.getDescription() + ); } @Override diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java index 7c35c4f07598e..298280250afac 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java @@ -234,7 +234,7 @@ public synchronized void onNewCheckpoint(final ReplicationCheckpoint receivedChe logger.trace( () -> new ParameterizedMessage( "Ignoring new replication checkpoint - shard is currently replicating to checkpoint {}", - replicaShard.getLatestReplicationCheckpoint() + ongoingReplicationTarget.getCheckpoint() ) ); return; @@ -242,7 +242,7 @@ public synchronized void onNewCheckpoint(final ReplicationCheckpoint receivedChe } final Thread thread = Thread.currentThread(); if (replicaShard.shouldProcessCheckpoint(receivedCheckpoint)) { - startReplication(replicaShard, new SegmentReplicationListener() { + startReplication(replicaShard, receivedCheckpoint, new SegmentReplicationListener() { @Override public void onReplicationDone(SegmentReplicationState state) { logger.trace( @@ -280,6 +280,8 @@ public void onReplicationFailure( ); if (sendShardFailure == true) { failShard(e, replicaShard); + } else { + processLatestReceivedCheckpoint(replicaShard, thread); } } }); @@ -396,8 +398,24 @@ protected void updateLatestReceivedCheckpoint(ReplicationCheckpoint receivedChec } } - public SegmentReplicationTarget startReplication(final IndexShard indexShard, final SegmentReplicationListener listener) { - final SegmentReplicationTarget target = new SegmentReplicationTarget(indexShard, sourceFactory.get(indexShard), listener); + /** + * Start a round of replication and sync to at least the given checkpoint. + * @param indexShard - {@link IndexShard} replica shard + * @param checkpoint - {@link ReplicationCheckpoint} checkpoint to sync to + * @param listener - {@link ReplicationListener} + * @return {@link SegmentReplicationTarget} target event orchestrating the event. + */ + public SegmentReplicationTarget startReplication( + final IndexShard indexShard, + final ReplicationCheckpoint checkpoint, + final SegmentReplicationListener listener + ) { + final SegmentReplicationTarget target = new SegmentReplicationTarget( + indexShard, + checkpoint, + sourceFactory.get(indexShard), + listener + ); startReplication(target); return target; } @@ -529,50 +547,59 @@ private void forceReplication(ForceSyncRequest request, ActionListener new ParameterizedMessage( + "[shardId {}] [replication id {}] Force replication Sync complete to {}, timing data: {}", + shardId, + state.getReplicationId(), + indexShard.getLatestReplicationCheckpoint(), + state.getTimingData() + ) + ); + // Promote engine type for primary target + if (indexShard.recoveryState().getPrimary() == true) { + indexShard.resetToWriteableEngine(); + } else { + // Update the replica's checkpoint on primary's replication tracker. + updateVisibleCheckpoint(state.getReplicationId(), indexShard); + } + listener.onResponse(TransportResponse.Empty.INSTANCE); + } catch (Exception e) { + logger.error("Error while marking replication completed", e); + listener.onFailure(e); + } + } + + @Override + public void onReplicationFailure( + SegmentReplicationState state, + ReplicationFailedException e, + boolean sendShardFailure + ) { + logger.error( () -> new ParameterizedMessage( - "[shardId {}] [replication id {}] Force replication Sync complete to {}, timing data: {}", - shardId, + "[shardId {}] [replication id {}] Force replication Sync failed, timing data: {}", + indexShard.shardId().getId(), state.getReplicationId(), - indexShard.getLatestReplicationCheckpoint(), state.getTimingData() - ) + ), + e ); - // Promote engine type for primary target - if (indexShard.recoveryState().getPrimary() == true) { - indexShard.resetToWriteableEngine(); - } else { - // Update the replica's checkpoint on primary's replication tracker. - updateVisibleCheckpoint(state.getReplicationId(), indexShard); + if (sendShardFailure) { + failShard(e, indexShard); } - listener.onResponse(TransportResponse.Empty.INSTANCE); - } catch (Exception e) { - logger.error("Error while marking replication completed", e); listener.onFailure(e); } } - - @Override - public void onReplicationFailure(SegmentReplicationState state, ReplicationFailedException e, boolean sendShardFailure) { - logger.error( - () -> new ParameterizedMessage( - "[shardId {}] [replication id {}] Replication failed, timing data: {}", - indexShard.shardId().getId(), - state.getReplicationId(), - state.getTimingData() - ), - e - ); - if (sendShardFailure) { - failShard(e, indexShard); - } - listener.onFailure(e); - } - }); + ); } } diff --git a/server/src/main/java/org/opensearch/indices/replication/checkpoint/SegmentReplicationCheckpointPublisher.java b/server/src/main/java/org/opensearch/indices/replication/checkpoint/SegmentReplicationCheckpointPublisher.java index b4bcdc92e539a..f5cb32b741862 100644 --- a/server/src/main/java/org/opensearch/indices/replication/checkpoint/SegmentReplicationCheckpointPublisher.java +++ b/server/src/main/java/org/opensearch/indices/replication/checkpoint/SegmentReplicationCheckpointPublisher.java @@ -34,7 +34,6 @@ public SegmentReplicationCheckpointPublisher(PublishAction publishAction) { public void publish(IndexShard indexShard, ReplicationCheckpoint checkpoint) { publishAction.publish(indexShard, checkpoint); - indexShard.onCheckpointPublished(checkpoint); } /** diff --git a/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java b/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java index 64fe42493c686..116f80d9e89ed 100644 --- a/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java +++ b/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java @@ -12,7 +12,9 @@ import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.search.ReferenceManager; +import org.apache.lucene.store.IOContext; import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.common.UUIDs; import org.opensearch.common.concurrent.GatedCloseable; import org.opensearch.common.lucene.Lucene; import org.opensearch.common.lucene.index.OpenSearchDirectoryReader; @@ -28,6 +30,7 @@ import java.io.IOException; import java.nio.file.Path; +import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Set; @@ -367,4 +370,68 @@ private NRTReplicationEngine buildNrtReplicaEngine(AtomicLong globalCheckpoint, private NRTReplicationEngine buildNrtReplicaEngine(AtomicLong globalCheckpoint, Store store) throws IOException { return buildNrtReplicaEngine(globalCheckpoint, store, defaultSettings); } + + public void testGetSegmentInfosSnapshotPreservesFilesUntilRelease() throws Exception { + final AtomicLong globalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED); + + // TODO: Remove this divergent commit logic and copy Segments_N from primary with node-node. + // randomly toggle commit / no commit. + IndexSettings settings = REMOTE_STORE_INDEX_SETTINGS; + final boolean shouldCommit = randomBoolean(); + if (shouldCommit) { + settings = INDEX_SETTINGS; + } + try ( + final Store nrtEngineStore = createStore(REMOTE_STORE_INDEX_SETTINGS, newDirectory()); + final NRTReplicationEngine nrtEngine = buildNrtReplicaEngine(globalCheckpoint, nrtEngineStore, settings) + ) { + List operations = generateHistoryOnReplica( + between(10, 20), + randomBoolean(), + randomBoolean(), + randomBoolean() + ); + for (Engine.Operation op : operations) { + applyOperation(engine, op); + applyOperation(nrtEngine, op); + // refresh to create a lot of segments. + engine.refresh("test"); + } + // wipe the nrt directory initially so we can sync with primary. + Lucene.cleanLuceneIndex(nrtEngineStore.directory()); + for (String file : engine.getLatestSegmentInfos().files(true)) { + nrtEngineStore.directory().copyFrom(store.directory(), file, file, IOContext.DEFAULT); + } + nrtEngine.updateSegments(engine.getLatestSegmentInfos()); + assertEquals(engine.getLatestSegmentInfos(), nrtEngine.getLatestSegmentInfos()); + final GatedCloseable snapshot = nrtEngine.getSegmentInfosSnapshot(); + final Collection replica_snapshotFiles = snapshot.get().files(true); + List replicaFiles = List.of(nrtEngine.store.directory().listAll()); + + // merge primary down to 1 segment + engine.forceMerge(true, 1, false, false, false, UUIDs.randomBase64UUID()); + final Collection files = engine.getLatestSegmentInfos().files(true); + + // copy new segments in and load reader. + for (String file : files) { + if (replicaFiles.contains(file) == false) { + nrtEngineStore.directory().copyFrom(store.directory(), file, file, IOContext.DEFAULT); + } + } + nrtEngine.updateSegments(engine.getLatestSegmentInfos()); + + replicaFiles = List.of(nrtEngine.store.directory().listAll()); + assertTrue(replicaFiles.containsAll(replica_snapshotFiles)); + + // close snapshot, files should be cleaned up + snapshot.close(); + + replicaFiles = List.of(nrtEngine.store.directory().listAll()); + assertFalse(replicaFiles.containsAll(replica_snapshotFiles)); + + // Ensure we still have all the active files. Note - we exclude the infos file here if we aren't committing + // the nrt reader will still reference segments_n-1 after being loaded until a local commit occurs. + assertTrue(replicaFiles.containsAll(nrtEngine.getLatestSegmentInfos().files(shouldCommit))); + } + } } diff --git a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java index 12b7341349442..5e582b2c5268a 100644 --- a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java @@ -481,7 +481,7 @@ public void testRejectCheckpointOnShardRoutingPrimary() throws IOException { spy.onNewCheckpoint(new ReplicationCheckpoint(primaryShard.shardId(), 0L, 0L, 0L, Codec.getDefault().getName()), spyShard); // Verify that checkpoint is not processed as shard routing is primary. - verify(spy, times(0)).startReplication(any(), any()); + verify(spy, times(0)).startReplication(any(), any(), any()); closeShards(primaryShard); } @@ -655,7 +655,7 @@ public void cancel() { } }; when(sourceFactory.get(any())).thenReturn(source); - startReplicationAndAssertCancellation(replica, targetService); + startReplicationAndAssertCancellation(replica, primary, targetService); shards.removeReplica(replica); closeShards(replica); @@ -700,11 +700,15 @@ protected void resolveCheckpointInfoResponseListener(ActionListener { - ((SegmentReplicationTargetService.SegmentReplicationListener) i.getArgument(1)).onReplicationDone(state); + ((SegmentReplicationTargetService.SegmentReplicationListener) i.getArgument(2)).onReplicationDone(state); latch.countDown(); return null; - }).when(spy).startReplication(any(), any()); + }).when(spy).startReplication(any(), any(), any()); doNothing().when(spy).updateVisibleCheckpoint(eq(0L), any()); spy.afterIndexShardStarted(replicaShard); @@ -422,14 +440,14 @@ public void testStartReplicationListenerFailure() throws InterruptedException { SegmentReplicationTargetService spy = spy(sut); CountDownLatch latch = new CountDownLatch(1); doAnswer(i -> { - ((SegmentReplicationTargetService.SegmentReplicationListener) i.getArgument(1)).onReplicationFailure( + ((SegmentReplicationTargetService.SegmentReplicationListener) i.getArgument(2)).onReplicationFailure( state, new ReplicationFailedException(replicaShard, null), false ); latch.countDown(); return null; - }).when(spy).startReplication(any(), any()); + }).when(spy).startReplication(any(), any(), any()); doNothing().when(spy).updateVisibleCheckpoint(eq(0L), any()); spy.afterIndexShardStarted(replicaShard); @@ -570,6 +588,7 @@ public void testForceSegmentSyncHandlerWithFailure_AlreadyClosedException_swallo public void testTargetCancelledBeforeStartInvoked() { final SegmentReplicationTarget target = new SegmentReplicationTarget( replicaShard, + primaryShard.getLatestReplicationCheckpoint(), mock(SegmentReplicationSource.class), new SegmentReplicationTargetService.SegmentReplicationListener() { @Override diff --git a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java index 176954b6d6b3d..5b996fd774baf 100644 --- a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java @@ -141,7 +141,7 @@ public void getSegmentFiles( SegmentReplicationTargetService.SegmentReplicationListener segRepListener = mock( SegmentReplicationTargetService.SegmentReplicationListener.class ); - segrepTarget = new SegmentReplicationTarget(spyIndexShard, segrepSource, segRepListener); + segrepTarget = new SegmentReplicationTarget(spyIndexShard, repCheckpoint, segrepSource, segRepListener); segrepTarget.startReplication(new ActionListener() { @Override @@ -189,7 +189,7 @@ public void getSegmentFiles( SegmentReplicationTargetService.SegmentReplicationListener segRepListener = mock( SegmentReplicationTargetService.SegmentReplicationListener.class ); - segrepTarget = new SegmentReplicationTarget(spyIndexShard, segrepSource, segRepListener); + segrepTarget = new SegmentReplicationTarget(spyIndexShard, repCheckpoint, segrepSource, segRepListener); segrepTarget.startReplication(new ActionListener() { @Override @@ -232,7 +232,7 @@ public void getSegmentFiles( SegmentReplicationTargetService.SegmentReplicationListener segRepListener = mock( SegmentReplicationTargetService.SegmentReplicationListener.class ); - segrepTarget = new SegmentReplicationTarget(spyIndexShard, segrepSource, segRepListener); + segrepTarget = new SegmentReplicationTarget(spyIndexShard, repCheckpoint, segrepSource, segRepListener); segrepTarget.startReplication(new ActionListener() { @Override @@ -275,7 +275,7 @@ public void getSegmentFiles( SegmentReplicationTargetService.SegmentReplicationListener segRepListener = mock( SegmentReplicationTargetService.SegmentReplicationListener.class ); - segrepTarget = new SegmentReplicationTarget(spyIndexShard, segrepSource, segRepListener); + segrepTarget = new SegmentReplicationTarget(spyIndexShard, repCheckpoint, segrepSource, segRepListener); doThrow(exception).when(spyIndexShard).finalizeReplication(any()); @@ -320,7 +320,7 @@ public void getSegmentFiles( SegmentReplicationTargetService.SegmentReplicationListener segRepListener = mock( SegmentReplicationTargetService.SegmentReplicationListener.class ); - segrepTarget = new SegmentReplicationTarget(spyIndexShard, segrepSource, segRepListener); + segrepTarget = new SegmentReplicationTarget(spyIndexShard, repCheckpoint, segrepSource, segRepListener); doThrow(exception).when(spyIndexShard).finalizeReplication(any()); @@ -364,7 +364,7 @@ public void getSegmentFiles( SegmentReplicationTargetService.SegmentReplicationListener segRepListener = mock( SegmentReplicationTargetService.SegmentReplicationListener.class ); - segrepTarget = new SegmentReplicationTarget(spyIndexShard, segrepSource, segRepListener); + segrepTarget = new SegmentReplicationTarget(spyIndexShard, repCheckpoint, segrepSource, segRepListener); when(spyIndexShard.getSegmentMetadataMap()).thenReturn(SI_SNAPSHOT_DIFFERENT); segrepTarget.startReplication(new ActionListener() { @Override @@ -417,7 +417,7 @@ public void getSegmentFiles( SegmentReplicationTargetService.SegmentReplicationListener.class ); - segrepTarget = new SegmentReplicationTarget(spyIndexShard, segrepSource, segRepListener); + segrepTarget = new SegmentReplicationTarget(spyIndexShard, repCheckpoint, segrepSource, segRepListener); when(spyIndexShard.getSegmentMetadataMap()).thenReturn(storeMetadataSnapshots.get(0).asMap()); segrepTarget.startReplication(new ActionListener() { @Override diff --git a/server/src/test/java/org/opensearch/recovery/ReplicationCollectionTests.java b/server/src/test/java/org/opensearch/recovery/ReplicationCollectionTests.java index 776173f73ce5c..9c38c5848e297 100644 --- a/server/src/test/java/org/opensearch/recovery/ReplicationCollectionTests.java +++ b/server/src/test/java/org/opensearch/recovery/ReplicationCollectionTests.java @@ -120,11 +120,13 @@ public void testStartMultipleReplicationsForSingleShard() throws Exception { shards.recoverReplica(shard); final SegmentReplicationTarget target1 = new SegmentReplicationTarget( shard, + shards.getPrimary().getLatestReplicationCheckpoint(), mock(SegmentReplicationSource.class), mock(ReplicationListener.class) ); final SegmentReplicationTarget target2 = new SegmentReplicationTarget( shard, + shards.getPrimary().getLatestReplicationCheckpoint(), mock(SegmentReplicationSource.class), mock(ReplicationListener.class) ); diff --git a/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java b/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java index 66e5459cfea3b..25b83fc76e6cf 100644 --- a/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java +++ b/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java @@ -1483,10 +1483,7 @@ public void getCheckpointMetadata( ActionListener listener ) { try { - final CopyState copyState = new CopyState( - ReplicationCheckpoint.empty(primaryShard.shardId, primaryShard.getLatestReplicationCheckpoint().getCodec()), - primaryShard - ); + final CopyState copyState = new CopyState(primaryShard.getLatestReplicationCheckpoint(), primaryShard); listener.onResponse( new CheckpointInfoResponse(copyState.getCheckpoint(), copyState.getMetadataMap(), copyState.getInfosBytes()) ); @@ -1541,6 +1538,7 @@ protected final List replicateSegments(IndexShard prim final SegmentReplicationTargetService targetService = prepareForReplication(primaryShard, replica); final SegmentReplicationTarget target = targetService.startReplication( replica, + primaryShard.getLatestReplicationCheckpoint(), getTargetListener(primaryShard, replica, primaryMetadata, countDownLatch) ); ids.add(target); From 6aa232bb1ab7e120841e54e868939d49313bd34a Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Wed, 26 Jul 2023 01:09:26 -0700 Subject: [PATCH 2/8] Handle NoSuchFileException when attempting to delete decref'd files. To avoid divergent logic with remote store, we always incref/decref the segmentinfos.files(true) which includes the segments_n file. Decref to 0 will attempt to delete the file from the store and its possible this _n file does not yet exist. This change will ignore if we get a noSuchFile while attempting to delete. Signed-off-by: Marc Handalian --- server/src/main/java/org/opensearch/index/store/Store.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/src/main/java/org/opensearch/index/store/Store.java b/server/src/main/java/org/opensearch/index/store/Store.java index 921deae41946a..257f5188aa09b 100644 --- a/server/src/main/java/org/opensearch/index/store/Store.java +++ b/server/src/main/java/org/opensearch/index/store/Store.java @@ -824,6 +824,10 @@ private void cleanupFiles(Collection filesToConsiderForCleanup, String r try { directory.deleteFile(reason, existingFile); } catch (IOException ex) { + if (ex instanceof NoSuchFileException) { + // file doesn't exist, nothing to do. + return; + } if (existingFile.startsWith(IndexFileNames.SEGMENTS) || existingFile.startsWith(CORRUPTED_MARKER_NAME_PREFIX)) { // TODO do we need to also fail this if we can't delete the pending commit file? // if one of those files can't be deleted we better fail the cleanup otherwise we might leave an old commit From e3a536637fba162334f3407992bf62816e84c487 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Mon, 31 Jul 2023 17:58:22 -0700 Subject: [PATCH 3/8] Add more unit tests. Signed-off-by: Marc Handalian --- .../index/shard/IndexShardTests.java | 48 +------------------ .../SegmentReplicationIndexShardTests.java | 34 ++++++++++++- .../SegmentReplicationTargetServiceTests.java | 27 +++++------ .../index/shard/IndexShardTestCase.java | 47 ++++++++++++++++++ 4 files changed, 94 insertions(+), 62 deletions(-) diff --git a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java index 96fa53fbf0fc2..60ba75b105d7e 100644 --- a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java @@ -3752,7 +3752,7 @@ public void testReadSnapshotConcurrently() throws IOException, InterruptedExcept */ public void testCheckpointRefreshListener() throws IOException { final SegmentReplicationCheckpointPublisher mock = mock(SegmentReplicationCheckpointPublisher.class); - IndexShard shard = newStartedShard(p -> newShard(mock), true); + IndexShard shard = newStartedShard(p -> newShard(true, mock), true); List refreshListeners = shard.getEngine().config().getInternalRefreshListener(); assertTrue(refreshListeners.stream().anyMatch(e -> e instanceof CheckpointRefreshListener)); closeShards(shard); @@ -3768,52 +3768,6 @@ public void testCheckpointRefreshListenerWithNull() throws IOException { closeShards(shard); } - /** - * creates a new initializing shard. The shard will be put in its proper path under the - * current node id the shard is assigned to. - * @param checkpointPublisher Segment Replication Checkpoint Publisher to publish checkpoint - */ - private IndexShard newShard(SegmentReplicationCheckpointPublisher checkpointPublisher) throws IOException { - final ShardId shardId = new ShardId("index", "_na_", 0); - final ShardRouting shardRouting = TestShardRouting.newShardRouting( - shardId, - randomAlphaOfLength(10), - true, - ShardRoutingState.INITIALIZING, - RecoverySource.EmptyStoreRecoverySource.INSTANCE - ); - final NodeEnvironment.NodePath nodePath = new NodeEnvironment.NodePath(createTempDir()); - ShardPath shardPath = new ShardPath(false, nodePath.resolve(shardId), nodePath.resolve(shardId), shardId); - - Settings indexSettings = Settings.builder() - .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) - .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) - .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) - .put(IndexMetadata.SETTING_REPLICATION_TYPE, "SEGMENT") - .put(IndexSettings.INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING.getKey(), between(0, 1000)) - .put(Settings.EMPTY) - .build(); - IndexMetadata metadata = IndexMetadata.builder(shardRouting.getIndexName()) - .settings(indexSettings) - .primaryTerm(0, primaryTerm) - .putMapping("{ \"properties\": {} }") - .build(); - return newShard( - shardRouting, - shardPath, - metadata, - null, - null, - new InternalEngineFactory(), - new EngineConfigFactory(new IndexSettings(metadata, metadata.getSettings())), - () -> {}, - RetentionLeaseSyncer.EMPTY, - EMPTY_EVENT_LISTENER, - checkpointPublisher, - null - ); - } - public void testIndexCheckOnStartup() throws Exception { final IndexShard indexShard = newStartedShard(true); diff --git a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java index 5e582b2c5268a..02286a8889ade 100644 --- a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java @@ -19,6 +19,7 @@ import org.opensearch.action.index.IndexRequest; import org.opensearch.action.support.PlainActionFuture; import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.cluster.routing.IndexShardRoutingTable; import org.opensearch.cluster.routing.ShardRouting; import org.opensearch.cluster.routing.ShardRoutingHelper; import org.opensearch.common.collect.Tuple; @@ -422,7 +423,38 @@ public void testShardIdleWithNoReplicas() throws Exception { /** * here we are starting a new primary shard in PrimaryMode and testing if the shard publishes checkpoint after refresh. */ - public void testPublishCheckpointOnPrimaryMode() throws IOException { + public void testPublishCheckpointOnPrimaryMode() throws IOException, InterruptedException { + final SegmentReplicationCheckpointPublisher mock = mock(SegmentReplicationCheckpointPublisher.class); + IndexShard shard = newStartedShard(p -> newShard(false, mock), false); + + final ShardRouting shardRouting = shard.routingEntry(); + promoteReplica( + shard, + Collections.singleton(shardRouting.allocationId().getId()), + new IndexShardRoutingTable.Builder(shardRouting.shardId()).addShard(shardRouting).build() + ); + + final CountDownLatch latch = new CountDownLatch(1); + shard.acquirePrimaryOperationPermit(new ActionListener() { + @Override + public void onResponse(Releasable releasable) { + releasable.close(); + latch.countDown(); + } + + @Override + public void onFailure(Exception e) { + throw new RuntimeException(e); + } + }, ThreadPool.Names.GENERIC, ""); + + latch.await(); + // verify checkpoint is published + verify(mock, times(1)).publish(any(), any()); + closeShards(shard); + } + + public void testPublishCheckpointPostFailover() throws IOException { final SegmentReplicationCheckpointPublisher mock = mock(SegmentReplicationCheckpointPublisher.class); IndexShard shard = newStartedShard(true); CheckpointRefreshListener refreshListener = new CheckpointRefreshListener(shard, mock); diff --git a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java index b62de1d026003..757f401c9735b 100644 --- a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java @@ -84,6 +84,7 @@ public class SegmentReplicationTargetServiceTests extends IndexShardTestCase { private IndicesService indicesService; private SegmentReplicationState state; + private ReplicationCheckpoint initialCheckpoint; private static final long TRANSPORT_TIMEOUT = 30000;// 30sec @@ -276,24 +277,22 @@ public void getSegmentFiles( } }; final SegmentReplicationTarget target = spy( - new SegmentReplicationTarget(replicaShard, source, mock(SegmentReplicationTargetService.SegmentReplicationListener.class)) + new SegmentReplicationTarget( + replicaShard, + primaryShard.getLatestReplicationCheckpoint(), + source, + mock(SegmentReplicationTargetService.SegmentReplicationListener.class) + ) ); + + final SegmentReplicationTargetService spy = spy(sut); + doReturn(false).when(spy).processLatestReceivedCheckpoint(eq(replicaShard), any()); // Start first round of segment replication. - sut.startReplication(target); + spy.startReplication(target); // Start second round of segment replication, this should fail to start as first round is still in-progress - sut.startReplication(replicaShard, new SegmentReplicationTargetService.SegmentReplicationListener() { - @Override - public void onReplicationDone(SegmentReplicationState state) { - Assert.fail("Should not succeed"); - } - - @Override - public void onReplicationFailure(SegmentReplicationState state, ReplicationFailedException e, boolean sendShardFailure) { - assertEquals("Shard " + replicaShard.shardId() + " is already replicating", e.getMessage()); - assertFalse(sendShardFailure); - } - }); + spy.onNewCheckpoint(newPrimaryCheckpoint, replicaShard); + verify(spy, times(1)).processLatestReceivedCheckpoint(eq(replicaShard), any()); blockGetCheckpointMetadata.countDown(); } diff --git a/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java b/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java index 25b83fc76e6cf..7a188f7e28de4 100644 --- a/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java +++ b/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java @@ -91,6 +91,7 @@ import org.opensearch.index.engine.EngineFactory; import org.opensearch.index.engine.EngineTestCase; import org.opensearch.index.engine.InternalEngineFactory; +import org.opensearch.index.engine.NRTReplicationEngineFactory; import org.opensearch.index.mapper.MapperService; import org.opensearch.index.mapper.SourceToParse; import org.opensearch.index.remote.RemoteRefreshSegmentPressureService; @@ -526,6 +527,52 @@ protected IndexShard newShard( ); } + /** + * creates a new initializing shard. The shard will be put in its proper path under the + * current node id the shard is assigned to. + * @param checkpointPublisher Segment Replication Checkpoint Publisher to publish checkpoint + */ + protected IndexShard newShard(boolean primary, SegmentReplicationCheckpointPublisher checkpointPublisher) throws IOException { + final ShardId shardId = new ShardId("index", "_na_", 0); + final ShardRouting shardRouting = TestShardRouting.newShardRouting( + shardId, + randomAlphaOfLength(10), + primary, + ShardRoutingState.INITIALIZING, + primary ? RecoverySource.EmptyStoreRecoverySource.INSTANCE : RecoverySource.PeerRecoverySource.INSTANCE + ); + final NodeEnvironment.NodePath nodePath = new NodeEnvironment.NodePath(createTempDir()); + ShardPath shardPath = new ShardPath(false, nodePath.resolve(shardId), nodePath.resolve(shardId), shardId); + + Settings indexSettings = Settings.builder() + .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_REPLICATION_TYPE, "SEGMENT") + .put(IndexSettings.INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING.getKey(), between(0, 1000)) + .put(Settings.EMPTY) + .build(); + IndexMetadata metadata = IndexMetadata.builder(shardRouting.getIndexName()) + .settings(indexSettings) + .primaryTerm(0, primaryTerm) + .putMapping("{ \"properties\": {} }") + .build(); + return newShard( + shardRouting, + shardPath, + metadata, + null, + null, + new NRTReplicationEngineFactory(), + new EngineConfigFactory(new IndexSettings(metadata, metadata.getSettings())), + () -> {}, + RetentionLeaseSyncer.EMPTY, + EMPTY_EVENT_LISTENER, + checkpointPublisher, + null + ); + } + /** * creates a new initializing shard. * @param routing shard routing to use From 95ab3b0f08d9e39b8fafbed50277cb8094e49491 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Mon, 31 Jul 2023 20:38:42 -0700 Subject: [PATCH 4/8] Clean up IndexShardTests.testCheckpointReffreshListenerWithNull Signed-off-by: Marc Handalian --- .../test/java/org/opensearch/index/shard/IndexShardTests.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java index 60ba75b105d7e..b6f8272aa3144 100644 --- a/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/IndexShardTests.java @@ -3762,7 +3762,8 @@ public void testCheckpointRefreshListener() throws IOException { * here we are passing null in place of SegmentReplicationCheckpointPublisher and testing on index shard if CheckpointRefreshListener is not added to the InternalrefreshListerners List */ public void testCheckpointRefreshListenerWithNull() throws IOException { - IndexShard shard = newStartedShard(p -> newShard(null), true); + final SegmentReplicationCheckpointPublisher publisher = null; + IndexShard shard = newStartedShard(p -> newShard(true, publisher), true); List refreshListeners = shard.getEngine().config().getInternalRefreshListener(); assertFalse(refreshListeners.stream().anyMatch(e -> e instanceof CheckpointRefreshListener)); closeShards(shard); From 54bbbd343a1ebaa410a11cf7509eca286a6d553b Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Mon, 31 Jul 2023 22:44:38 -0700 Subject: [PATCH 5/8] Remove unnecessary catch for NoSuchFileException. Signed-off-by: Marc Handalian --- .../opensearch/index/engine/NRTReplicationEngine.java | 2 +- .../java/org/opensearch/index/shard/IndexShard.java | 6 ++++-- .../main/java/org/opensearch/index/store/Store.java | 4 ---- .../index/engine/NRTReplicationEngineTests.java | 11 +++-------- 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java b/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java index e9987591aac30..6b09b8d86dc6c 100644 --- a/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java @@ -452,7 +452,7 @@ public synchronized GatedCloseable getSegmentInfosSnapshot() { final SegmentInfos latestSegmentInfos = getLatestSegmentInfos(); // incref all files try { - final Collection files = latestSegmentInfos.files(true); + final Collection files = latestSegmentInfos.files(false); store.incRefFileDeleter(files); return new GatedCloseable<>(latestSegmentInfos, () -> store.decRefFileDeleter(files)); } catch (IOException e) { diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 408eb149509a8..1a61209957ba2 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -678,10 +678,12 @@ public void updateShardState( assert newRouting.primary() && currentRouting.primary() == false; resetEngineToGlobalCheckpoint(); } + replicationTracker.activatePrimaryMode(getLocalCheckpoint()); - if (indexSettings.isSegRepEnabled()) { - // force publish a checkpoint now that shard is in primary mode. + if (checkpointPublisher != null) { + // force publish a checkpoint once in primary mode so that replicas not caught up to previous primary + // are brought up to date. checkpointPublisher.publish(this, getLatestReplicationCheckpoint()); } diff --git a/server/src/main/java/org/opensearch/index/store/Store.java b/server/src/main/java/org/opensearch/index/store/Store.java index 257f5188aa09b..921deae41946a 100644 --- a/server/src/main/java/org/opensearch/index/store/Store.java +++ b/server/src/main/java/org/opensearch/index/store/Store.java @@ -824,10 +824,6 @@ private void cleanupFiles(Collection filesToConsiderForCleanup, String r try { directory.deleteFile(reason, existingFile); } catch (IOException ex) { - if (ex instanceof NoSuchFileException) { - // file doesn't exist, nothing to do. - return; - } if (existingFile.startsWith(IndexFileNames.SEGMENTS) || existingFile.startsWith(CORRUPTED_MARKER_NAME_PREFIX)) { // TODO do we need to also fail this if we can't delete the pending commit file? // if one of those files can't be deleted we better fail the cleanup otherwise we might leave an old commit diff --git a/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java b/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java index 116f80d9e89ed..37edb96230a2b 100644 --- a/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java +++ b/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java @@ -385,12 +385,7 @@ public void testGetSegmentInfosSnapshotPreservesFilesUntilRelease() throws Excep final Store nrtEngineStore = createStore(REMOTE_STORE_INDEX_SETTINGS, newDirectory()); final NRTReplicationEngine nrtEngine = buildNrtReplicaEngine(globalCheckpoint, nrtEngineStore, settings) ) { - List operations = generateHistoryOnReplica( - between(10, 20), - randomBoolean(), - randomBoolean(), - randomBoolean() - ); + List operations = generateHistoryOnReplica(between(5, 10), randomBoolean(), randomBoolean(), randomBoolean()); for (Engine.Operation op : operations) { applyOperation(engine, op); applyOperation(nrtEngine, op); @@ -405,12 +400,12 @@ public void testGetSegmentInfosSnapshotPreservesFilesUntilRelease() throws Excep nrtEngine.updateSegments(engine.getLatestSegmentInfos()); assertEquals(engine.getLatestSegmentInfos(), nrtEngine.getLatestSegmentInfos()); final GatedCloseable snapshot = nrtEngine.getSegmentInfosSnapshot(); - final Collection replica_snapshotFiles = snapshot.get().files(true); + final Collection replica_snapshotFiles = snapshot.get().files(false); List replicaFiles = List.of(nrtEngine.store.directory().listAll()); // merge primary down to 1 segment engine.forceMerge(true, 1, false, false, false, UUIDs.randomBase64UUID()); - final Collection files = engine.getLatestSegmentInfos().files(true); + final Collection files = engine.getLatestSegmentInfos().files(false); // copy new segments in and load reader. for (String file : files) { From ad3d7bbcaffde623837b7d414957e4a49825909b Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Tue, 1 Aug 2023 16:29:40 -0700 Subject: [PATCH 6/8] Add another test for non segrep. Signed-off-by: Marc Handalian --- .../opensearch/index/shard/IndexShard.java | 2 +- .../SegmentReplicationIndexShardTests.java | 35 ++++++++++++++++++- .../index/shard/IndexShardTestCase.java | 11 ++++-- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 1a61209957ba2..f72cc53fb9d5d 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -681,7 +681,7 @@ public void updateShardState( replicationTracker.activatePrimaryMode(getLocalCheckpoint()); - if (checkpointPublisher != null) { + if (indexSettings.isSegRepEnabled()) { // force publish a checkpoint once in primary mode so that replicas not caught up to previous primary // are brought up to date. checkpointPublisher.publish(this, getLatestReplicationCheckpoint()); diff --git a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java index 02286a8889ade..57602d96745f6 100644 --- a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java @@ -64,6 +64,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.concurrent.CountDownLatch; @@ -425,7 +426,7 @@ public void testShardIdleWithNoReplicas() throws Exception { */ public void testPublishCheckpointOnPrimaryMode() throws IOException, InterruptedException { final SegmentReplicationCheckpointPublisher mock = mock(SegmentReplicationCheckpointPublisher.class); - IndexShard shard = newStartedShard(p -> newShard(false, mock), false); + IndexShard shard = newStartedShard(p -> newShard(false, mock, settings), false); final ShardRouting shardRouting = shard.routingEntry(); promoteReplica( @@ -454,6 +455,38 @@ public void onFailure(Exception e) { closeShards(shard); } + public void testPublishCheckpointOnPrimaryMode_segrep_off() throws IOException, InterruptedException { + final SegmentReplicationCheckpointPublisher mock = mock(SegmentReplicationCheckpointPublisher.class); + final Settings settings = Settings.builder().put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.DOCUMENT).build(); + IndexShard shard = newStartedShard(p -> newShard(false, mock, settings), false); + + final ShardRouting shardRouting = shard.routingEntry(); + promoteReplica( + shard, + Collections.singleton(shardRouting.allocationId().getId()), + new IndexShardRoutingTable.Builder(shardRouting.shardId()).addShard(shardRouting).build() + ); + + final CountDownLatch latch = new CountDownLatch(1); + shard.acquirePrimaryOperationPermit(new ActionListener() { + @Override + public void onResponse(Releasable releasable) { + releasable.close(); + latch.countDown(); + } + + @Override + public void onFailure(Exception e) { + throw new RuntimeException(e); + } + }, ThreadPool.Names.GENERIC, ""); + + latch.await(); + // verify checkpoint is published + verify(mock, times(0)).publish(any(), any()); + closeShards(shard); + } + public void testPublishCheckpointPostFailover() throws IOException { final SegmentReplicationCheckpointPublisher mock = mock(SegmentReplicationCheckpointPublisher.class); IndexShard shard = newStartedShard(true); diff --git a/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java b/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java index 7a188f7e28de4..9dc114eb923d3 100644 --- a/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java +++ b/test/framework/src/main/java/org/opensearch/index/shard/IndexShardTestCase.java @@ -139,6 +139,7 @@ import org.opensearch.indices.replication.common.ReplicationFailedException; import org.opensearch.indices.replication.common.ReplicationListener; import org.opensearch.indices.replication.common.ReplicationState; +import org.opensearch.indices.replication.common.ReplicationType; import org.opensearch.repositories.IndexId; import org.opensearch.repositories.RepositoriesService; import org.opensearch.repositories.Repository; @@ -527,12 +528,18 @@ protected IndexShard newShard( ); } + protected IndexShard newShard(boolean primary, SegmentReplicationCheckpointPublisher checkpointPublisher) throws IOException { + final Settings settings = Settings.builder().put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT).build(); + return newShard(primary, checkpointPublisher, settings); + } + /** * creates a new initializing shard. The shard will be put in its proper path under the * current node id the shard is assigned to. * @param checkpointPublisher Segment Replication Checkpoint Publisher to publish checkpoint */ - protected IndexShard newShard(boolean primary, SegmentReplicationCheckpointPublisher checkpointPublisher) throws IOException { + protected IndexShard newShard(boolean primary, SegmentReplicationCheckpointPublisher checkpointPublisher, Settings settings) + throws IOException { final ShardId shardId = new ShardId("index", "_na_", 0); final ShardRouting shardRouting = TestShardRouting.newShardRouting( shardId, @@ -545,10 +552,10 @@ protected IndexShard newShard(boolean primary, SegmentReplicationCheckpointPubli ShardPath shardPath = new ShardPath(false, nodePath.resolve(shardId), nodePath.resolve(shardId), shardId); Settings indexSettings = Settings.builder() + .put(settings) .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) - .put(IndexMetadata.SETTING_REPLICATION_TYPE, "SEGMENT") .put(IndexSettings.INDEX_SOFT_DELETES_RETENTION_OPERATIONS_SETTING.getKey(), between(0, 1000)) .put(Settings.EMPTY) .build(); From 804c2039e0d36e78073bc7d45091879e2336efa5 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Wed, 2 Aug 2023 11:56:02 -0700 Subject: [PATCH 7/8] PR Feedback. Signed-off-by: Marc Handalian --- .../opensearch/index/shard/IndexShard.java | 1 - .../engine/NRTReplicationEngineTests.java | 22 ++++++++++++++----- ...licationWithNodeToNodeIndexShardTests.java | 16 +++++++++----- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index f72cc53fb9d5d..0086c6cc355c0 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -680,7 +680,6 @@ public void updateShardState( } replicationTracker.activatePrimaryMode(getLocalCheckpoint()); - if (indexSettings.isSegRepEnabled()) { // force publish a checkpoint once in primary mode so that replicas not caught up to previous primary // are brought up to date. diff --git a/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java b/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java index 37edb96230a2b..4c87df48f583f 100644 --- a/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java +++ b/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java @@ -30,6 +30,7 @@ import java.io.IOException; import java.nio.file.Path; +import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Map; @@ -385,30 +386,39 @@ public void testGetSegmentInfosSnapshotPreservesFilesUntilRelease() throws Excep final Store nrtEngineStore = createStore(REMOTE_STORE_INDEX_SETTINGS, newDirectory()); final NRTReplicationEngine nrtEngine = buildNrtReplicaEngine(globalCheckpoint, nrtEngineStore, settings) ) { - List operations = generateHistoryOnReplica(between(5, 10), randomBoolean(), randomBoolean(), randomBoolean()); + // only index 2 docs here, this will create segments _0 and _1 and after forcemerge into _2. + final int docCount = 2; + List operations = generateHistoryOnReplica(docCount, randomBoolean(), randomBoolean(), randomBoolean()); for (Engine.Operation op : operations) { applyOperation(engine, op); applyOperation(nrtEngine, op); // refresh to create a lot of segments. engine.refresh("test"); } + assertEquals(2, engine.segmentsStats(false, false).getCount()); // wipe the nrt directory initially so we can sync with primary. Lucene.cleanLuceneIndex(nrtEngineStore.directory()); + assertFalse( + Arrays.stream(nrtEngineStore.directory().listAll()) + .anyMatch(file -> file.equals("write.lock") == false && file.equals("extra0") == false) + ); for (String file : engine.getLatestSegmentInfos().files(true)) { nrtEngineStore.directory().copyFrom(store.directory(), file, file, IOContext.DEFAULT); } nrtEngine.updateSegments(engine.getLatestSegmentInfos()); assertEquals(engine.getLatestSegmentInfos(), nrtEngine.getLatestSegmentInfos()); final GatedCloseable snapshot = nrtEngine.getSegmentInfosSnapshot(); - final Collection replica_snapshotFiles = snapshot.get().files(false); + final Collection replicaSnapshotFiles = snapshot.get().files(false); List replicaFiles = List.of(nrtEngine.store.directory().listAll()); // merge primary down to 1 segment engine.forceMerge(true, 1, false, false, false, UUIDs.randomBase64UUID()); - final Collection files = engine.getLatestSegmentInfos().files(false); + // we expect a 3rd segment to be created after merge. + assertEquals(3, engine.segmentsStats(false, false).getCount()); + final Collection latestPrimaryFiles = engine.getLatestSegmentInfos().files(false); // copy new segments in and load reader. - for (String file : files) { + for (String file : latestPrimaryFiles) { if (replicaFiles.contains(file) == false) { nrtEngineStore.directory().copyFrom(store.directory(), file, file, IOContext.DEFAULT); } @@ -416,13 +426,13 @@ public void testGetSegmentInfosSnapshotPreservesFilesUntilRelease() throws Excep nrtEngine.updateSegments(engine.getLatestSegmentInfos()); replicaFiles = List.of(nrtEngine.store.directory().listAll()); - assertTrue(replicaFiles.containsAll(replica_snapshotFiles)); + assertTrue(replicaFiles.containsAll(replicaSnapshotFiles)); // close snapshot, files should be cleaned up snapshot.close(); replicaFiles = List.of(nrtEngine.store.directory().listAll()); - assertFalse(replicaFiles.containsAll(replica_snapshotFiles)); + assertFalse(replicaFiles.containsAll(replicaSnapshotFiles)); // Ensure we still have all the active files. Note - we exclude the infos file here if we aren't committing // the nrt reader will still reference segments_n-1 after being loaded until a local commit occurs. diff --git a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationWithNodeToNodeIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationWithNodeToNodeIndexShardTests.java index 69846fbbe1dd4..a6634c0741cd4 100644 --- a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationWithNodeToNodeIndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationWithNodeToNodeIndexShardTests.java @@ -95,7 +95,7 @@ public void getSegmentFiles( } }; when(sourceFactory.get(any())).thenReturn(source); - startReplicationAndAssertCancellation(replica, targetService); + startReplicationAndAssertCancellation(replica, primary, targetService); shards.removeReplica(replica); closeShards(replica); @@ -137,7 +137,7 @@ public void getSegmentFiles( } }; when(sourceFactory.get(any())).thenReturn(source); - startReplicationAndAssertCancellation(replica, targetService); + startReplicationAndAssertCancellation(replica, primary, targetService); shards.removeReplica(replica); closeShards(replica); @@ -189,7 +189,7 @@ public void cancel() { } }; when(sourceFactory.get(any())).thenReturn(source); - startReplicationAndAssertCancellation(replica, targetService); + startReplicationAndAssertCancellation(replica, primary, targetService); shards.removeReplica(replica); closeShards(replica); @@ -227,7 +227,7 @@ public void getSegmentFiles( ) {} }; when(sourceFactory.get(any())).thenReturn(source); - startReplicationAndAssertCancellation(replica, targetService); + startReplicationAndAssertCancellation(replica, primary, targetService); shards.removeReplica(replica); closeShards(replica); @@ -275,7 +275,7 @@ public void getSegmentFiles( } }; when(sourceFactory.get(any())).thenReturn(source); - startReplicationAndAssertCancellation(nextPrimary, targetService); + startReplicationAndAssertCancellation(nextPrimary, oldPrimary, targetService); // wait for replica to finish being promoted, and assert doc counts. final CountDownLatch latch = new CountDownLatch(1); nextPrimary.acquirePrimaryOperationPermit(new ActionListener<>() { @@ -422,7 +422,11 @@ public void testTemporaryFilesNotCleanup() throws Exception { runnablePostGetFiles ); when(sourceFactory.get(any())).thenReturn(segmentReplicationSource); - targetService.startReplication(replica, getTargetListener(primaryShard, replica, primaryMetadata, countDownLatch)); + targetService.startReplication( + replica, + primaryShard.getLatestReplicationCheckpoint(), + getTargetListener(primaryShard, replica, primaryMetadata, countDownLatch) + ); countDownLatch.await(30, TimeUnit.SECONDS); assertEquals("Replication failed", 0, countDownLatch.getCount()); shards.assertAllEqual(numDocs); From d0e15b8c4622d7ca9bb03432957c58a4998b5186 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Wed, 2 Aug 2023 17:59:03 -0700 Subject: [PATCH 8/8] re-compute replication checkpoint on primary promotion. Signed-off-by: Marc Handalian --- .../src/main/java/org/opensearch/index/shard/IndexShard.java | 4 ++++ .../opensearch/index/shard/RemoteStoreRefreshListener.java | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 0086c6cc355c0..ace6ed56c007c 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -677,6 +677,10 @@ public void updateShardState( // this Shard's engine was read only, we need to update its engine before restoring local history from xlog. assert newRouting.primary() && currentRouting.primary() == false; resetEngineToGlobalCheckpoint(); + // It is possible an engine can open with a SegmentInfos on a higher gen but the reader does not refresh to + // trigger our refresh listener. + // Force update the checkpoint post engine reset. + updateReplicationCheckpoint(); } replicationTracker.activatePrimaryMode(getLocalCheckpoint()); diff --git a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java index f401ab2ddff85..d56054dd1c42b 100644 --- a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java +++ b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java @@ -206,6 +206,10 @@ private synchronized boolean syncSegments() { try (GatedCloseable segmentInfosGatedCloseable = indexShard.getSegmentInfosSnapshot()) { SegmentInfos segmentInfos = segmentInfosGatedCloseable.get(); + assert segmentInfos.getGeneration() == checkpoint.getSegmentsGen() : "SegmentInfos generation: " + + segmentInfos.getGeneration() + + " does not match metadata generation: " + + checkpoint.getSegmentsGen(); // Capture replication checkpoint before uploading the segments as upload can take some time and checkpoint can // move. long lastRefreshedCheckpoint = ((InternalEngine) indexShard.getEngine()).lastRefreshedCheckpoint();