diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 5b6257084e440..352d4efc95269 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -187,6 +187,7 @@ import org.opensearch.indices.recovery.RecoveryTarget; import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint; import org.opensearch.indices.replication.checkpoint.SegmentReplicationCheckpointPublisher; +import org.opensearch.indices.replication.common.ReplicationTimer; import org.opensearch.repositories.RepositoriesService; import org.opensearch.repositories.Repository; import org.opensearch.search.suggest.completion.CompletionStats; @@ -698,7 +699,16 @@ public void updateShardState( if (indexSettings.isSegRepEnabled()) { // this Shard's engine was read only, we need to update its engine before restoring local history from xlog. assert newRouting.primary() && currentRouting.primary() == false; + ReplicationTimer timer = new ReplicationTimer(); + timer.start(); + logger.debug( + "Resetting engine on promotion of shard [{}] to primary, startTime {}\n", + shardId, + timer.startTime() + ); resetEngineToGlobalCheckpoint(); + timer.stop(); + logger.info("Completed engine failover for shard [{}] in: {} ms", shardId, timer.time()); // It is possible an engine can open with a SegmentInfos on a higher gen but the reader does not refresh to // trigger our refresh listener. // Force update the checkpoint post engine reset. diff --git a/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java b/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java index 305c3a3acbf75..81d8bccb86c60 100644 --- a/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java +++ b/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java @@ -40,7 +40,6 @@ import org.apache.logging.log4j.core.LogEvent; import org.apache.logging.log4j.core.appender.AbstractAppender; import org.apache.logging.log4j.core.filter.RegexFilter; -import org.apache.lucene.codecs.LiveDocsFormat; import org.apache.lucene.document.Field; import org.apache.lucene.document.KeywordField; import org.apache.lucene.document.LongPoint; @@ -3237,22 +3236,10 @@ public void testUnreferencedFileCleanUpOnSegmentMergeFailureWithCleanUpEnabled() MockDirectoryWrapper wrapper = newMockDirectory(); final CountDownLatch cleanupCompleted = new CountDownLatch(1); MockDirectoryWrapper.Failure fail = new MockDirectoryWrapper.Failure() { - public boolean didFail1; - public boolean didFail2; - @Override public void eval(MockDirectoryWrapper dir) throws IOException { - if (!doFail) { - return; - } - - // Fail segment merge with diskfull during merging terms. - if (callStackContainsAnyOf("mergeTerms") && !didFail1) { - didFail1 = true; - throw new IOException("No space left on device"); - } - if (callStackContains(LiveDocsFormat.class, "writeLiveDocs") && !didFail2) { - didFail2 = true; + // Fail segment merge with diskfull during merging terms + if (callStackContainsAnyOf("mergeTerms")) { throw new IOException("No space left on device"); } } @@ -3325,7 +3312,6 @@ public void onFailedEngine(String reason, Exception e) { segments = engine.segments(false); assertThat(segments.size(), equalTo(2)); - fail.setDoFail(); // IndexWriter can throw either IOException or IllegalStateException depending on whether tragedy is set or not. expectThrowsAnyOf( Arrays.asList(IOException.class, IllegalStateException.class), @@ -3345,20 +3331,10 @@ public void testUnreferencedFileCleanUpOnSegmentMergeFailureWithCleanUpDisabled( MockDirectoryWrapper wrapper = newMockDirectory(); final CountDownLatch cleanupCompleted = new CountDownLatch(1); MockDirectoryWrapper.Failure fail = new MockDirectoryWrapper.Failure() { - public boolean didFail1; - public boolean didFail2; @Override public void eval(MockDirectoryWrapper dir) throws IOException { - if (!doFail) { - return; - } - if (callStackContainsAnyOf("mergeTerms") && !didFail1) { - didFail1 = true; - throw new IOException("No space left on device"); - } - if (callStackContains(LiveDocsFormat.class, "writeLiveDocs") && !didFail2) { - didFail2 = true; + if (callStackContainsAnyOf("mergeTerms")) { throw new IOException("No space left on device"); } } @@ -3439,7 +3415,6 @@ public void onFailedEngine(String reason, Exception e) { segments = engine.segments(false); assertThat(segments.size(), equalTo(2)); - fail.setDoFail(); // IndexWriter can throw either IOException or IllegalStateException depending on whether tragedy is set or not. expectThrowsAnyOf( Arrays.asList(IOException.class, IllegalStateException.class), @@ -3459,20 +3434,10 @@ public void testUnreferencedFileCleanUpFailsOnSegmentMergeFailureWhenDirectoryCl MockDirectoryWrapper wrapper = newMockDirectory(); final CountDownLatch cleanupCompleted = new CountDownLatch(1); MockDirectoryWrapper.Failure fail = new MockDirectoryWrapper.Failure() { - public boolean didFail1; - public boolean didFail2; @Override public void eval(MockDirectoryWrapper dir) throws IOException { - if (!doFail) { - return; - } - if (callStackContainsAnyOf("mergeTerms") && !didFail1) { - didFail1 = true; - throw new IOException("No space left on device"); - } - if (callStackContains(LiveDocsFormat.class, "writeLiveDocs") && !didFail2) { - didFail2 = true; + if (callStackContainsAnyOf("mergeTerms")) { throw new IOException("No space left on device"); } } @@ -3537,7 +3502,6 @@ public void onFailedEngine(String reason, Exception e) { segments = engine.segments(false); assertThat(segments.size(), equalTo(2)); - fail.setDoFail(); // Close the store so that unreferenced file cleanup will fail. store.close(); diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java index 2ce0bdc607189..20cec90d79e3e 100644 --- a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java @@ -362,6 +362,7 @@ public void testPrimaryRestart() throws Exception { * prevent FileAlreadyExistsException. It does so by only copying files in first round of segment replication without * committing locally so that in next round of segment replication those files are not considered for download again */ + @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/10885") public void testSegRepSucceedsOnPreviousCopiedFiles() throws Exception { try (ReplicationGroup shards = createGroup(1, getIndexSettings(), new NRTReplicationEngineFactory())) { shards.startAll(); @@ -453,6 +454,7 @@ public void onReplicationFailure( * blocking update of reader. Once this is done, it corrupts one segment file and ensure that file is deleted in next * round of segment replication by ensuring doc count. */ + @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/10885") public void testNoFailuresOnFileReads() throws Exception { try (ReplicationGroup shards = createGroup(1, getIndexSettings(), new NRTReplicationEngineFactory())) { shards.startAll();