From 5ae93338e786173e39f5906ea1b2a5129f98ab66 Mon Sep 17 00:00:00 2001 From: Suraj Singh Date: Thu, 26 Oct 2023 07:42:23 -0700 Subject: [PATCH 1/3] Mute testSegRepSucceedsOnPreviousCopiedFiles and testNoFailuresOnFileReads unit tests (#10942) Signed-off-by: Suraj Singh --- .../java/org/opensearch/index/shard/RemoteIndexShardTests.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java index 2ce0bdc607189..20cec90d79e3e 100644 --- a/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/RemoteIndexShardTests.java @@ -362,6 +362,7 @@ public void testPrimaryRestart() throws Exception { * prevent FileAlreadyExistsException. It does so by only copying files in first round of segment replication without * committing locally so that in next round of segment replication those files are not considered for download again */ + @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/10885") public void testSegRepSucceedsOnPreviousCopiedFiles() throws Exception { try (ReplicationGroup shards = createGroup(1, getIndexSettings(), new NRTReplicationEngineFactory())) { shards.startAll(); @@ -453,6 +454,7 @@ public void onReplicationFailure( * blocking update of reader. Once this is done, it corrupts one segment file and ensure that file is deleted in next * round of segment replication by ensuring doc count. */ + @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/10885") public void testNoFailuresOnFileReads() throws Exception { try (ReplicationGroup shards = createGroup(1, getIndexSettings(), new NRTReplicationEngineFactory())) { shards.startAll(); From 0c9fc21ae78babd6820479ea940a4c986d82f10f Mon Sep 17 00:00:00 2001 From: Poojita Raj Date: Thu, 26 Oct 2023 16:11:19 -0700 Subject: [PATCH 2/3] Add log for failover time (#10952) Signed-off-by: Poojita Raj --- .../java/org/opensearch/index/shard/IndexShard.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 5b6257084e440..352d4efc95269 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -187,6 +187,7 @@ import org.opensearch.indices.recovery.RecoveryTarget; import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint; import org.opensearch.indices.replication.checkpoint.SegmentReplicationCheckpointPublisher; +import org.opensearch.indices.replication.common.ReplicationTimer; import org.opensearch.repositories.RepositoriesService; import org.opensearch.repositories.Repository; import org.opensearch.search.suggest.completion.CompletionStats; @@ -698,7 +699,16 @@ public void updateShardState( if (indexSettings.isSegRepEnabled()) { // this Shard's engine was read only, we need to update its engine before restoring local history from xlog. assert newRouting.primary() && currentRouting.primary() == false; + ReplicationTimer timer = new ReplicationTimer(); + timer.start(); + logger.debug( + "Resetting engine on promotion of shard [{}] to primary, startTime {}\n", + shardId, + timer.startTime() + ); resetEngineToGlobalCheckpoint(); + timer.stop(); + logger.info("Completed engine failover for shard [{}] in: {} ms", shardId, timer.time()); // It is possible an engine can open with a SegmentInfos on a higher gen but the reader does not refresh to // trigger our refresh listener. // Force update the checkpoint post engine reset. From e9affeab3494f4c2ed96a3efd647530b38a315fc Mon Sep 17 00:00:00 2001 From: rishavz_sagar Date: Fri, 27 Oct 2023 14:34:13 +0530 Subject: [PATCH 3/3] Fixing unreferenced file cleanup flaky tests (#10801) Signed-off-by: RS146BIJAY --- .../index/engine/InternalEngineTests.java | 44 ++----------------- 1 file changed, 4 insertions(+), 40 deletions(-) diff --git a/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java b/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java index 305c3a3acbf75..81d8bccb86c60 100644 --- a/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java +++ b/server/src/test/java/org/opensearch/index/engine/InternalEngineTests.java @@ -40,7 +40,6 @@ import org.apache.logging.log4j.core.LogEvent; import org.apache.logging.log4j.core.appender.AbstractAppender; import org.apache.logging.log4j.core.filter.RegexFilter; -import org.apache.lucene.codecs.LiveDocsFormat; import org.apache.lucene.document.Field; import org.apache.lucene.document.KeywordField; import org.apache.lucene.document.LongPoint; @@ -3237,22 +3236,10 @@ public void testUnreferencedFileCleanUpOnSegmentMergeFailureWithCleanUpEnabled() MockDirectoryWrapper wrapper = newMockDirectory(); final CountDownLatch cleanupCompleted = new CountDownLatch(1); MockDirectoryWrapper.Failure fail = new MockDirectoryWrapper.Failure() { - public boolean didFail1; - public boolean didFail2; - @Override public void eval(MockDirectoryWrapper dir) throws IOException { - if (!doFail) { - return; - } - - // Fail segment merge with diskfull during merging terms. - if (callStackContainsAnyOf("mergeTerms") && !didFail1) { - didFail1 = true; - throw new IOException("No space left on device"); - } - if (callStackContains(LiveDocsFormat.class, "writeLiveDocs") && !didFail2) { - didFail2 = true; + // Fail segment merge with diskfull during merging terms + if (callStackContainsAnyOf("mergeTerms")) { throw new IOException("No space left on device"); } } @@ -3325,7 +3312,6 @@ public void onFailedEngine(String reason, Exception e) { segments = engine.segments(false); assertThat(segments.size(), equalTo(2)); - fail.setDoFail(); // IndexWriter can throw either IOException or IllegalStateException depending on whether tragedy is set or not. expectThrowsAnyOf( Arrays.asList(IOException.class, IllegalStateException.class), @@ -3345,20 +3331,10 @@ public void testUnreferencedFileCleanUpOnSegmentMergeFailureWithCleanUpDisabled( MockDirectoryWrapper wrapper = newMockDirectory(); final CountDownLatch cleanupCompleted = new CountDownLatch(1); MockDirectoryWrapper.Failure fail = new MockDirectoryWrapper.Failure() { - public boolean didFail1; - public boolean didFail2; @Override public void eval(MockDirectoryWrapper dir) throws IOException { - if (!doFail) { - return; - } - if (callStackContainsAnyOf("mergeTerms") && !didFail1) { - didFail1 = true; - throw new IOException("No space left on device"); - } - if (callStackContains(LiveDocsFormat.class, "writeLiveDocs") && !didFail2) { - didFail2 = true; + if (callStackContainsAnyOf("mergeTerms")) { throw new IOException("No space left on device"); } } @@ -3439,7 +3415,6 @@ public void onFailedEngine(String reason, Exception e) { segments = engine.segments(false); assertThat(segments.size(), equalTo(2)); - fail.setDoFail(); // IndexWriter can throw either IOException or IllegalStateException depending on whether tragedy is set or not. expectThrowsAnyOf( Arrays.asList(IOException.class, IllegalStateException.class), @@ -3459,20 +3434,10 @@ public void testUnreferencedFileCleanUpFailsOnSegmentMergeFailureWhenDirectoryCl MockDirectoryWrapper wrapper = newMockDirectory(); final CountDownLatch cleanupCompleted = new CountDownLatch(1); MockDirectoryWrapper.Failure fail = new MockDirectoryWrapper.Failure() { - public boolean didFail1; - public boolean didFail2; @Override public void eval(MockDirectoryWrapper dir) throws IOException { - if (!doFail) { - return; - } - if (callStackContainsAnyOf("mergeTerms") && !didFail1) { - didFail1 = true; - throw new IOException("No space left on device"); - } - if (callStackContains(LiveDocsFormat.class, "writeLiveDocs") && !didFail2) { - didFail2 = true; + if (callStackContainsAnyOf("mergeTerms")) { throw new IOException("No space left on device"); } } @@ -3537,7 +3502,6 @@ public void onFailedEngine(String reason, Exception e) { segments = engine.segments(false); assertThat(segments.size(), equalTo(2)); - fail.setDoFail(); // Close the store so that unreferenced file cleanup will fail. store.close();