From bc3dffb2098d7a11609e51aaaec056f94a9afa90 Mon Sep 17 00:00:00 2001 From: Ahmad AbuKhalil <105249973+aabukhalil@users.noreply.github.com> Date: Fri, 17 Jun 2022 11:32:10 -0700 Subject: [PATCH] Fix false positive query timeouts due to using cached time (#3454) * Fix false positive query timeouts due to using cached time Signed-off-by: Ahmad AbuKhalil * delegate nanoTime call to SearchContext Signed-off-by: Ahmad AbuKhalil * add override to SearchContext getRelativeTimeInMillis to force non cached time Signed-off-by: Ahmad AbuKhalil --- .../NanoTimeVsCurrentTimeMillisBenchmark.java | 42 +++++++++++++ .../search/internal/SearchContext.java | 16 ++++- .../opensearch/search/query/QueryPhase.java | 32 +++++++--- .../search/DefaultSearchContextTests.java | 4 ++ .../search/query/QueryPhaseTests.java | 59 +++++++++++++++++++ 5 files changed, 143 insertions(+), 10 deletions(-) create mode 100644 benchmarks/src/main/java/org/opensearch/benchmark/time/NanoTimeVsCurrentTimeMillisBenchmark.java diff --git a/benchmarks/src/main/java/org/opensearch/benchmark/time/NanoTimeVsCurrentTimeMillisBenchmark.java b/benchmarks/src/main/java/org/opensearch/benchmark/time/NanoTimeVsCurrentTimeMillisBenchmark.java new file mode 100644 index 0000000000000..4e4662a8f0427 --- /dev/null +++ b/benchmarks/src/main/java/org/opensearch/benchmark/time/NanoTimeVsCurrentTimeMillisBenchmark.java @@ -0,0 +1,42 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.benchmark.time; + +import org.openjdk.jmh.annotations.*; + +import java.util.concurrent.TimeUnit; + +@Fork(3) +@Warmup(iterations = 10) +@Measurement(iterations = 20) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Benchmark) +@SuppressWarnings("unused") // invoked by benchmarking framework +public class NanoTimeVsCurrentTimeMillisBenchmark { + private volatile long var = 0; + + @Benchmark + public long currentTimeMillis() { + return System.currentTimeMillis(); + } + + @Benchmark + public long nanoTime() { + return System.nanoTime(); + } + + /* + * this acts as upper bound of how time is cached in org.opensearch.threadpool.ThreadPool + * */ + @Benchmark + public long accessLongVar() { + return var++; + } +} diff --git a/server/src/main/java/org/opensearch/search/internal/SearchContext.java b/server/src/main/java/org/opensearch/search/internal/SearchContext.java index 60438014f8d53..76d0d7b72c6b4 100644 --- a/server/src/main/java/org/opensearch/search/internal/SearchContext.java +++ b/server/src/main/java/org/opensearch/search/internal/SearchContext.java @@ -388,10 +388,24 @@ public final boolean hasOnlySuggest() { /** * Returns time in milliseconds that can be used for relative time calculations. - * WARN: This is not the epoch time. + * WARN: This is not the epoch time and can be a cached time. */ public abstract long getRelativeTimeInMillis(); + /** + * Returns time in milliseconds that can be used for relative time calculations. this method will fall back to + * {@link SearchContext#getRelativeTimeInMillis()} (which might be a cached time) if useCache was set to true else it will be just be a + * wrapper of {@link System#nanoTime()} converted to milliseconds. + * @param useCache to allow using cached time if true or forcing calling {@link System#nanoTime()} if false + * @return Returns time in milliseconds that can be used for relative time calculations. + */ + public long getRelativeTimeInMillis(boolean useCache) { + if (useCache) { + return getRelativeTimeInMillis(); + } + return TimeValue.nsecToMSec(System.nanoTime()); + } + /** Return a view of the additional query collector managers that should be run for this context. */ public abstract Map, CollectorManager> queryCollectorManagers(); diff --git a/server/src/main/java/org/opensearch/search/query/QueryPhase.java b/server/src/main/java/org/opensearch/search/query/QueryPhase.java index 8de44a7448023..94e1524cccb7d 100644 --- a/server/src/main/java/org/opensearch/search/query/QueryPhase.java +++ b/server/src/main/java/org/opensearch/search/query/QueryPhase.java @@ -257,15 +257,7 @@ static boolean executeInternal(SearchContext searchContext, QueryPhaseSearcher q final Runnable timeoutRunnable; if (timeoutSet) { - final long startTime = searchContext.getRelativeTimeInMillis(); - final long timeout = searchContext.timeout().millis(); - final long maxTime = startTime + timeout; - timeoutRunnable = searcher.addQueryCancellation(() -> { - final long time = searchContext.getRelativeTimeInMillis(); - if (time > maxTime) { - throw new TimeExceededException(); - } - }); + timeoutRunnable = searcher.addQueryCancellation(createQueryTimeoutChecker(searchContext)); } else { timeoutRunnable = null; } @@ -309,6 +301,28 @@ static boolean executeInternal(SearchContext searchContext, QueryPhaseSearcher q } } + /** + * Create runnable which throws {@link TimeExceededException} when the runnable is called after timeout + runnable creation time + * exceeds currentTime + * @param searchContext to extract timeout from and to get relative time from + * @return the created runnable + */ + static Runnable createQueryTimeoutChecker(final SearchContext searchContext) { + /* for startTime, relative non-cached precise time must be used to prevent false positive timeouts. + * Using cached time for startTime will fail and produce false positive timeouts when maxTime = (startTime + timeout) falls in + * next time cache slot(s) AND time caching lifespan > passed timeout */ + final long startTime = searchContext.getRelativeTimeInMillis(false); + final long maxTime = startTime + searchContext.timeout().millis(); + return () -> { + /* As long as startTime is non cached time, using cached time here might only produce false negative timeouts within the time + * cache life span which is acceptable */ + final long time = searchContext.getRelativeTimeInMillis(); + if (time > maxTime) { + throw new TimeExceededException(); + } + }; + } + private static boolean searchWithCollector( SearchContext searchContext, ContextIndexSearcher searcher, diff --git a/server/src/test/java/org/opensearch/search/DefaultSearchContextTests.java b/server/src/test/java/org/opensearch/search/DefaultSearchContextTests.java index 3c83f899dd1b5..0ec7d090f1a04 100644 --- a/server/src/test/java/org/opensearch/search/DefaultSearchContextTests.java +++ b/server/src/test/java/org/opensearch/search/DefaultSearchContextTests.java @@ -368,6 +368,10 @@ protected Engine.Searcher acquireSearcherInternal(String source) { ParsedQuery parsedQuery = ParsedQuery.parsedMatchAllQuery(); context3.sliceBuilder(null).parsedQuery(parsedQuery).preProcess(false); assertEquals(context3.query(), context3.buildFilteredQuery(parsedQuery.query())); + // make sure getPreciseRelativeTimeInMillis is same as System.nanoTime() + long timeToleranceInMs = 10; + long currTime = TimeValue.nsecToMSec(System.nanoTime()); + assertTrue(Math.abs(context3.getRelativeTimeInMillis(false) - currTime) <= timeToleranceInMs); when(queryShardContext.getIndexSettings()).thenReturn(indexSettings); when(queryShardContext.fieldMapper(anyString())).thenReturn(mock(MappedFieldType.class)); diff --git a/server/src/test/java/org/opensearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/opensearch/search/query/QueryPhaseTests.java index 2234c8a980923..9b2edccff82ee 100644 --- a/server/src/test/java/org/opensearch/search/query/QueryPhaseTests.java +++ b/server/src/test/java/org/opensearch/search/query/QueryPhaseTests.java @@ -85,6 +85,7 @@ import org.apache.lucene.util.FixedBitSet; import org.opensearch.action.search.SearchShardTask; import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; import org.opensearch.index.mapper.DateFieldMapper; import org.opensearch.index.mapper.MappedFieldType; import org.opensearch.index.mapper.MapperService; @@ -105,6 +106,7 @@ import org.opensearch.search.sort.SortAndFormats; import org.opensearch.tasks.TaskCancelledException; import org.opensearch.test.TestSearchContext; +import org.opensearch.threadpool.ThreadPool; import java.io.IOException; import java.util.ArrayList; @@ -117,9 +119,14 @@ import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.lessThanOrEqualTo; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.when; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.verifyNoMoreInteractions; import static org.opensearch.search.query.TopDocsCollectorContext.hasInfMaxScore; public class QueryPhaseTests extends IndexShardTestCase { @@ -1079,6 +1086,58 @@ public void testCancellationDuringPreprocess() throws IOException { } } + public void testQueryTimeoutChecker() throws Exception { + long timeCacheLifespan = ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.get(Settings.EMPTY).millis(); + long timeTolerance = timeCacheLifespan / 20; + + // should throw time exceed exception for sure after timeCacheLifespan*2+timeTolerance (next's next cached time is available) + assertThrows( + QueryPhase.TimeExceededException.class, + () -> createTimeoutCheckerThenWaitThenRun(timeCacheLifespan, timeCacheLifespan * 2 + timeTolerance, true, false) + ); + + // should not throw time exceed exception after timeCacheLifespan+timeTolerance because new cached time - init time < timeout + createTimeoutCheckerThenWaitThenRun(timeCacheLifespan, timeCacheLifespan + timeTolerance, true, false); + + // should not throw time exceed exception after timeout < timeCacheLifespan when cached time didn't change + createTimeoutCheckerThenWaitThenRun(timeCacheLifespan / 2, timeCacheLifespan / 2 + timeTolerance, false, true); + createTimeoutCheckerThenWaitThenRun(timeCacheLifespan / 4, timeCacheLifespan / 2 + timeTolerance, false, true); + } + + private void createTimeoutCheckerThenWaitThenRun( + long timeout, + long sleepAfterCreation, + boolean checkCachedTimeChanged, + boolean checkCachedTimeHasNotChanged + ) throws Exception { + long timeCacheLifespan = ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.get(Settings.EMPTY).millis(); + long timeTolerance = timeCacheLifespan / 20; + long currentTimeDiffWithCachedTime = TimeValue.nsecToMSec(System.nanoTime()) - threadPool.relativeTimeInMillis(); + // need to run this test approximately at the start of cached time window + long timeToAlignTimeWithCachedTimeOffset = timeCacheLifespan - currentTimeDiffWithCachedTime + timeTolerance; + Thread.sleep(timeToAlignTimeWithCachedTimeOffset); + + long initialRelativeCachedTime = threadPool.relativeTimeInMillis(); + SearchContext mockedSearchContext = mock(SearchContext.class); + when(mockedSearchContext.timeout()).thenReturn(TimeValue.timeValueMillis(timeout)); + when(mockedSearchContext.getRelativeTimeInMillis()).thenAnswer(invocation -> threadPool.relativeTimeInMillis()); + when(mockedSearchContext.getRelativeTimeInMillis(eq(false))).thenCallRealMethod(); + Runnable queryTimeoutChecker = QueryPhase.createQueryTimeoutChecker(mockedSearchContext); + // make sure next time slot become available + Thread.sleep(sleepAfterCreation); + if (checkCachedTimeChanged) { + assertNotEquals(initialRelativeCachedTime, threadPool.relativeTimeInMillis()); + } + if (checkCachedTimeHasNotChanged) { + assertEquals(initialRelativeCachedTime, threadPool.relativeTimeInMillis()); + } + queryTimeoutChecker.run(); + verify(mockedSearchContext, times(1)).timeout(); + verify(mockedSearchContext, times(1)).getRelativeTimeInMillis(eq(false)); + verify(mockedSearchContext, atLeastOnce()).getRelativeTimeInMillis(); + verifyNoMoreInteractions(mockedSearchContext); + } + private static class TestSearchContextWithRewriteAndCancellation extends TestSearchContext { private TestSearchContextWithRewriteAndCancellation(