From f639259ca570bab5e503e7ead00b541c7761fef9 Mon Sep 17 00:00:00 2001 From: Ruirui Zhang Date: Tue, 26 Mar 2024 13:29:05 -0700 Subject: [PATCH] Add a counter to node stat (and _cat/shards) api to track shard going from idle to non-idle (#12768) Signed-off-by: Ruirui Zhang --- CHANGELOG.md | 1 + .../test/cat.shards/10_basic.yml | 99 ++++++++++++++++++- .../index/search/stats/SearchStats.java | 26 ++++- .../index/search/stats/ShardSearchStats.java | 9 +- .../opensearch/index/shard/IndexShard.java | 5 + .../index/shard/SearchOperationListener.java | 16 +++ .../rest/action/cat/RestShardsAction.java | 5 + .../index/search/stats/SearchStatsTests.java | 7 +- .../shard/SearchOperationListenerTests.java | 31 ++++++ .../action/cat/RestShardsActionTests.java | 8 +- 10 files changed, 196 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f04f8641396f..5063c5e24e55c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - [Concurrent Segment Search] Perform buildAggregation concurrently and support Composite Aggregations ([#12697](https://github.com/opensearch-project/OpenSearch/pull/12697)) - Convert ingest processor supports ip type ([#12818](https://github.com/opensearch-project/OpenSearch/pull/12818)) - Allow setting KEYSTORE_PASSWORD through env variable ([#12865](https://github.com/opensearch-project/OpenSearch/pull/12865)) +- Add a counter to node stat (and _cat/shards) api to track shard going from idle to non-idle ([#12768](https://github.com/opensearch-project/OpenSearch/pull/12768)) ### Dependencies - Bump `org.apache.commons:commons-configuration2` from 2.10.0 to 2.10.1 ([#12896](https://github.com/opensearch-project/OpenSearch/pull/12896)) diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/cat.shards/10_basic.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/cat.shards/10_basic.yml index 29fbf55417961..c309f19b454e7 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/cat.shards/10_basic.yml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/cat.shards/10_basic.yml @@ -1,13 +1,108 @@ "Help": - skip: - version: " - 2.11.99" + version: " - 2.99.99" + reason: search idle reactivate count total is only added in 3.0.0 + features: node_selector + - do: + cat.shards: + help: true + node_selector: + version: "3.0.0 - " + + - match: + $body: | + /^ index .+ \n + shard .+ \n + prirep .+ \n + state .+ \n + docs .+ \n + store .+ \n + ip .+ \n + id .+ \n + node .+ \n + sync_id .+ \n + unassigned.reason .+ \n + unassigned.at .+ \n + unassigned.for .+ \n + unassigned.details .+ \n + recoverysource.type .+ \n + completion.size .+ \n + fielddata.memory_size .+ \n + fielddata.evictions .+ \n + query_cache.memory_size .+ \n + query_cache.evictions .+ \n + flush.total .+ \n + flush.total_time .+ \n + get.current .+ \n + get.time .+ \n + get.total .+ \n + get.exists_time .+ \n + get.exists_total .+ \n + get.missing_time .+ \n + get.missing_total .+ \n + indexing.delete_current .+ \n + indexing.delete_time .+ \n + indexing.delete_total .+ \n + indexing.index_current .+ \n + indexing.index_time .+ \n + indexing.index_total .+ \n + indexing.index_failed .+ \n + merges.current .+ \n + merges.current_docs .+ \n + merges.current_size .+ \n + merges.total .+ \n + merges.total_docs .+ \n + merges.total_size .+ \n + merges.total_time .+ \n + refresh.total .+ \n + refresh.time .+ \n + refresh.external_total .+ \n + refresh.external_time .+ \n + refresh.listeners .+ \n + search.fetch_current .+ \n + search.fetch_time .+ \n + search.fetch_total .+ \n + search.open_contexts .+ \n + search.query_current .+ \n + search.query_time .+ \n + search.query_total .+ \n + search.concurrent_query_current .+ \n + search.concurrent_query_time .+ \n + search.concurrent_query_total .+ \n + search.concurrent_avg_slice_count .+ \n + search.scroll_current .+ \n + search.scroll_time .+ \n + search.scroll_total .+ \n + search.point_in_time_current .+ \n + search.point_in_time_time .+ \n + search.point_in_time_total .+ \n + search.search_idle_reactivate_count_total .+ \n + segments.count .+ \n + segments.memory .+ \n + segments.index_writer_memory .+ \n + segments.version_map_memory .+ \n + segments.fixed_bitset_memory .+ \n + seq_no.max .+ \n + seq_no.local_checkpoint .+ \n + seq_no.global_checkpoint .+ \n + warmer.current .+ \n + warmer.total .+ \n + warmer.total_time .+ \n + path.data .+ \n + path.state .+ \n + docs.deleted .+ \n + $/ +--- +"Help from 2.12.0 to 2.99.99": + - skip: + version: " - 2.11.99 , 3.0.0 - " reason: deleted docs and concurrent search are added in 2.12.0 features: node_selector - do: cat.shards: help: true node_selector: - version: "2.12.0 - " + version: "2.12.0 - 2.99.99" - match: $body: | diff --git a/server/src/main/java/org/opensearch/index/search/stats/SearchStats.java b/server/src/main/java/org/opensearch/index/search/stats/SearchStats.java index 576e00f8f30d1..d3a53fcc0e2d8 100644 --- a/server/src/main/java/org/opensearch/index/search/stats/SearchStats.java +++ b/server/src/main/java/org/opensearch/index/search/stats/SearchStats.java @@ -163,6 +163,8 @@ public static class Stats implements Writeable, ToXContentFragment { private long pitTimeInMillis; private long pitCurrent; + private long searchIdleReactivateCount; + @Nullable private RequestStatsLongHolder requestStatsLongHolder; @@ -193,7 +195,8 @@ public Stats( long pitCurrent, long suggestCount, long suggestTimeInMillis, - long suggestCurrent + long suggestCurrent, + long searchIdleReactivateCount ) { this.requestStatsLongHolder = new RequestStatsLongHolder(); this.queryCount = queryCount; @@ -220,6 +223,8 @@ public Stats( this.pitCount = pitCount; this.pitTimeInMillis = pitTimeInMillis; this.pitCurrent = pitCurrent; + + this.searchIdleReactivateCount = searchIdleReactivateCount; } private Stats(StreamInput in) throws IOException { @@ -255,6 +260,10 @@ private Stats(StreamInput in) throws IOException { concurrentQueryCurrent = in.readVLong(); queryConcurrency = in.readVLong(); } + + if (in.getVersion().onOrAfter(Version.V_3_0_0)) { + searchIdleReactivateCount = in.readVLong(); + } } public void add(Stats stats) { @@ -282,6 +291,8 @@ public void add(Stats stats) { pitCount += stats.pitCount; pitTimeInMillis += stats.pitTimeInMillis; pitCurrent += stats.pitCurrent; + + searchIdleReactivateCount += stats.searchIdleReactivateCount; } public void addForClosingShard(Stats stats) { @@ -306,6 +317,8 @@ public void addForClosingShard(Stats stats) { pitTimeInMillis += stats.pitTimeInMillis; pitCurrent += stats.pitCurrent; queryConcurrency += stats.queryConcurrency; + + searchIdleReactivateCount += stats.searchIdleReactivateCount; } public long getQueryCount() { @@ -412,6 +425,10 @@ public long getSuggestCurrent() { return suggestCurrent; } + public long getSearchIdleReactivateCount() { + return searchIdleReactivateCount; + } + public static Stats readStats(StreamInput in) throws IOException { return new Stats(in); } @@ -457,6 +474,10 @@ public void writeTo(StreamOutput out) throws IOException { out.writeVLong(concurrentQueryCurrent); out.writeVLong(queryConcurrency); } + + if (out.getVersion().onOrAfter(Version.V_3_0_0)) { + out.writeVLong(searchIdleReactivateCount); + } } @Override @@ -486,6 +507,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.humanReadableField(Fields.SUGGEST_TIME_IN_MILLIS, Fields.SUGGEST_TIME, getSuggestTime()); builder.field(Fields.SUGGEST_CURRENT, suggestCurrent); + builder.field(Fields.SEARCH_IDLE_REACTIVATE_COUNT_TOTAL, searchIdleReactivateCount); + if (requestStatsLongHolder != null) { builder.startObject(Fields.REQUEST); @@ -654,6 +677,7 @@ static final class Fields { static final String TIME = "time"; static final String CURRENT = "current"; static final String TOTAL = "total"; + static final String SEARCH_IDLE_REACTIVATE_COUNT_TOTAL = "search_idle_reactivate_count_total"; } diff --git a/server/src/main/java/org/opensearch/index/search/stats/ShardSearchStats.java b/server/src/main/java/org/opensearch/index/search/stats/ShardSearchStats.java index 99e3f8465c5db..3098986852cc1 100644 --- a/server/src/main/java/org/opensearch/index/search/stats/ShardSearchStats.java +++ b/server/src/main/java/org/opensearch/index/search/stats/ShardSearchStats.java @@ -213,6 +213,11 @@ public void onFreePitContext(ReaderContext readerContext) { totalStats.pitMetric.inc(TimeUnit.NANOSECONDS.toMicros(System.nanoTime() - readerContext.getStartTimeInNano())); } + @Override + public void onSearchIdleReactivation() { + totalStats.searchIdleMetric.inc(); + } + /** * Holder of statistics values * @@ -239,6 +244,7 @@ static final class StatsHolder { final CounterMetric scrollCurrent = new CounterMetric(); final CounterMetric pitCurrent = new CounterMetric(); final CounterMetric suggestCurrent = new CounterMetric(); + final CounterMetric searchIdleMetric = new CounterMetric(); SearchStats.Stats stats() { return new SearchStats.Stats( @@ -260,7 +266,8 @@ SearchStats.Stats stats() { pitCurrent.count(), suggestMetric.count(), TimeUnit.NANOSECONDS.toMillis(suggestMetric.sum()), - suggestCurrent.count() + suggestCurrent.count(), + searchIdleMetric.count() ); } } diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 410b3bae1bc65..73bea3e5567b1 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -4698,9 +4698,14 @@ public void afterRefresh(boolean didRefresh) { * true if the listener was registered to wait for a refresh. */ public final void awaitShardSearchActive(Consumer listener) { + boolean isSearchIdle = isSearchIdle(); markSearcherAccessed(); // move the shard into non-search idle final Translog.Location location = pendingRefreshLocation.get(); if (location != null) { + if (isSearchIdle) { + SearchOperationListener searchOperationListener = getSearchOperationListener(); + searchOperationListener.onSearchIdleReactivation(); + } addRefreshListener(location, (b) -> { pendingRefreshLocation.compareAndSet(location, null); listener.accept(true); diff --git a/server/src/main/java/org/opensearch/index/shard/SearchOperationListener.java b/server/src/main/java/org/opensearch/index/shard/SearchOperationListener.java index 849a4f9c15318..94079db468f9c 100644 --- a/server/src/main/java/org/opensearch/index/shard/SearchOperationListener.java +++ b/server/src/main/java/org/opensearch/index/shard/SearchOperationListener.java @@ -145,6 +145,11 @@ default void onNewPitContext(ReaderContext readerContext) {} */ default void onFreePitContext(ReaderContext readerContext) {} + /** + * Executed when a shard goes from idle to non-idle state + */ + default void onSearchIdleReactivation() {} + /** * A Composite listener that multiplexes calls to each of the listeners methods. */ @@ -310,5 +315,16 @@ public void onFreePitContext(ReaderContext readerContext) { } } } + + @Override + public void onSearchIdleReactivation() { + for (SearchOperationListener listener : listeners) { + try { + listener.onSearchIdleReactivation(); + } catch (Exception e) { + logger.warn(() -> new ParameterizedMessage("onNewSearchIdleReactivation listener [{}] failed", listener), e); + } + } + } } } diff --git a/server/src/main/java/org/opensearch/rest/action/cat/RestShardsAction.java b/server/src/main/java/org/opensearch/rest/action/cat/RestShardsAction.java index bdbc1c800d388..0205c610e6b1a 100644 --- a/server/src/main/java/org/opensearch/rest/action/cat/RestShardsAction.java +++ b/server/src/main/java/org/opensearch/rest/action/cat/RestShardsAction.java @@ -253,6 +253,10 @@ protected Table getTableWithHeader(final RestRequest request) { "search.point_in_time_total", "alias:spto,searchPointInTimeTotal;default:false;text-align:right;desc:completed point in time contexts" ); + table.addCell( + "search.search_idle_reactivate_count_total", + "alias:ssirct,searchSearchIdleReactivateCountTotal;default:false;text-align:right;desc:number of times a shard reactivated" + ); table.addCell("segments.count", "alias:sc,segmentsCount;default:false;text-align:right;desc:number of segments"); table.addCell("segments.memory", "alias:sm,segmentsMemory;default:false;text-align:right;desc:memory used by segments"); @@ -427,6 +431,7 @@ Table buildTable(RestRequest request, ClusterStateResponse state, IndicesStatsRe table.addCell(getOrNull(commonStats, CommonStats::getSearch, i -> i.getTotal().getPitCurrent())); table.addCell(getOrNull(commonStats, CommonStats::getSearch, i -> i.getTotal().getPitTime())); table.addCell(getOrNull(commonStats, CommonStats::getSearch, i -> i.getTotal().getPitCount())); + table.addCell(getOrNull(commonStats, CommonStats::getSearch, i -> i.getTotal().getSearchIdleReactivateCount())); table.addCell(getOrNull(commonStats, CommonStats::getSegments, SegmentsStats::getCount)); table.addCell(getOrNull(commonStats, CommonStats::getSegments, SegmentsStats::getZeroMemory)); diff --git a/server/src/test/java/org/opensearch/index/search/stats/SearchStatsTests.java b/server/src/test/java/org/opensearch/index/search/stats/SearchStatsTests.java index 5656b77445772..594700ea60b3e 100644 --- a/server/src/test/java/org/opensearch/index/search/stats/SearchStatsTests.java +++ b/server/src/test/java/org/opensearch/index/search/stats/SearchStatsTests.java @@ -57,9 +57,9 @@ public void testShardLevelSearchGroupStats() throws Exception { // let's create two dummy search stats with groups Map groupStats1 = new HashMap<>(); Map groupStats2 = new HashMap<>(); - groupStats2.put("group1", new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)); - SearchStats searchStats1 = new SearchStats(new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 0, groupStats1); - SearchStats searchStats2 = new SearchStats(new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 0, groupStats2); + groupStats2.put("group1", new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)); + SearchStats searchStats1 = new SearchStats(new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 0, groupStats1); + SearchStats searchStats2 = new SearchStats(new Stats(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 0, groupStats2); // adding these two search stats and checking group stats are correct searchStats1.add(searchStats2); @@ -128,6 +128,7 @@ private static void assertStats(Stats stats, long equalTo) { assertEquals(equalTo, stats.getSuggestCount()); assertEquals(equalTo, stats.getSuggestTimeInMillis()); assertEquals(equalTo, stats.getSuggestCurrent()); + assertEquals(equalTo, stats.getSearchIdleReactivateCount()); // avg_concurrency is not summed up across stats assertEquals(1, stats.getConcurrentAvgSliceCount(), 0); } diff --git a/server/src/test/java/org/opensearch/index/shard/SearchOperationListenerTests.java b/server/src/test/java/org/opensearch/index/shard/SearchOperationListenerTests.java index 98f86758ea2ca..c61c13eecf2c3 100644 --- a/server/src/test/java/org/opensearch/index/shard/SearchOperationListenerTests.java +++ b/server/src/test/java/org/opensearch/index/shard/SearchOperationListenerTests.java @@ -64,6 +64,7 @@ public void testListenersAreExecuted() { AtomicInteger newScrollContext = new AtomicInteger(); AtomicInteger freeScrollContext = new AtomicInteger(); AtomicInteger validateSearchContext = new AtomicInteger(); + AtomicInteger searchIdleReactivateCount = new AtomicInteger(); AtomicInteger timeInNanos = new AtomicInteger(randomIntBetween(0, 10)); SearchOperationListener listener = new SearchOperationListener() { @Override @@ -133,6 +134,11 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertNotNull(readerContext); validateSearchContext.incrementAndGet(); } + + @Override + public void onSearchIdleReactivation() { + searchIdleReactivateCount.incrementAndGet(); + } }; SearchOperationListener throwingListener = (SearchOperationListener) Proxy.newProxyInstance( @@ -169,6 +175,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(0, newScrollContext.get()); assertEquals(0, freeContext.get()); assertEquals(0, freeScrollContext.get()); + assertEquals(0, searchIdleReactivateCount.get()); assertEquals(0, validateSearchContext.get()); compositeListener.onFetchPhase(ctx, timeInNanos.get()); @@ -182,6 +189,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(0, newScrollContext.get()); assertEquals(0, freeContext.get()); assertEquals(0, freeScrollContext.get()); + assertEquals(0, searchIdleReactivateCount.get()); assertEquals(0, validateSearchContext.get()); compositeListener.onPreQueryPhase(ctx); @@ -195,6 +203,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(0, newScrollContext.get()); assertEquals(0, freeContext.get()); assertEquals(0, freeScrollContext.get()); + assertEquals(0, searchIdleReactivateCount.get()); assertEquals(0, validateSearchContext.get()); compositeListener.onPreFetchPhase(ctx); @@ -208,6 +217,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(0, newScrollContext.get()); assertEquals(0, freeContext.get()); assertEquals(0, freeScrollContext.get()); + assertEquals(0, searchIdleReactivateCount.get()); assertEquals(0, validateSearchContext.get()); compositeListener.onFailedFetchPhase(ctx); @@ -221,6 +231,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(0, newScrollContext.get()); assertEquals(0, freeContext.get()); assertEquals(0, freeScrollContext.get()); + assertEquals(0, searchIdleReactivateCount.get()); assertEquals(0, validateSearchContext.get()); compositeListener.onFailedQueryPhase(ctx); @@ -234,6 +245,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(0, newScrollContext.get()); assertEquals(0, freeContext.get()); assertEquals(0, freeScrollContext.get()); + assertEquals(0, searchIdleReactivateCount.get()); assertEquals(0, validateSearchContext.get()); compositeListener.onNewReaderContext(mock(ReaderContext.class)); @@ -247,6 +259,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(0, newScrollContext.get()); assertEquals(0, freeContext.get()); assertEquals(0, freeScrollContext.get()); + assertEquals(0, searchIdleReactivateCount.get()); assertEquals(0, validateSearchContext.get()); compositeListener.onNewScrollContext(mock(ReaderContext.class)); @@ -260,6 +273,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(2, newScrollContext.get()); assertEquals(0, freeContext.get()); assertEquals(0, freeScrollContext.get()); + assertEquals(0, searchIdleReactivateCount.get()); assertEquals(0, validateSearchContext.get()); compositeListener.onFreeReaderContext(mock(ReaderContext.class)); @@ -273,6 +287,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(2, newScrollContext.get()); assertEquals(2, freeContext.get()); assertEquals(0, freeScrollContext.get()); + assertEquals(0, searchIdleReactivateCount.get()); assertEquals(0, validateSearchContext.get()); compositeListener.onFreeScrollContext(mock(ReaderContext.class)); @@ -286,6 +301,21 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(2, newScrollContext.get()); assertEquals(2, freeContext.get()); assertEquals(2, freeScrollContext.get()); + assertEquals(0, searchIdleReactivateCount.get()); + assertEquals(0, validateSearchContext.get()); + + compositeListener.onSearchIdleReactivation(); + assertEquals(2, preFetch.get()); + assertEquals(2, preQuery.get()); + assertEquals(2, failedFetch.get()); + assertEquals(2, failedQuery.get()); + assertEquals(2, onQuery.get()); + assertEquals(2, onFetch.get()); + assertEquals(2, newContext.get()); + assertEquals(2, newScrollContext.get()); + assertEquals(2, freeContext.get()); + assertEquals(2, freeScrollContext.get()); + assertEquals(2, searchIdleReactivateCount.get()); assertEquals(0, validateSearchContext.get()); if (throwingListeners == 0) { @@ -311,6 +341,7 @@ public void validateReaderContext(ReaderContext readerContext, TransportRequest assertEquals(2, newScrollContext.get()); assertEquals(2, freeContext.get()); assertEquals(2, freeScrollContext.get()); + assertEquals(2, searchIdleReactivateCount.get()); assertEquals(2, validateSearchContext.get()); } } diff --git a/server/src/test/java/org/opensearch/rest/action/cat/RestShardsActionTests.java b/server/src/test/java/org/opensearch/rest/action/cat/RestShardsActionTests.java index fa13ec2036797..883df7da5d717 100644 --- a/server/src/test/java/org/opensearch/rest/action/cat/RestShardsActionTests.java +++ b/server/src/test/java/org/opensearch/rest/action/cat/RestShardsActionTests.java @@ -125,7 +125,7 @@ public void testBuildTable() { assertThat(headers.get(6).value, equalTo("ip")); assertThat(headers.get(7).value, equalTo("id")); assertThat(headers.get(8).value, equalTo("node")); - assertThat(headers.get(78).value, equalTo("docs.deleted")); + assertThat(headers.get(79).value, equalTo("docs.deleted")); final List> rows = table.getRows(); assertThat(rows.size(), equalTo(numShards)); @@ -141,9 +141,9 @@ public void testBuildTable() { assertThat(row.get(4).value, equalTo(shardStats.getStats().getDocs().getCount())); assertThat(row.get(6).value, equalTo(localNode.getHostAddress())); assertThat(row.get(7).value, equalTo(localNode.getId())); - assertThat(row.get(76).value, equalTo(shardStats.getDataPath())); - assertThat(row.get(77).value, equalTo(shardStats.getStatePath())); - assertThat(row.get(78).value, equalTo(shardStats.getStats().getDocs().getDeleted())); + assertThat(row.get(77).value, equalTo(shardStats.getDataPath())); + assertThat(row.get(78).value, equalTo(shardStats.getStatePath())); + assertThat(row.get(79).value, equalTo(shardStats.getStats().getDocs().getDeleted())); } } }