From 0fd93bbace3eea222ae1243e51f5ba36c0d023f0 Mon Sep 17 00:00:00 2001 From: patsonluk Date: Thu, 10 Oct 2024 12:48:32 -0700 Subject: [PATCH] SAI-5048: Some enhancements for node level metrics (#220) (#229) (cherry-picked from commit ad302eefb90340b51769155d4f5914a31262078d of fs/branch_9_3) --- .../servlet/PrometheusMetricsServlet.java | 584 +++++++++++++----- .../org/apache/solr/update/CommitTracker.java | 1 + .../solr/update/CommitUpdateCommand.java | 1 + .../solr/update/DirectUpdateHandler2.java | 22 +- .../apache/solr/update/SolrIndexConfig.java | 3 + .../apache/solr/update/SolrIndexWriter.java | 16 +- .../servlet/PrometheusMetricsServletTest.java | 84 ++- 7 files changed, 506 insertions(+), 205 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/servlet/PrometheusMetricsServlet.java b/solr/core/src/java/org/apache/solr/servlet/PrometheusMetricsServlet.java index ee7d6d7f13e..ee9628dd52f 100644 --- a/solr/core/src/java/org/apache/solr/servlet/PrometheusMetricsServlet.java +++ b/solr/core/src/java/org/apache/solr/servlet/PrometheusMetricsServlet.java @@ -33,12 +33,14 @@ import java.util.Collections; import java.util.Enumeration; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Supplier; +import java.util.stream.Collectors; import javax.servlet.ServletOutputStream; import javax.servlet.UnavailableException; import javax.servlet.WriteListener; @@ -67,15 +69,20 @@ public final class PrometheusMetricsServlet extends BaseSolrServlet { // failing the call. private static final Integer INVALID_NUMBER = -1; - private final List callers = - Collections.unmodifiableList( - Arrays.asList( - new GarbageCollectorMetricsApiCaller(), - new MemoryMetricsApiCaller(), - new OsMetricsApiCaller(), - new ThreadMetricsApiCaller(), - new StatusCodeMetricsApiCaller(), - new CoresMetricsApiCaller())); + private final List callers = getCallers(); + + private List getCallers() { + AggregateMetricsApiCaller aggregateMetricsApiCaller = new AggregateMetricsApiCaller(); + return List.of( + new GarbageCollectorMetricsApiCaller(), + new MemoryMetricsApiCaller(), + new OsMetricsApiCaller(), + new ThreadMetricsApiCaller(), + new StatusCodeMetricsApiCaller(), + aggregateMetricsApiCaller, + new CoresMetricsApiCaller( + Collections.unmodifiableList(aggregateMetricsApiCaller.missingCoreMetrics))); + } private final Map cacheMetricTypes = Map.of( @@ -249,7 +256,7 @@ static void getSharedCacheMetrics( } } - static class GarbageCollectorMetricsApiCaller extends MetricsApiCaller { + static class GarbageCollectorMetricsApiCaller extends MetricsByPrefixApiCaller { GarbageCollectorMetricsApiCaller() { super("jvm", "gc.G1-,memory.pools.G1-", ""); @@ -347,7 +354,7 @@ protected void handle(List results, JsonNode metrics) throws I } } - static class MemoryMetricsApiCaller extends MetricsApiCaller { + static class MemoryMetricsApiCaller extends MetricsByPrefixApiCaller { MemoryMetricsApiCaller() { super("jvm", "memory.heap.,memory.non-heap.", ""); @@ -397,7 +404,7 @@ protected void handle(List results, JsonNode metrics) throws I } } - static class OsMetricsApiCaller extends MetricsApiCaller { + static class OsMetricsApiCaller extends MetricsByPrefixApiCaller { OsMetricsApiCaller() { super("jvm", "os.", ""); @@ -440,7 +447,7 @@ protected void handle(List results, JsonNode metrics) throws I } } - static class ThreadMetricsApiCaller extends MetricsApiCaller { + static class ThreadMetricsApiCaller extends MetricsByPrefixApiCaller { ThreadMetricsApiCaller() { super("jvm", "threads.", ""); @@ -508,7 +515,7 @@ protected void handle(List results, JsonNode metrics) throws I } } - static class StatusCodeMetricsApiCaller extends MetricsApiCaller { + static class StatusCodeMetricsApiCaller extends MetricsByPrefixApiCaller { StatusCodeMetricsApiCaller() { super("jetty", "org.eclipse.jetty.server.handler.DefaultHandler.", "count"); @@ -559,15 +566,350 @@ protected void handle(List results, JsonNode metrics) throws I } } - // Aggregating across all the cores on the node. - // Report only local requests, excluding forwarded requests to other nodes. + enum CoreMetric { + MAJOR_MERGE( + "INDEX.merge.major", "merges_major", "cumulative number of major merges across cores"), + MAJOR_MERGE_RUNNING_DOCS( + "INDEX.merge.major.running.docs", + "merges_major_current_docs", + "current number of docs in major merges across cores", + null, + PrometheusMetricType.GAUGE), + MINOR_MERGE( + "INDEX.merge.minor", "merges_minor", "cumulative number of minor merges across cores"), + MINOR_MERGE_RUNNING_DOCS( + "INDEX.merge.minor.running.docs", + "merges_minor_current_docs", + "current number of docs in minor merges across cores", + null, + PrometheusMetricType.GAUGE), + GET( + "QUERY./get.requestTimes", + "top_level_requests_get", + "cumulative number of top-level gets across cores"), + GET_DURATION_P50( + "QUERY./get.requestTimes", + "top_level_requests_get_duration_p50", + "top-level gets p50 duration", + "median_ms", + PrometheusMetricType.GAUGE), + GET_DURATION_P95( + "QUERY./get.requestTimes", + "top_level_requests_get_duration_p95", + "top-level gets p95 duration", + "p95_ms", + PrometheusMetricType.GAUGE), + GET_DURATION_P99( + "QUERY./get.requestTimes", + "top_level_requests_get_duration_p99", + "top-level gets p99 duration", + "p99_ms", + PrometheusMetricType.GAUGE), + GET_SUBSHARD( + "QUERY./get[shard].requestTimes", + "sub_shard_requests_get", + "cumulative number of sub (spawned by re-distributing a top-level req) gets across cores"), + GET_SUBSHARD_DURATION_P50( + "QUERY./get[shard].requestTimes", + "sub_shard_requests_get_duration_p50", + "sub shard gets p50 duration", + "median_ms", + PrometheusMetricType.GAUGE), + GET_SUBSHARD_DURATION_P95( + "QUERY./get[shard].requestTimes", + "sub_shard_requests_get_duration_p95", + "sub shard gets p95 duration", + "p95_ms", + PrometheusMetricType.GAUGE), + GET_SUBSHARD_DURATION_P99( + "QUERY./get[shard].requestTimes", + "sub_shard_requests_get_duration_p99", + "sub shard gets p99 duration", + "p99_ms", + PrometheusMetricType.GAUGE), + SELECT( + "QUERY./select.requestTimes", + "top_level_requests_select", + "cumulative number of top-level selects across cores"), + SELECT_DURATION_P50( + "QUERY./select.requestTimes", + "top_level_requests_select_duration_p50", + "top-level selects p50 duration", + "median_ms", + PrometheusMetricType.GAUGE), + SELECT_DURATION_P95( + "QUERY./select.requestTimes", + "top_level_requests_select_duration_p95", + "top-level selects p95 duration", + "p95_ms", + PrometheusMetricType.GAUGE), + SELECT_DURATION_P99( + "QUERY./select.requestTimes", + "top_level_requests_select_duration_p99", + "top-level selects p99 duration", + "p99_ms", + PrometheusMetricType.GAUGE), + SELECT_SUBSHARD( + "QUERY./select[shard].requestTimes", + "sub_shard_requests_select", + "cumulative number of sub (spawned by re-distributing a top-level req) selects across cores"), + SELECT_SUBSHARD_DURATION_P50( + "QUERY./select[shard].requestTimes", + "sub_shard_requests_select_duration_p50", + "sub shard selects p50 duration", + "median_ms", + PrometheusMetricType.GAUGE), + SELECT_SUBSHARD_DURATION_P95( + "QUERY./select[shard].requestTimes", + "sub_shard_requests_select_duration_p95", + "sub shard selects p95 duration", + "p95_ms", + PrometheusMetricType.GAUGE), + SELECT_SUBSHARD_DURATION_P99( + "QUERY./select[shard].requestTimes", + "sub_shard_requests_select_duration_p99", + "sub shard selects p99 duration", + "p99_ms", + PrometheusMetricType.GAUGE), + UPDATE( + "UPDATE./update.requestTimes", + "distributed_requests_update", + "cumulative number of distributed updates across cores"), + UPDATE_DURATION_P50( + "UPDATE./update.requestTimes", + "distributed_requests_update_duration_p50", + "distributed updates p50 duration", + "median_ms", + PrometheusMetricType.GAUGE), + UPDATE_DURATION_P95( + "UPDATE./update.requestTimes", + "distributed_requests_update_duration_p95", + "distributed updates p95 duration", + "p95_ms", + PrometheusMetricType.GAUGE), + UPDATE_DURATION_P99( + "UPDATE./update.requestTimes", + "distributed_requests_update_duration_p99", + "distributed updates p99 duration", + "p99_ms", + PrometheusMetricType.GAUGE), + LOCAL_UPDATE( + "UPDATE./update[local].requestTimes", + "local_requests_update", + "cumulative number of local updates across cores"), + LOCAL_UPDATE_DURATION_P50( + "UPDATE./update[local].requestTimes", + "local_requests_update_duration_p50", + "local updates p50 duration", + "median_ms", + PrometheusMetricType.GAUGE), + LOCAL_UPDATE_DURATION_P95( + "UPDATE./update[local].requestTimes", + "local_requests_update_duration_p95", + "local updates p95 duration", + "p95_ms", + PrometheusMetricType.GAUGE), + LOCAL_UPDATE_DURATION_P99( + "UPDATE./update[local].requestTimes", + "local_requests_update_duration_p99", + "local updates p99 duration", + "p99_ms", + PrometheusMetricType.GAUGE), + AUTOCOMMIT( + "UPDATE.updateHandler.autoCommits", + "auto_commits_hard", + "cumulative number of hard auto commits across cores"), + SOFT_AUTOCOMMIT( + "UPDATE.updateHandler.softAutoCommits", + "auto_commits_soft", + "cumulative number of soft auto commits across cores"), + COMMITS("UPDATE.updateHandler.commits", "commits", "cumulative number of commits across cores"), + CUMULATIVE_DEL_BY_ID( + "UPDATE.updateHandler.cumulativeDeletesById", + "deletes_by_id", + "cumulative number of deletes by id across cores"), + CUMULATIVE_DEL_BY_Q( + "UPDATE.updateHandler.cumulativeDeletesByQuery", + "deletes_by_query", + "cumulative number of deletes by query across cores"), + CUMULATIVE_DOC_ADDS( + "UPDATE.updateHandler.cumulativeAdds", + "doc_adds", + "cumulative number of docs added across cores"), + CUMULATIVE_ERRS( + "UPDATE.updateHandler.cumulativeErrors", + "update_errors", + "cumulative number of errors during updates across cores"), + MERGES("UPDATE.updateHandler.merges", "merges", "cumulative number of merges across cores"), + OPTIMIZE( + "UPDATE.updateHandler.optimizes", + "optimizes", + "cumulative number of optimizes across cores"), + + SPLITS("UPDATE.updateHandler.optimizes", "splits", "cumulative number of splits across cores"), + + EXPUNGE_DEL( + "UPDATE.updateHandler.expungeDeletes", + "expunge_deletes", + "cumulative number of expungeDeletes across cores"); + final String key, metricName, desc, property; + private final PrometheusMetricType metricType; + private static final Map lookup = new HashMap<>(); + + static { + for (CoreMetric e : CoreMetric.values()) { + lookup.put(e.key, e); + } + } + + CoreMetric(String key, String metricName, String desc) { + this(key, metricName, desc, "count", PrometheusMetricType.COUNTER); + } + + CoreMetric( + String key, + String metricName, + String desc, + String property, + PrometheusMetricType metricType) { + this.key = key; + this.metricName = metricName; + this.desc = desc; + this.property = property; + this.metricType = metricType; + } + + PrometheusMetric createPrometheusMetric(Number value) { + return createPrometheusMetric(value, null); + } + + PrometheusMetric createPrometheusMetric(Number value, String descriptionSuffix) { + return new PrometheusMetric( + metricName, + metricType, + desc + (descriptionSuffix != null ? descriptionSuffix : ""), + value.longValue()); + } + } + + /** + * A caller that fetch metrics from both groups "solr.node" (node aggregated metrics) and "core" + * (per core metrics) and match it to all the values in enum CoreMetric. The goal is to provide + * node level metrics on the CoreMetric values. + * + *

It first iterates on the "solr.node" metrics, if a core metric is not found there, then it + * will look it up per core and sum them up as the node metrics. + */ + static class AggregateMetricsApiCaller extends MetricsByPrefixApiCaller { + /* + "metrics":{ + "solr.node":{ //node aggregated metrics + "QUERY./select.requestTimes":{"count":2}, + "QUERY./select[shard].requestTimes":{"count":0}, + "UPDATE./update.requestTimes":{"count":2}, + "UPDATE./update[local].requestTimes":{"count":0} + ... + }, + "solr.core.loadtest.shard1_1.replica_n8":{ //pre core metrics + "QUERY./select.requestTimes":{"count":1}, + "QUERY./select[shard].requestTimes":{"count":0}, + "UPDATE./update.requestTimes":{"count":1}, + "UPDATE./update[local].requestTimes":{"count":0} + ... + }, + "solr.core.loadtest.shard2_1.replica_n10":{ + "QUERY./select.requestTimes":{"count":0}, + "QUERY./select[shard].requestTimes":{"count":0}, + "UPDATE./update.requestTimes":{"count":1}, + "UPDATE./update[local].requestTimes":{"count":0} + ... + }, + ... + */ + List missingCoreMetrics = new ArrayList<>(); + + AggregateMetricsApiCaller() { + super("solr.node", buildPrefix(), buildProperty()); + } + + private static String buildPrefix() { + return String.join( + ",", Arrays.stream(CoreMetric.values()).map(m -> m.key).toArray(String[]::new)); + } + + private static String buildProperty() { + return String.join( + ",", + Arrays.stream(CoreMetric.values()) + .filter(m -> m.property != null) + .map(m -> m.property) + .collect(Collectors.toSet())); + } + + @Override + protected void handle(List results, JsonNode metricsNode) throws IOException { + missingCoreMetrics.clear(); + JsonNode nodeMetricNode = metricsNode.get("solr.node"); + + if (nodeMetricNode != null) { + for (CoreMetric metric : CoreMetric.values()) { + Number value = + metric.property != null + ? getNumber(nodeMetricNode, metric.key, metric.property) + : getNumber(nodeMetricNode, metric.key); + if (!INVALID_NUMBER.equals(value)) { + results.add(metric.createPrometheusMetric(value, "[node aggregated]")); + } else { + missingCoreMetrics.add(metric); + } + } + } else { + log.warn( + "Cannot find the solr.node metrics, going to fall back to getting metrics from all cores"); + missingCoreMetrics.addAll(Arrays.asList(CoreMetric.values())); + } + } + } + + /** + * Collector that get metrics from all the cores and then sum those metrics by CoreMetric key. + * + *

This runs after AggregateMetricsApiCaller and pick up whatever is missing from it by reading + * missingCoreMetricsView. + * + *

Therefore, this has dependency on AggregateMetricsApiCaller and should not be executed + * concurrently with it. + */ static class CoresMetricsApiCaller extends MetricsApiCaller { + private final List missingCoreMetricsView; + + CoresMetricsApiCaller(List missingCoreMetricsView) { + this.missingCoreMetricsView = missingCoreMetricsView; + } + + @Override + protected String buildQueryString() { + List prefixes = new ArrayList<>(); + List properties = new ArrayList<>(); + for (CoreMetric missingMetric : missingCoreMetricsView) { + prefixes.add(missingMetric.key); + if (missingMetric.property != null) { + properties.add(missingMetric.property); + } + } - CoresMetricsApiCaller() { - super( + String propertyClause = + String.join( + "&property=", + properties.stream() + .map(p -> URLEncoder.encode(p, StandardCharsets.UTF_8)) + .collect(Collectors.toSet())); + return String.format( + Locale.ROOT, + "wt=json&indent=false&compact=true&group=%s&prefix=%s%s", "core", - "INDEX.merge.,QUERY./get.requestTimes,QUERY./get[shard].requestTimes,QUERY./select.requestTimes,QUERY./select[shard].requestTimes,UPDATE./update.requestTimes,UPDATE./update[local].requestTimes,UPDATE.updateHandler.autoCommits,UPDATE.updateHandler.commits,UPDATE.updateHandler.cumulativeDeletesBy,UPDATE.updateHandler.softAutoCommits", - "count"); + URLEncoder.encode(String.join(",", prefixes), StandardCharsets.UTF_8), + propertyClause); } /* @@ -595,132 +937,29 @@ static class CoresMetricsApiCaller extends MetricsApiCaller { "UPDATE.updateHandler.softAutoCommits":0}, ... */ + @Override protected void handle(List results, JsonNode metrics) throws IOException { - long mergeMajor = 0; - long mergeMajorDocs = 0; - long mergeMinor = 0; - long mergeMinorDocs = 0; - long distribGet = 0; - long localGet = 0; - long distribSelect = 0; - long localSelect = 0; - long distribUpdate = 0; - long localUpdate = 0; - long hardAutoCommit = 0; - long commit = 0; - long deleteById = 0; - long deleteByQuery = 0; - long softAutoCommit = 0; - for (JsonNode core : metrics) { - mergeMajor += getNumber(core, "INDEX.merge.major", property).longValue(); - mergeMajorDocs += getNumber(core, "INDEX.merge.major.running.docs").longValue(); - mergeMinor += getNumber(core, "INDEX.merge.minor", property).longValue(); - mergeMinorDocs += getNumber(core, "INDEX.merge.minor.running.docs").longValue(); - distribGet += getNumber(core, "QUERY./get.requestTimes", property).longValue(); - localGet += getNumber(core, "QUERY./get[shard].requestTimes", property).longValue(); - distribSelect += getNumber(core, "QUERY./select.requestTimes", property).longValue(); - localSelect += getNumber(core, "QUERY./select[shard].requestTimes", property).longValue(); - distribUpdate += getNumber(core, "UPDATE./update.requestTimes", property).longValue(); - localUpdate += getNumber(core, "UPDATE./update[local].requestTimes", property).longValue(); - hardAutoCommit += getNumber(core, "UPDATE.updateHandler.autoCommits").longValue(); - commit += getNumber(core, "UPDATE.updateHandler.commits", property).longValue(); - deleteById += - getNumber(core, "UPDATE.updateHandler.cumulativeDeletesById", property).longValue(); - deleteByQuery += - getNumber(core, "UPDATE.updateHandler.cumulativeDeletesByQuery", property).longValue(); - softAutoCommit += getNumber(core, "UPDATE.updateHandler.softAutoCommits").longValue(); + Map accumulative = new LinkedHashMap<>(); + for (CoreMetric missingCoreMetric : missingCoreMetricsView) { + for (JsonNode coreMetricNode : metrics) { + Number val = + missingCoreMetric.property != null + ? getNumber(coreMetricNode, missingCoreMetric.key, missingCoreMetric.property) + : getNumber(coreMetricNode, missingCoreMetric.key); + if (!val.equals(INVALID_NUMBER)) { + accumulative.put( + missingCoreMetric, + accumulative.getOrDefault(missingCoreMetric, 0L) + val.longValue()); + } + } + } + + for (Map.Entry coreMetricEntry : accumulative.entrySet()) { + CoreMetric coreMetric = coreMetricEntry.getKey(); + Long accumulativeVal = coreMetricEntry.getValue(); + results.add(coreMetric.createPrometheusMetric(accumulativeVal)); } - results.add( - new PrometheusMetric( - "merges_major", - PrometheusMetricType.COUNTER, - "cumulative number of major merges across cores", - mergeMajor)); - results.add( - new PrometheusMetric( - "merges_major_current_docs", - PrometheusMetricType.GAUGE, - "current number of docs in major merges across cores", - mergeMajorDocs)); - results.add( - new PrometheusMetric( - "merges_minor", - PrometheusMetricType.COUNTER, - "cumulative number of minor merges across cores", - mergeMinor)); - results.add( - new PrometheusMetric( - "merges_minor_current_docs", - PrometheusMetricType.GAUGE, - "current number of docs in minor merges across cores", - mergeMinorDocs)); - results.add( - new PrometheusMetric( - "top_level_requests_get", - PrometheusMetricType.COUNTER, - "cumulative number of top-level gets across cores", - distribGet)); - results.add( - new PrometheusMetric( - "sub_shard_requests_get", - PrometheusMetricType.COUNTER, - "cumulative number of sub (spawned by re-distributing a top-level req) gets across cores", - localGet)); - results.add( - new PrometheusMetric( - "top_level_requests_select", - PrometheusMetricType.COUNTER, - "cumulative number of top-level selects across cores", - distribSelect)); - results.add( - new PrometheusMetric( - "sub_shard_requests_select", - PrometheusMetricType.COUNTER, - "cumulative number of sub (spawned by re-distributing a top-level req) selects across cores", - localSelect)); - results.add( - new PrometheusMetric( - "distributed_requests_update", - PrometheusMetricType.COUNTER, - "cumulative number of distributed updates across cores", - distribUpdate)); - results.add( - new PrometheusMetric( - "local_requests_update", - PrometheusMetricType.COUNTER, - "cumulative number of local updates across cores", - localUpdate)); - results.add( - new PrometheusMetric( - "auto_commits_hard", - PrometheusMetricType.COUNTER, - "cumulative number of hard auto commits across cores", - hardAutoCommit)); - results.add( - new PrometheusMetric( - "auto_commits_soft", - PrometheusMetricType.COUNTER, - "cumulative number of soft auto commits across cores", - softAutoCommit)); - results.add( - new PrometheusMetric( - "commits", - PrometheusMetricType.COUNTER, - "cumulative number of commits across cores", - commit)); - results.add( - new PrometheusMetric( - "deletes_by_id", - PrometheusMetricType.COUNTER, - "cumulative number of deletes by id across cores", - deleteById)); - results.add( - new PrometheusMetric( - "deletes_by_query", - PrometheusMetricType.COUNTER, - "cumulative number of deletes by query across cores", - deleteByQuery)); } } @@ -785,7 +1024,9 @@ static Number getNumber(JsonNode node, String... names) throws IOException { for (String name : names) { node = node.path(name); } - if (node.isNumber()) { + if (node.isMissingNode()) { + return INVALID_NUMBER; + } else if (node.isNumber()) { return node.numberValue(); } else { log.warn("node {} does not have a number at the path {}.", originalNode, names); @@ -805,23 +1046,13 @@ static SolrDispatchFilter getSolrDispatchFilter(HttpServletRequest request) thro abstract static class MetricsApiCaller { - protected final String group; - protected final String prefix; - protected final String property; - - MetricsApiCaller(String group, String prefix, String property) { - this.group = group; - this.prefix = prefix; - this.property = property; - } - // use HttpSolrCall to simulate a call to the metrics api. void call( AtomicInteger qTime, List results, HttpServletRequest originalRequest) throws IOException, UnavailableException { SolrDispatchFilter filter = getSolrDispatchFilter(originalRequest); CoreContainer cores = filter.getCores(); - HttpServletRequest request = new MetricsApiRequest(originalRequest, group, prefix, property); + HttpServletRequest request = new MetricsApiRequest(originalRequest, buildQueryString()); MetricsApiResponse response = new MetricsApiResponse(); SolrDispatchFilter.Action action = new HttpSolrCall(filter, cores, request, response, false).call(); @@ -850,6 +1081,38 @@ void handleResponse(AtomicInteger qTime, List results, JsonNod protected abstract void handle(List results, JsonNode metrics) throws IOException; + + protected abstract String buildQueryString(); + } + + private abstract static class MetricsByPrefixApiCaller extends MetricsApiCaller { + protected final String group; + protected final String prefix; + protected final String[] properties; + protected final String property; // for backward compatibility + + MetricsByPrefixApiCaller(String group, String prefix, String... properties) { + this.group = group; + this.prefix = prefix; + this.properties = properties; + this.property = properties.length > 0 ? properties[0] : null; + } + + @Override + protected String buildQueryString() { + String propertyClause = + String.join( + "&property=", + Arrays.stream(properties) + .map(p -> URLEncoder.encode(p, StandardCharsets.UTF_8)) + .collect(Collectors.toSet())); + return String.format( + Locale.ROOT, + "wt=json&indent=false&compact=true&group=%s&prefix=%s%s", + URLEncoder.encode(group, StandardCharsets.UTF_8), + URLEncoder.encode(prefix, StandardCharsets.UTF_8), + propertyClause); + } } // represents a request to e.g., @@ -860,16 +1123,9 @@ static class MetricsApiRequest extends HttpServletRequestWrapper { private final String queryString; private final Map attributes = new HashMap<>(); - MetricsApiRequest(HttpServletRequest request, String group, String prefix, String property) - throws IOException { + MetricsApiRequest(HttpServletRequest request, String queryString) throws IOException { super(request); - queryString = - String.format( - Locale.ROOT, - "wt=json&indent=false&compact=true&group=%s&prefix=%s&property=%s", - URLEncoder.encode(group, StandardCharsets.UTF_8.name()), - URLEncoder.encode(prefix, StandardCharsets.UTF_8.name()), - URLEncoder.encode(property, StandardCharsets.UTF_8.name())); + this.queryString = queryString; } @Override diff --git a/solr/core/src/java/org/apache/solr/update/CommitTracker.java b/solr/core/src/java/org/apache/solr/update/CommitTracker.java index 02bf13c55e6..659d38f6bc7 100644 --- a/solr/core/src/java/org/apache/solr/update/CommitTracker.java +++ b/solr/core/src/java/org/apache/solr/update/CommitTracker.java @@ -267,6 +267,7 @@ public void run() { command.openSearcher = openSearcher; command.waitSearcher = WAIT_SEARCHER; command.softCommit = softCommit; + command.autoCommit = true; if (core.getCoreDescriptor().getCloudDescriptor() != null && core.getCoreDescriptor().getCloudDescriptor().isLeader() && !softCommit) { diff --git a/solr/core/src/java/org/apache/solr/update/CommitUpdateCommand.java b/solr/core/src/java/org/apache/solr/update/CommitUpdateCommand.java index bb18b770e5d..f4a588d856f 100644 --- a/solr/core/src/java/org/apache/solr/update/CommitUpdateCommand.java +++ b/solr/core/src/java/org/apache/solr/update/CommitUpdateCommand.java @@ -27,6 +27,7 @@ public class CommitUpdateCommand extends UpdateCommand { public boolean expungeDeletes = false; public boolean softCommit = false; public boolean prepareCommit = false; + boolean autoCommit = false; /** * User provided commit data. Can be let to null if there is none. It is possible to commit this diff --git a/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java b/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java index d730cc137fb..9309fea60cc 100644 --- a/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java +++ b/solr/core/src/java/org/apache/solr/update/DirectUpdateHandler2.java @@ -96,6 +96,8 @@ public class DirectUpdateHandler2 extends UpdateHandler Meter splitCommands; Meter optimizeCommands; Meter rollbackCommands; + Meter hardAutoCommitCounts; + Meter softAutoCommitCounts; LongAdder numDocsPending = new LongAdder(); LongAdder numErrors = new LongAdder(); Meter numErrorsCumulative; @@ -217,14 +219,10 @@ public void initializeMetrics(SolrMetricsContext parentContext, String scope) { this.solrMetricsContext = parentContext.getChildContext(this); } commitCommands = solrMetricsContext.meter("commits", getCategory().toString(), scope); - solrMetricsContext.gauge( - () -> commitTracker.getCommitCount(), true, "autoCommits", getCategory().toString(), scope); - solrMetricsContext.gauge( - () -> softCommitTracker.getCommitCount(), - true, - "softAutoCommits", - getCategory().toString(), - scope); + hardAutoCommitCounts = solrMetricsContext.meter("autoCommits", getCategory().toString(), scope); + softAutoCommitCounts = + solrMetricsContext.meter("softAutoCommits", getCategory().toString(), scope); + if (commitTracker.getDocsUpperBound() > 0) { solrMetricsContext.gauge( () -> commitTracker.getDocsUpperBound(), @@ -710,6 +708,14 @@ public void commit(CommitUpdateCommand cmd) throws IOException { if (cmd.expungeDeletes) expungeDeleteCommands.mark(); } + if (cmd.autoCommit) { + if (cmd.softCommit) { + softAutoCommitCounts.mark(); + } else { + hardAutoCommitCounts.mark(); + } + } + @SuppressWarnings("unchecked") Future[] waitSearcher = cmd.waitSearcher ? (Future[]) Array.newInstance(Future.class, 1) : null; diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexConfig.java b/solr/core/src/java/org/apache/solr/update/SolrIndexConfig.java index 94c5e63aa37..0b080169af0 100644 --- a/solr/core/src/java/org/apache/solr/update/SolrIndexConfig.java +++ b/solr/core/src/java/org/apache/solr/update/SolrIndexConfig.java @@ -92,6 +92,7 @@ public class SolrIndexConfig implements MapSerializable { public final PluginInfo mergedSegmentWarmerInfo; public InfoStream infoStream = InfoStream.NO_OUTPUT; + public final boolean aggregateNodeLevelMetricsEnabled; private ConfigNode node; /** Internal constructor for setting defaults based on Lucene Version */ @@ -108,6 +109,7 @@ private SolrIndexConfig() { mergedSegmentWarmerInfo = null; // enable coarse-grained metrics by default metricsInfo = new PluginInfo("metrics", Collections.emptyMap(), null, null); + this.aggregateNodeLevelMetricsEnabled = false; } private ConfigNode get(String s) { @@ -196,6 +198,7 @@ public SolrIndexConfig(ConfigNode cfg, SolrIndexConfig def) { "Beginning with Solr 5.0, option is no longer supported and should be removed from solrconfig.xml (these integrity checks are now automatic)", get("checkIntegrityAtMerge").isNull(), true); + aggregateNodeLevelMetricsEnabled = node.boolAttr("aggregateNodeLevelMetricsEnabled", false); } @Override diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java index 7b215e74fab..7d3534a0d1d 100644 --- a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java +++ b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java @@ -41,6 +41,9 @@ import org.apache.solr.core.DirectoryFactory.DirContext; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrInfoBean; +import org.apache.solr.metrics.SolrDelegateRegistryMetricsContext; +import org.apache.solr.metrics.SolrMetricManager; +import org.apache.solr.metrics.SolrMetricProducer; import org.apache.solr.metrics.SolrMetricsContext; import org.apache.solr.schema.IndexSchema; import org.slf4j.Logger; @@ -155,7 +158,18 @@ private SolrIndexWriter( infoStream = getConfig().getInfoStream(); this.directory = directory; numOpens.incrementAndGet(); - solrMetricsContext = core.getSolrMetricsContext().getChildContext(this); + SolrMetricsContext ctx = core.getSolrMetricsContext(); + if (config.aggregateNodeLevelMetricsEnabled) { + solrMetricsContext = + new SolrDelegateRegistryMetricsContext( + ctx.getMetricManager(), + ctx.getRegistryName(), + SolrMetricProducer.getUniqueMetricTag(this, ctx.getTag()), + SolrMetricManager.getRegistryName(SolrInfoBean.Group.node)); + } else { + solrMetricsContext = core.getSolrMetricsContext().getChildContext(this); + } + if (config.metricsInfo != null && config.metricsInfo.initArgs != null) { Object v = config.metricsInfo.initArgs.get("majorMergeDocs"); if (v != null) { diff --git a/solr/core/src/test/org/apache/solr/servlet/PrometheusMetricsServletTest.java b/solr/core/src/test/org/apache/solr/servlet/PrometheusMetricsServletTest.java index 6ce88f41776..5f44cd655d9 100644 --- a/solr/core/src/test/org/apache/solr/servlet/PrometheusMetricsServletTest.java +++ b/solr/core/src/test/org/apache/solr/servlet/PrometheusMetricsServletTest.java @@ -20,6 +20,7 @@ import java.io.PrintWriter; import java.io.StringWriter; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import org.junit.Assert; @@ -303,11 +304,13 @@ public void testCoresMetricsApiCaller() throws Exception { + " \"QUERY./select[shard].requestTimes\":{\"count\":11},\n" + " \"UPDATE./update.requestTimes\":{\"count\":37},\n" + " \"UPDATE./update[local].requestTimes\":{\"count\":12},\n" - + " \"UPDATE.updateHandler.autoCommits\":13,\n" + + " \"UPDATE.updateHandler.autoCommits\":{\"count\":13},\n" + " \"UPDATE.updateHandler.commits\":{\"count\":33}," + " \"UPDATE.updateHandler.cumulativeDeletesById\":{\"count\":14},\n" + " \"UPDATE.updateHandler.cumulativeDeletesByQuery\":{\"count\":15},\n" - + " \"UPDATE.updateHandler.softAutoCommits\":16},\n" + + " \"UPDATE.updateHandler.cumulativeAdds\":{\"count\":150},\n" + + " \"UPDATE.updateHandler.cumulativeErrors\":{\"count\":1},\n" + + " \"UPDATE.updateHandler.softAutoCommits\":{\"count\":16}},\n" + " \"solr.core.testdrive.shard1.replica_n1\":{\n" + " \"INDEX.merge.errors\":17,\n" + " \"INDEX.merge.major\":{\"count\":18},\n" @@ -324,11 +327,13 @@ public void testCoresMetricsApiCaller() throws Exception { + " \"QUERY./select[shard].requestTimes\":{\"count\":27},\n" + " \"UPDATE./update.requestTimes\":{\"count\":40},\n" + " \"UPDATE./update[local].requestTimes\":{\"count\":28},\n" - + " \"UPDATE.updateHandler.autoCommits\":29,\n" + + " \"UPDATE.updateHandler.autoCommits\":{\"count\":29},\n" + " \"UPDATE.updateHandler.commits\":{\"count\":34}," + " \"UPDATE.updateHandler.cumulativeDeletesById\":{\"count\":30},\n" + " \"UPDATE.updateHandler.cumulativeDeletesByQuery\":{\"count\":31},\n" - + " \"UPDATE.updateHandler.softAutoCommits\":32}}}"; + + " \"UPDATE.updateHandler.cumulativeAdds\":{\"count\":191},\n" + + " \"UPDATE.updateHandler.cumulativeErrors\":{\"count\":3},\n" + + " \"UPDATE.updateHandler.softAutoCommits\":{\"count\":32}}}}"; String output = "# HELP merges_major cumulative number of major merges across cores\n" + "# TYPE merges_major counter\n" @@ -374,8 +379,19 @@ public void testCoresMetricsApiCaller() throws Exception { + "deletes_by_id 44\n" + "# HELP deletes_by_query cumulative number of deletes by query across cores\n" + "# TYPE deletes_by_query counter\n" - + "deletes_by_query 46\n"; - assertMetricsApiCaller(new PrometheusMetricsServlet.CoresMetricsApiCaller(), json, 14, output); + + "deletes_by_query 46\n" + + "# HELP doc_adds cumulative number of docs added across cores\n" + + "# TYPE doc_adds counter\n" + + "doc_adds 341\n" + + "# HELP update_errors cumulative number of errors during updates across cores\n" + + "# TYPE update_errors counter\n" + + "update_errors 4\n"; + assertMetricsApiCaller( + new PrometheusMetricsServlet.CoresMetricsApiCaller( + Arrays.asList(PrometheusMetricsServlet.CoreMetric.values())), + json, + 14, + output); } @Test @@ -393,10 +409,12 @@ public void testCoresMetricsApiCallerMissingIndex() throws Exception { + " \"QUERY./select[shard].requestTimes\":{\"count\":2},\n" + " \"UPDATE./update.requestTimes\":{\"count\":31},\n" + " \"UPDATE./update[local].requestTimes\":{\"count\":3},\n" - + " \"UPDATE.updateHandler.autoCommits\":4,\n" + + " \"UPDATE.updateHandler.autoCommits\":{\"count\":4},\n" + " \"UPDATE.updateHandler.cumulativeDeletesById\":{\"count\":5},\n" + " \"UPDATE.updateHandler.cumulativeDeletesByQuery\":{\"count\":6},\n" - + " \"UPDATE.updateHandler.softAutoCommits\":7},\n" + + " \"UPDATE.updateHandler.cumulativeAdds\":{\"count\":0},\n" + + " \"UPDATE.updateHandler.cumulativeErrors\":{\"count\":0},\n" + + " \"UPDATE.updateHandler.softAutoCommits\":{\"count\":7}},\n" + " \"solr.core.testdrive.shard1.replica_n1\":{\n" + " \"QUERY./get.requestTimes\":{\"count\":32},\n" + " \"QUERY./get[shard].requestTimes\":{\"count\":8},\n" @@ -404,10 +422,12 @@ public void testCoresMetricsApiCallerMissingIndex() throws Exception { + " \"QUERY./select[shard].requestTimes\":{\"count\":9},\n" + " \"UPDATE./update.requestTimes\":{\"count\":34},\n" + " \"UPDATE./update[local].requestTimes\":{\"count\":10},\n" - + " \"UPDATE.updateHandler.autoCommits\":11,\n" + + " \"UPDATE.updateHandler.autoCommits\":{\"count\":11},\n" + " \"UPDATE.updateHandler.cumulativeDeletesById\":{\"count\":12},\n" + " \"UPDATE.updateHandler.cumulativeDeletesByQuery\":{\"count\":13},\n" - + " \"UPDATE.updateHandler.softAutoCommits\":14},\n" + + " \"UPDATE.updateHandler.cumulativeAdds\":{\"count\":0},\n" + + " \"UPDATE.updateHandler.cumulativeErrors\":{\"count\":0},\n" + + " \"UPDATE.updateHandler.softAutoCommits\":{\"count\":14}},\n" + " \"solr.core.loadtest.shard1_0.replica_n7\":{\n" + " \"QUERY./get.requestTimes\":{\"count\":35},\n" + " \"QUERY./get[shard].requestTimes\":{\"count\":15},\n" @@ -415,10 +435,12 @@ public void testCoresMetricsApiCallerMissingIndex() throws Exception { + " \"QUERY./select[shard].requestTimes\":{\"count\":16},\n" + " \"UPDATE./update.requestTimes\":{\"count\":37},\n" + " \"UPDATE./update[local].requestTimes\":{\"count\":17},\n" - + " \"UPDATE.updateHandler.autoCommits\":18,\n" + + " \"UPDATE.updateHandler.autoCommits\":{\"count\":18},\n" + " \"UPDATE.updateHandler.cumulativeDeletesById\":{\"count\":19},\n" + " \"UPDATE.updateHandler.cumulativeDeletesByQuery\":{\"count\":20},\n" - + " \"UPDATE.updateHandler.softAutoCommits\":21},\n" + + " \"UPDATE.updateHandler.cumulativeAdds\":{\"count\":0},\n" + + " \"UPDATE.updateHandler.cumulativeErrors\":{\"count\":0},\n" + + " \"UPDATE.updateHandler.softAutoCommits\":{\"count\":21}},\n" + " \"solr.core.local.shard1.replica_n1\":{\n" + " \"QUERY./get.requestTimes\":{\"count\":38},\n" + " \"QUERY./get[shard].requestTimes\":{\"count\":22},\n" @@ -426,24 +448,14 @@ public void testCoresMetricsApiCallerMissingIndex() throws Exception { + " \"QUERY./select[shard].requestTimes\":{\"count\":23},\n" + " \"UPDATE./update.requestTimes\":{\"count\":40},\n" + " \"UPDATE./update[local].requestTimes\":{\"count\":24},\n" - + " \"UPDATE.updateHandler.autoCommits\":25,\n" + + " \"UPDATE.updateHandler.autoCommits\":{\"count\":25},\n" + " \"UPDATE.updateHandler.cumulativeDeletesById\":{\"count\":26},\n" + " \"UPDATE.updateHandler.cumulativeDeletesByQuery\":{\"count\":27},\n" - + " \"UPDATE.updateHandler.softAutoCommits\":28}}}"; + + " \"UPDATE.updateHandler.cumulativeAdds\":{\"count\":0},\n" + + " \"UPDATE.updateHandler.cumulativeErrors\":{\"count\":0},\n" + + " \"UPDATE.updateHandler.softAutoCommits\":{\"count\":28}}}}"; String output = - "# HELP merges_major cumulative number of major merges across cores\n" - + "# TYPE merges_major counter\n" - + "merges_major -4\n" - + "# HELP merges_major_current_docs current number of docs in major merges across cores\n" - + "# TYPE merges_major_current_docs gauge\n" - + "merges_major_current_docs -4\n" - + "# HELP merges_minor cumulative number of minor merges across cores\n" - + "# TYPE merges_minor counter\n" - + "merges_minor -4\n" - + "# HELP merges_minor_current_docs current number of docs in minor merges across cores\n" - + "# TYPE merges_minor_current_docs gauge\n" - + "merges_minor_current_docs -4\n" - + "# HELP top_level_requests_get cumulative number of top-level gets across cores\n" + "# HELP top_level_requests_get cumulative number of top-level gets across cores\n" + "# TYPE top_level_requests_get counter\n" + "top_level_requests_get 134\n" + "# HELP sub_shard_requests_get cumulative number of sub (spawned by re-distributing a top-level req) gets across cores\n" @@ -467,15 +479,23 @@ public void testCoresMetricsApiCallerMissingIndex() throws Exception { + "# HELP auto_commits_soft cumulative number of soft auto commits across cores\n" + "# TYPE auto_commits_soft counter\n" + "auto_commits_soft 70\n" - + "# HELP commits cumulative number of commits across cores\n" - + "# TYPE commits counter\n" - + "commits -4\n" + "# HELP deletes_by_id cumulative number of deletes by id across cores\n" + "# TYPE deletes_by_id counter\n" + "deletes_by_id 62\n" + "# HELP deletes_by_query cumulative number of deletes by query across cores\n" + "# TYPE deletes_by_query counter\n" - + "deletes_by_query 66\n"; - assertMetricsApiCaller(new PrometheusMetricsServlet.CoresMetricsApiCaller(), json, 25, output); + + "deletes_by_query 66\n" + + "# HELP doc_adds cumulative number of docs added across cores\n" + + "# TYPE doc_adds counter\n" + + "doc_adds 0\n" + + "# HELP update_errors cumulative number of errors during updates across cores\n" + + "# TYPE update_errors counter\n" + + "update_errors 0\n"; + assertMetricsApiCaller( + new PrometheusMetricsServlet.CoresMetricsApiCaller( + Arrays.asList(PrometheusMetricsServlet.CoreMetric.values())), + json, + 25, + output); } }