diff --git a/solr/core/src/java/org/apache/solr/servlet/PrometheusMetricsServlet.java b/solr/core/src/java/org/apache/solr/servlet/PrometheusMetricsServlet.java index 364148ee849..782af4061d2 100644 --- a/solr/core/src/java/org/apache/solr/servlet/PrometheusMetricsServlet.java +++ b/solr/core/src/java/org/apache/solr/servlet/PrometheusMetricsServlet.java @@ -40,6 +40,7 @@ import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Supplier; +import java.util.stream.Collectors; import javax.servlet.ServletOutputStream; import javax.servlet.UnavailableException; import javax.servlet.WriteListener; @@ -71,16 +72,13 @@ public final class PrometheusMetricsServlet extends BaseSolrServlet { private final List callers = getCallers(); private List getCallers() { - AggregateMetricsApiCaller aggregateMetricsApiCaller = new AggregateMetricsApiCaller(); return List.of( new GarbageCollectorMetricsApiCaller(), new MemoryMetricsApiCaller(), new OsMetricsApiCaller(), new ThreadMetricsApiCaller(), new StatusCodeMetricsApiCaller(), - aggregateMetricsApiCaller, - new CoresMetricsApiCaller( - Collections.unmodifiableList(aggregateMetricsApiCaller.missingCoreMetrics))); + new AggregateMetricsApiCaller()); } private final Map cacheMetricTypes = @@ -566,6 +564,66 @@ protected void handle(List results, JsonNode metrics) throws I } enum CoreMetric { + MAJOR_MERGE( + "INDEX.merge.major", "merges_major", "cumulative number of major merges across cores"), + MAJOR_MERGE_RUNNING_DOCS( + "INDEX.merge.major.running.docs", + "merges_major_current_docs", + "current number of docs in major merges across cores", + null, + PrometheusMetricType.GAUGE), + MINOR_MERGE( + "INDEX.merge.minor", "merges_minor", "cumulative number of minor merges across cores"), + MINOR_MERGE_RUNNING_DOCS( + "INDEX.merge.minor.running.docs", + "merges_minor_current_docs", + "current number of docs in minor merges across cores", + null, + PrometheusMetricType.GAUGE), + GET( + "QUERY./get.requestTimes", + "top_level_requests_get", + "cumulative number of top-level gets across cores"), + GET_DURATION_P50( + "QUERY./get.requestTimes", + "top_level_requests_get_duration_p50", + "top-level gets p50 duration", + "median_ms", + PrometheusMetricType.GAUGE), + GET_DURATION_P95( + "QUERY./get.requestTimes", + "top_level_requests_get_duration_p95", + "top-level gets p95 duration", + "p95_ms", + PrometheusMetricType.GAUGE), + GET_DURATION_P99( + "QUERY./get.requestTimes", + "top_level_requests_get_duration_p99", + "top-level gets p99 duration", + "p99_ms", + PrometheusMetricType.GAUGE), + GET_SUBSHARD( + "QUERY./get[shard].requestTimes", + "sub_shard_requests_get", + "cumulative number of sub (spawned by re-distributing a top-level req) gets across cores"), + GET_SUBSHARD_DURATION_P50( + "QUERY./get[shard].requestTimes", + "sub_shard_requests_get_duration_p50", + "sub shard gets p50 duration", + "median_ms", + PrometheusMetricType.GAUGE), + GET_SUBSHARD_DURATION_P95( + "QUERY./get[shard].requestTimes", + "sub_shard_requests_get_duration_p95", + "sub shard gets p95 duration", + "p95_ms", + PrometheusMetricType.GAUGE), + GET_SUBSHARD_DURATION_P99( + "QUERY./get[shard].requestTimes", + "sub_shard_requests_get_duration_p99", + "sub shard gets p99 duration", + "p99_ms", + PrometheusMetricType.GAUGE), SELECT( "QUERY./select.requestTimes", "top_level_requests_select", @@ -654,59 +712,6 @@ enum CoreMetric { "local updates p99 duration", "p99_ms", PrometheusMetricType.GAUGE), - GET( - "QUERY./get.requestTimes", - "top_level_requests_get", - "cumulative number of top-level gets across cores"), - GET_DURATION_P50( - "QUERY./get.requestTimes", - "top_level_requests_get_duration_p50", - "top-level gets p50 duration", - "median_ms", - PrometheusMetricType.GAUGE), - GET_DURATION_P95( - "QUERY./get.requestTimes", - "top_level_requests_get_duration_p95", - "top-level gets p95 duration", - "p95_ms", - PrometheusMetricType.GAUGE), - GET_DURATION_P99( - "QUERY./get.requestTimes", - "top_level_requests_get_duration_p99", - "top-level gets p99 duration", - "p99_ms", - PrometheusMetricType.GAUGE), - GET_SUBSHARD( - "QUERY./get[shard].requestTimes", - "sub_shard_requests_get", - "cumulative number of sub (spawned by re-distributing a top-level req) gets across cores"), - GET_SUBSHARD_DURATION_P50( - "QUERY./get[shard].requestTimes", - "sub_shard_requests_get_duration_p50", - "sub shard gets p50 duration", - "median_ms", - PrometheusMetricType.GAUGE), - GET_SUBSHARD_DURATION_P95( - "QUERY./get[shard].requestTimes", - "sub_shard_requests_get_duration_p95", - "sub shard gets p95 duration", - "p95_ms", - PrometheusMetricType.GAUGE), - GET_SUBSHARD_DURATION_P99( - "QUERY./get[shard].requestTimes", - "sub_shard_requests_get_duration_p99", - "sub shard gets p99 duration", - "p99_ms", - PrometheusMetricType.GAUGE), - COMMITS("UPDATE.updateHandler.commits", "commits", "cumulative number of commits across cores"), - DEL_BY_ID( - "UPDATE.updateHandler.cumulativeDeletesById", - "deletes_by_id", - "cumulative number of deletes by id across cores"), - DEL_BY_Q( - "UPDATE.updateHandler.cumulativeDeletesByQuery", - "deletes_by_query", - "cumulative number of deletes by query across cores"), AUTOCOMMIT( "UPDATE.updateHandler.autoCommits", "auto_commits_hard", @@ -719,23 +724,15 @@ enum CoreMetric { "cumulative number of soft auto commits across cores", null, PrometheusMetricType.COUNTER), - - MAJOR_MERGE( - "INDEX.merge.major", "merges_major", "cumulative number of major merges across cores"), - MAJOR_MERGE_RUNNING_DOCS( - "INDEX.merge.major.running.docs", - "merges_major_current_docs", - "current number of docs in major merges across cores", - null, - PrometheusMetricType.GAUGE), - MINOR_MERGE( - "INDEX.merge.minor", "merges_minor", "cumulative number of minor merges across cores"), - MINOR_MERGE_RUNNING_DOCS( - "INDEX.merge.minor.running.docs", - "merges_minor_current_docs", - "current number of docs in minor merges across cores", - null, - PrometheusMetricType.GAUGE), + COMMITS("UPDATE.updateHandler.commits", "commits", "cumulative number of commits across cores"), + CUMULATIVE_DEL_BY_ID( + "UPDATE.updateHandler.cumulativeDeletesById", + "deletes_by_id", + "cumulative number of deletes by id across cores"), + CUMULATIVE_DEL_BY_Q( + "UPDATE.updateHandler.cumulativeDeletesByQuery", + "deletes_by_query", + "cumulative number of deletes by query across cores"), CUMULATIVE_DOC_ADDS( "UPDATE.updateHandler.cumulativeAdds", "doc_adds", @@ -744,16 +741,6 @@ enum CoreMetric { "UPDATE.updateHandler.cumulativeErrors", "update_errors", "cumulative number of errors during updates across cores"), - CUMULATIVE_DEL_BY_ID( - "UPDATE.updateHandler.cumulativeDeletesById", - "cumulative_delete_by_id", - "cumulative number delete by id across cores"), - - CUMULATIVE_DEL_BY_Q( - "UPDATE.updateHandler.cumulativeDeletesByQuery", - "cumulative_delete_by_q", - "cumulative number delete by queries across cores"), - MERGES("UPDATE.updateHandler.merges", "merges", "cumulative number of merges across cores"), OPTIMIZE( "UPDATE.updateHandler.optimizes", @@ -806,108 +793,89 @@ PrometheusMetric createPrometheusMetric(Number value, String descriptionSuffix) } } - static class AggregateMetricsApiCaller extends MetricsByKeyApiCaller { - List missingCoreMetrics = new ArrayList<>(); - - /*"metrics":{ - "solr.node":{ + /** + * A caller that fetch metrics from both groups "solr.node" (node aggregated metrics) and "core" + * (per core metrics) and match it to all the values in enum CoreMetric. The goal is to provide + * node level metrics on the CoreMetric values. + * + *

It first iterates on the "solr.node" metrics, if a core metric is not found there, then it + * will look it up per core and sum them up as the node metrics. + */ + static class AggregateMetricsApiCaller extends MetricsByPrefixApiCaller { + /* + "metrics":{ + "solr.node":{ //node aggregated metrics "QUERY./select.requestTimes":{"count":2}, "QUERY./select[shard].requestTimes":{"count":0}, "UPDATE./update.requestTimes":{"count":2}, - "UPDATE./update[local].requestTimes":{"count":0}}}}*/ + "UPDATE./update[local].requestTimes":{"count":0} + ... + }, + "solr.core.loadtest.shard1_1.replica_n8":{ //pre core metrics + "QUERY./select.requestTimes":{"count":1}, + "QUERY./select[shard].requestTimes":{"count":0}, + "UPDATE./update.requestTimes":{"count":1}, + "UPDATE./update[local].requestTimes":{"count":0} + ... + }, + "solr.core.loadtest.shard2_1.replica_n10":{ + "QUERY./select.requestTimes":{"count":0}, + "QUERY./select[shard].requestTimes":{"count":0}, + "UPDATE./update.requestTimes":{"count":1}, + "UPDATE./update[local].requestTimes":{"count":0} + ... + }, + ... + */ AggregateMetricsApiCaller() { - super("solr.node", buildKeys()); + super("solr.node,core", buildPrefix(), buildProperty()); } - private static String buildQueryKey(CoreMetric metric) { - return "solr.node:" + metric.key + (metric.property != null ? (":" + metric.property) : ""); + private static String buildPrefix() { + return String.join( + ",", Arrays.stream(CoreMetric.values()).map(m -> m.key).toArray(String[]::new)); } - private static String[] buildKeys() { - return Arrays.stream(CoreMetric.values()).map(m -> buildQueryKey(m)).toArray(String[]::new); + private static String buildProperty() { + return String.join( + ",", + Arrays.stream(CoreMetric.values()) + .filter(m -> m.property != null) + .map(m -> m.property) + .collect(Collectors.toSet())); } @Override protected void handle(List results, JsonNode metricsNode) throws IOException { - missingCoreMetrics.clear(); - - for (CoreMetric metric : CoreMetric.values()) { - Number value = getNumber(metricsNode, buildQueryKey(metric)); - if (!INVALID_NUMBER.equals(value)) { - results.add(metric.createPrometheusMetric(value, "[node aggregated]")); - } else { - missingCoreMetrics.add(metric); - } - } - } - } - - /** - * Collector that get metrics from all the cores and then sum those metrics by CoreMetric key. - * - *

This runs after AggregateMetricsApiCaller and pick up whatever is missing from it by reading - * missingCoreMetricsView. - * - *

Therefore, this has dependency on AggregateMetricsApiCaller and should not be executed - * concurrently with it. - */ - static class CoresMetricsApiCaller extends MetricsApiCaller { - private final List missingCoreMetricsView; - - CoresMetricsApiCaller(List missingCoreMetricsView) { - this.missingCoreMetricsView = missingCoreMetricsView; - } - - @Override - protected String buildQueryString() { - List prefixes = new ArrayList<>(); - List properties = new ArrayList<>(); - for (CoreMetric missingMetric : missingCoreMetricsView) { - prefixes.add(missingMetric.key); - if (missingMetric.property != null) { - properties.add(missingMetric.property); + List missingCoreMetrics = new ArrayList<>(); + JsonNode nodeMetricNode = metricsNode.get("solr.node"); + + if (nodeMetricNode != null) { + for (CoreMetric metric : CoreMetric.values()) { + Number value = + metric.property != null + ? getNumber(nodeMetricNode, metric.key, metric.property) + : getNumber(nodeMetricNode, metric.key); + if (!INVALID_NUMBER.equals(value)) { + results.add(metric.createPrometheusMetric(value, "[node aggregated]")); + } else { + missingCoreMetrics.add(metric); + } } + } else { + log.warn( + "Cannot find the solr.node metrics, going to fall back to getting metrics from all cores"); + missingCoreMetrics.addAll(Arrays.asList(CoreMetric.values())); } - return String.format( - Locale.ROOT, - "wt=json&indent=false&compact=true&group=%s&prefix=%s&property=%s", - "core", - URLEncoder.encode(String.join(",", prefixes), StandardCharsets.UTF_8), - URLEncoder.encode(String.join(",", properties), StandardCharsets.UTF_8)); - } - - /* - "metrics":{ - "solr.core.loadtest.shard1_1.replica_n8":{ - "INDEX.merge.errors":0, - "INDEX.merge.major":{"count":0}, - "INDEX.merge.major.running":0, - "INDEX.merge.major.running.docs":0, - "INDEX.merge.major.running.segments":0, - "INDEX.merge.minor":{"count":0}, - "INDEX.merge.minor.running":0, - "INDEX.merge.minor.running.docs":0, - "INDEX.merge.minor.running.segments":0, - "QUERY./get.requestTimes":{"count":0}, - "QUERY./get[shard].requestTimes":{"count":0}, - "QUERY./select.requestTimes":{"count":2}, - "QUERY./select[shard].requestTimes":{"count":0}, - "UPDATE./update.requestTimes":{"count":0}, - "UPDATE./update[local].requestTimes":{"count":0}, - "UPDATE.updateHandler.autoCommits":0, - "UPDATE.updateHandler.commits":{"count":14877}, - "UPDATE.updateHandler.cumulativeDeletesById":{"count":0}, - "UPDATE.updateHandler.cumulativeDeletesByQuery":{"count":0}, - "UPDATE.updateHandler.softAutoCommits":0}, - ... - */ - - @Override - protected void handle(List results, JsonNode metrics) throws IOException { Map accumulative = new LinkedHashMap<>(); - for (CoreMetric missingCoreMetric : missingCoreMetricsView) { - for (JsonNode coreMetricNode : metrics) { + for (Map.Entry entry : metricsNode.properties()) { + if ("solr.node".equals(entry.getKey())) { // this one is not a core + continue; + } + JsonNode coreMetricNode = entry.getValue(); + for (CoreMetric missingCoreMetric : + missingCoreMetrics) { // only iterate on those that aren't accounted for by "solr.node" Number val = missingCoreMetric.property != null ? getNumber(coreMetricNode, missingCoreMetric.key, missingCoreMetric.property) @@ -1050,47 +1018,33 @@ protected abstract void handle(List results, JsonNode metrics) protected abstract String buildQueryString(); } - private abstract static class MetricsByKeyApiCaller extends MetricsApiCaller { - private final String group; - private final String[] keys; - - private MetricsByKeyApiCaller(String group, String[] keys) { - this.group = group; - this.keys = keys; - } - - @Override - protected String buildQueryString() { - String keyClause = - Arrays.stream(keys) - .reduce("", (s, key) -> s + "&key=" + URLEncoder.encode(key, StandardCharsets.UTF_8)); - return String.format( - Locale.ROOT, - "wt=json&indent=false&compact=true&group=%s%s", - URLEncoder.encode(group, StandardCharsets.UTF_8), - keyClause); - } - } - private abstract static class MetricsByPrefixApiCaller extends MetricsApiCaller { protected final String group; protected final String prefix; - protected final String property; + protected final String[] properties; + protected final String property; // for backward compatibility - MetricsByPrefixApiCaller(String group, String prefix, String property) { + MetricsByPrefixApiCaller(String group, String prefix, String... properties) { this.group = group; this.prefix = prefix; - this.property = property; + this.properties = properties; + this.property = properties.length > 1 ? properties[0] : null; } @Override protected String buildQueryString() { + String propertyClause = + String.join( + "&property=", + Arrays.stream(properties) + .map(p -> URLEncoder.encode(p, StandardCharsets.UTF_8)) + .collect(Collectors.toSet())); return String.format( Locale.ROOT, - "wt=json&indent=false&compact=true&group=%s&prefix=%s&property=%s", + "wt=json&indent=false&compact=true&group=%s&prefix=%s%s", URLEncoder.encode(group, StandardCharsets.UTF_8), URLEncoder.encode(prefix, StandardCharsets.UTF_8), - URLEncoder.encode(property, StandardCharsets.UTF_8)); + propertyClause); } } diff --git a/solr/core/src/test/org/apache/solr/servlet/PrometheusMetricsServletTest.java b/solr/core/src/test/org/apache/solr/servlet/PrometheusMetricsServletTest.java index f8079b060e4..c153e919948 100644 --- a/solr/core/src/test/org/apache/solr/servlet/PrometheusMetricsServletTest.java +++ b/solr/core/src/test/org/apache/solr/servlet/PrometheusMetricsServletTest.java @@ -16,24 +16,6 @@ */ package org.apache.solr.servlet; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.AUTOCOMMIT; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.COMMITS; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.CUMULATIVE_DOC_ADDS; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.CUMULATIVE_ERRS; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.DEL_BY_ID; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.DEL_BY_Q; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.GET; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.GET_SUBSHARD; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.LOCAL_UPDATE; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.MAJOR_MERGE; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.MAJOR_MERGE_RUNNING_DOCS; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.MINOR_MERGE; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.MINOR_MERGE_RUNNING_DOCS; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.SELECT; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.SELECT_SUBSHARD; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.SOFT_AUTOCOMMIT; -import static org.apache.solr.servlet.PrometheusMetricsServlet.CoreMetric.UPDATE; - import com.fasterxml.jackson.databind.ObjectMapper; import java.io.PrintWriter; import java.io.StringWriter; @@ -404,28 +386,7 @@ public void testCoresMetricsApiCaller() throws Exception { + "# TYPE update_errors counter\n" + "update_errors 4\n"; assertMetricsApiCaller( - new PrometheusMetricsServlet.CoresMetricsApiCaller( - List.of( - MAJOR_MERGE, - MAJOR_MERGE_RUNNING_DOCS, - MINOR_MERGE, - MINOR_MERGE_RUNNING_DOCS, - GET, - GET_SUBSHARD, - SELECT, - SELECT_SUBSHARD, - UPDATE, - LOCAL_UPDATE, - AUTOCOMMIT, - SOFT_AUTOCOMMIT, - COMMITS, - DEL_BY_ID, - DEL_BY_Q, - CUMULATIVE_DOC_ADDS, - CUMULATIVE_ERRS)), - json, - 14, - output); + new PrometheusMetricsServlet.AggregateMetricsApiCaller(), json, 14, output); } @Test @@ -526,27 +487,6 @@ public void testCoresMetricsApiCallerMissingIndex() throws Exception { + "# TYPE update_errors counter\n" + "update_errors 0\n"; assertMetricsApiCaller( - new PrometheusMetricsServlet.CoresMetricsApiCaller( - List.of( - MAJOR_MERGE, - MAJOR_MERGE_RUNNING_DOCS, - MINOR_MERGE, - MINOR_MERGE_RUNNING_DOCS, - GET, - GET_SUBSHARD, - SELECT, - SELECT_SUBSHARD, - UPDATE, - LOCAL_UPDATE, - AUTOCOMMIT, - SOFT_AUTOCOMMIT, - COMMITS, - DEL_BY_ID, - DEL_BY_Q, - CUMULATIVE_DOC_ADDS, - CUMULATIVE_ERRS)), - json, - 25, - output); + new PrometheusMetricsServlet.AggregateMetricsApiCaller(), json, 25, output); } }