From 3566ee9a0129394cb23cc40c6af1f19a7fa68193 Mon Sep 17 00:00:00 2001 From: Artem Prigoda Date: Thu, 20 Jun 2024 15:22:41 +0200 Subject: [PATCH] Track RequestedRangeNotSatisfiedException separately in S3 Metrics (#109657) Due to RCO changes, we started getting a lot of `RequestedRangeNotSatisfiedExceptions` which are expected. We would like track them separately. This change adds two new metrics to track all client errors analogous to other S3 errors. * es.repositories.exceptions.request_range_not_satisfied.total * es.repositories.exceptions.request_range_not_satisfied.histogram In the future, we can add the error code as an attribute to the metrics, so we can adapt it to all client errors. --- docs/changelog/109657.yaml | 5 ++ .../s3/S3BlobStoreRepositoryMetricsTests.java | 68 ++++++++++++++++--- .../repositories/s3/S3BlobStore.java | 18 +++++ .../repositories/RepositoriesMetrics.java | 8 +++ 4 files changed, 88 insertions(+), 11 deletions(-) create mode 100644 docs/changelog/109657.yaml diff --git a/docs/changelog/109657.yaml b/docs/changelog/109657.yaml new file mode 100644 index 0000000000000..35b315b7568c9 --- /dev/null +++ b/docs/changelog/109657.yaml @@ -0,0 +1,5 @@ +pr: 109657 +summary: Track `RequestedRangeNotSatisfiedException` separately in S3 Metrics +area: Snapshot/Restore +type: enhancement +issues: [] diff --git a/modules/repository-s3/src/internalClusterTest/java/org/elasticsearch/repositories/s3/S3BlobStoreRepositoryMetricsTests.java b/modules/repository-s3/src/internalClusterTest/java/org/elasticsearch/repositories/s3/S3BlobStoreRepositoryMetricsTests.java index f8503bca3ec67..640293ecb80b0 100644 --- a/modules/repository-s3/src/internalClusterTest/java/org/elasticsearch/repositories/s3/S3BlobStoreRepositoryMetricsTests.java +++ b/modules/repository-s3/src/internalClusterTest/java/org/elasticsearch/repositories/s3/S3BlobStoreRepositoryMetricsTests.java @@ -14,7 +14,6 @@ import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.common.blobstore.BlobContainer; import org.elasticsearch.common.blobstore.BlobPath; -import org.elasticsearch.common.blobstore.BlobStore; import org.elasticsearch.common.blobstore.OperationPurpose; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.collect.Iterators; @@ -23,6 +22,7 @@ import org.elasticsearch.plugins.PluginsService; import org.elasticsearch.repositories.RepositoriesService; import org.elasticsearch.repositories.blobstore.BlobStoreRepository; +import org.elasticsearch.repositories.blobstore.RequestedRangeNotSatisfiedException; import org.elasticsearch.repositories.s3.S3BlobStore.Operation; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.telemetry.Measurement; @@ -39,6 +39,7 @@ import static org.elasticsearch.repositories.RepositoriesMetrics.HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM; import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_HISTOGRAM; +import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL; import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_TOTAL; import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_OPERATIONS_TOTAL; import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_REQUESTS_TOTAL; @@ -47,8 +48,10 @@ import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL; import static org.elasticsearch.rest.RestStatus.INTERNAL_SERVER_ERROR; import static org.elasticsearch.rest.RestStatus.NOT_FOUND; +import static org.elasticsearch.rest.RestStatus.REQUESTED_RANGE_NOT_SATISFIED; import static org.elasticsearch.rest.RestStatus.TOO_MANY_REQUESTS; import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.instanceOf; @SuppressForbidden(reason = "this test uses a HttpServer to emulate an S3 endpoint") // Need to set up a new cluster for each test because cluster settings use randomized authentication settings @@ -80,22 +83,29 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { .build(); } - public void testMetricsWithErrors() throws IOException { - final String repository = createRepository(randomRepositoryName()); - - final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData); - final var blobStoreRepository = (BlobStoreRepository) internalCluster().getInstance(RepositoriesService.class, dataNodeName) - .repository(repository); - final BlobStore blobStore = blobStoreRepository.blobStore(); - final TestTelemetryPlugin plugin = internalCluster().getInstance(PluginsService.class, dataNodeName) + private static TestTelemetryPlugin getPlugin(String dataNodeName) { + var plugin = internalCluster().getInstance(PluginsService.class, dataNodeName) .filterPlugins(TestTelemetryPlugin.class) .findFirst() .orElseThrow(); - plugin.resetMeter(); + return plugin; + } + + private static BlobContainer getBlobContainer(String dataNodeName, String repository) { + final var blobStoreRepository = (BlobStoreRepository) internalCluster().getInstance(RepositoriesService.class, dataNodeName) + .repository(repository); + return blobStoreRepository.blobStore().blobContainer(BlobPath.EMPTY.add(randomIdentifier())); + } + + public void testMetricsWithErrors() throws IOException { + final String repository = createRepository(randomRepositoryName()); + + final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData); + final TestTelemetryPlugin plugin = getPlugin(dataNodeName); final OperationPurpose purpose = randomFrom(OperationPurpose.values()); - final BlobContainer blobContainer = blobStore.blobContainer(BlobPath.EMPTY.add(randomIdentifier())); + final BlobContainer blobContainer = getBlobContainer(dataNodeName, repository); final String blobName = randomIdentifier(); // Put a blob @@ -132,6 +142,9 @@ public void testMetricsWithErrors() throws IOException { assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch)); assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch)); assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch)); + + // Make sure we don't hit the request range not satisfied counters + assertThat(getLongCounterValue(plugin, METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL, Operation.GET_OBJECT), equalTo(0L)); } // List retry exhausted @@ -166,6 +179,39 @@ public void testMetricsWithErrors() throws IOException { assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(1L)); } + public void testMetricsForRequestRangeNotSatisfied() { + final String repository = createRepository(randomRepositoryName()); + final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData); + final BlobContainer blobContainer = getBlobContainer(dataNodeName, repository); + final TestTelemetryPlugin plugin = getPlugin(dataNodeName); + + final OperationPurpose purpose = randomFrom(OperationPurpose.values()); + final String blobName = randomIdentifier(); + + for (int i = 0; i < randomIntBetween(1, 3); i++) { + final long batch = i + 1; + addErrorStatus(TOO_MANY_REQUESTS, TOO_MANY_REQUESTS, REQUESTED_RANGE_NOT_SATISFIED); + try { + blobContainer.readBlob(purpose, blobName).close(); + } catch (Exception e) { + assertThat(e, instanceOf(RequestedRangeNotSatisfiedException.class)); + } + + assertThat(getLongCounterValue(plugin, METRIC_REQUESTS_TOTAL, Operation.GET_OBJECT), equalTo(3 * batch)); + assertThat(getLongCounterValue(plugin, METRIC_OPERATIONS_TOTAL, Operation.GET_OBJECT), equalTo(batch)); + assertThat(getLongCounterValue(plugin, METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL, Operation.GET_OBJECT), equalTo(batch)); + assertThat(getLongCounterValue(plugin, METRIC_EXCEPTIONS_TOTAL, Operation.GET_OBJECT), equalTo(batch)); + assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch)); + assertThat( + getLongCounterValue(plugin, METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL, Operation.GET_OBJECT), + equalTo(batch) + ); + assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.GET_OBJECT), equalTo(2 * batch)); + assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.GET_OBJECT), equalTo(2 * batch)); + assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch)); + } + } + private void addErrorStatus(RestStatus... statuses) { errorStatusQueue.addAll(Arrays.asList(statuses)); } diff --git a/modules/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3BlobStore.java b/modules/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3BlobStore.java index 2aff610dc82e9..5af53364fb765 100644 --- a/modules/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3BlobStore.java +++ b/modules/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3BlobStore.java @@ -52,6 +52,7 @@ import java.util.stream.Collectors; import static org.elasticsearch.core.Strings.format; +import static org.elasticsearch.rest.RestStatus.REQUESTED_RANGE_NOT_SATISFIED; class S3BlobStore implements BlobStore { @@ -177,6 +178,23 @@ public final void collectMetrics(Request request, Response response) { .map(List::size) .orElse(0); + if (exceptionCount > 0) { + final List statusCodes = Objects.requireNonNullElse( + awsRequestMetrics.getProperty(AWSRequestMetrics.Field.StatusCode), + List.of() + ); + // REQUESTED_RANGE_NOT_SATISFIED errors are expected errors due to RCO + // TODO Add more expected client error codes? + final long amountOfRequestRangeNotSatisfiedErrors = statusCodes.stream() + .filter(e -> (Integer) e == REQUESTED_RANGE_NOT_SATISFIED.getStatus()) + .count(); + if (amountOfRequestRangeNotSatisfiedErrors > 0) { + s3RepositoriesMetrics.common() + .requestRangeNotSatisfiedExceptionCounter() + .incrementBy(amountOfRequestRangeNotSatisfiedErrors, attributes); + } + } + s3RepositoriesMetrics.common().operationCounter().incrementBy(1, attributes); if (numberOfAwsErrors == requestCount) { s3RepositoriesMetrics.common().unsuccessfulOperationCounter().incrementBy(1, attributes); diff --git a/server/src/main/java/org/elasticsearch/repositories/RepositoriesMetrics.java b/server/src/main/java/org/elasticsearch/repositories/RepositoriesMetrics.java index 50aa7881cd2b6..85f06580cee79 100644 --- a/server/src/main/java/org/elasticsearch/repositories/RepositoriesMetrics.java +++ b/server/src/main/java/org/elasticsearch/repositories/RepositoriesMetrics.java @@ -16,6 +16,7 @@ public record RepositoriesMetrics( MeterRegistry meterRegistry, LongCounter requestCounter, LongCounter exceptionCounter, + LongCounter requestRangeNotSatisfiedExceptionCounter, LongCounter throttleCounter, LongCounter operationCounter, LongCounter unsuccessfulOperationCounter, @@ -28,6 +29,8 @@ public record RepositoriesMetrics( public static final String METRIC_REQUESTS_TOTAL = "es.repositories.requests.total"; public static final String METRIC_EXCEPTIONS_TOTAL = "es.repositories.exceptions.total"; + public static final String METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL = + "es.repositories.exceptions.request_range_not_satisfied.total"; public static final String METRIC_THROTTLES_TOTAL = "es.repositories.throttles.total"; public static final String METRIC_OPERATIONS_TOTAL = "es.repositories.operations.total"; public static final String METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL = "es.repositories.operations.unsuccessful.total"; @@ -40,6 +43,11 @@ public RepositoriesMetrics(MeterRegistry meterRegistry) { meterRegistry, meterRegistry.registerLongCounter(METRIC_REQUESTS_TOTAL, "repository request counter", "unit"), meterRegistry.registerLongCounter(METRIC_EXCEPTIONS_TOTAL, "repository request exception counter", "unit"), + meterRegistry.registerLongCounter( + METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL, + "repository request RequestedRangeNotSatisfiedException counter", + "unit" + ), meterRegistry.registerLongCounter(METRIC_THROTTLES_TOTAL, "repository request throttle counter", "unit"), meterRegistry.registerLongCounter(METRIC_OPERATIONS_TOTAL, "repository operation counter", "unit"), meterRegistry.registerLongCounter(METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL, "repository unsuccessful operation counter", "unit"),