Skip to content

Commit

Permalink
File-based settings health indicator (elastic#117081)
Browse files Browse the repository at this point in the history
* Add FileSettingsService health indicator

* spotless

* YELLOW for any failure, plus most_recent_failure
  • Loading branch information
prdoyle authored Nov 21, 2024
1 parent 06840ba commit 1a4b3d3
Show file tree
Hide file tree
Showing 5 changed files with 230 additions and 12 deletions.
14 changes: 10 additions & 4 deletions server/src/main/java/org/elasticsearch/node/NodeConstruction.java
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@
import org.elasticsearch.reservedstate.ReservedClusterStateHandlerProvider;
import org.elasticsearch.reservedstate.action.ReservedClusterSettingsAction;
import org.elasticsearch.reservedstate.service.FileSettingsService;
import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService;
import org.elasticsearch.rest.action.search.SearchResponseMetrics;
import org.elasticsearch.script.ScriptModule;
import org.elasticsearch.script.ScriptService;
Expand Down Expand Up @@ -1032,10 +1033,12 @@ private void construct(
actionModule.getReservedClusterStateService().installStateHandler(new ReservedRepositoryAction(repositoriesService));
actionModule.getReservedClusterStateService().installStateHandler(new ReservedPipelineAction());

FileSettingsHealthIndicatorService fileSettingsHealthIndicatorService = new FileSettingsHealthIndicatorService();
FileSettingsService fileSettingsService = new FileSettingsService(
clusterService,
actionModule.getReservedClusterStateService(),
environment
environment,
fileSettingsHealthIndicatorService
);

RestoreService restoreService = new RestoreService(
Expand Down Expand Up @@ -1129,7 +1132,8 @@ private void construct(
featureService,
threadPool,
telemetryProvider,
repositoriesService
repositoriesService,
fileSettingsHealthIndicatorService
)
);

Expand Down Expand Up @@ -1301,7 +1305,8 @@ private Module loadDiagnosticServices(
FeatureService featureService,
ThreadPool threadPool,
TelemetryProvider telemetryProvider,
RepositoriesService repositoriesService
RepositoriesService repositoriesService,
FileSettingsHealthIndicatorService fileSettingsHealthIndicatorService
) {

MasterHistoryService masterHistoryService = new MasterHistoryService(transportService, threadPool, clusterService);
Expand All @@ -1316,7 +1321,8 @@ private Module loadDiagnosticServices(
new StableMasterHealthIndicatorService(coordinationDiagnosticsService, clusterService),
new RepositoryIntegrityHealthIndicatorService(clusterService, featureService),
new DiskHealthIndicatorService(clusterService, featureService),
new ShardsCapacityHealthIndicatorService(clusterService, featureService)
new ShardsCapacityHealthIndicatorService(clusterService, featureService),
fileSettingsHealthIndicatorService
);
var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)
.flatMap(plugin -> plugin.getHealthIndicatorServices().stream());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,27 @@
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.file.MasterNodeFileWatchingService;
import org.elasticsearch.env.Environment;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
import org.elasticsearch.health.HealthIndicatorResult;
import org.elasticsearch.health.HealthIndicatorService;
import org.elasticsearch.health.SimpleHealthIndicatorDetails;
import org.elasticsearch.health.node.HealthInfo;
import org.elasticsearch.xcontent.XContentParseException;
import org.elasticsearch.xcontent.XContentParserConfiguration;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;

import static org.elasticsearch.health.HealthStatus.GREEN;
import static org.elasticsearch.health.HealthStatus.YELLOW;
import static org.elasticsearch.health.ImpactArea.DEPLOYMENT_MANAGEMENT;
import static org.elasticsearch.reservedstate.service.ReservedStateVersionCheck.HIGHER_OR_SAME_VERSION;
import static org.elasticsearch.reservedstate.service.ReservedStateVersionCheck.HIGHER_VERSION_ONLY;
import static org.elasticsearch.xcontent.XContentType.JSON;
Expand All @@ -53,17 +66,29 @@ public class FileSettingsService extends MasterNodeFileWatchingService implement
public static final String NAMESPACE = "file_settings";
public static final String OPERATOR_DIRECTORY = "operator";
private final ReservedClusterStateService stateService;
private final FileSettingsHealthIndicatorService healthIndicatorService;

/**
* Constructs the {@link FileSettingsService}
*
* @param clusterService so we can register ourselves as a cluster state change listener
* @param stateService an instance of the immutable cluster state controller, so we can perform the cluster state changes
* @param environment we need the environment to pull the location of the config and operator directories
* @param healthIndicatorService tracks the success or failure of file-based settings
*/
public FileSettingsService(ClusterService clusterService, ReservedClusterStateService stateService, Environment environment) {
public FileSettingsService(
ClusterService clusterService,
ReservedClusterStateService stateService,
Environment environment,
FileSettingsHealthIndicatorService healthIndicatorService
) {
super(clusterService, environment.configFile().toAbsolutePath().resolve(OPERATOR_DIRECTORY).resolve(SETTINGS_FILE_NAME));
this.stateService = stateService;
this.healthIndicatorService = healthIndicatorService;
}

public FileSettingsHealthIndicatorService healthIndicatorService() {
return healthIndicatorService;
}

/**
Expand Down Expand Up @@ -121,6 +146,7 @@ protected boolean shouldRefreshFileState(ClusterState clusterState) {
@Override
protected void processFileChanges() throws ExecutionException, InterruptedException, IOException {
logger.info("processing path [{}] for [{}]", watchedFile(), NAMESPACE);
healthIndicatorService.changeOccurred();
processFileChanges(HIGHER_VERSION_ONLY);
}

Expand All @@ -131,6 +157,7 @@ protected void processFileChanges() throws ExecutionException, InterruptedExcept
@Override
protected void processFileOnServiceStart() throws IOException, ExecutionException, InterruptedException {
logger.info("processing path [{}] for [{}] on service start", watchedFile(), NAMESPACE);
healthIndicatorService.changeOccurred();
processFileChanges(HIGHER_OR_SAME_VERSION);
}

Expand All @@ -146,6 +173,16 @@ private void processFileChanges(ReservedStateVersionCheck versionCheck) throws I
completion.get();
}

private void completeProcessing(Exception e, PlainActionFuture<Void> completion) {
if (e != null) {
healthIndicatorService.failureOccurred(e.toString());
completion.onFailure(e);
} else {
completion.onResponse(null);
healthIndicatorService.successOccurred();
}
}

@Override
protected void onProcessFileChangesException(Exception e) {
if (e instanceof ExecutionException) {
Expand All @@ -172,11 +209,61 @@ protected void processInitialFileMissing() throws ExecutionException, Interrupte
completion.get();
}

private static void completeProcessing(Exception e, PlainActionFuture<Void> completion) {
if (e != null) {
completion.onFailure(e);
} else {
completion.onResponse(null);
public static class FileSettingsHealthIndicatorService implements HealthIndicatorService {
static final String NAME = "file_settings";
static final String NO_CHANGES_SYMPTOM = "No file-based setting changes have occurred";
static final String SUCCESS_SYMPTOM = "The most recent file-based settings were applied successfully";
static final String FAILURE_SYMPTOM = "The most recent file-based settings encountered an error";

static final List<HealthIndicatorImpact> STALE_SETTINGS_IMPACT = List.of(
new HealthIndicatorImpact(
NAME,
"stale",
3,
"The most recent file-based settings changes have not been applied.",
List.of(DEPLOYMENT_MANAGEMENT)
)
);

private final AtomicLong changeCount = new AtomicLong(0);
private final AtomicLong failureStreak = new AtomicLong(0);
private final AtomicReference<String> mostRecentFailure = new AtomicReference<>();

public void changeOccurred() {
changeCount.incrementAndGet();
}

public void successOccurred() {
failureStreak.set(0);
}

public void failureOccurred(String description) {
failureStreak.incrementAndGet();
mostRecentFailure.set(description);
}

@Override
public String name() {
return NAME;
}

@Override
public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) {
if (0 == changeCount.get()) {
return createIndicator(GREEN, NO_CHANGES_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of());
}
long numFailures = failureStreak.get();
if (0 == numFailures) {
return createIndicator(GREEN, SUCCESS_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of());
} else {
return createIndicator(
YELLOW,
FAILURE_SYMPTOM,
new SimpleHealthIndicatorDetails(Map.of("failure_streak", numFailures, "most_recent_failure", mostRecentFailure.get())),
STALE_SETTINGS_IMPACT,
List.of()
);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,12 @@ public void setup() {
);

fileSettingsService = spy(
new FileSettingsService(clusterService, mock(ReservedClusterStateService.class), newEnvironment(Settings.EMPTY))
new FileSettingsService(
clusterService,
mock(ReservedClusterStateService.class),
newEnvironment(Settings.EMPTY),
new FileSettingsService.FileSettingsHealthIndicatorService()
)
);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.reservedstate.service;

import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorResult;
import org.elasticsearch.health.SimpleHealthIndicatorDetails;
import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService;
import org.elasticsearch.test.ESTestCase;
import org.junit.Before;

import java.util.List;
import java.util.Map;

import static org.elasticsearch.health.HealthStatus.GREEN;
import static org.elasticsearch.health.HealthStatus.YELLOW;
import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.FAILURE_SYMPTOM;
import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.NO_CHANGES_SYMPTOM;
import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.STALE_SETTINGS_IMPACT;
import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.SUCCESS_SYMPTOM;

/**
* Here, we test {@link FileSettingsHealthIndicatorService} in isolation;
* we do not test that {@link FileSettingsService} uses it correctly.
*/
public class FileSettingsHealthIndicatorServiceTests extends ESTestCase {

FileSettingsHealthIndicatorService healthIndicatorService;

@Before
public void initialize() {
healthIndicatorService = new FileSettingsHealthIndicatorService();
}

public void testInitiallyGreen() {
assertEquals(
new HealthIndicatorResult("file_settings", GREEN, NO_CHANGES_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()),
healthIndicatorService.calculate(false, null)
);
}

public void testGreenYellowYellowGreen() {
healthIndicatorService.changeOccurred();
// This is a strange case: a change occurred, but neither success nor failure have been reported yet.
// While the change is still in progress, we don't change the status.
assertEquals(
new HealthIndicatorResult("file_settings", GREEN, SUCCESS_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()),
healthIndicatorService.calculate(false, null)
);

healthIndicatorService.failureOccurred("whoopsie 1");
assertEquals(
new HealthIndicatorResult(
"file_settings",
YELLOW,
FAILURE_SYMPTOM,
new SimpleHealthIndicatorDetails(Map.of("failure_streak", 1L, "most_recent_failure", "whoopsie 1")),
STALE_SETTINGS_IMPACT,
List.of()
),
healthIndicatorService.calculate(false, null)
);

healthIndicatorService.failureOccurred("whoopsie #2");
assertEquals(
new HealthIndicatorResult(
"file_settings",
YELLOW,
FAILURE_SYMPTOM,
new SimpleHealthIndicatorDetails(Map.of("failure_streak", 2L, "most_recent_failure", "whoopsie #2")),
STALE_SETTINGS_IMPACT,
List.of()
),
healthIndicatorService.calculate(false, null)
);

healthIndicatorService.successOccurred();
assertEquals(
new HealthIndicatorResult("file_settings", GREEN, SUCCESS_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()),
healthIndicatorService.calculate(false, null)
);
}
}
Loading

0 comments on commit 1a4b3d3

Please sign in to comment.