From 8052f03f61b2d89f20780caf5641691a1ccece90 Mon Sep 17 00:00:00 2001 From: Josh Dover <1813008+joshdover@users.noreply.github.com> Date: Mon, 20 Nov 2023 18:47:57 +0100 Subject: [PATCH] [Fleet] Cap setup attempts to 50 on Serverless (#171550) ## Summary If there is a bug in Fleet setup, we can retrigger rollovers on each attempt, causing shard explosion in Elasticsearch. We need a sane limit to prevent this from happening, which this PR introduces. The impact of Fleet setup failing to complete (Agents may not be able to be enrolled) is much smaller than causing shard explosion, so this seems like an acceptable tradeoff. This is only a small part of the overall solution - in the current incident we have, it's still unclear why we are failing to rollover the index and getting into this loop. ### Checklist Delete any items that are not applicable to this PR. - [ ] Any text added follows [EUI's writing guidelines](https://elastic.github.io/eui/#/guidelines/writing), uses sentence case text and includes [i18n support](https://github.com/elastic/kibana/blob/main/packages/kbn-i18n/README.md) - [ ] [Documentation](https://www.elastic.co/guide/en/kibana/master/development-documentation.html) was added for features that require explanation or tutorials - [ ] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios - [ ] Any UI touched in this PR is usable by keyboard only (learn more about [keyboard accessibility](https://webaim.org/techniques/keyboard/)) - [ ] Any UI touched in this PR does not create any new axe failures (run axe in browser: [FF](https://addons.mozilla.org/en-US/firefox/addon/axe-devtools/), [Chrome](https://chrome.google.com/webstore/detail/axe-web-accessibility-tes/lhdoppojpmngadmnindnejefpokejbdd?hl=en-US)) - [ ] If a plugin configuration key changed, check if it needs to be allowlisted in the cloud and added to the [docker list](https://github.com/elastic/kibana/blob/main/src/dev/build/tasks/os_packages/docker_generator/resources/base/bin/kibana-docker) - [ ] This renders correctly on smaller devices using a responsive layout. (You can test this [in your browser](https://www.browserstack.com/guide/responsive-testing-on-local-server)) - [ ] This was checked for [cross-browser compatibility](https://www.elastic.co/support/matrix#matrix_browsers) --- x-pack/plugins/fleet/server/plugin.ts | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/x-pack/plugins/fleet/server/plugin.ts b/x-pack/plugins/fleet/server/plugin.ts index 0373971c664e2..8333d0257a8f0 100644 --- a/x-pack/plugins/fleet/server/plugin.ts +++ b/x-pack/plugins/fleet/server/plugin.ts @@ -524,6 +524,9 @@ export class FleetPlugin this.policyWatcher.start(licenseService); + // We only retry when this feature flag is enabled (Serverless) + const setupAttempts = this.configInitialValue.internal?.retrySetupOnBoot ? 25 : 1; + const fleetSetupPromise = (async () => { try { // Fleet remains `available` during setup as to excessively delay Kibana's boot process. @@ -555,10 +558,9 @@ export class FleetPlugin ); }, { - // We only retry when this feature flag is enabled - numOfAttempts: this.configInitialValue.internal?.retrySetupOnBoot ? Infinity : 1, - // 250ms initial backoff - startingDelay: 250, + numOfAttempts: setupAttempts, + // 1s initial backoff + startingDelay: 1000, // 5m max backoff maxDelay: 60000 * 5, timeMultiple: 2, @@ -566,7 +568,7 @@ export class FleetPlugin jitter: 'full', retry: (error: any, attemptCount: number) => { const summary = `Fleet setup attempt ${attemptCount} failed, will retry after backoff`; - logger.debug(summary, { error: { message: error } }); + logger.warn(summary, { error: { message: error } }); this.fleetStatus$.next({ level: ServiceStatusLevels.available, @@ -586,7 +588,9 @@ export class FleetPlugin summary: 'Fleet is available', }); } catch (error) { - logger.warn('Fleet setup failed', { error: { message: error } }); + logger.warn(`Fleet setup failed after ${setupAttempts} attempts`, { + error: { message: error }, + }); this.fleetStatus$.next({ // As long as Fleet has a dependency on EPR, we can't reliably set Kibana status to `unavailable` here.