[8.x] Handle cluster_block_exception during reindexing the TM index (#…

…201297) (#203609) # Backport This will backport the following commits from `main` to `8.x`: - [Handle cluster_block_exception during reindexing the TM index (#201297)](#201297)  ### Questions ? Please refer to the [Backport tool documentation](https://github.com/sqren/backport)  Co-authored-by: Ersin Erdal <[email protected]>
elastic · Dec 10, 2024 · af76299 · af76299
1 parent ebd2ef3
commit af76299
Show file tree

Hide file tree

Showing 7 changed files with 241 additions and 91 deletions.
diff --git a/x-pack/plugins/task_manager/server/config.ts b/x-pack/plugins/task_manager/server/config.ts
@@ -36,6 +36,7 @@ export const CLAIM_STRATEGY_MGET = 'mget';
 export const DEFAULT_DISCOVERY_INTERVAL_MS = 1000 * 10; // 10 seconds
 const MIN_DISCOVERY_INTERVAL_MS = 1000; // 1 second
 const MAX_DISCOVERY_INTERVAL_MS = 1000 * 60 * 5; // 5 minutes
+export const DISCOVERY_INTERVAL_AFTER_BLOCK_EXCEPTION_MS = 6 * 1000 * 10; // 60 seconds
 
 export const DEFAULT_ACTIVE_NODES_LOOK_BACK_DURATION = '30s';
 const FIVE_MIN_IN_MS = 5 * 60 * 1000;

diff --git a/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts b/x-pack/plugins/task_manager/server/integration_tests/managed_configuration.test.ts
@@ -13,6 +13,7 @@ import { ADJUST_THROUGHPUT_INTERVAL } from '../lib/create_managed_configuration'
 import { TaskManagerPlugin, TaskManagerStartContract } from '../plugin';
 import { coreMock } from '@kbn/core/server/mocks';
 import { TaskManagerConfig } from '../config';
+import { BulkUpdateError } from '../lib/bulk_update_error';
 
 describe('managed configuration', () => {
   let taskManagerStart: TaskManagerStartContract;
@@ -130,14 +131,41 @@ describe('managed configuration', () => {
       clock.tick(ADJUST_THROUGHPUT_INTERVAL);
 
       expect(logger.warn).toHaveBeenCalledWith(
-        'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).'
+        'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" and/or "cluster_block_exception" error(s).'
       );
       expect(logger.debug).toHaveBeenCalledWith(
-        'Poll interval configuration changing from 3000 to 3600 after seeing 1 "too many request" and/or "execute [inline] script" error(s)'
+        'Poll interval configuration changing from 3000 to 3600 after seeing 1 "too many request" and/or "execute [inline] script" and/or "cluster_block_exception" error(s).'
       );
       expect(logger.debug).toHaveBeenCalledWith('Task poller now using interval of 3600ms');
     });
 
+    test('should increase poll interval when Elasticsearch returns a cluster_block_exception error', async () => {
+      savedObjectsClient.create.mockRejectedValueOnce(
+        new BulkUpdateError({
+          statusCode: 403,
+          message: 'index is blocked',
+          type: 'cluster_block_exception',
+        })
+      );
+
+      await expect(
+        taskManagerStart.schedule({
+          taskType: 'foo',
+          state: {},
+          params: {},
+        })
+      ).rejects.toThrowErrorMatchingInlineSnapshot(`"index is blocked"`);
+      clock.tick(100000);
+
+      expect(logger.warn).toHaveBeenCalledWith(
+        'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" and/or "cluster_block_exception" error(s).'
+      );
+      expect(logger.debug).toHaveBeenCalledWith(
+        'Poll interval configuration changing from 3000 to 61000 after seeing 1 "too many request" and/or "execute [inline] script" and/or "cluster_block_exception" error(s).'
+      );
+      expect(logger.debug).toHaveBeenCalledWith('Task poller now using interval of 61000ms');
+    });
+
     test('should increase poll interval when Elasticsearch returns "cannot execute [inline] scripts" error', async () => {
       const childEsClient = esStart.client.asInternalUser.child({}) as jest.Mocked<Client>;
       childEsClient.search.mockImplementationOnce(async () => {
@@ -151,10 +179,10 @@ describe('managed configuration', () => {
       clock.tick(ADJUST_THROUGHPUT_INTERVAL);
 
       expect(logger.warn).toHaveBeenCalledWith(
-        'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).'
+        'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" and/or "cluster_block_exception" error(s).'
       );
       expect(logger.debug).toHaveBeenCalledWith(
-        'Poll interval configuration changing from 3000 to 3600 after seeing 1 "too many request" and/or "execute [inline] script" error(s)'
+        'Poll interval configuration changing from 3000 to 3600 after seeing 1 "too many request" and/or "execute [inline] script" and/or "cluster_block_exception" error(s).'
       );
       expect(logger.debug).toHaveBeenCalledWith('Task poller now using interval of 3600ms');
     });

diff --git a/x-pack/plugins/task_manager/server/kibana_discovery_service/kibana_discovery_service.test.ts b/x-pack/plugins/task_manager/server/kibana_discovery_service/kibana_discovery_service.test.ts
@@ -10,7 +10,11 @@ import { BACKGROUND_TASK_NODE_SO_NAME } from '../saved_objects';
 import { SavedObjectsBulkDeleteResponse, SavedObjectsUpdateResponse } from '@kbn/core/server';
 
 import { createFindResponse, createFindSO } from './mock_kibana_discovery_service';
-import { DEFAULT_ACTIVE_NODES_LOOK_BACK_DURATION, DEFAULT_DISCOVERY_INTERVAL_MS } from '../config';
+import {
+  DEFAULT_ACTIVE_NODES_LOOK_BACK_DURATION,
+  DEFAULT_DISCOVERY_INTERVAL_MS,
+  DISCOVERY_INTERVAL_AFTER_BLOCK_EXCEPTION_MS,
+} from '../config';
 
 const currentNode = 'current-node-id';
 const now = '2024-08-10T10:00:00.000Z';
@@ -199,6 +203,49 @@ describe('KibanaDiscoveryService', () => {
       );
     });
 
+    it('reschedules discovery job in case of cluster_block_exception', async () => {
+      savedObjectsRepository.update.mockResolvedValueOnce(
+        {} as SavedObjectsUpdateResponse<unknown>
+      );
+
+      const kibanaDiscoveryService = new KibanaDiscoveryService({
+        savedObjectsRepository,
+        logger,
+        currentNode,
+        config: {
+          active_nodes_lookback: DEFAULT_ACTIVE_NODES_LOOK_BACK_DURATION,
+          interval: DEFAULT_DISCOVERY_INTERVAL_MS,
+        },
+      });
+      await kibanaDiscoveryService.start();
+
+      expect(kibanaDiscoveryService.isStarted()).toBe(true);
+      expect(setTimeout).toHaveBeenCalledTimes(1);
+      expect(setTimeout).toHaveBeenNthCalledWith(
+        1,
+        expect.any(Function),
+        DEFAULT_DISCOVERY_INTERVAL_MS
+      );
+
+      savedObjectsRepository.update.mockRejectedValueOnce(
+        new Error('failed due to cluster_block_exception, task_manager index')
+      );
+
+      await jest.advanceTimersByTimeAsync(15000);
+
+      expect(savedObjectsRepository.update).toHaveBeenCalledTimes(2);
+      expect(setTimeout).toHaveBeenCalledTimes(2);
+      expect(setTimeout).toHaveBeenNthCalledWith(
+        2,
+        expect.any(Function),
+        DISCOVERY_INTERVAL_AFTER_BLOCK_EXCEPTION_MS
+      );
+      expect(logger.error).toHaveBeenCalledTimes(1);
+      expect(logger.error).toHaveBeenCalledWith(
+        "Kibana Discovery Service couldn't update this node's last_seen timestamp. id: current-node-id, last_seen: 2024-08-10T10:00:10.000Z, error:failed due to cluster_block_exception, task_manager index"
+      );
+    });
+
     it('does not schedule when Kibana is shutting down', async () => {
       savedObjectsRepository.update.mockResolvedValueOnce(
         {} as SavedObjectsUpdateResponse<unknown>

diff --git a/x-pack/plugins/task_manager/server/kibana_discovery_service/kibana_discovery_service.ts b/x-pack/plugins/task_manager/server/kibana_discovery_service/kibana_discovery_service.ts
@@ -9,7 +9,8 @@ import type { ISavedObjectsRepository } from '@kbn/core/server';
 import { Logger } from '@kbn/core/server';
 import { BACKGROUND_TASK_NODE_SO_NAME } from '../saved_objects';
 import { BackgroundTaskNode } from '../saved_objects/schemas/background_task_node';
-import { TaskManagerConfig } from '../config';
+import { DISCOVERY_INTERVAL_AFTER_BLOCK_EXCEPTION_MS, TaskManagerConfig } from '../config';
+import { isClusterBlockException } from '../lib/bulk_update_error';
 
 interface DiscoveryServiceParams {
   config: TaskManagerConfig['discovery'];
@@ -59,6 +60,7 @@ export class KibanaDiscoveryService {
   }
 
   private async scheduleUpsertCurrentNode() {
+    let retryInterval = this.discoveryInterval;
     if (!this.stopped) {
       const lastSeenDate = new Date();
       const lastSeen = lastSeenDate.toISOString();
@@ -69,9 +71,12 @@ export class KibanaDiscoveryService {
           this.started = true;
         }
       } catch (e) {
+        if (isClusterBlockException(e)) {
+          retryInterval = DISCOVERY_INTERVAL_AFTER_BLOCK_EXCEPTION_MS;
+        }
         if (!this.started) {
           this.logger.error(
-            `Kibana Discovery Service couldn't be started and will be retried in ${this.discoveryInterval}ms, error:${e.message}`
+            `Kibana Discovery Service couldn't be started and will be retried in ${retryInterval}ms, error:${e.message}`
           );
         } else {
           this.logger.error(
@@ -82,7 +87,7 @@ export class KibanaDiscoveryService {
         this.timer = setTimeout(
           async () => await this.scheduleUpsertCurrentNode(),
           // The timeout should not be less than the default timeout of two seconds
-          Math.max(this.discoveryInterval - (Date.now() - lastSeenDate.getTime()), DEFAULT_TIMEOUT)
+          Math.max(retryInterval - (Date.now() - lastSeenDate.getTime()), DEFAULT_TIMEOUT)
         );
       }
     }

diff --git a/x-pack/plugins/task_manager/server/lib/bulk_update_error.ts b/x-pack/plugins/task_manager/server/lib/bulk_update_error.ts
@@ -43,3 +43,10 @@ export function getBulkUpdateErrorType(error: Error | BulkUpdateError): string |
     return (error as BulkUpdateError).type;
   }
 }
+
+export function isClusterBlockException(error: Error | BulkUpdateError): boolean {
+  return (
+    getBulkUpdateErrorType(error) === 'cluster_block_exception' ||
+    error.message.includes('cluster_block_exception')
+  );
+}
diff --git a/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts b/x-pack/plugins/task_manager/server/lib/create_managed_configuration.test.ts
@@ -11,6 +11,7 @@ import { SavedObjectsErrorHelpers } from '@kbn/core/server';
 import {
   createManagedConfiguration,
   ADJUST_THROUGHPUT_INTERVAL,
+  INTERVAL_AFTER_BLOCK_EXCEPTION,
 } from './create_managed_configuration';
 import { mockLogger } from '../test_utils';
 import { CLAIM_STRATEGY_UPDATE_BY_QUERY, CLAIM_STRATEGY_MGET, TaskManagerConfig } from '../config';
@@ -420,12 +421,29 @@ describe('createManagedConfiguration()', () => {
       expect(subscription).toHaveBeenNthCalledWith(2, 120);
     });
 
+    test('should increase configuration at the next interval when an error with cluster_block_exception type is emitted, then decreases back to normal', async () => {
+      const { subscription, errors$ } = setupScenario(100);
+      errors$.next(
+        new BulkUpdateError({
+          statusCode: 403,
+          message: 'index is blocked',
+          type: 'cluster_block_exception',
+        })
+      );
+      expect(subscription).toHaveBeenNthCalledWith(1, 100);
+      // It emits the error with cluster_block_exception type immediately
+      expect(subscription).toHaveBeenNthCalledWith(2, INTERVAL_AFTER_BLOCK_EXCEPTION);
+      clock.tick(INTERVAL_AFTER_BLOCK_EXCEPTION);
+      expect(subscription).toHaveBeenCalledTimes(3);
+      expect(subscription).toHaveBeenNthCalledWith(3, 100);
+    });
+
     test('should log a warning when the configuration changes from the starting value', async () => {
       const { errors$ } = setupScenario(100);
       errors$.next(SavedObjectsErrorHelpers.createTooManyRequestsError('a', 'b'));
       clock.tick(ADJUST_THROUGHPUT_INTERVAL);
       expect(logger.warn).toHaveBeenCalledWith(
-        'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" error(s).'
+        'Poll interval configuration is temporarily increased after Elasticsearch returned 1 "too many request" and/or "execute [inline] script" and/or "cluster_block_exception" error(s).'
       );
     });