Skip to content

Commit

Permalink
[8.15] [Response Ops][Alerting] Deleting ad hoc run task if the last …
Browse files Browse the repository at this point in the history
…schedule entry ends in a timeout (#187496) (#187810)

# Backport

This will backport the following commits from `main` to `8.15`:
- [[Response Ops][Alerting] Deleting ad hoc run task if the last
schedule entry ends in a timeout
(#187496)](#187496)

<!--- Backport version: 9.4.3 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"Ying
Mao","email":"[email protected]"},"sourceCommit":{"committedDate":"2024-07-09T00:07:57Z","message":"[Response
Ops][Alerting] Deleting ad hoc run task if the last schedule entry ends
in a timeout (#187496)\n\n## Summary\r\n\r\nFixing bug where ad hoc run
task was not getting deleted if running the\r\nlast schedule entry and
the run times out.\r\n\r\n## To Verify\r\n1. Create a detection
rule\r\n2. Add a delay in the ad hoc task runner\r\n```\r\n---
a/x-pack/plugins/alerting/server/task_runner/ad_hoc_task_runner.ts\r\n+++
b/x-pack/plugins/alerting/server/task_runner/ad_hoc_task_runner.ts\r\n@@
-530,6 +530,7 @@ export class AdHocTaskRunner {\r\n } catch (err) {\r\n
runMetrics = asErr(err);\r\n }\r\n+ await new Promise((resolve) =>
setTimeout(resolve, 3100000));\r\n await
this.processAdHocRunResults(runMetrics);\r\n```\r\n3. Schedule a
backfill for the rule with only one schedule entry\r\n\r\n```\r\nPOST
https://localhost:5601/internal/alerting/rules/backfill/_schedule\r\n[\r\n
{\r\n \"rule_id\": <ruleId>,\r\n \"start\":
\"2024-07-03T13:05:00.000Z\"\r\n }\r\n]\r\n```\r\n\r\n4. Wait for the
run to get cancelled then verify that the task
was\r\ndeleted.\r\n\r\n---------\r\n\r\nCo-authored-by: Elastic Machine
<[email protected]>","sha":"c2be810de37479d43661e722e7d7454dacc35c79","branchLabelMapping":{"^v8.16.0$":"main","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["Feature:Alerting","release_note:skip","Team:ResponseOps","v8.15.0","v8.16.0"],"title":"[Response
Ops][Alerting] Deleting ad hoc run task if the last schedule entry ends
in a
timeout","number":187496,"url":"https://github.com/elastic/kibana/pull/187496","mergeCommit":{"message":"[Response
Ops][Alerting] Deleting ad hoc run task if the last schedule entry ends
in a timeout (#187496)\n\n## Summary\r\n\r\nFixing bug where ad hoc run
task was not getting deleted if running the\r\nlast schedule entry and
the run times out.\r\n\r\n## To Verify\r\n1. Create a detection
rule\r\n2. Add a delay in the ad hoc task runner\r\n```\r\n---
a/x-pack/plugins/alerting/server/task_runner/ad_hoc_task_runner.ts\r\n+++
b/x-pack/plugins/alerting/server/task_runner/ad_hoc_task_runner.ts\r\n@@
-530,6 +530,7 @@ export class AdHocTaskRunner {\r\n } catch (err) {\r\n
runMetrics = asErr(err);\r\n }\r\n+ await new Promise((resolve) =>
setTimeout(resolve, 3100000));\r\n await
this.processAdHocRunResults(runMetrics);\r\n```\r\n3. Schedule a
backfill for the rule with only one schedule entry\r\n\r\n```\r\nPOST
https://localhost:5601/internal/alerting/rules/backfill/_schedule\r\n[\r\n
{\r\n \"rule_id\": <ruleId>,\r\n \"start\":
\"2024-07-03T13:05:00.000Z\"\r\n }\r\n]\r\n```\r\n\r\n4. Wait for the
run to get cancelled then verify that the task
was\r\ndeleted.\r\n\r\n---------\r\n\r\nCo-authored-by: Elastic Machine
<[email protected]>","sha":"c2be810de37479d43661e722e7d7454dacc35c79"}},"sourceBranch":"main","suggestedTargetBranches":["8.15"],"targetPullRequestStates":[{"branch":"8.15","label":"v8.15.0","branchLabelMappingKey":"^v(\\d+).(\\d+).\\d+$","isSourceBranch":false,"state":"NOT_CREATED"},{"branch":"main","label":"v8.16.0","branchLabelMappingKey":"^v8.16.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/187496","number":187496,"mergeCommit":{"message":"[Response
Ops][Alerting] Deleting ad hoc run task if the last schedule entry ends
in a timeout (#187496)\n\n## Summary\r\n\r\nFixing bug where ad hoc run
task was not getting deleted if running the\r\nlast schedule entry and
the run times out.\r\n\r\n## To Verify\r\n1. Create a detection
rule\r\n2. Add a delay in the ad hoc task runner\r\n```\r\n---
a/x-pack/plugins/alerting/server/task_runner/ad_hoc_task_runner.ts\r\n+++
b/x-pack/plugins/alerting/server/task_runner/ad_hoc_task_runner.ts\r\n@@
-530,6 +530,7 @@ export class AdHocTaskRunner {\r\n } catch (err) {\r\n
runMetrics = asErr(err);\r\n }\r\n+ await new Promise((resolve) =>
setTimeout(resolve, 3100000));\r\n await
this.processAdHocRunResults(runMetrics);\r\n```\r\n3. Schedule a
backfill for the rule with only one schedule entry\r\n\r\n```\r\nPOST
https://localhost:5601/internal/alerting/rules/backfill/_schedule\r\n[\r\n
{\r\n \"rule_id\": <ruleId>,\r\n \"start\":
\"2024-07-03T13:05:00.000Z\"\r\n }\r\n]\r\n```\r\n\r\n4. Wait for the
run to get cancelled then verify that the task
was\r\ndeleted.\r\n\r\n---------\r\n\r\nCo-authored-by: Elastic Machine
<[email protected]>","sha":"c2be810de37479d43661e722e7d7454dacc35c79"}}]}]
BACKPORT-->

Co-authored-by: Ying Mao <[email protected]>
  • Loading branch information
kibanamachine and ymao1 authored Jul 9, 2024
1 parent 056d64a commit bcdf5e9
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1244,6 +1244,84 @@ describe('Ad Hoc Task Runner', () => {
expect(logger.error).not.toHaveBeenCalled();
});

test('should handle task cancellation signal due to timeout when for last schedule entry', async () => {
encryptedSavedObjectsClient.getDecryptedAsInternalUser.mockResolvedValue({
...mockedAdHocRunSO,
attributes: {
...mockedAdHocRunSO.attributes,
schedule: [{ ...schedule1, status: adHocRunStatus.COMPLETE }, schedule2],
},
});
const taskRunner = new AdHocTaskRunner({
context: taskRunnerFactoryInitializerParams,
internalSavedObjectsRepository,
taskInstance: mockedTaskInstance,
});

const promise = taskRunner.run();
await Promise.resolve();
await taskRunner.cancel();
await promise;
await taskRunner.cleanup();

expect(encryptedSavedObjectsClient.getDecryptedAsInternalUser).toHaveBeenCalledWith(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
'abc',
{}
);
expect(ruleTypeRegistry.get).toHaveBeenCalledWith('siem.queryRule');
expect(ruleTypeRegistry.ensureRuleTypeEnabled).toHaveBeenCalledWith('siem.queryRule');
expect(mockValidateRuleTypeParams).toHaveBeenCalledWith(
mockedAdHocRunSO.attributes.rule.params,
ruleTypeWithAlerts.validate.params
);

// @ts-ignore - accessing private variable
// should run the first entry in the schedule
expect(taskRunner.scheduleToRunIndex).toEqual(1);
expect(RuleRunMetricsStore).toHaveBeenCalledTimes(1);
expect(ruleTypeWithAlerts.executor).toHaveBeenCalledTimes(1);

expect(internalSavedObjectsRepository.update).toHaveBeenCalledWith(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
mockedAdHocRunSO.id,
{
schedule: [
{ ...schedule1, status: adHocRunStatus.COMPLETE },
{ ...schedule2, status: adHocRunStatus.TIMEOUT },
],
},
{ namespace: undefined, refresh: false }
);

expect(internalSavedObjectsRepository.delete).toHaveBeenCalledWith(
AD_HOC_RUN_SAVED_OBJECT_TYPE,
mockedAdHocRunSO.id,
{ namespace: undefined, refresh: false }
);

testAlertingEventLogCalls({
status: 'ok',
timeout: true,
backfillRunAt: schedule2.runAt,
backfillInterval: schedule2.interval,
});
expect(logger.debug).toHaveBeenCalledTimes(3);
expect(logger.debug).nthCalledWith(
1,
`Executing ad hoc run for rule test:rule-id for runAt ${schedule2.runAt}`
);
expect(logger.debug).nthCalledWith(
2,
`Cancelling execution for ad hoc run with id abc for rule type test with id rule-id - execution exceeded rule type timeout of 3m`
);
expect(logger.debug).nthCalledWith(
3,
`Aborting any in-progress ES searches for rule type test with id rule-id`
);
expect(logger.error).not.toHaveBeenCalled();
});

test('should handle task cancellation that leads to executor throwing error', async () => {
ruleTypeWithAlerts.executor.mockImplementationOnce(() => {
throw new Error('Search has been aborted due to cancelled execution');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import {
TaskErrorSource,
} from '@kbn/task-manager-plugin/server';
import { nanosToMillis } from '@kbn/event-log-plugin/common';
import { RunResult } from '@kbn/task-manager-plugin/server/task';
import { CancellableTask, RunResult } from '@kbn/task-manager-plugin/server/task';
import { AdHocRunStatus, adHocRunStatus } from '../../common/constants';
import { RuleRunnerErrorStackTraceLog, RuleTaskStateAndMetrics, TaskRunnerContext } from './types';
import { getExecutorServices } from './get_executor_services';
Expand Down Expand Up @@ -66,7 +66,7 @@ interface RunParams {
validatedParams: RuleTypeParams;
}

export class AdHocTaskRunner {
export class AdHocTaskRunner implements CancellableTask {
private readonly context: TaskRunnerContext;
private readonly executionId: string;
private readonly internalSavedObjectsRepository: ISavedObjectsRepository;
Expand Down Expand Up @@ -573,6 +573,10 @@ export class AdHocTaskRunner {
: undefined,
},
});
this.shouldDeleteTask = !this.hasAnyPendingRuns();

// cleanup function is not called for timed out tasks
await this.cleanup();
}

async cleanup() {
Expand Down

0 comments on commit bcdf5e9

Please sign in to comment.