Skip to content

Commit

Permalink
Improve task manager functional tests in preperation for mget task cl…
Browse files Browse the repository at this point in the history
…aimer being the default (elastic#196399)

Resolves elastic#184942
Resolves elastic#192023
Resolves elastic#195573

In this PR, I'm improving the flakiness found in our functional tests in
preperation for mget being the default task claimer that all these tests
run with (elastic#194625). Because the
mget task claimer works differently and also polls more frequently, we
end-up in situations where tasks run faster than they were with
update_by_query, creating more race conditions that are now fixed in
this PR.

Issues were surfaced via elastic#190148
where I set `mget` as the default task claiming strategy.

Flaky test runs (some of these failed on other tests that are flaky):
-
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/7151
-
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/7169
-
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/7172
-
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/7175
-
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/7176
-
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/7185
(for
elastic@0fcf1ae)
  • Loading branch information
mikecote authored Oct 21, 2024
1 parent 6bfe8a7 commit 3b8cf12
Show file tree
Hide file tree
Showing 18 changed files with 174 additions and 60 deletions.
2 changes: 1 addition & 1 deletion x-pack/plugins/actions/server/lib/retry_if_conflicts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export const RetryForConflictsAttempts = 2;
// note: we considered making this random, to help avoid a stampede, but
// with 1 retry it probably doesn't matter, and adding randomness could
// make it harder to diagnose issues
const RetryForConflictsDelay = 250;
const RetryForConflictsDelay = 100;

// retry an operation if it runs into 409 Conflict's, up to a limit
export async function retryIfConflicts<T>(
Expand Down
2 changes: 1 addition & 1 deletion x-pack/plugins/alerting/server/lib/retry_if_conflicts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ export const RetryForConflictsAttempts = 2;
// note: we considered making this random, to help avoid a stampede, but
// with 1 retry it probably doesn't matter, and adding randomness could
// make it harder to diagnose issues
const RetryForConflictsDelay = 250;
const RetryForConflictsDelay = 100;

// retry an operation if it runs into 409 Conflict's, up to a limit
export async function retryIfConflicts<T>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,10 @@ async function searchAvailableTasks({
// Task must be enabled
EnabledTask,
// a task type that's not excluded (may be removed or not)
OneOfTaskTypes('task.taskType', claimPartitions.unlimitedTypes),
OneOfTaskTypes(
'task.taskType',
claimPartitions.unlimitedTypes.concat(Array.from(removedTypes))
),
// Either a task with idle status and runAt <= now or
// status running or claiming with a retryAt <= now.
shouldBeOneOf(IdleTaskWithExpiredRunAt, RunningOrClaimingTaskWithExpiredRetryAt),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ function getIndexRecordActionType() {
secrets,
reference: params.reference,
source: 'action:test.index-record',
'@timestamp': new Date(),
},
});
return { status: 'ok', actionId };
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/
import { omit } from 'lodash';
import type { Client } from '@elastic/elasticsearch';
import { DeleteByQueryRequest } from '@elastic/elasticsearch/lib/api/types';

Expand Down Expand Up @@ -61,6 +62,9 @@ export class ESTestIndexTool {
group: {
type: 'keyword',
},
'@timestamp': {
type: 'date',
},
host: {
properties: {
hostname: {
Expand Down Expand Up @@ -109,6 +113,7 @@ export class ESTestIndexTool {
async search(source: string, reference?: string) {
const body = reference
? {
sort: [{ '@timestamp': 'asc' }],
query: {
bool: {
must: [
Expand All @@ -127,6 +132,7 @@ export class ESTestIndexTool {
},
}
: {
sort: [{ '@timestamp': 'asc' }],
query: {
term: {
source,
Expand All @@ -138,7 +144,16 @@ export class ESTestIndexTool {
size: 1000,
body,
};
return await this.es.search(params, { meta: true });
const result = await this.es.search(params, { meta: true });
result.body.hits.hits = result.body.hits.hits.map((hit) => {
return {
...hit,
// Easier to remove @timestamp than to have all the downstream code ignore it
// in their assertions
_source: omit(hit._source as Record<string, unknown>, '@timestamp'),
};
});
return result;
}

async getAll(size: number = 10) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ export default function apiKeyBackfillTests({ getService }: FtrProviderContext)
}

it('should wait to invalidate API key until backfill for rule is complete', async () => {
const start = moment().utc().startOf('day').subtract(7, 'days').toISOString();
const start = moment().utc().startOf('day').subtract(13, 'days').toISOString();
const end = moment().utc().startOf('day').subtract(4, 'day').toISOString();
const spaceId = SuperuserAtSpace1.space.id;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1184,7 +1184,7 @@ instanceStateValue: true
reference,
overwrites: {
enabled: false,
schedule: { interval: '1s' },
schedule: { interval: '1m' },
},
});

Expand Down Expand Up @@ -1288,7 +1288,7 @@ instanceStateValue: true
);

// @ts-expect-error doesnt handle total: number
expect(searchResult.body.hits.total.value).to.eql(1);
expect(searchResult.body.hits.total.value).to.be.greaterThan(0);
// @ts-expect-error _source: unknown
expect(searchResult.body.hits.hits[0]._source.params.message).to.eql(
'Alerts, all:2, new:2 IDs:[1,2,], ongoing:0 IDs:[], recovered:0 IDs:[]'
Expand All @@ -1304,7 +1304,7 @@ instanceStateValue: true
const response = await alertUtils.createAlwaysFiringRuleWithSummaryAction({
reference,
overwrites: {
schedule: { interval: '1s' },
schedule: { interval: '1h' },
},
notifyWhen: 'onActiveAlert',
throttle: null,
Expand Down Expand Up @@ -1435,7 +1435,7 @@ instanceStateValue: true
const response = await alertUtils.createAlwaysFiringRuleWithSummaryAction({
reference,
overwrites: {
schedule: { interval: '1s' },
schedule: { interval: '3s' },
},
notifyWhen: 'onThrottleInterval',
throttle: '10s',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
.send(
getTestRuleData({
rule_type_id: 'test.patternFiring',
schedule: { interval: '1s' },
throttle: null,
schedule: { interval: '2s' },
throttle: '1s',
params: {
pattern,
},
Expand Down Expand Up @@ -665,6 +665,10 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
status_change_threshold: 4,
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

const { body: createdAction } = await supertest
.post(`${getUrlPrefix(space.id)}/api/actions/connector`)
.set('kbn-xsrf', 'foo')
Expand Down Expand Up @@ -763,6 +767,10 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
status_change_threshold: 4,
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

const { body: createdAction } = await supertest
.post(`${getUrlPrefix(space.id)}/api/actions/connector`)
.set('kbn-xsrf', 'foo')
Expand Down Expand Up @@ -871,6 +879,10 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
status_change_threshold: 4,
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

const { body: createdAction } = await supertest
.post(`${getUrlPrefix(space.id)}/api/actions/connector`)
.set('kbn-xsrf', 'foo')
Expand Down Expand Up @@ -964,6 +976,10 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
status_change_threshold: 4,
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

const { body: createdAction } = await supertest
.post(`${getUrlPrefix(space.id)}/api/actions/connector`)
.set('kbn-xsrf', 'foo')
Expand Down Expand Up @@ -1067,6 +1083,10 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
status_change_threshold: 5,
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

const { body: createdAction } = await supertest
.post(`${getUrlPrefix(space.id)}/api/actions/connector`)
.set('kbn-xsrf', 'foo')
Expand Down Expand Up @@ -1166,6 +1186,10 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
status_change_threshold: 4,
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

const { body: createdAction } = await supertest
.post(`${getUrlPrefix(space.id)}/api/actions/connector`)
.set('kbn-xsrf', 'foo')
Expand All @@ -1192,7 +1216,8 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
.send(
getTestRuleData({
rule_type_id: 'test.patternFiring',
schedule: { interval: '1s' },
schedule: { interval: '2s' },
notify_when: RuleNotifyWhen.THROTTLE,
throttle: '1s',
params: {
pattern,
Expand Down Expand Up @@ -1263,6 +1288,10 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
status_change_threshold: 4,
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

const { body: createdAction } = await supertest
.post(`${getUrlPrefix(space.id)}/api/actions/connector`)
.set('kbn-xsrf', 'foo')
Expand All @@ -1289,7 +1318,7 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
.send(
getTestRuleData({
rule_type_id: 'test.patternFiring',
schedule: { interval: '1s' },
schedule: { interval: '2s' },
throttle: null,
notify_when: null,
params: {
Expand All @@ -1302,8 +1331,7 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
params: {},
frequency: {
summary: false,
throttle: '1s',
notify_when: RuleNotifyWhen.THROTTLE,
notify_when: RuleNotifyWhen.ACTIVE,
},
},
{
Expand All @@ -1312,8 +1340,7 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
params: {},
frequency: {
summary: false,
throttle: '1s',
notify_when: RuleNotifyWhen.THROTTLE,
notify_when: RuleNotifyWhen.ACTIVE,
},
},
],
Expand Down Expand Up @@ -1371,6 +1398,10 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
status_change_threshold: 4,
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

const { body: createdAction } = await supertest
.post(`${getUrlPrefix(space.id)}/api/actions/connector`)
.set('kbn-xsrf', 'foo')
Expand All @@ -1396,7 +1427,7 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
.send(
getTestRuleData({
rule_type_id: 'test.patternFiring',
schedule: { interval: '1s' },
schedule: { interval: '2s' },
throttle: '1s',
params: {
pattern,
Expand Down Expand Up @@ -1463,6 +1494,10 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
status_change_threshold: 4,
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

const { body: createdAction } = await supertest
.post(`${getUrlPrefix(space.id)}/api/actions/connector`)
.set('kbn-xsrf', 'foo')
Expand All @@ -1488,7 +1523,7 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
.send(
getTestRuleData({
rule_type_id: 'test.patternFiring',
schedule: { interval: '1s' },
schedule: { interval: '2s' },
throttle: null,
notify_when: null,
params: {
Expand All @@ -1501,8 +1536,7 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
params: {},
frequency: {
summary: false,
throttle: '1s',
notify_when: RuleNotifyWhen.THROTTLE,
notify_when: RuleNotifyWhen.ACTIVE,
},
},
{
Expand All @@ -1511,8 +1545,7 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
params: {},
frequency: {
summary: false,
throttle: '1s',
notify_when: RuleNotifyWhen.THROTTLE,
notify_when: RuleNotifyWhen.ACTIVE,
},
},
],
Expand Down Expand Up @@ -1567,6 +1600,9 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

// flap and then recover, then active again
const instance = [true, false, true, false, true].concat(
...new Array(6).fill(false),
Expand Down Expand Up @@ -1709,8 +1745,8 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
.send(
getTestRuleData({
rule_type_id: 'test.patternFiring',
schedule: { interval: '1s' },
throttle: null,
schedule: { interval: '2s' },
throttle: '1s',
params: {
pattern,
},
Expand Down Expand Up @@ -1942,8 +1978,8 @@ export default function eventLogTests({ getService }: FtrProviderContext) {
provider: 'alerting',
actions: new Map([
// make sure the counts of the # of events per type are as expected
['execute-start', { equal: 6 }],
['execute', { equal: 6 }],
['execute-start', { gte: 6 }],
['execute', { gte: 6 }],
['new-instance', { equal: 1 }],
['active-instance', { equal: 2 }],
['recovered-instance', { equal: 1 }],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@ export default function createAlertsAsDataFlappingTest({ getService }: FtrProvid

const alertsAsDataIndex = '.alerts-test.patternfiring.alerts-default';

// FLAKY: https://github.com/elastic/kibana/issues/195573
// Failing: See https://github.com/elastic/kibana/issues/195573
describe.skip('alerts as data flapping', function () {
describe('alerts as data flapping', function () {
this.tags('skipFIPS');
beforeEach(async () => {
await es.deleteByQuery({
Expand Down Expand Up @@ -712,6 +710,9 @@ export default function createAlertsAsDataFlappingTest({ getService }: FtrProvid
})
.expect(200);

// wait so cache expires
await setTimeoutAsync(TEST_CACHE_EXPIRATION_TIME);

// Wait for the rule to run once
let run = 1;
let runWhichItFlapped = 0;
Expand Down Expand Up @@ -754,6 +755,11 @@ export default function createAlertsAsDataFlappingTest({ getService }: FtrProvid
const searchResult = await es.search({
index: alertsAsDataIndex,
body: {
sort: [
{
'@timestamp': 'desc',
},
],
query: {
bool: {
must: {
Expand Down
Loading

0 comments on commit 3b8cf12

Please sign in to comment.