Skip to content

Commit

Permalink
[Security Solution] Add retry to getMetrics to reduce flake (#180704)
Browse files Browse the repository at this point in the history
## Summary

API integration tests added in #180094 fail occasionally on main, but
pass consistently locally and in the flaky test runner. This PR adds a
retry to the `getMetrics` request in hopes of removing the flakiness.

Flake issues:
#180530
#180641

---------

Co-authored-by: kibanamachine <[email protected]>
  • Loading branch information
marshallmain and kibanamachine authored Apr 15, 2024
1 parent 9e6617d commit 59055c6
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ import {
} from '../../../../../../../common/utils/security_solution';
import { FtrProviderContext } from '../../../../../../ftr_provider_context';
import { EsArchivePathBuilder } from '../../../../../../es_archive_path_builder';
import { getMetricsRequest, getMetricsWithRetry } from './utils';

/**
* Specific AGENT_ID to use for some of the tests. If the archiver changes and you see errors
Expand All @@ -66,6 +67,7 @@ export default ({ getService }: FtrProviderContext) => {
const es = getService('es');
const log = getService('log');
const kibanaServer = getService('kibanaServer');
const retry = getService('retry');

// TODO: add a new service for loading archiver files similar to "getService('es')"
const config = getService('config');
Expand Down Expand Up @@ -205,15 +207,7 @@ export default ({ getService }: FtrProviderContext) => {
});

it('classifies verification_exception errors as user errors', async () => {
function getMetricsRequest(reset: boolean = false) {
return request
.get(`/api/task_manager/metrics${reset ? '' : '?reset=false'}`)
.set('kbn-xsrf', 'foo')
.expect(200)
.then((response) => response.body);
}

await getMetricsRequest(true);
await getMetricsRequest(request, true);
const rule: EqlRuleCreateProps = {
...getEqlRuleForAlertTesting(['auditbeat-*']),
query: 'file where field.doesnt.exist == true',
Expand All @@ -234,9 +228,15 @@ export default ({ getService }: FtrProviderContext) => {
ruleResponse.execution_summary.last_execution.message.includes('verification_exception')
).eql(true);

const metricsResponse = await getMetricsRequest();
const metricsResponse = await getMetricsWithRetry(
request,
retry,
false,
(metrics) =>
metrics.metrics?.task_run?.value.by_type['alerting:siem__eqlRule'].user_errors === 1
);
expect(
metricsResponse.metrics.task_run.value.by_type['alerting:siem__eqlRule'].user_errors
metricsResponse.metrics?.task_run?.value.by_type['alerting:siem__eqlRule'].user_errors
).eql(1);
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ import {
} from '../../../../../../../common/utils/security_solution';
import { FtrProviderContext } from '../../../../../../ftr_provider_context';
import { EsArchivePathBuilder } from '../../../../../../es_archive_path_builder';
import { getMetricsRequest, getMetricsWithRetry } from './utils';

export default ({ getService }: FtrProviderContext) => {
const supertest = getService('supertest');
Expand All @@ -69,6 +70,7 @@ export default ({ getService }: FtrProviderContext) => {
const isServerless = config.get('serverless');
const dataPathBuilder = new EsArchivePathBuilder(isServerless);
const auditPath = dataPathBuilder.getPath('auditbeat/hosts');
const retry = getService('retry');

const siemModule = 'security_linux_v3';
const mlJobId = 'v3_linux_anomalous_network_activity';
Expand Down Expand Up @@ -181,15 +183,7 @@ export default ({ getService }: FtrProviderContext) => {
});

it('classifies ml job missing errors as user errors', async () => {
function getMetricsRequest(reset: boolean = false) {
return request
.get(`/api/task_manager/metrics${reset ? '' : '?reset=false'}`)
.set('kbn-xsrf', 'foo')
.expect(200)
.then((response) => response.body);
}

await getMetricsRequest(true);
await getMetricsRequest(request, true);
const badRule: MachineLearningRuleCreateProps = {
...rule,
machine_learning_job_id: 'doesNotExist',
Expand All @@ -210,10 +204,14 @@ export default ({ getService }: FtrProviderContext) => {
true
);

const metricsResponse = await getMetricsRequest();
expect(
metricsResponse.metrics.task_run.value.by_type['alerting:siem__mlRule'].user_errors
).toEqual(1);
const metricsResponse = await getMetricsWithRetry(
request,
retry,
false,
(metrics) =>
metrics.metrics?.task_run?.value.by_type['alerting:siem__mlRule'].user_errors === 1
);
expect(metricsResponse.metrics?.task_run?.value.by_type['alerting:siem__mlRule']).toEqual(1);
});

it('@skipInQA generates max alerts warning when circuit breaker is exceeded', async () => {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import supertest from 'supertest';

import { NodeMetrics } from '@kbn/task-manager-plugin/server/routes/metrics';
import { RetryService } from '@kbn/ftr-common-functional-services';

export const getMetricsRequest = (
request: supertest.SuperTest<supertest.Test>,
reset: boolean = false
) => {
return request
.get(`/api/task_manager/metrics${reset ? '' : '?reset=false'}`)
.set('kbn-xsrf', 'foo')
.expect(200)
.then((response) => response.body);
};

export const getMetricsWithRetry = (
request: supertest.SuperTest<supertest.Test>,
retry: RetryService,
reset: boolean = false,
callback?: (metrics: NodeMetrics) => boolean
): Promise<NodeMetrics> => {
return retry.try(async () => {
const metrics = await getMetricsRequest(request, reset);

if (metrics.metrics) {
if ((callback && callback(metrics)) || !callback) {
return metrics;
}
}

throw new Error(`Expected metrics not received: ${JSON.stringify(metrics)}`);
});
};

0 comments on commit 59055c6

Please sign in to comment.