Skip to content

Commit

Permalink
[EDR Workflows][Serverless] E2E Endpoint creation fine tuning (elasti…
Browse files Browse the repository at this point in the history
…c#172463)

This PR addresses 3 known issues with environment setup on Serverless CI
pipelines.

### Setup task fails on:
### 1. Host recreation after first failed attempt

Test fails on: `AssertionError: Timed out retrying after 60000ms:
Expected to find content: 'test-host-2321' but never did.`

Failed job from before:
[here](https://buildkite.com/elastic/kibana-on-merge/builds/38728#018c219b-c0b2-41a6-9cba-2613fa85382c)

Endpoint creation task is successful (host is enrolled with fleet),
however, it doesn't appear in kibana. Since we are using /metadata
endpoint to list all agents I've added a check in the task to see if the
endpoint makes it to the metadata . If it fails to do so I delete the
endpoint and do a retry with additional index (.fleet-agents,
metadata-current and metadata-unified) search (thanks @joeypoon)

Successful job:
[here](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4284#018c462c-21ec-44c0-afea-a630253e7717)

### 2. Fleet server not coming up

Test fails on: `│ERROR Error: Timed out waiting for fleet server
[dev-fleet-server.8284.gns5] to register with Elasticsarch`

Failed job from before:
[here](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4166#018c368c-2434-4a2a-aaab-ae097ef26843)

If first attempt at creating and enrolling fleet server fails we do a
retry.

Successful job:
[here](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4285#018c462c-487d-4a1c-8a37-71ca62915878)


### 3. Package policy creation fails

Test fails on: `CypressError: cy.task('indexFleetEndpointPolicy') failed
with the following error: Request failed with status code 500`

**Couldn't recreate in CI.**
Failed job from before:
[here](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4204#018c3f17-3a3a-43dd-a31b-6bc5109d4193)

Package installation fails with `no_shard_available_action_exception`
error. We retry api call.



closes elastic#170482 (agent creation)
closes elastic#172920 (agent creation)
closes elastic#172319 (agent creation)
closes elastic#172326 (package policy)

1000 test runs on single test file (issues were occuring in setup tasks,
not test cases itself):
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4496
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4497
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4498
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4499
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4500
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4501
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4502
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4503
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4504
https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/4505
  • Loading branch information
szwarckonrad authored Dec 21, 2023
1 parent 41375d7 commit ff0351e
Show file tree
Hide file tree
Showing 11 changed files with 226 additions and 156 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,23 @@ import {
API_VERSIONS,
} from '@kbn/fleet-plugin/common';
import { memoize } from 'lodash';
import type { ToolingLog } from '@kbn/tooling-log';
import { catchAxiosErrorFormatAndThrow } from '../format_axios_error';
import { usageTracker } from './usage_tracker';
import { getEndpointPackageInfo } from '../utils/package';
import type { PolicyData } from '../types';
import { policyFactory as policyConfigFactory } from '../models/policy_config';
import { wrapErrorAndRejectPromise } from './utils';
import { RETRYABLE_TRANSIENT_ERRORS, retryOnError, wrapErrorAndRejectPromise } from './utils';

export interface IndexedFleetEndpointPolicyResponse {
integrationPolicies: PolicyData[];
agentPolicies: AgentPolicy[];
}

enum TimeoutsInMS {
TEN_SECONDS = 10 * 1000,
FIVE_MINUTES = 5 * 60 * 1000,
}
/**
* Create an endpoint Integration Policy (and associated Agent Policy) via Fleet
* (NOTE: ensure that fleet is setup first before calling this loading function)
Expand All @@ -43,7 +49,8 @@ export const indexFleetEndpointPolicy = usageTracker.track(
kbnClient: KbnClient,
policyName: string,
endpointPackageVersion?: string,
agentPolicyName?: string
agentPolicyName?: string,
log?: ToolingLog
): Promise<IndexedFleetEndpointPolicyResponse> => {
const response: IndexedFleetEndpointPolicyResponse = {
integrationPolicies: [],
Expand Down Expand Up @@ -84,6 +91,7 @@ export const indexFleetEndpointPolicy = usageTracker.track(
// Create integration (package) policy
const newPackagePolicyData: CreatePackagePolicyRequest['body'] = {
name: policyName,
// skip_ensure_installed: true,
description: 'Protect the worlds data',
policy_id: agentPolicy.data.item.id,
enabled: true,
Expand All @@ -106,18 +114,48 @@ export const indexFleetEndpointPolicy = usageTracker.track(
version: packageVersion,
},
};
const packagePolicy = (await kbnClient
.request({
path: PACKAGE_POLICY_API_ROUTES.CREATE_PATTERN,
method: 'POST',
body: newPackagePolicyData,
headers: {
'elastic-api-version': API_VERSIONS.public.v1,
},
})
.catch(wrapErrorAndRejectPromise)) as AxiosResponse<CreatePackagePolicyResponse>;

response.integrationPolicies.push(packagePolicy.data.item as PolicyData);
const createPackagePolicy = async (): Promise<CreatePackagePolicyResponse> =>
kbnClient
.request<CreatePackagePolicyResponse>({
path: PACKAGE_POLICY_API_ROUTES.CREATE_PATTERN,
method: 'POST',
body: newPackagePolicyData,
headers: {
'elastic-api-version': API_VERSIONS.public.v1,
},
})
.catch(catchAxiosErrorFormatAndThrow)
.then((res) => res.data);

const started = new Date();
const hasTimedOut = (): boolean => {
const elapsedTime = Date.now() - started.getTime();
return elapsedTime > TimeoutsInMS.FIVE_MINUTES;
};

let packagePolicy: CreatePackagePolicyResponse | undefined;
log?.debug(`Creating integration policy with name: ${policyName}`);

while (!packagePolicy && !hasTimedOut()) {
packagePolicy = await retryOnError(
async () => createPackagePolicy(),
[...RETRYABLE_TRANSIENT_ERRORS, 'resource_not_found_exception'],
log
);

if (!packagePolicy) {
await new Promise((resolve) => setTimeout(resolve, TimeoutsInMS.TEN_SECONDS));
}
}

if (!packagePolicy) {
throw new Error(`Create package policy failed`);
}

log?.verbose(`Integration policy created:`, JSON.stringify(packagePolicy, null, 2));

response.integrationPolicies.push(packagePolicy.item as PolicyData);

return response;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
*/

import { kibanaPackageJson } from '@kbn/repo-info';
import type { Client } from '@elastic/elasticsearch';

import type { ToolingLog } from '@kbn/tooling-log';
import type { KbnClient } from '@kbn/test/src/kbn_client';
import { isFleetServerRunning } from '../../../../scripts/endpoint/common/fleet_server/fleet_server_services';
Expand All @@ -28,6 +30,7 @@ import {

export interface CreateAndEnrollEndpointHostCIOptions
extends Pick<BaseVmCreateOptions, 'disk' | 'cpus' | 'memory'> {
esClient: Client;
kbnClient: KbnClient;
log: ToolingLog;
/** The fleet Agent Policy ID to use for enrolling the agent */
Expand All @@ -51,6 +54,7 @@ export interface CreateAndEnrollEndpointHostCIResponse {
*/
export const createAndEnrollEndpointHostCI = async ({
kbnClient,
esClient,
log,
agentPolicyId,
cpus,
Expand Down Expand Up @@ -122,7 +126,13 @@ export const createAndEnrollEndpointHostCI = async ({

await hostVm.exec(agentEnrollCommand);

const { id: agentId } = await waitForHostToEnroll(kbnClient, log, hostVm.name, 240000);
const { id: agentId } = await waitForHostToEnroll(
kbnClient,
log,
hostVm.name,
5 * 60 * 1000,
esClient
);

return {
hostname: hostVm.name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,13 @@ export const dataLoaders = (
endpointPackageVersion?: string;
agentPolicyName?: string;
}) => {
const { kbnClient } = await stackServicesPromise;
const { kbnClient, log } = await stackServicesPromise;
return indexFleetEndpointPolicy(
kbnClient,
policyName,
endpointPackageVersion,
agentPolicyName
agentPolicyName,
log
);
},

Expand Down Expand Up @@ -390,7 +391,7 @@ ${s1Info.status}
createEndpointHost: async (
options: Omit<CreateAndEnrollEndpointHostCIOptions, 'log' | 'kbnClient'>
): Promise<CreateAndEnrollEndpointHostCIResponse> => {
const { kbnClient, log } = await stackServicesPromise;
const { kbnClient, log, esClient } = await stackServicesPromise;

let retryAttempt = 0;
const attemptCreateEndpointHost =
Expand All @@ -403,6 +404,7 @@ ${s1Info.status}
...options,
log,
kbnClient,
esClient,
})
: await createAndEnrollEndpointHost({
useClosestVersionMatch: true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ import type { KbnClient } from '@kbn/test';
import type { WriteResponseBase } from '@elastic/elasticsearch/lib/api/types';
import { clone, merge } from 'lodash';
import type { DeepPartial } from 'utility-types';
import { catchAxiosErrorFormatAndThrow } from './format_axios_error';
import {
RETRYABLE_TRANSIENT_ERRORS,
retryOnError,
} from '../../../common/endpoint/data_loaders/utils';
import { catchAxiosErrorFormatAndThrow } from '../../../common/endpoint/format_axios_error';
import type { GetMetadataListRequestQuery } from '../../../common/api/endpoint';
import { resolvePathVariables } from '../../../public/common/utils/resolve_path_variables';
import {
Expand All @@ -19,6 +23,7 @@ import {
METADATA_DATASTREAM,
} from '../../../common/endpoint/constants';
import type { HostInfo, HostMetadata, MetadataListResponse } from '../../../common/endpoint/types';
import { HostStatus } from '../../../common/endpoint/types';
import { EndpointDocGenerator } from '../../../common/endpoint/generate_data';

const endpointGenerator = new EndpointDocGenerator();
Expand Down Expand Up @@ -163,15 +168,15 @@ export const waitForEndpointToStreamData = async (
let found: HostInfo | undefined;

while (!found && !hasTimedOut()) {
found = await fetchEndpointMetadata(kbnClient, endpointAgentId).catch((error) => {
// Ignore `not found` (404) responses. Endpoint could be new and thus documents might not have
// been streamed yet.
if (error?.response?.status === 404) {
return undefined;
}

throw error;
});
found = await retryOnError(
async () =>
fetchEndpointMetadataList(kbnClient, {
kuery: `united.endpoint.agent.id: "${endpointAgentId}"`,
}).then((response) => {
return response.data.filter((record) => record.host_status === HostStatus.HEALTHY)[0];
}),
RETRYABLE_TRANSIENT_ERRORS
);

if (!found) {
// sleep and check again
Expand Down
Loading

0 comments on commit ff0351e

Please sign in to comment.