From 341772ea9486fd9e8500b5dd67f9e5fd22530e59 Mon Sep 17 00:00:00 2001 From: Julia Bardi <90178898+juliaElastic@users.noreply.github.com> Date: Mon, 18 Dec 2023 11:10:05 +0100 Subject: [PATCH] [Fleet] Upgrade details telemetry (#173356) Relates https://github.com/elastic/kibana/issues/162448 Added upgrade details telemetry, publishing to `fleet-agents index` in telemetry cluster, each bucket as separate documents. Implemented by doing a `multi_terms` aggregation to group the same `target_version, state, error_msg` values together. Do we also want to include the agent count in each bucket in the telemetry event? @jlind23 @ycombinator Note: since this task runs every hour, it will most likely capture the `UPG_FAILED` states, since the other (success) states are temporarily on the agent docs, and removed if the upgrade is successful. E.g. 2 docs like the below become one telemetry event ``` // .fleet-agents upgrade_details: { target_version: '8.12.0', state: 'UPG_FAILED', metadata: { error_msg: 'Download failed', }, }, // telemetry event { target_version: '8.12.0', state: 'UPG_FAILED', error_msg: 'Download failed', } ``` To verify: - start kibana 8.13-SNAPSHOT locally - set an invalid agent download source in Fleet Settings - enroll an agent version 8.12-SNAPSHOT - upgrade to 8.13-SNAPSHOT with the API ``` POST kbn:/api/fleet/agents//upgrade { "version": "8.13.0-SNAPSHOT", "force": true } ``` - wait 15m so that the upgrade goes to failed state - wait up to 1h for the telemetry task to run (speed up locally by setting a shorter interval in FleetUsageSender in kibana) - verify in debug logs: ``` [2023-12-14T14:26:28.832+01:00][DEBUG][plugins.fleet] Agents upgrade details telemetry: [{"target_version":"8.13.0-SNAPSHOT","state":"UPG_FAILED","error_msg":"failed download of agent binary: unable to download package: 3 errors occurred:\n\t* package '/Library/Elastic/Agent/data/elastic-agent-f383c6/downloads/elastic-agent-8.13.0-SNAPSHOT-darwin-aarch64.tar.gz' not found: open /Library/Elastic/Agent/data/elastic-agent-f383c6/downloads/elastic-agent-8.13.0-SNAPSHOT-darwin-aarch64.tar.gz: no such file or directory\n\t* call to 'https://artifacts.elastic.co/downloads/dummy/beats/elastic-agent/elastic-agent-8.13.0-SNAPSHOT-darwin-aarch64.tar.gz' returned unsuccessful status code: 404\n\t* call to 'https://artifacts.elastic.co/downloads/dummy/beats/elastic-agent/elastic-agent-8.13.0-SNAPSHOT-darwin-aarch64.tar.gz' returned unsuccessful status code: 404\n\n"}] ``` - [x] [Unit or functional tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html) were updated or added to match the most common scenarios --- .../server/collectors/agent_collectors.ts | 34 +++++++++++++++++ .../server/collectors/agents_per_output.ts | 1 + .../fleet_usage_telemetry.test.ts | 38 +++++++++++++++++++ .../services/telemetry/fleet_usage_sender.ts | 10 ++++- .../services/telemetry/fleet_usages_schema.ts | 32 ++++++++++++++++ 5 files changed, 114 insertions(+), 1 deletion(-) diff --git a/x-pack/plugins/fleet/server/collectors/agent_collectors.ts b/x-pack/plugins/fleet/server/collectors/agent_collectors.ts index 09a4986e0e6f0..391114aa992b3 100644 --- a/x-pack/plugins/fleet/server/collectors/agent_collectors.ts +++ b/x-pack/plugins/fleet/server/collectors/agent_collectors.ts @@ -75,6 +75,12 @@ export interface AgentData { version: string; count: number; }>; + upgrade_details: Array<{ + target_version: string; + state: string; + error_msg: string; + agent_count: number; + }>; } const DEFAULT_AGENT_DATA = { @@ -82,6 +88,7 @@ const DEFAULT_AGENT_DATA = { agents_per_policy: [], agents_per_version: [], agents_per_os: [], + upgrade_details: [], }; export const getAgentData = async ( @@ -135,6 +142,23 @@ export const getAgentData = async ( ], }, }, + upgrade_details: { + multi_terms: { + size: 1000, + terms: [ + { + field: 'upgrade_details.target_version.keyword', + }, + { + field: 'upgrade_details.state', + }, + { + field: 'upgrade_details.metadata.error_msg.keyword', + missing: '', + }, + ], + }, + }, }, }, { signal: abortController.signal } @@ -190,11 +214,21 @@ export const getAgentData = async ( count: bucket.doc_count, })); + const upgradeDetails = ((response?.aggregations?.upgrade_details as any).buckets ?? []).map( + (bucket: any) => ({ + target_version: bucket.key[0], + state: bucket.key[1], + error_msg: bucket.key[2], + agent_count: bucket.doc_count, + }) + ); + return { agent_checkin_status: statuses, agents_per_policy: agentsPerPolicy, agents_per_version: agentsPerVersion, agents_per_os: agentsPerOS, + upgrade_details: upgradeDetails, }; } catch (error) { if (error.statusCode === 404) { diff --git a/x-pack/plugins/fleet/server/collectors/agents_per_output.ts b/x-pack/plugins/fleet/server/collectors/agents_per_output.ts index 3ad09bcb51177..bfa4cb49ca8d7 100644 --- a/x-pack/plugins/fleet/server/collectors/agents_per_output.ts +++ b/x-pack/plugins/fleet/server/collectors/agents_per_output.ts @@ -61,5 +61,6 @@ export async function getAgentsPerOutput( } outputTypes[monitoringOutputType].count_as_monitoring += item.agents ?? 0; }); + return Object.values(outputTypes); } diff --git a/x-pack/plugins/fleet/server/integration_tests/fleet_usage_telemetry.test.ts b/x-pack/plugins/fleet/server/integration_tests/fleet_usage_telemetry.test.ts index e2e7e9f7887e6..9c17e7b565162 100644 --- a/x-pack/plugins/fleet/server/integration_tests/fleet_usage_telemetry.test.ts +++ b/x-pack/plugins/fleet/server/integration_tests/fleet_usage_telemetry.test.ts @@ -146,6 +146,13 @@ describe('fleet usage telemetry', () => { status: 'HEALTHY', }, ], + upgrade_details: { + target_version: '8.12.0', + state: 'UPG_FAILED', + metadata: { + error_msg: 'Download failed', + }, + }, }, { create: { @@ -176,6 +183,13 @@ describe('fleet usage telemetry', () => { status: 'HEALTHY', }, ], + upgrade_details: { + target_version: '8.12.0', + state: 'UPG_FAILED', + metadata: { + error_msg: 'Agent crash detected', + }, + }, }, { create: { @@ -220,6 +234,11 @@ describe('fleet usage telemetry', () => { last_checkin: new Date(Date.now() - 1000 * 60 * 6).toISOString(), active: true, policy_id: 'policy2', + upgrade_details: { + target_version: '8.11.0', + state: 'UPG_ROLLBACK', + metadata: {}, + }, }, ], refresh: 'wait_for', @@ -498,5 +517,24 @@ describe('fleet usage telemetry', () => { fleet_server_logs_top_errors: ['failed to unenroll offline agents'], }) ); + expect(usage?.upgrade_details.length).toBe(3); + expect(usage?.upgrade_details).toContainEqual({ + target_version: '8.12.0', + state: 'UPG_FAILED', + error_msg: 'Download failed', + agent_count: 1, + }); + expect(usage?.upgrade_details).toContainEqual({ + target_version: '8.12.0', + state: 'UPG_FAILED', + error_msg: 'Agent crash detected', + agent_count: 1, + }); + expect(usage?.upgrade_details).toContainEqual({ + target_version: '8.11.0', + state: 'UPG_ROLLBACK', + error_msg: '', + agent_count: 1, + }); }); }); diff --git a/x-pack/plugins/fleet/server/services/telemetry/fleet_usage_sender.ts b/x-pack/plugins/fleet/server/services/telemetry/fleet_usage_sender.ts index 4ee0a7dac4f89..1b90fe6d01e0b 100644 --- a/x-pack/plugins/fleet/server/services/telemetry/fleet_usage_sender.ts +++ b/x-pack/plugins/fleet/server/services/telemetry/fleet_usage_sender.ts @@ -24,7 +24,7 @@ const FLEET_AGENTS_EVENT_TYPE = 'fleet_agents'; export class FleetUsageSender { private taskManager?: TaskManagerStartContract; - private taskVersion = '1.1.3'; + private taskVersion = '1.1.4'; private taskType = 'Fleet-Usage-Sender'; private wasStarted: boolean = false; private interval = '1h'; @@ -83,6 +83,7 @@ export class FleetUsageSender { const { agents_per_version: agentsPerVersion, agents_per_output_type: agentsPerOutputType, + upgrade_details: upgradeDetails, ...fleetUsageData } = usageData; appContextService @@ -106,6 +107,13 @@ export class FleetUsageSender { agents_per_output_type: byOutputType, }); }); + + appContextService + .getLogger() + .debug('Agents upgrade details telemetry: ' + JSON.stringify(upgradeDetails)); + upgradeDetails.forEach((upgradeDetailsObj) => { + core.analytics.reportEvent(FLEET_AGENTS_EVENT_TYPE, { upgrade_details: upgradeDetailsObj }); + }); } catch (error) { appContextService .getLogger() diff --git a/x-pack/plugins/fleet/server/services/telemetry/fleet_usages_schema.ts b/x-pack/plugins/fleet/server/services/telemetry/fleet_usages_schema.ts index e59de684264bf..f7b580932ea0f 100644 --- a/x-pack/plugins/fleet/server/services/telemetry/fleet_usages_schema.ts +++ b/x-pack/plugins/fleet/server/services/telemetry/fleet_usages_schema.ts @@ -90,6 +90,38 @@ export const fleetAgentsSchema: RootSchema = { }, }, }, + upgrade_details: { + _meta: { + description: 'Agent upgrade details telemetry', + optional: true, + }, + properties: { + target_version: { + type: 'keyword', + _meta: { + description: 'Target version of the agent upgrade', + }, + }, + state: { + type: 'keyword', + _meta: { + description: 'State of the agent upgrade', + }, + }, + error_msg: { + type: 'keyword', + _meta: { + description: 'Error message of the agent upgrade if failed', + }, + }, + agent_count: { + type: 'long', + _meta: { + description: 'How many agents have this upgrade details', + }, + }, + }, + }, }; export const fleetUsagesSchema: RootSchema = {