From 40e7bc183de67a926a614ebae886aa1dcf26e388 Mon Sep 17 00:00:00 2001 From: Marco Antonio Ghiani Date: Thu, 28 Sep 2023 14:54:40 +0200 Subject: [PATCH] [Stack Monitoring] Update flows for cpu stats fetching (#167244) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 📓 Summary When retrieving the CPU stats for containerized (or non-container) clusters, we were not considering a scenario where the user could run in a cgroup but without limits set. These changes re-write the conditions to determine whether we allow treating limitless containers as non-containerized, covering the case where a user run in a cgroup and for some reason hasn't set the limit. ## Testing > Taken from https://github.com/elastic/kibana/pull/159351 since it reproduced the same behaviours There are 3 main states to test: No limit set but Kibana configured to use container stats. Limit changed during lookback period (to/from real value, to/from no limit). Limit set and CPU usage crossing threshold and then falling down to recovery **Note: Please also test the non-container use case for this rule to ensure that didn't get broken during this refactor** **1. Start Elasticsearch in a container without setting the CPU limits:** ``` docker network create elastic docker run --name es01 --net elastic -p 9201:9200 -e xpack.license.self_generated.type=trial -it docker.elastic.co/elasticsearch/elasticsearch:master-SNAPSHOT ``` (We're using `master-SNAPSHOT` to include a recent fix to reporting for cgroup v2) Make note of the generated password for the `elastic` user. **2. Start another Elasticsearch instance to act as the monitoring cluster** **3. Configure Kibana to connect to the monitoring cluster and start it** **4. Configure Metricbeat to collect metrics from the Docker cluster and ship them to the monitoring cluster, then start it** Execute the below command next to the Metricbeat binary to grab the CA certificate from the Elasticsearch cluster. ``` docker cp es01:/usr/share/elasticsearch/config/certs/http_ca.crt . ``` Use the `elastic` password and the CA certificate to configure the `elasticsearch` module: ``` - module: elasticsearch xpack.enabled: true period: 10s hosts: - "https://localhost:9201" username: "elastic" password: "PASSWORD" ssl.certificate_authorities: "PATH_TO_CERT/http_ca.crt" ``` **5. Configure an alert in Kibana with a chosen threshold** OBSERVE: Alert gets fired to inform you that there looks to be a misconfiguration, together with reporting the current value for the fallback metric (warning if the fallback metric is below threshold, danger is if is above). **6. Set limit** First stop ES using `docker stop es01`, then set the limit using `docker update --cpus=1 es01` and start it again using `docker start es01`. After a brief delay you should now see the alert change to a warning about the limits having changed during the alert lookback period and stating that the CPU usage could not be confidently calculated. Wait for change event to pass out of lookback window. **7. Generate load on the monitored cluster** [Slingshot](https://github.com/elastic/slingshot) is an option. After you clone it, you need to update the `package.json` to match [this change](https://github.com/elastic/slingshot/blob/8bfa8351deb0d89859548ee5241e34d0920927e5/package.json#L45-L46) before running `npm install`. Then you can modify the value for `elasticsearch` in the `configs/hosts.json` file like this: ``` "elasticsearch": { "node": "https://localhost:9201", "auth": { "username": "elastic", "password": "PASSWORD" }, "ssl": { "ca": "PATH_TO_CERT/http_ca.crt", "rejectUnauthorized": false } } ``` Then you can start one or more instances of Slingshot like this: `npx ts-node bin/slingshot load --config configs/hosts.json` **7. Observe the alert firing in the logs** Assuming you're using a connector for server log output, you should see a message like below once the threshold is breached: ``` `[2023-06-13T13:05:50.036+02:00][INFO ][plugins.actions.server-log] Server log: CPU usage alert is firing for node e76ce10526e2 in cluster: docker-cluster. [View node](/app/monitoring#/elasticsearch/nodes/OyDWTz1PS-aEwjqcPN2vNQ?_g=(cluster_uuid:kasJK8VyTG6xNZ2PFPAtYg))` ``` The alert should also be visible in the Stack Monitoring UI overview page. At this point you can stop Slingshot and confirm that the alert recovers once CPU usage goes back down below the threshold. **8. Stop the load and confirm that the rule recovers.** --------- Co-authored-by: Marco Antonio Ghiani Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com> (cherry picked from commit 833c07536267141ab9c53335e000698f74ce22d3) --- .../plugins/monitoring/common/types/alerts.ts | 1 - .../server/alerts/cpu_usage_rule.test.ts | 40 ++++++--- .../server/alerts/cpu_usage_rule.ts | 27 +----- .../alerts/fetch_cpu_usage_node_stats.test.ts | 83 +----------------- .../lib/alerts/fetch_cpu_usage_node_stats.ts | 86 +++++-------------- 5 files changed, 56 insertions(+), 181 deletions(-) diff --git a/x-pack/plugins/monitoring/common/types/alerts.ts b/x-pack/plugins/monitoring/common/types/alerts.ts index 71943f42dd21f..adf00789d4056 100644 --- a/x-pack/plugins/monitoring/common/types/alerts.ts +++ b/x-pack/plugins/monitoring/common/types/alerts.ts @@ -171,7 +171,6 @@ export interface AlertNodeStats { export interface AlertCpuUsageNodeStats extends AlertNodeStats { cpuUsage?: number; limitsChanged?: boolean; - missingLimits?: boolean; unexpectedLimits?: boolean; } diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts index 171daed9f9d64..dcf1e80583726 100644 --- a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts +++ b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts @@ -258,12 +258,7 @@ describe('CpuUsageRule', () => { it('should fire actions when resource limits are missing', async () => { (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => { - return [ - { - ...stat, - missingLimits: true, - }, - ]; + return [stat]; }); const rule = new CpuUsageRule(); @@ -287,14 +282,39 @@ describe('CpuUsageRule', () => { nodeId, nodeName, threshold, - missingLimits: true, }, nodeId, nodeName, ui: { isFiring: true, message: { - text: `Kibana is configured for containerized workloads but node #start_linkmyNodeName#end_link does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%. Last checked at #absolute`, + text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`, + nextSteps: [ + { + text: '#start_linkCheck hot threads#end_link', + tokens: [ + { + startToken: '#start_link', + endToken: '#end_link', + type: 'docLink', + partialUrl: + '{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/cluster-nodes-hot-threads.html', + }, + ], + }, + { + text: '#start_linkCheck long running tasks#end_link', + tokens: [ + { + startToken: '#start_link', + endToken: '#end_link', + type: 'docLink', + partialUrl: + '{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html', + }, + ], + }, + ], tokens: [ { startToken: '#start_link', @@ -319,8 +339,8 @@ describe('CpuUsageRule', () => { ], }); expect(scheduleActions).toHaveBeenCalledWith('default', { - internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`, + internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`, action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, actionPlain: 'Verify CPU usage of node.', clusterName, diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts index 58265dbfdbad7..49ab66f2ce10d 100644 --- a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts +++ b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts @@ -100,12 +100,7 @@ export class CpuUsageRule extends BaseRule { stat: AlertCpuUsageNodeStats, threshold: number ): { shouldFire: boolean; severity: AlertSeverity } { - if ( - stat.missingLimits || - stat.limitsChanged || - stat.unexpectedLimits || - stat.cpuUsage === undefined - ) { + if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) { let severity = AlertSeverity.Warning; if (stat.cpuUsage && stat.cpuUsage > threshold) { severity = AlertSeverity.Danger; @@ -149,19 +144,6 @@ export class CpuUsageRule extends BaseRule { } as AlertMessageTimeToken, ]; - if (stat.missingLimits) { - return { - text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.missingLimits', { - defaultMessage: `Kibana is configured for containerized workloads but node #start_link{nodeName}#end_link does not have resource limits configured. Fallback metric reports usage of {cpuUsage}%. Last checked at #absolute`, - values: { - nodeName: stat.nodeName, - cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT), - }, - }), - tokens, - }; - } - if (stat.unexpectedLimits) { return { text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', { @@ -273,12 +255,7 @@ export class CpuUsageRule extends BaseRule { private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) { const stat = state.meta as AlertCpuUsageNodeStats; - if ( - stat.missingLimits || - stat.limitsChanged || - stat.unexpectedLimits || - stat.cpuUsage === undefined - ) { + if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) { return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', { defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`, values: { diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts index 9551b30d1c2d2..214a7c04005f5 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts @@ -126,10 +126,10 @@ describe('fetchCpuUsageNodeStats', () => { value: 45, }, quota_micros_max: { - value: -1, + value: 2000, }, quota_micros_min: { - value: -1, + value: 2000, }, name: { buckets: [ @@ -366,7 +366,6 @@ describe('fetchCpuUsageNodeStats', () => { expect(stats).toEqual([ { - missingLimits: true, clusterUuid: 'my-test-cluster', nodeId: 'my-test-node', nodeName: 'test-node', @@ -454,83 +453,5 @@ describe('fetchCpuUsageNodeStats', () => { }, ]); }); - - it('warns about failing to compute usage due to values missing', async () => { - esClient.search.mockResponse({ - aggregations: { - clusters: { - buckets: [ - { - key: 'my-test-cluster', - nodes: { - buckets: [ - { - key: 'my-test-node', - min_usage_nanos: { - value: null, - }, - max_usage_nanos: { - value: null, - }, - min_periods: { - value: null, - }, - max_periods: { - value: null, - }, - quota_micros_min: { - value: 10000, - }, - quota_micros_max: { - value: 10000, - }, - average_cpu_usage_percent: { - value: 45, - }, - name: { - buckets: [ - { - key: 'test-node', - }, - ], - }, - index: { - buckets: [ - { - key: 'a-local-index', - }, - ], - }, - }, - ], - }, - }, - ], - }, - }, - } as any); - - const stats = await fetchCpuUsageNodeStats( - { - esClient, - clusterUuids: ['my-test-cluster'], - startMs: 0, - endMs: 10, - filterQuery, - logger: loggerMock.create(), - }, - configSlice - ); - - expect(stats).toEqual([ - { - clusterUuid: 'my-test-cluster', - nodeId: 'my-test-node', - nodeName: 'test-node', - ccs: undefined, - cpuUsage: undefined, - }, - ]); - }); }); }); diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts index 014c38f447e1e..5ccaa522c7368 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts @@ -14,14 +14,6 @@ import { MonitoringConfig } from '../../config'; import { getElasticsearchDataset, getIndexPatterns } from '../cluster/get_index_patterns'; import { createDatasetFilter } from './create_dataset_query_filter'; -interface CpuUsageFieldsWithValues { - 'max of node_stats.os.cgroup.cpu.cfs_quota_micros': number | null; - 'max of node_stats.os.cgroup.cpuacct.usage_nanos': number | null; - 'min of node_stats.os.cgroup.cpuacct.usage_nanos': number | null; - 'max of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': number | null; - 'min of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': number | null; -} - interface Options { esClient: ElasticsearchClient; clusterUuids: string[]; @@ -45,7 +37,7 @@ export async function fetchCpuUsageNodeStats( } async function fetchContainerStats( - { esClient, startMs, endMs, clusterUuids, filterQuery, logger }: Options, + { esClient, startMs, endMs, clusterUuids, filterQuery }: Options, config: MonitoringConfig ) { const indexPatterns = getIndexPatterns({ @@ -178,58 +170,34 @@ async function fetchContainerStats( ccs = index.includes(':') ? index.split(':')[0] : undefined; } - const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1; - const notRunningInAContainer = - node.quota_micros_min.value === null && node.quota_micros_max.value === null; - if (limitsNotSet || notRunningInAContainer) { - return { - missingLimits: true, - clusterUuid: cluster.key as string, - nodeId: node.key as string, - cpuUsage: node.average_cpu_usage_percent.value ?? undefined, - nodeName, - ccs, - }; - } + const nodeStats = { + clusterUuid: cluster.key as string, + nodeId: node.key as string, + nodeName, + ccs, + }; - if (node.quota_micros_min.value !== node.quota_micros_max.value) { - return { - limitsChanged: true, - clusterUuid: cluster.key as string, - nodeId: node.key as string, - cpuUsage: undefined, - nodeName, - ccs, - }; - } + const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1; if ( + limitsNotSet || node.max_usage_nanos.value === null || node.min_usage_nanos.value === null || node.max_periods.value === null || node.min_periods.value === null || node.quota_micros_max.value === null ) { - logger.warn( - `CPU usage rule: Some aggregated values needed for container CPU usage calculation was empty: ${findEmptyValues( - { - 'max of node_stats.os.cgroup.cpu.cfs_quota_micros': node.quota_micros_max.value, - 'max of node_stats.os.cgroup.cpuacct.usage_nanos': node.max_usage_nanos.value, - 'min of node_stats.os.cgroup.cpuacct.usage_nanos': node.min_usage_nanos.value, - 'max of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': - node.max_periods.value, - 'min of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': - node.min_periods.value, - } - )}` - ); + return { + ...nodeStats, + cpuUsage: node.average_cpu_usage_percent.value ?? undefined, + }; + } + if (node.quota_micros_min.value !== node.quota_micros_max.value) { return { - clusterUuid: cluster.key as string, - nodeId: node.key as string, + ...nodeStats, + limitsChanged: true, cpuUsage: undefined, - nodeName, - ccs, }; } @@ -243,24 +211,13 @@ async function fetchContainerStats( ); return { - clusterUuid: cluster.key as string, - nodeId: node.key as string, + ...nodeStats, cpuUsage: Math.round(cpuUsage * 100) / 100, - nodeName, - ccs, }; }); }); } -function findEmptyValues(fieldsWithValues: CpuUsageFieldsWithValues): string { - const entries: Array<[string, number | null]> = Object.entries(fieldsWithValues); - return entries - .filter(([, value]) => value === null) - .map(([key]) => key) - .join(', '); -} - function computeCfsPercentCpuUsage(usageNanos: number, quotaMicros: number, periods: number) { // See https://github.com/elastic/kibana/pull/159351 for an explanation of this formula const quotaNanos = quotaMicros * 1000; @@ -380,8 +337,9 @@ async function fetchNonContainerStats( ccs = index.includes(':') ? index.split(':')[0] : undefined; } - const runningInAContainer = - node.quota_micros_min.value !== null || node.quota_micros_max.value !== null; + const runningInAContainerWithLimits = + (node.quota_micros_min.value !== null && node.quota_micros_min.value !== -1) || + (node.quota_micros_max.value !== null && node.quota_micros_max.value !== -1); return { clusterUuid: cluster.key as string, @@ -389,7 +347,7 @@ async function fetchNonContainerStats( cpuUsage: node.average_cpu.value ?? undefined, nodeName, ccs, - unexpectedLimits: runningInAContainer, + unexpectedLimits: runningInAContainerWithLimits, }; }); });