diff --git a/x-pack/plugins/monitoring/common/types/alerts.ts b/x-pack/plugins/monitoring/common/types/alerts.ts index d00cc90c5516b..71943f42dd21f 100644 --- a/x-pack/plugins/monitoring/common/types/alerts.ts +++ b/x-pack/plugins/monitoring/common/types/alerts.ts @@ -169,10 +169,10 @@ export interface AlertNodeStats { } export interface AlertCpuUsageNodeStats extends AlertNodeStats { - cpuUsage: number; - containerUsage: number; - containerPeriods: number; - containerQuota: number; + cpuUsage?: number; + limitsChanged?: boolean; + missingLimits?: boolean; + unexpectedLimits?: boolean; } export interface AlertThreadPoolRejectionsStats { diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts index 6c5858d48e94e..171daed9f9d64 100644 --- a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts +++ b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts @@ -42,7 +42,7 @@ describe('CpuUsageRule', () => { expect(rule.ruleOptions.throttle).toBe('1d'); expect(rule.ruleOptions.defaultParams).toStrictEqual({ threshold: 85, duration: '5m' }); expect(rule.ruleOptions.actionVariables).toStrictEqual([ - { name: 'node', description: 'The node reporting high cpu usage.' }, + { name: 'node', description: 'The node reporting high CPU usage.' }, { name: 'internalShortMessage', description: 'The short internal message generated by Elastic.', @@ -114,7 +114,7 @@ describe('CpuUsageRule', () => { getState.mockReset(); }); - it('should fire actions', async () => { + it('should fire actions when threshold is exceeded', async () => { const rule = new CpuUsageRule(); const type = rule.getRuleType(); await type.executor({ @@ -122,6 +122,7 @@ describe('CpuUsageRule', () => { params: rule.ruleOptions.defaultParams, } as any); const count = 1; + const threshold = rule.ruleOptions.defaultParams?.threshold; expect(replaceState).toHaveBeenCalledWith({ alertStates: [ { @@ -134,13 +135,14 @@ describe('CpuUsageRule', () => { cpuUsage, nodeId, nodeName, + threshold, }, nodeId, nodeName, ui: { isFiring: true, message: { - text: `Node #start_link${nodeName}#end_link is reporting cpu usage of ${cpuUsage}% at #absolute`, + text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`, nextSteps: [ { text: '#start_linkCheck hot threads#end_link', @@ -168,6 +170,12 @@ describe('CpuUsageRule', () => { }, ], tokens: [ + { + startToken: '#start_link', + endToken: '#end_link', + type: 'link', + url: 'elasticsearch/nodes/myNodeId', + }, { startToken: '#absolute', type: 'time', @@ -175,12 +183,6 @@ describe('CpuUsageRule', () => { isRelative: false, timestamp: 1, }, - { - startToken: '#start_link', - endToken: '#end_link', - type: 'link', - url: 'elasticsearch/nodes/myNodeId', - }, ], }, severity: 'danger', @@ -191,10 +193,10 @@ describe('CpuUsageRule', () => { ], }); expect(scheduleActions).toHaveBeenCalledWith('default', { - internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. Verify CPU level of node.`, + internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`, action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - actionPlain: 'Verify CPU level of node.', + actionPlain: 'Verify CPU usage of node.', clusterName, count, nodes: `${nodeName}:${cpuUsage}`, @@ -242,10 +244,85 @@ describe('CpuUsageRule', () => { } as any); const count = 1; expect(scheduleActions).toHaveBeenCalledWith('default', { - internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:${ccs}))`, - internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. Verify CPU level of node.`, + internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:${ccs}))`, + internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`, action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:testCluster))`, - actionPlain: 'Verify CPU level of node.', + actionPlain: 'Verify CPU usage of node.', + clusterName, + count, + nodes: `${nodeName}:${cpuUsage}`, + node: `${nodeName}:${cpuUsage}`, + state: 'firing', + }); + }); + + it('should fire actions when resource limits are missing', async () => { + (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => { + return [ + { + ...stat, + missingLimits: true, + }, + ]; + }); + + const rule = new CpuUsageRule(); + const type = rule.getRuleType(); + await type.executor({ + ...executorOptions, + params: rule.ruleOptions.defaultParams, + } as any); + const count = 1; + const threshold = rule.ruleOptions.defaultParams?.threshold; + expect(replaceState).toHaveBeenCalledWith({ + alertStates: [ + { + ccs: undefined, + cluster: { clusterUuid, clusterName }, + cpuUsage, + itemLabel: undefined, + meta: { + clusterUuid, + cpuUsage, + nodeId, + nodeName, + threshold, + missingLimits: true, + }, + nodeId, + nodeName, + ui: { + isFiring: true, + message: { + text: `Kibana is configured for containerized workloads but node #start_linkmyNodeName#end_link does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%. Last checked at #absolute`, + tokens: [ + { + startToken: '#start_link', + endToken: '#end_link', + type: 'link', + url: 'elasticsearch/nodes/myNodeId', + }, + { + startToken: '#absolute', + type: 'time', + isAbsolute: true, + isRelative: false, + timestamp: 1, + }, + ], + }, + severity: 'danger', + triggeredMS: 1, + lastCheckedMS: 0, + }, + }, + ], + }); + expect(scheduleActions).toHaveBeenCalledWith('default', { + internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`, + action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + actionPlain: 'Verify CPU usage of node.', clusterName, count, nodes: `${nodeName}:${cpuUsage}`, @@ -253,5 +330,229 @@ describe('CpuUsageRule', () => { state: 'firing', }); }); + + it('should fire actions when resource limits have changed', async () => { + (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => { + return [ + { + ...stat, + limitsChanged: true, + }, + ]; + }); + + const rule = new CpuUsageRule(); + const type = rule.getRuleType(); + await type.executor({ + ...executorOptions, + params: rule.ruleOptions.defaultParams, + } as any); + const count = 1; + const threshold = rule.ruleOptions.defaultParams?.threshold; + expect(replaceState).toHaveBeenCalledWith({ + alertStates: [ + { + ccs: undefined, + cluster: { clusterUuid, clusterName }, + cpuUsage, + itemLabel: undefined, + meta: { + clusterUuid, + cpuUsage, + nodeId, + nodeName, + threshold, + limitsChanged: true, + }, + nodeId, + nodeName, + ui: { + isFiring: true, + message: { + text: 'Resource limits for node #start_linkmyNodeName#end_link has changed within the look back window, unable to confidently calculate CPU usage for alerting. Please monitor the usage until the window has moved. Last checked at #absolute', + tokens: [ + { + startToken: '#start_link', + endToken: '#end_link', + type: 'link', + url: 'elasticsearch/nodes/myNodeId', + }, + { + startToken: '#absolute', + type: 'time', + isAbsolute: true, + isRelative: false, + timestamp: 1, + }, + ], + }, + severity: 'danger', + triggeredMS: 1, + lastCheckedMS: 0, + }, + }, + ], + }); + expect(scheduleActions).toHaveBeenCalledWith('default', { + internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`, + action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + actionPlain: 'Verify CPU usage of node.', + clusterName, + count, + nodes: `${nodeName}:${cpuUsage}`, + node: `${nodeName}:${cpuUsage}`, + state: 'firing', + }); + }); + + it('should fire actions when resource limits are set but not expected', async () => { + (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => { + return [ + { + ...stat, + unexpectedLimits: true, + }, + ]; + }); + + const rule = new CpuUsageRule(); + const type = rule.getRuleType(); + await type.executor({ + ...executorOptions, + params: rule.ruleOptions.defaultParams, + } as any); + const count = 1; + const threshold = rule.ruleOptions.defaultParams?.threshold; + expect(replaceState).toHaveBeenCalledWith({ + alertStates: [ + { + ccs: undefined, + cluster: { clusterUuid, clusterName }, + cpuUsage, + itemLabel: undefined, + meta: { + clusterUuid, + cpuUsage, + nodeId, + nodeName, + threshold, + unexpectedLimits: true, + }, + nodeId, + nodeName, + ui: { + isFiring: true, + message: { + text: `Kibana is configured for non-containerized workloads but node #start_linkmyNodeName#end_link has resource limits configured. Node reports usage of ${cpuUsage}%. Last checked at #absolute`, + tokens: [ + { + startToken: '#start_link', + endToken: '#end_link', + type: 'link', + url: 'elasticsearch/nodes/myNodeId', + }, + { + startToken: '#absolute', + type: 'time', + isAbsolute: true, + isRelative: false, + timestamp: 1, + }, + ], + }, + severity: 'danger', + triggeredMS: 1, + lastCheckedMS: 0, + }, + }, + ], + }); + expect(scheduleActions).toHaveBeenCalledWith('default', { + internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`, + action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + actionPlain: 'Verify CPU usage of node.', + clusterName, + count, + nodes: `${nodeName}:${cpuUsage}`, + node: `${nodeName}:${cpuUsage}`, + state: 'firing', + }); + }); + + it('should fire actions when it fails to calculate CPU usage', async () => { + (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => { + return [ + { + ...stat, + cpuUsage: undefined, + }, + ]; + }); + + const rule = new CpuUsageRule(); + const type = rule.getRuleType(); + await type.executor({ + ...executorOptions, + params: rule.ruleOptions.defaultParams, + } as any); + const count = 1; + const threshold = rule.ruleOptions.defaultParams?.threshold; + expect(replaceState).toHaveBeenCalledWith({ + alertStates: [ + { + ccs: undefined, + cluster: { clusterUuid, clusterName }, + cpuUsage: undefined, + itemLabel: undefined, + meta: { + clusterUuid, + cpuUsage: undefined, + nodeId, + nodeName, + threshold, + }, + nodeId, + nodeName, + ui: { + isFiring: true, + message: { + text: 'Failed to compute CPU usage for node #start_linkmyNodeName#end_link. Please check the Kibana logs for more details. Last checked at #absolute', + tokens: [ + { + startToken: '#start_link', + endToken: '#end_link', + type: 'link', + url: 'elasticsearch/nodes/myNodeId', + }, + { + startToken: '#absolute', + type: 'time', + isAbsolute: true, + isRelative: false, + timestamp: 1, + }, + ], + }, + severity: 'warning', + triggeredMS: 1, + lastCheckedMS: 0, + }, + }, + ], + }); + expect(scheduleActions).toHaveBeenCalledWith('default', { + internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`, + action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + actionPlain: 'Verify CPU usage of node.', + clusterName, + count, + nodes: `${nodeName}:undefined`, + node: `${nodeName}:undefined`, + state: 'firing', + }); + }); }); }); diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts index 92c45c9e61ae2..58265dbfdbad7 100644 --- a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts +++ b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts @@ -11,6 +11,7 @@ import { ElasticsearchClient } from '@kbn/core/server'; import { Alert } from '@kbn/alerting-plugin/server'; import { RawAlertInstance, SanitizedRule } from '@kbn/alerting-plugin/common'; import { parseDuration } from '@kbn/alerting-plugin/common/parse_duration'; +import { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types'; import { BaseRule } from './base_rule'; import { AlertData, @@ -46,7 +47,7 @@ export class CpuUsageRule extends BaseRule { { name: 'node', description: i18n.translate('xpack.monitoring.alerts.cpuUsage.actionVariables.node', { - defaultMessage: 'The node reporting high cpu usage.', + defaultMessage: 'The node reporting high CPU usage.', }), }, ...Object.values(AlertingDefaults.ALERT_TYPE.context), @@ -62,28 +63,57 @@ export class CpuUsageRule extends BaseRule { const duration = parseDuration(params.duration); const endMs = +new Date(); const startMs = endMs - duration; + + let filterQuery; + if (params.filterQuery) { + try { + filterQuery = JSON.parse(params.filterQuery) as QueryDslQueryContainer; + } catch (error) { + throw new Error(`Failed to parse filter query in CPU usage rule ${error}`); + } + } + const stats = await fetchCpuUsageNodeStats( - esClient, - clusters, - startMs, - endMs, - Globals.app.config.ui.max_bucket_size, - params.filterQuery + { + esClient, + clusterUuids: clusters.map((cluster) => cluster.clusterUuid), + startMs, + endMs, + filterQuery, + logger: this.scopedLogger, + }, + Globals.app.config ); - return stats.map((stat) => { - if (Globals.app.config.ui.container.elasticsearch.enabled) { - stat.cpuUsage = - (stat.containerUsage / (stat.containerPeriods * stat.containerQuota * 1000)) * 100; + + return stats.map((stat) => ({ + clusterUuid: stat.clusterUuid, + ...this.outcomeAndSeverity(stat, params.threshold!), + meta: { + ...stat, + threshold: params.threshold!, + }, + ccs: stat.ccs, + })); + } + + private outcomeAndSeverity( + stat: AlertCpuUsageNodeStats, + threshold: number + ): { shouldFire: boolean; severity: AlertSeverity } { + if ( + stat.missingLimits || + stat.limitsChanged || + stat.unexpectedLimits || + stat.cpuUsage === undefined + ) { + let severity = AlertSeverity.Warning; + if (stat.cpuUsage && stat.cpuUsage > threshold) { + severity = AlertSeverity.Danger; } + return { shouldFire: true, severity }; + } - return { - clusterUuid: stat.clusterUuid, - shouldFire: stat.cpuUsage > params.threshold!, - severity: AlertSeverity.Danger, - meta: stat, - ccs: stat.ccs, - }; - }); + return { shouldFire: stat.cpuUsage > threshold, severity: AlertSeverity.Danger }; } protected filterAlertInstance(alertInstance: RawAlertInstance, filters: CommonAlertFilter[]) { @@ -102,13 +132,80 @@ export class CpuUsageRule extends BaseRule { } protected getUiMessage(alertState: AlertState, item: AlertData): AlertMessage { - const stat = item.meta as AlertCpuUsageNodeStats; + const stat = item.meta as AlertCpuUsageNodeStats & Pick; + const tokens = [ + { + startToken: '#start_link', + endToken: '#end_link', + type: AlertMessageTokenType.Link, + url: `elasticsearch/nodes/${stat.nodeId}`, + } as AlertMessageLinkToken, + { + startToken: '#absolute', + type: AlertMessageTokenType.Time, + isAbsolute: true, + isRelative: false, + timestamp: alertState.ui.triggeredMS, + } as AlertMessageTimeToken, + ]; + + if (stat.missingLimits) { + return { + text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.missingLimits', { + defaultMessage: `Kibana is configured for containerized workloads but node #start_link{nodeName}#end_link does not have resource limits configured. Fallback metric reports usage of {cpuUsage}%. Last checked at #absolute`, + values: { + nodeName: stat.nodeName, + cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT), + }, + }), + tokens, + }; + } + + if (stat.unexpectedLimits) { + return { + text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', { + defaultMessage: `Kibana is configured for non-containerized workloads but node #start_link{nodeName}#end_link has resource limits configured. Node reports usage of {cpuUsage}%. Last checked at #absolute`, + values: { + nodeName: stat.nodeName, + cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT), + }, + }), + tokens, + }; + } + + if (stat.limitsChanged) { + return { + text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.limitsChanged', { + defaultMessage: `Resource limits for node #start_link{nodeName}#end_link has changed within the look back window, unable to confidently calculate CPU usage for alerting. Please monitor the usage until the window has moved. Last checked at #absolute`, + values: { + nodeName: stat.nodeName, + }, + }), + tokens, + }; + } + + if (stat.cpuUsage === undefined) { + return { + text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage', { + defaultMessage: `Failed to compute CPU usage for node #start_link{nodeName}#end_link. Please check the Kibana logs for more details. Last checked at #absolute`, + values: { + nodeName: stat.nodeName, + }, + }), + tokens, + }; + } + return { text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.firingMessage', { - defaultMessage: `Node #start_link{nodeName}#end_link is reporting cpu usage of {cpuUsage}% at #absolute`, + defaultMessage: `Node #start_link{nodeName}#end_link is reporting CPU usage of {cpuUsage}% which is above the configured threshold of {threshold}%. Last checked at #absolute`, values: { nodeName: stat.nodeName, cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT), + threshold: stat.threshold, }, }), nextSteps: [ @@ -125,21 +222,7 @@ export class CpuUsageRule extends BaseRule { `{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html` ), ], - tokens: [ - { - startToken: '#absolute', - type: AlertMessageTokenType.Time, - isAbsolute: true, - isRelative: false, - timestamp: alertState.ui.triggeredMS, - } as AlertMessageTimeToken, - { - startToken: '#start_link', - endToken: '#end_link', - type: AlertMessageTokenType.Link, - url: `elasticsearch/nodes/${stat.nodeId}`, - } as AlertMessageLinkToken, - ], + tokens, }; } @@ -157,7 +240,7 @@ export class CpuUsageRule extends BaseRule { return; } const shortActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.shortAction', { - defaultMessage: 'Verify CPU level of node.', + defaultMessage: 'Verify CPU usage of node.', }); const fullActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.fullAction', { defaultMessage: 'View node', @@ -169,28 +252,8 @@ export class CpuUsageRule extends BaseRule { ccs ); const action = `[${fullActionText}](${globalStateLink})`; - const internalShortMessage = i18n.translate( - 'xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage', - { - defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {shortActionText}`, - values: { - clusterName: cluster.clusterName, - nodeName: firingNode.nodeName, - shortActionText, - }, - } - ); - const internalFullMessage = i18n.translate( - 'xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage', - { - defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {action}`, - values: { - clusterName: cluster.clusterName, - nodeName: firingNode.nodeName, - action, - }, - } - ); + const internalShortMessage = this.getMessage(firingNode, cluster.clusterName, shortActionText); + const internalFullMessage = this.getMessage(firingNode, cluster.clusterName, action); instance.scheduleActions('default', { internalShortMessage, internalFullMessage: Globals.app.isCloud ? internalShortMessage : internalFullMessage, @@ -206,4 +269,33 @@ export class CpuUsageRule extends BaseRule { actionPlain: shortActionText, }); } + + private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) { + const stat = state.meta as AlertCpuUsageNodeStats; + + if ( + stat.missingLimits || + stat.limitsChanged || + stat.unexpectedLimits || + stat.cpuUsage === undefined + ) { + return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', { + defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`, + values: { + clusterName, + nodeName: state.nodeName, + action, + }, + }); + } + + return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessage', { + defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster {clusterName}. {action}`, + values: { + clusterName, + nodeName: state.nodeName, + action, + }, + }); + } } diff --git a/x-pack/plugins/monitoring/server/lib/alerts/__snapshots__/fetch_cpu_usage_node_stats.test.ts.snap b/x-pack/plugins/monitoring/server/lib/alerts/__snapshots__/fetch_cpu_usage_node_stats.test.ts.snap new file mode 100644 index 0000000000000..9a06dcd7263d2 --- /dev/null +++ b/x-pack/plugins/monitoring/server/lib/alerts/__snapshots__/fetch_cpu_usage_node_stats.test.ts.snap @@ -0,0 +1,247 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`fetchCpuUsageNodeStats when running in a container calculates the containerized CPU usage 1`] = ` +Object { + "aggs": Object { + "clusters": Object { + "aggs": Object { + "nodes": Object { + "aggs": Object { + "average_cpu_usage_percent": Object { + "avg": Object { + "field": "node_stats.process.cpu.percent", + }, + }, + "index": Object { + "terms": Object { + "field": "_index", + "size": 1, + }, + }, + "max_periods": Object { + "max": Object { + "field": "node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods", + }, + }, + "max_usage_nanos": Object { + "max": Object { + "field": "node_stats.os.cgroup.cpuacct.usage_nanos", + }, + }, + "min_periods": Object { + "min": Object { + "field": "node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods", + }, + }, + "min_usage_nanos": Object { + "min": Object { + "field": "node_stats.os.cgroup.cpuacct.usage_nanos", + }, + }, + "name": Object { + "terms": Object { + "field": "source_node.name", + "size": 1, + }, + }, + "quota_micros_max": Object { + "max": Object { + "field": "node_stats.os.cgroup.cpu.cfs_quota_micros", + }, + }, + "quota_micros_min": Object { + "min": Object { + "field": "node_stats.os.cgroup.cpu.cfs_quota_micros", + }, + }, + }, + "terms": Object { + "field": "node_stats.node_id", + "size": 10, + }, + }, + }, + "terms": Object { + "field": "cluster_uuid", + "size": 10, + }, + }, + }, + "filter_path": Array [ + "aggregations", + ], + "index": ".monitoring-es-*,metrics-elasticsearch.stack_monitoring.node_stats-*", + "query": Object { + "bool": Object { + "filter": Array [ + Object { + "bool": Object { + "minimum_should_match": 1, + "should": Array [ + Object { + "term": Object { + "type": "node_stats", + }, + }, + Object { + "term": Object { + "metricset.name": "node_stats", + }, + }, + Object { + "term": Object { + "data_stream.dataset": "elasticsearch.stack_monitoring.node_stats", + }, + }, + ], + }, + }, + Object { + "terms": Object { + "cluster_uuid": Array [ + "my-test-cluster", + ], + }, + }, + Object { + "range": Object { + "timestamp": Object { + "format": "epoch_millis", + "gte": 0, + "lte": 10, + }, + }, + }, + Object { + "bool": Object { + "minimum_should_match": 1, + "should": Array [ + Object { + "term": Object { + "cluster_uuid": Object { + "value": "my-test-cluster", + }, + }, + }, + ], + }, + }, + ], + }, + }, + "size": 0, +} +`; + +exports[`fetchCpuUsageNodeStats when running outside a container calculates the CPU usage 1`] = ` +Object { + "aggs": Object { + "clusters": Object { + "aggs": Object { + "nodes": Object { + "aggs": Object { + "average_cpu": Object { + "avg": Object { + "field": "node_stats.process.cpu.percent", + }, + }, + "index": Object { + "terms": Object { + "field": "_index", + "size": 1, + }, + }, + "name": Object { + "terms": Object { + "field": "source_node.name", + "size": 1, + }, + }, + "quota_micros_max": Object { + "max": Object { + "field": "node_stats.os.cgroup.cpu.cfs_quota_micros", + }, + }, + "quota_micros_min": Object { + "min": Object { + "field": "node_stats.os.cgroup.cpu.cfs_quota_micros", + }, + }, + }, + "terms": Object { + "field": "node_stats.node_id", + "size": 10, + }, + }, + }, + "terms": Object { + "field": "cluster_uuid", + "size": 10, + }, + }, + }, + "filter_path": Array [ + "aggregations", + ], + "index": ".monitoring-es-*,metrics-elasticsearch.stack_monitoring.node_stats-*", + "query": Object { + "bool": Object { + "filter": Array [ + Object { + "bool": Object { + "minimum_should_match": 1, + "should": Array [ + Object { + "term": Object { + "type": "node_stats", + }, + }, + Object { + "term": Object { + "metricset.name": "node_stats", + }, + }, + Object { + "term": Object { + "data_stream.dataset": "elasticsearch.stack_monitoring.node_stats", + }, + }, + ], + }, + }, + Object { + "terms": Object { + "cluster_uuid": Array [ + "my-test-cluster", + ], + }, + }, + Object { + "range": Object { + "timestamp": Object { + "format": "epoch_millis", + "gte": 0, + "lte": 10, + }, + }, + }, + Object { + "bool": Object { + "minimum_should_match": 1, + "should": Array [ + Object { + "term": Object { + "cluster_uuid": Object { + "value": "my-test-cluster", + }, + }, + }, + ], + }, + }, + ], + }, + }, + "size": 0, +} +`; diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts index 77c96e8b6138a..9551b30d1c2d2 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts @@ -5,64 +5,75 @@ * 2.0. */ -import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey'; import { elasticsearchClientMock } from '@kbn/core-elasticsearch-client-server-mocks'; +import { loggerMock } from '@kbn/logging-mocks'; import { fetchCpuUsageNodeStats } from './fetch_cpu_usage_node_stats'; -jest.mock('../../static_globals', () => ({ - Globals: { - app: { - config: { - ui: { - ccs: { enabled: true }, +describe('fetchCpuUsageNodeStats', () => { + describe('when running outside a container', () => { + const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser; + + const configSlice: any = { + ui: { + ccs: { enabled: false }, + container: { + elasticsearch: { + enabled: false, + }, }, + max_bucket_size: 10, }, - }, - }, -})); + }; -describe('fetchCpuUsageNodeStats', () => { - const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser; - const clusters = [ - { - clusterUuid: 'abc123', - clusterName: 'test', - }, - ]; - const startMs = 0; - const endMs = 0; - const size = 10; + const filterQuery = { + bool: { + should: [ + { + term: { + cluster_uuid: { + value: 'my-test-cluster', + }, + }, + }, + ], + minimum_should_match: 1, + }, + }; - it('fetch normal stats', async () => { - esClient.search.mockResponse( - // @ts-expect-error not full response interface - { + it('calculates the CPU usage', async () => { + esClient.search.mockResponse({ aggregations: { clusters: { buckets: [ { - key: clusters[0].clusterUuid, + key: 'my-test-cluster', nodes: { buckets: [ { - key: 'theNodeId', - index: { + key: 'my-test-node', + average_cpu: { + value: 45, + }, + quota_micros_max: { + value: null, + }, + quota_micros_min: { + value: null, + }, + name: { buckets: [ { - key: '.monitoring-es-TODAY', + key: 'test-node', }, ], }, - name: { + index: { buckets: [ { - key: 'theNodeName', + key: 'a-local-index', }, ], }, - average_cpu: { - value: 10, - }, }, ], }, @@ -70,66 +81,186 @@ describe('fetchCpuUsageNodeStats', () => { ], }, }, - } - ); - const result = await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size); - expect(result).toEqual([ - { - clusterUuid: clusters[0].clusterUuid, - nodeName: 'theNodeName', - nodeId: 'theNodeId', - cpuUsage: 10, - containerUsage: undefined, - containerPeriods: undefined, - containerQuota: undefined, - ccs: null, - }, - ]); - }); + } as any); + + const stats = await fetchCpuUsageNodeStats( + { + esClient, + clusterUuids: ['my-test-cluster'], + startMs: 0, + endMs: 10, + filterQuery, + logger: loggerMock.create(), + }, + configSlice + ); + + expect(stats).toEqual([ + { + clusterUuid: 'my-test-cluster', + nodeId: 'my-test-node', + nodeName: 'test-node', + ccs: undefined, + cpuUsage: 45, + unexpectedLimits: false, + }, + ]); - it('fetch container stats', async () => { - esClient.search.mockResponse( - // @ts-expect-error not full response interface - { + // If this check fails, it means the query has changed which `might` mean the response shape has changed and + // the test data needs to be updated to reflect the new format. + expect(esClient.search.mock.calls[0][0]).toMatchSnapshot(); + }); + + it('warns about container metrics being present', async () => { + esClient.search.mockResponse({ aggregations: { clusters: { buckets: [ { - key: clusters[0].clusterUuid, + key: 'my-test-cluster', nodes: { buckets: [ { - key: 'theNodeId', + key: 'my-test-node', + average_cpu: { + value: 45, + }, + quota_micros_max: { + value: -1, + }, + quota_micros_min: { + value: -1, + }, + name: { + buckets: [ + { + key: 'test-node', + }, + ], + }, index: { buckets: [ { - key: '.monitoring-es-TODAY', + key: 'a-local-index', }, ], }, + }, + ], + }, + }, + ], + }, + }, + } as any); + + const stats = await fetchCpuUsageNodeStats( + { + esClient, + clusterUuids: ['my-test-cluster'], + startMs: 0, + endMs: 10, + filterQuery, + logger: loggerMock.create(), + }, + configSlice + ); + + expect(stats).toEqual([ + { + unexpectedLimits: true, + clusterUuid: 'my-test-cluster', + nodeId: 'my-test-node', + nodeName: 'test-node', + ccs: undefined, + cpuUsage: 45, + }, + ]); + }); + }); + + describe('when running in a container', () => { + const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser; + + const configSlice: any = { + ui: { + ccs: { enabled: false }, + container: { + elasticsearch: { + enabled: true, + }, + }, + max_bucket_size: 10, + }, + }; + + const filterQuery = { + bool: { + should: [ + { + term: { + cluster_uuid: { + value: 'my-test-cluster', + }, + }, + }, + ], + minimum_should_match: 1, + }, + }; + + it('calculates the containerized CPU usage', async () => { + // 45% CPU usage + const maxPeriods = 1000; + const quotaMicros = 100000; + const usageLimitNanos = maxPeriods * quotaMicros * 1000; + const maxUsageNanos = 0.45 * usageLimitNanos; + + esClient.search.mockResponse({ + aggregations: { + clusters: { + buckets: [ + { + key: 'my-test-cluster', + nodes: { + buckets: [ + { + key: 'my-test-node', + min_usage_nanos: { + value: 0, + }, + max_usage_nanos: { + value: maxUsageNanos, + }, + min_periods: { + value: 0, + }, + max_periods: { + value: maxPeriods, + }, + quota_micros_min: { + value: quotaMicros, + }, + quota_micros_max: { + value: quotaMicros, + }, + average_cpu_usage_percent: { + value: 45, + }, name: { buckets: [ { - key: 'theNodeName', + key: 'test-node', }, ], }, - histo: { + index: { buckets: [ - null, { - usage_deriv: { - normalized_value: 10, - }, - periods_deriv: { - normalized_value: 5, - }, + key: 'a-local-index', }, ], }, - average_quota: { - value: 50, - }, }, ], }, @@ -137,59 +268,81 @@ describe('fetchCpuUsageNodeStats', () => { ], }, }, - } - ); - const result = await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size); - expect(result).toEqual([ - { - clusterUuid: clusters[0].clusterUuid, - nodeName: 'theNodeName', - nodeId: 'theNodeId', - cpuUsage: undefined, - containerUsage: 10, - containerPeriods: 5, - containerQuota: 50, - ccs: null, - }, - ]); - }); + } as any); + + const stats = await fetchCpuUsageNodeStats( + { + esClient, + clusterUuids: ['my-test-cluster'], + startMs: 0, + endMs: 10, + filterQuery, + logger: loggerMock.create(), + }, + configSlice + ); + + expect(stats).toEqual([ + { + clusterUuid: 'my-test-cluster', + nodeId: 'my-test-node', + nodeName: 'test-node', + ccs: undefined, + cpuUsage: 45, + }, + ]); + + // If this check fails, it means the query has changed which `might` mean the response shape has changed and + // the test data needs to be updated to reflect the new format. + expect(esClient.search.mock.calls[0][0]).toMatchSnapshot(); + }); - it('fetch properly return ccs', async () => { - esClient.search.mockResponse( - // @ts-expect-error not full response interface - { + it('warns about resource usage limits not being set', async () => { + esClient.search.mockResponse({ aggregations: { clusters: { buckets: [ { - key: clusters[0].clusterUuid, + key: 'my-test-cluster', nodes: { buckets: [ { - key: 'theNodeId', - index: { + key: 'my-test-node', + min_usage_nanos: { + value: 0, + }, + max_usage_nanos: { + value: 1000, + }, + min_periods: { + value: 0, + }, + max_periods: { + value: 100, + }, + quota_micros_min: { + value: -1, + }, + quota_micros_max: { + value: -1, + }, + average_cpu_usage_percent: { + value: 45, + }, + name: { buckets: [ { - key: 'foo:.monitoring-es-TODAY', + key: 'test-node', }, ], }, - name: { + index: { buckets: [ { - key: 'theNodeName', + key: 'a-local-index', }, ], }, - average_usage: { - value: 10, - }, - average_periods: { - value: 5, - }, - average_quota: { - value: 50, - }, }, ], }, @@ -197,90 +350,187 @@ describe('fetchCpuUsageNodeStats', () => { ], }, }, - } - ); - const result = await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size); - expect(result[0].ccs).toBe('foo'); - }); + } as any); - it('should use consistent params', async () => { - let params = null; - esClient.search.mockImplementation((...args) => { - params = args[0]; - return Promise.resolve({} as estypes.SearchResponse); + const stats = await fetchCpuUsageNodeStats( + { + esClient, + clusterUuids: ['my-test-cluster'], + startMs: 0, + endMs: 10, + filterQuery, + logger: loggerMock.create(), + }, + configSlice + ); + + expect(stats).toEqual([ + { + missingLimits: true, + clusterUuid: 'my-test-cluster', + nodeId: 'my-test-node', + nodeName: 'test-node', + ccs: undefined, + cpuUsage: 45, + }, + ]); }); - const filterQuery = - '{"bool":{"should":[{"exists":{"field":"cluster_uuid"}}],"minimum_should_match":1}}'; - await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size, filterQuery); - expect(params).toStrictEqual({ - index: - '*:.monitoring-es-*,.monitoring-es-*,*:metrics-elasticsearch.stack_monitoring.node_stats-*,metrics-elasticsearch.stack_monitoring.node_stats-*', - filter_path: ['aggregations'], - body: { - size: 0, - query: { - bool: { - filter: [ - { terms: { cluster_uuid: ['abc123'] } }, + + it('warns about resource usage limits being changed', async () => { + esClient.search.mockResponse({ + aggregations: { + clusters: { + buckets: [ { - bool: { - should: [ - { term: { type: 'node_stats' } }, - { term: { 'metricset.name': 'node_stats' } }, + key: 'my-test-cluster', + nodes: { + buckets: [ { - term: { 'data_stream.dataset': 'elasticsearch.stack_monitoring.node_stats' }, + key: 'my-test-node', + min_usage_nanos: { + value: 0, + }, + max_usage_nanos: { + value: 1000, + }, + min_periods: { + value: 0, + }, + max_periods: { + value: 100, + }, + quota_micros_min: { + value: -1, + }, + quota_micros_max: { + value: 10000, + }, + average_cpu_usage_percent: { + value: 45, + }, + name: { + buckets: [ + { + key: 'test-node', + }, + ], + }, + index: { + buckets: [ + { + key: 'a-local-index', + }, + ], + }, }, ], - minimum_should_match: 1, }, }, - { range: { timestamp: { format: 'epoch_millis', gte: 0, lte: 0 } } }, - { - bool: { should: [{ exists: { field: 'cluster_uuid' } }], minimum_should_match: 1 }, - }, ], }, }, - aggs: { + } as any); + + const stats = await fetchCpuUsageNodeStats( + { + esClient, + clusterUuids: ['my-test-cluster'], + startMs: 0, + endMs: 10, + filterQuery, + logger: loggerMock.create(), + }, + configSlice + ); + + expect(stats).toEqual([ + { + limitsChanged: true, + clusterUuid: 'my-test-cluster', + nodeId: 'my-test-node', + nodeName: 'test-node', + ccs: undefined, + cpuUsage: undefined, + }, + ]); + }); + + it('warns about failing to compute usage due to values missing', async () => { + esClient.search.mockResponse({ + aggregations: { clusters: { - terms: { field: 'cluster_uuid', size: 10, include: ['abc123'] }, - aggs: { - nodes: { - terms: { field: 'node_stats.node_id', size: 10 }, - aggs: { - index: { terms: { field: '_index', size: 1 } }, - average_cpu: { avg: { field: 'node_stats.process.cpu.percent' } }, - average_quota: { avg: { field: 'node_stats.os.cgroup.cpu.cfs_quota_micros' } }, - name: { terms: { field: 'source_node.name', size: 1 } }, - histo: { - date_histogram: { field: 'timestamp', fixed_interval: '0m' }, - aggs: { - average_periods: { - max: { field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods' }, - }, - average_usage: { max: { field: 'node_stats.os.cgroup.cpuacct.usage_nanos' } }, - usage_deriv: { - derivative: { - buckets_path: 'average_usage', - gap_policy: 'skip', - unit: '1s', - }, - }, - periods_deriv: { - derivative: { - buckets_path: 'average_periods', - gap_policy: 'skip', - unit: '1s', - }, + buckets: [ + { + key: 'my-test-cluster', + nodes: { + buckets: [ + { + key: 'my-test-node', + min_usage_nanos: { + value: null, + }, + max_usage_nanos: { + value: null, + }, + min_periods: { + value: null, + }, + max_periods: { + value: null, + }, + quota_micros_min: { + value: 10000, + }, + quota_micros_max: { + value: 10000, + }, + average_cpu_usage_percent: { + value: 45, + }, + name: { + buckets: [ + { + key: 'test-node', + }, + ], + }, + index: { + buckets: [ + { + key: 'a-local-index', + }, + ], }, }, - }, + ], }, }, - }, + ], }, }, - }, + } as any); + + const stats = await fetchCpuUsageNodeStats( + { + esClient, + clusterUuids: ['my-test-cluster'], + startMs: 0, + endMs: 10, + filterQuery, + logger: loggerMock.create(), + }, + configSlice + ); + + expect(stats).toEqual([ + { + clusterUuid: 'my-test-cluster', + nodeId: 'my-test-node', + nodeName: 'test-node', + ccs: undefined, + cpuUsage: undefined, + }, + ]); }); }); }); diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts index 8037ad94e6764..014c38f447e1e 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts @@ -5,139 +5,346 @@ * 2.0. */ -import { ElasticsearchClient } from '@kbn/core/server'; -import { get } from 'lodash'; -import moment from 'moment'; -import { NORMALIZED_DERIVATIVE_UNIT } from '../../../common/constants'; -import { AlertCluster, AlertCpuUsageNodeStats } from '../../../common/types/alerts'; -import { createDatasetFilter } from './create_dataset_query_filter'; -import { getIndexPatterns, getElasticsearchDataset } from '../cluster/get_index_patterns'; -import { Globals } from '../../static_globals'; +import { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types'; +import { ElasticsearchClient, Logger } from '@kbn/core/server'; +import { InferSearchResponseOf } from '@kbn/es-types'; import { CCS_REMOTE_PATTERN } from '../../../common/constants'; +import { AlertCpuUsageNodeStats } from '../../../common/types/alerts'; +import { MonitoringConfig } from '../../config'; +import { getElasticsearchDataset, getIndexPatterns } from '../cluster/get_index_patterns'; +import { createDatasetFilter } from './create_dataset_query_filter'; -interface NodeBucketESResponse { - key: string; - average_cpu: { value: number }; +interface CpuUsageFieldsWithValues { + 'max of node_stats.os.cgroup.cpu.cfs_quota_micros': number | null; + 'max of node_stats.os.cgroup.cpuacct.usage_nanos': number | null; + 'min of node_stats.os.cgroup.cpuacct.usage_nanos': number | null; + 'max of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': number | null; + 'min of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': number | null; } -interface ClusterBucketESResponse { - key: string; - nodes: { - buckets: NodeBucketESResponse[]; - }; +interface Options { + esClient: ElasticsearchClient; + clusterUuids: string[]; + startMs: number; + endMs: number; + filterQuery?: QueryDslQueryContainer; + logger: Logger; } export async function fetchCpuUsageNodeStats( - esClient: ElasticsearchClient, - clusters: AlertCluster[], - startMs: number, - endMs: number, - size: number, - filterQuery?: string + options: Options, + config: MonitoringConfig ): Promise { - // Using pure MS didn't seem to work well with the date_histogram interval - // but minutes does - const intervalInMinutes = moment.duration(endMs - startMs).asMinutes(); + if (config.ui.container.elasticsearch.enabled) { + options.logger.debug('CPU usage rule: Computing usage for containerized clusters'); + return fetchContainerStats(options, config); + } + + options.logger.debug('CPU usage rule: Computing usage for non-containerized clusters'); + return fetchNonContainerStats(options, config); +} +async function fetchContainerStats( + { esClient, startMs, endMs, clusterUuids, filterQuery, logger }: Options, + config: MonitoringConfig +) { const indexPatterns = getIndexPatterns({ - config: Globals.app.config, + config, moduleType: 'elasticsearch', dataset: 'node_stats', ccs: CCS_REMOTE_PATTERN, }); + const params = { index: indexPatterns, filter_path: ['aggregations'], - body: { - size: 0, - query: { - bool: { - filter: [ - { - terms: { - cluster_uuid: clusters.map((cluster) => cluster.clusterUuid), + size: 0, + query: { + bool: { + filter: [ + createDatasetFilter('node_stats', 'node_stats', getElasticsearchDataset('node_stats')), + { + terms: { + cluster_uuid: clusterUuids, + }, + }, + { + range: { + timestamp: { + format: 'epoch_millis', + gte: startMs, + lte: endMs, }, }, - createDatasetFilter('node_stats', 'node_stats', getElasticsearchDataset('node_stats')), - { - range: { - timestamp: { - format: 'epoch_millis', - gte: startMs, - lte: endMs, + }, + ], + }, + }, + aggs: { + clusters: { + terms: { + field: 'cluster_uuid', + size: config.ui.max_bucket_size, + }, + aggs: { + nodes: { + terms: { + field: 'node_stats.node_id', + size: config.ui.max_bucket_size, + }, + aggs: { + name: { + terms: { + field: 'source_node.name', + size: 1, + }, + }, + // Used to check for CCS and get the remote cluster name + index: { + terms: { + field: '_index', + size: 1, + }, + }, + // Fallback value in case container limits are not specified + average_cpu_usage_percent: { + avg: { + field: 'node_stats.process.cpu.percent', + }, + }, + // Container limit min and max, to calculate usage and detect config changes + quota_micros_max: { + max: { + field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', + }, + }, + quota_micros_min: { + min: { + field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', + }, + }, + // Usage to calculate delta + max_usage_nanos: { + max: { + field: 'node_stats.os.cgroup.cpuacct.usage_nanos', + }, + }, + min_usage_nanos: { + min: { + field: 'node_stats.os.cgroup.cpuacct.usage_nanos', + }, + }, + // Periods to calculate delta + max_periods: { + max: { + field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods', + }, + }, + min_periods: { + min: { + field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods', }, }, }, - ], + }, }, }, - aggs: { - clusters: { - terms: { - field: 'cluster_uuid', - size, - include: clusters.map((cluster) => cluster.clusterUuid), + }, + }; + + if (filterQuery) { + (params.query!.bool!.filter! as QueryDslQueryContainer[]).push(filterQuery); + } + + const response = (await esClient.search(params)) as unknown as InferSearchResponseOf< + unknown, + typeof params + >; + + if (!response.aggregations) { + throw new Error('Failed to resolve needed aggregations for CPU Usage Rule'); + } + + return response.aggregations.clusters.buckets.flatMap((cluster) => { + return cluster.nodes.buckets.map((node): AlertCpuUsageNodeStats => { + let nodeName; + if (node.name.buckets.length) { + nodeName = node.name.buckets[0].key as string; + } + + let ccs; + if (node.index.buckets.length) { + const index = node.index.buckets[0].key as string; + ccs = index.includes(':') ? index.split(':')[0] : undefined; + } + + const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1; + const notRunningInAContainer = + node.quota_micros_min.value === null && node.quota_micros_max.value === null; + if (limitsNotSet || notRunningInAContainer) { + return { + missingLimits: true, + clusterUuid: cluster.key as string, + nodeId: node.key as string, + cpuUsage: node.average_cpu_usage_percent.value ?? undefined, + nodeName, + ccs, + }; + } + + if (node.quota_micros_min.value !== node.quota_micros_max.value) { + return { + limitsChanged: true, + clusterUuid: cluster.key as string, + nodeId: node.key as string, + cpuUsage: undefined, + nodeName, + ccs, + }; + } + + if ( + node.max_usage_nanos.value === null || + node.min_usage_nanos.value === null || + node.max_periods.value === null || + node.min_periods.value === null || + node.quota_micros_max.value === null + ) { + logger.warn( + `CPU usage rule: Some aggregated values needed for container CPU usage calculation was empty: ${findEmptyValues( + { + 'max of node_stats.os.cgroup.cpu.cfs_quota_micros': node.quota_micros_max.value, + 'max of node_stats.os.cgroup.cpuacct.usage_nanos': node.max_usage_nanos.value, + 'min of node_stats.os.cgroup.cpuacct.usage_nanos': node.min_usage_nanos.value, + 'max of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': + node.max_periods.value, + 'min of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': + node.min_periods.value, + } + )}` + ); + + return { + clusterUuid: cluster.key as string, + nodeId: node.key as string, + cpuUsage: undefined, + nodeName, + ccs, + }; + } + + const usageDeltaNanos = node.max_usage_nanos.value - node.min_usage_nanos.value; + const periodsDelta = node.max_periods.value - node.min_periods.value; + + const cpuUsage = computeCfsPercentCpuUsage( + usageDeltaNanos, + node.quota_micros_max.value, + periodsDelta + ); + + return { + clusterUuid: cluster.key as string, + nodeId: node.key as string, + cpuUsage: Math.round(cpuUsage * 100) / 100, + nodeName, + ccs, + }; + }); + }); +} + +function findEmptyValues(fieldsWithValues: CpuUsageFieldsWithValues): string { + const entries: Array<[string, number | null]> = Object.entries(fieldsWithValues); + return entries + .filter(([, value]) => value === null) + .map(([key]) => key) + .join(', '); +} + +function computeCfsPercentCpuUsage(usageNanos: number, quotaMicros: number, periods: number) { + // See https://github.com/elastic/kibana/pull/159351 for an explanation of this formula + const quotaNanos = quotaMicros * 1000; + const limitNanos = quotaNanos * periods; + const usageAsFactor = usageNanos / limitNanos; + return usageAsFactor * 100; +} + +async function fetchNonContainerStats( + { esClient, startMs, endMs, clusterUuids, filterQuery }: Options, + config: MonitoringConfig +) { + const indexPatterns = getIndexPatterns({ + config, + moduleType: 'elasticsearch', + dataset: 'node_stats', + ccs: CCS_REMOTE_PATTERN, + }); + + const params = { + index: indexPatterns, + filter_path: ['aggregations'], + size: 0, + query: { + bool: { + filter: [ + createDatasetFilter('node_stats', 'node_stats', getElasticsearchDataset('node_stats')), + { + terms: { + cluster_uuid: clusterUuids, + }, + }, + { + range: { + timestamp: { + format: 'epoch_millis', + gte: startMs, + lte: endMs, + }, + }, }, - aggs: { - nodes: { - terms: { - field: 'node_stats.node_id', - size, - }, - aggs: { - index: { - terms: { - field: '_index', - size: 1, - }, + ], + }, + }, + aggs: { + clusters: { + terms: { + field: 'cluster_uuid', + size: config.ui.max_bucket_size, + }, + aggs: { + nodes: { + terms: { + field: 'node_stats.node_id', + size: config.ui.max_bucket_size, + }, + aggs: { + name: { + terms: { + field: 'source_node.name', + size: 1, }, - average_cpu: { - avg: { - field: 'node_stats.process.cpu.percent', - }, + }, + // Used to check for CCS and get the remote cluster name + index: { + terms: { + field: '_index', + size: 1, }, - average_quota: { - avg: { - field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', - }, + }, + average_cpu: { + avg: { + field: 'node_stats.process.cpu.percent', }, - name: { - terms: { - field: 'source_node.name', - size: 1, - }, + }, + // Container limit min and max, to detect possible config errors + quota_micros_max: { + max: { + field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', }, - histo: { - date_histogram: { - field: 'timestamp', - fixed_interval: `${intervalInMinutes}m`, - }, - aggs: { - average_periods: { - max: { - field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods', - }, - }, - average_usage: { - max: { - field: 'node_stats.os.cgroup.cpuacct.usage_nanos', - }, - }, - usage_deriv: { - derivative: { - buckets_path: 'average_usage', - gap_policy: 'skip' as const, - unit: NORMALIZED_DERIVATIVE_UNIT, - }, - }, - periods_deriv: { - derivative: { - buckets_path: 'average_periods', - gap_policy: 'skip' as const, - unit: NORMALIZED_DERIVATIVE_UNIT, - }, - }, - }, + }, + quota_micros_min: { + min: { + field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', }, }, }, @@ -147,38 +354,43 @@ export async function fetchCpuUsageNodeStats( }, }; - try { - if (filterQuery) { - const filterQueryObject = JSON.parse(filterQuery); - params.body.query.bool.filter.push(filterQueryObject); - } - } catch (e) { - // meh + if (filterQuery) { + (params.query!.bool!.filter! as QueryDslQueryContainer[]).push(filterQuery); } - const response = await esClient.search(params); - const stats: AlertCpuUsageNodeStats[] = []; - const clusterBuckets = get( - response, - 'aggregations.clusters.buckets', - [] - ) as ClusterBucketESResponse[]; - for (const clusterBucket of clusterBuckets) { - for (const node of clusterBucket.nodes.buckets) { - const lastBucket = get(node, 'histo.buckets[1]', {}); - const indexName = get(node, 'index.buckets[0].key', ''); - const stat = { - clusterUuid: clusterBucket.key, - nodeId: node.key, - nodeName: get(node, 'name.buckets[0].key'), - cpuUsage: get(node, 'average_cpu.value'), - containerUsage: get(lastBucket, 'usage_deriv.normalized_value'), - containerPeriods: get(lastBucket, 'periods_deriv.normalized_value'), - containerQuota: get(node, 'average_quota.value'), - ccs: indexName.includes(':') ? indexName.split(':')[0] : null, - }; - stats.push(stat); - } + const response = (await esClient.search(params)) as unknown as InferSearchResponseOf< + unknown, + typeof params + >; + + if (!response.aggregations) { + throw new Error('Failed to resolve needed aggregations for CPU Usage Rule'); } - return stats; + + return response.aggregations.clusters.buckets.flatMap((cluster) => { + return cluster.nodes.buckets.map((node): AlertCpuUsageNodeStats => { + let nodeName; + if (node.name.buckets.length) { + nodeName = node.name.buckets[0].key as string; + } + + let ccs; + if (node.index.buckets.length) { + const index = node.index.buckets[0].key as string; + ccs = index.includes(':') ? index.split(':')[0] : undefined; + } + + const runningInAContainer = + node.quota_micros_min.value !== null || node.quota_micros_max.value !== null; + + return { + clusterUuid: cluster.key as string, + nodeId: node.key as string, + cpuUsage: node.average_cpu.value ?? undefined, + nodeName, + ccs, + unexpectedLimits: runningInAContainer, + }; + }); + }); } diff --git a/x-pack/plugins/monitoring/tsconfig.json b/x-pack/plugins/monitoring/tsconfig.json index 00ca962568141..d70d8b51fcd08 100644 --- a/x-pack/plugins/monitoring/tsconfig.json +++ b/x-pack/plugins/monitoring/tsconfig.json @@ -41,6 +41,7 @@ "@kbn/shared-ux-router", "@kbn/observability-shared-plugin", "@kbn/shared-ux-link-redirect-app", + "@kbn/es-types", "@kbn/logs-shared-plugin", ], "exclude": [ diff --git a/x-pack/plugins/translations/translations/fr-FR.json b/x-pack/plugins/translations/translations/fr-FR.json index 8d312cf306da9..28590adc654b8 100644 --- a/x-pack/plugins/translations/translations/fr-FR.json +++ b/x-pack/plugins/translations/translations/fr-FR.json @@ -24904,9 +24904,6 @@ "xpack.monitoring.alerts.clusterHealth.firing.internalShortMessage": "L'alerte d'intégrité de cluster se déclenche pour {clusterName}. L'intégrité actuelle est {health}. {actionText}", "xpack.monitoring.alerts.clusterHealth.ui.firingMessage": "L'intégrité du cluster Elasticsearch est {health}.", "xpack.monitoring.alerts.clusterHealth.ui.nextSteps.message1": "{message}. #start_linkView now#end_link", - "xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage": "L'alerte d'utilisation CPU se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {action}", - "xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage": "L'alerte d'utilisation CPU se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {shortActionText}", - "xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "Le nœud #start_link{nodeName}#end_link signale une utilisation CPU de {cpuUsage} % à #absolute", "xpack.monitoring.alerts.diskUsage.firing.internalFullMessage": "L'alerte d'utilisation du disque se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {action}", "xpack.monitoring.alerts.diskUsage.firing.internalShortMessage": "L'alerte d'utilisation du disque se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {shortActionText}", "xpack.monitoring.alerts.diskUsage.ui.firingMessage": "Le nœud #start_link{nodeName}#end_link signale une utilisation du disque de {diskUsage} % à #absolute", diff --git a/x-pack/plugins/translations/translations/ja-JP.json b/x-pack/plugins/translations/translations/ja-JP.json index d9ca725bbd4dd..d87d2be8ee3d3 100644 --- a/x-pack/plugins/translations/translations/ja-JP.json +++ b/x-pack/plugins/translations/translations/ja-JP.json @@ -24890,9 +24890,6 @@ "xpack.monitoring.alerts.clusterHealth.firing.internalShortMessage": "クラスター正常性アラートが{clusterName}に対して作動しています。現在のヘルスは{health}です。{actionText}", "xpack.monitoring.alerts.clusterHealth.ui.firingMessage": "Elasticsearchクラスターの正常性は{health}です。", "xpack.monitoring.alerts.clusterHealth.ui.nextSteps.message1": "{message}. #start_linkView now#end_link", - "xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage": "クラスター{clusterName}のノード{nodeName}について、CPU使用率のアラートが発生しています。{action}", - "xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage": "クラスター{clusterName}のノード{nodeName}について、CPU使用率のアラートが発生しています。{shortActionText}", - "xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "ノード#start_link{nodeName}#end_linkは、#absoluteでCPU使用率{cpuUsage}%を報告しています", "xpack.monitoring.alerts.diskUsage.firing.internalFullMessage": "クラスター{clusterName}のノード{nodeName}について、ディスク使用率のアラートが発生しています。{action}", "xpack.monitoring.alerts.diskUsage.firing.internalShortMessage": "クラスター{clusterName}のノード{nodeName}について、ディスク使用率のアラートが発生しています。{shortActionText}", "xpack.monitoring.alerts.diskUsage.ui.firingMessage": "ノード#start_link{nodeName}#end_linkは、#absoluteでディスク使用率{diskUsage}%を報告しています", diff --git a/x-pack/plugins/translations/translations/zh-CN.json b/x-pack/plugins/translations/translations/zh-CN.json index a778e8193a149..5d3c8e37ad605 100644 --- a/x-pack/plugins/translations/translations/zh-CN.json +++ b/x-pack/plugins/translations/translations/zh-CN.json @@ -24889,9 +24889,6 @@ "xpack.monitoring.alerts.clusterHealth.firing.internalShortMessage": "为 {clusterName} 触发了集群运行状况告警。当前运行状况为 {health}。{actionText}", "xpack.monitoring.alerts.clusterHealth.ui.firingMessage": "Elasticsearch 集群运行状况为 {health}。", "xpack.monitoring.alerts.clusterHealth.ui.nextSteps.message1": "{message}. #start_linkView now#end_link", - "xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了 CPU 使用率告警。{action}", - "xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了 CPU 使用率告警。{shortActionText}", - "xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "节点 #start_link{nodeName}#end_link 于 #absolute报告 cpu 使用率为 {cpuUsage}%", "xpack.monitoring.alerts.diskUsage.firing.internalFullMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了磁盘使用率告警。{action}", "xpack.monitoring.alerts.diskUsage.firing.internalShortMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了磁盘使用率告警。{shortActionText}", "xpack.monitoring.alerts.diskUsage.ui.firingMessage": "节点 #start_link{nodeName}#end_link 于 #absolute 报告磁盘使用率为 {diskUsage}%",