Skip to content

Commit

Permalink
[monitoring] Revert CPU Usage rule changes (#172913)
Browse files Browse the repository at this point in the history
Reverts #159351
Reverts #167244

Due to the many unexpected issues that these changes introduced we've
decided to revert these changes until we have better solutions for the
problems we've learnt about.

Problems:
- Gaps in data cause alerts to fire (see next point)
- Normal CPU rescaling causes alerts to fire
#160905
- Any error fires an alert (since there is no other way to inform the
user about the problems faced by the rule executor)
- Many assumptions about cgroups only being for container users are
wrong

To address some of these issues we also need more functionality in the
alerting framework to be able to register secondary actions so that we
may trigger non-oncall workflows for when a rule faces issues with
evaluating the stats.

Original issue #116128
  • Loading branch information
miltonhultgren authored Dec 8, 2023
1 parent 7f61e2a commit 55bc6d5
Show file tree
Hide file tree
Showing 10 changed files with 398 additions and 1,385 deletions.
7 changes: 4 additions & 3 deletions x-pack/plugins/monitoring/common/types/alerts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,10 @@ export interface AlertNodeStats {
}

export interface AlertCpuUsageNodeStats extends AlertNodeStats {
cpuUsage?: number;
limitsChanged?: boolean;
unexpectedLimits?: boolean;
cpuUsage: number;
containerUsage: number;
containerPeriods: number;
containerQuota: number;
}

export interface AlertThreadPoolRejectionsStats {
Expand Down
351 changes: 15 additions & 336 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts

Large diffs are not rendered by default.

187 changes: 59 additions & 128 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import { ElasticsearchClient } from '@kbn/core/server';
import { Alert } from '@kbn/alerting-plugin/server';
import { RawAlertInstance, SanitizedRule } from '@kbn/alerting-plugin/common';
import { parseDuration } from '@kbn/alerting-plugin/common/parse_duration';
import { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types';
import { BaseRule } from './base_rule';
import {
AlertData,
Expand Down Expand Up @@ -47,7 +46,7 @@ export class CpuUsageRule extends BaseRule {
{
name: 'node',
description: i18n.translate('xpack.monitoring.alerts.cpuUsage.actionVariables.node', {
defaultMessage: 'The node reporting high CPU usage.',
defaultMessage: 'The node reporting high cpu usage.',
}),
},
...Object.values(AlertingDefaults.ALERT_TYPE.context),
Expand All @@ -63,52 +62,28 @@ export class CpuUsageRule extends BaseRule {
const duration = parseDuration(params.duration);
const endMs = +new Date();
const startMs = endMs - duration;

let filterQuery;
if (params.filterQuery) {
try {
filterQuery = JSON.parse(params.filterQuery) as QueryDslQueryContainer;
} catch (error) {
throw new Error(`Failed to parse filter query in CPU usage rule ${error}`);
}
}

const stats = await fetchCpuUsageNodeStats(
{
esClient,
clusterUuids: clusters.map((cluster) => cluster.clusterUuid),
startMs,
endMs,
filterQuery,
logger: this.scopedLogger,
},
Globals.app.config
esClient,
clusters,
startMs,
endMs,
Globals.app.config.ui.max_bucket_size,
params.filterQuery
);

return stats.map((stat) => ({
clusterUuid: stat.clusterUuid,
...this.outcomeAndSeverity(stat, params.threshold!),
meta: {
...stat,
threshold: params.threshold!,
},
ccs: stat.ccs,
}));
}

private outcomeAndSeverity(
stat: AlertCpuUsageNodeStats,
threshold: number
): { shouldFire: boolean; severity: AlertSeverity } {
if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
let severity = AlertSeverity.Warning;
if (stat.cpuUsage && stat.cpuUsage > threshold) {
severity = AlertSeverity.Danger;
return stats.map((stat) => {
if (Globals.app.config.ui.container.elasticsearch.enabled) {
stat.cpuUsage =
(stat.containerUsage / (stat.containerPeriods * stat.containerQuota * 1000)) * 100;
}
return { shouldFire: true, severity };
}

return { shouldFire: stat.cpuUsage > threshold, severity: AlertSeverity.Danger };
return {
clusterUuid: stat.clusterUuid,
shouldFire: stat.cpuUsage > params.threshold!,
severity: AlertSeverity.Danger,
meta: stat,
ccs: stat.ccs,
};
});
}

protected filterAlertInstance(alertInstance: RawAlertInstance, filters: CommonAlertFilter[]) {
Expand All @@ -127,67 +102,13 @@ export class CpuUsageRule extends BaseRule {
}

protected getUiMessage(alertState: AlertState, item: AlertData): AlertMessage {
const stat = item.meta as AlertCpuUsageNodeStats & Pick<CommonAlertParams, 'threshold'>;
const tokens = [
{
startToken: '#start_link',
endToken: '#end_link',
type: AlertMessageTokenType.Link,
url: `elasticsearch/nodes/${stat.nodeId}`,
} as AlertMessageLinkToken,
{
startToken: '#absolute',
type: AlertMessageTokenType.Time,
isAbsolute: true,
isRelative: false,
timestamp: alertState.ui.triggeredMS,
} as AlertMessageTimeToken,
];

if (stat.unexpectedLimits) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', {
defaultMessage: `Kibana is configured for non-containerized workloads but node #start_link{nodeName}#end_link has resource limits configured. Node reports usage of {cpuUsage}%. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT),
},
}),
tokens,
};
}

if (stat.limitsChanged) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.limitsChanged', {
defaultMessage: `Resource limits for node #start_link{nodeName}#end_link has changed within the look back window, unable to confidently calculate CPU usage for alerting. Please monitor the usage until the window has moved. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
},
}),
tokens,
};
}

if (stat.cpuUsage === undefined) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage', {
defaultMessage: `Failed to compute CPU usage for node #start_link{nodeName}#end_link. Please check the Kibana logs for more details. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
},
}),
tokens,
};
}

const stat = item.meta as AlertCpuUsageNodeStats;
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.firingMessage', {
defaultMessage: `Node #start_link{nodeName}#end_link is reporting CPU usage of {cpuUsage}% which is above the configured threshold of {threshold}%. Last checked at #absolute`,
defaultMessage: `Node #start_link{nodeName}#end_link is reporting cpu usage of {cpuUsage}% at #absolute`,
values: {
nodeName: stat.nodeName,
cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT),
threshold: stat.threshold,
},
}),
nextSteps: [
Expand All @@ -204,7 +125,21 @@ export class CpuUsageRule extends BaseRule {
`{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html`
),
],
tokens,
tokens: [
{
startToken: '#absolute',
type: AlertMessageTokenType.Time,
isAbsolute: true,
isRelative: false,
timestamp: alertState.ui.triggeredMS,
} as AlertMessageTimeToken,
{
startToken: '#start_link',
endToken: '#end_link',
type: AlertMessageTokenType.Link,
url: `elasticsearch/nodes/${stat.nodeId}`,
} as AlertMessageLinkToken,
],
};
}

Expand All @@ -222,7 +157,7 @@ export class CpuUsageRule extends BaseRule {
return;
}
const shortActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.shortAction', {
defaultMessage: 'Verify CPU usage of node.',
defaultMessage: 'Verify CPU level of node.',
});
const fullActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.fullAction', {
defaultMessage: 'View node',
Expand All @@ -234,8 +169,28 @@ export class CpuUsageRule extends BaseRule {
ccs
);
const action = `[${fullActionText}](${globalStateLink})`;
const internalShortMessage = this.getMessage(firingNode, cluster.clusterName, shortActionText);
const internalFullMessage = this.getMessage(firingNode, cluster.clusterName, action);
const internalShortMessage = i18n.translate(
'xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage',
{
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {shortActionText}`,
values: {
clusterName: cluster.clusterName,
nodeName: firingNode.nodeName,
shortActionText,
},
}
);
const internalFullMessage = i18n.translate(
'xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage',
{
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {action}`,
values: {
clusterName: cluster.clusterName,
nodeName: firingNode.nodeName,
action,
},
}
);
instance.scheduleActions('default', {
internalShortMessage,
internalFullMessage: Globals.app.isCloud ? internalShortMessage : internalFullMessage,
Expand All @@ -251,28 +206,4 @@ export class CpuUsageRule extends BaseRule {
actionPlain: shortActionText,
});
}

private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) {
const stat = state.meta as AlertCpuUsageNodeStats;

if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', {
defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`,
values: {
clusterName,
nodeName: state.nodeName,
action,
},
});
}

return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessage', {
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster {clusterName}. {action}`,
values: {
clusterName,
nodeName: state.nodeName,
action,
},
});
}
}
Loading

0 comments on commit 55bc6d5

Please sign in to comment.