diff --git a/alerts/apps_alerts.libsonnet b/alerts/apps_alerts.libsonnet index 754346abb..3616d61f2 100644 --- a/alerts/apps_alerts.libsonnet +++ b/alerts/apps_alerts.libsonnet @@ -211,13 +211,13 @@ local utils = import '../lib/utils.libsonnet'; }, { expr: ||| - sum by (namespace, pod, container, %(clusterLabel)s) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0 + kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", %(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0 ||| % $._config, labels: { severity: 'warning', }, annotations: { - description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour.', + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").', summary: 'Pod container waiting longer than 1 hour', }, 'for': '1h', diff --git a/alerts/resource_alerts.libsonnet b/alerts/resource_alerts.libsonnet index 04a8e3a00..a8a355f76 100644 --- a/alerts/resource_alerts.libsonnet +++ b/alerts/resource_alerts.libsonnet @@ -199,9 +199,9 @@ { alert: 'CPUThrottlingHigh', expr: ||| - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (%(clusterLabel)s, container, pod, namespace) + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node) / - sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (%(clusterLabel)s, container, pod, namespace) + sum(increase(container_cpu_cfs_periods_total{%(cadvisorSelector)s, %(cpuThrottlingSelector)s}[5m])) without (id, metrics_path, name, image, endpoint, job, node) > ( %(cpuThrottlingPercent)s / 100 ) ||| % $._config, 'for': '15m',