From 215a7faf0eca1f0829eeb751e7149b3f3b4df368 Mon Sep 17 00:00:00 2001 From: d032408 Date: Wed, 12 Dec 2018 12:13:36 +0100 Subject: [PATCH] [kube-monitoring] bm disk errors are warnings because of raid redundancy --- .../charts/prometheus-frontend/metal-ironic.alerts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/system/kube-monitoring/charts/prometheus-frontend/metal-ironic.alerts b/system/kube-monitoring/charts/prometheus-frontend/metal-ironic.alerts index 47ad71d31d6..3083363250b 100644 --- a/system/kube-monitoring/charts/prometheus-frontend/metal-ironic.alerts +++ b/system/kube-monitoring/charts/prometheus-frontend/metal-ironic.alerts @@ -2,7 +2,7 @@ groups: - name: metal-ironic.alerts rules: - alert: MetalIronicSensorCritical - expr: count(ipmi_sensor_state{type=~"(Memory|Drive Slot|Processor|Critical Interrupt)", maintenance="false", provision_state=~"(deploy|active|available)"} == 2) by (instance, type, name, manufacturer, model, provision_state, server_id, project_id) + expr: count(ipmi_sensor_state{type=~"(Memory|Processor|Critical Interrupt)", maintenance="false", provision_state=~"(deploy|active|available)"} == 2) by (instance, type, name, manufacturer, model, provision_state, server_id, project_id) for: 15m labels: severity: critical @@ -16,7 +16,7 @@ groups: summary: "Hardware error for instance: {{ $labels.instance }}" - alert: MetalIronicSensorWarning - expr: count(ipmi_sensor_state{type=~"(Memory|Drive Slot|Processor|Critical Interrupt)", maintenance="false", provision_state=~"(manageable)"} == 2) by (instance, type, name, manufacturer, model, provision_state, server_id, project_id) + expr: count((ipmi_sensor_state{type=~"(Memory|Processor|Critical Interrupt)", maintenance="false", provision_state=~"(manageable)"} == 2) or (ipmi_sensor_state{type=~"(Drive Slot)", maintenance="false"} == 2)) by (instance, type, name, manufacturer, model, provision_state, server_id, project_id) for: 15m labels: severity: warning