diff --git a/class/defaults.yml b/class/defaults.yml index 19eaa7f9..af985f5f 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -133,6 +133,33 @@ parameters: - CephDeviceFailurePredicted - CephDeviceFailurePredictionTooHigh - CephDeviceFailureRelocationIncomplete + # Offline OSD host should be covered by cluster-level alerts + - CephOSDHostDown + # Clock skew should be covered by cluster-level alert + - CephMonClockSkew + # Network packet drops/errors should be covered by cluster-level + # alerts, we don't need to have duplicates for Ceph nodes + - CephNodeNetworkPacketDrops + - CephNodeNetworkPacketErrors + # All the following alerts should be covered by CephHealthWarning + - CephOSDInternalDiskSizeMismatch + - CephOSDInternalDiskSizeMismatch + - CephFilesystemInsufficientStandby + - CephFilesystemMDSRanksLow + - CephOSDTooManyRepairs + - CephOSDBackfillFull + - CephOSDReadErrors + - CephOSDNearFull + - CephPGNotDeepScrubbed + - CephPGNotScrubbed + - CephPGsHighPerOSD + - CephPoolBackfillFull + - CephPoolNearFull + # The following alerts are not valuable enough to have active by + # default + - CephPGImbalance + - CephPoolGrowthWarning + - CephSlowOps # Alert rule patches. # Provide partial objects for alert rules that need to be tuned compared to diff --git a/component/alertrules.libsonnet b/component/alertrules.libsonnet index dcc10e19..3ce7bd2b 100644 --- a/component/alertrules.libsonnet +++ b/component/alertrules.libsonnet @@ -11,29 +11,6 @@ assert std.member(inv.applications, 'openshift4-monitoring') : 'Neither rancher-monitoring nor openshift4-monitoring is available'; -// Function to process an array which supports removing previously added -// elements by prefixing them with ~ -local render_array(arr) = - // extract real value of array entry - local realval(v) = std.lstripChars(v, '~'); - // Compute whether each element should be included by keeping track of - // whether its last occurrence in the input array was prefixed with ~ or - // not. - local val_state = std.foldl( - function(a, it) a + it, - [ - { [realval(v)]: !std.startsWith(v, '~') } - for v in arr - ], - {} - ); - // Return filtered array containing only elements whose last occurrence - // wasn't prefixed by ~. - std.filter( - function(val) val_state[val], - std.objectFields(val_state) - ); - // Keep only alerts from params.ceph_cluster.ignore_alerts for which the last // array entry wasn't prefixed with `~`. local user_ignore_alerts = @@ -46,7 +23,7 @@ local user_ignore_alerts = ) else []; - render_array( + com.renderArray( legacyIgnores + params.alerts.ignoreNames ); @@ -58,6 +35,16 @@ local ignore_alerts = std.set( // covered by `CephMgrIsAbsent`. 'CephMgrIsMissingReplicas', ] + + ( + // Drop CephOSDDownHigh for installations with < 10 nodes, since the alert + // fires if more than 10% of OSDs are down (i.e. 1 node/OSD for small + // clusters). The assumption here is that for clusters with >= 10 nodes, + // the likelyhood of running >1 OSD per node is significant. + if params.ceph_cluster.node_count < 10 then + [ 'CephOSDDownHigh' ] + else + [] + ) + // Add set of upstream alerts that should be ignored from processed value of // `params.alerts.ignoreNames` user_ignore_alerts @@ -66,72 +53,37 @@ local ignore_alerts = std.set( local runbook(alertname) = 'https://hub.syn.tools/rook-ceph/runbooks/%s.html' % alertname; -/* FROM HERE: should be provided as library function by - * rancher-/openshift4-monitoring */ -// We shouldn't be expected to care how rancher-/openshift4-monitoring -// implement alert managmement and patching, instead we should be able to -// reuse their functionality as a black box to make sure our alerts work -// correctly in the environment into which we're deploying. - local on_openshift = inv.parameters.facts.distribution == 'openshift4'; - -local global_alert_params = +local alertpatching = if on_openshift then - inv.parameters.openshift4_monitoring.alerts + import 'lib/alert-patching.libsonnet' else - inv.parameters.rancher_monitoring.alerts; - -local filter_patch_rules(g) = - // combine our set of alerts to ignore with the monitoring component's - // set of ignoreNames. - local ignore_set = std.set(global_alert_params.ignoreNames + ignore_alerts); - g { - rules: std.map( - // Patch rules to make sure they match the requirements. - function(rule) - local rulepatch = com.makeMergeable( - com.getValueOrDefault( - params.alerts.patchRules, - rule.alert, - {} - ) - ); - local runbook_url = runbook(rule.alert); + local patchRule(rule) = + if !std.objectHas(rule, 'alert') then + rule + else rule { - // Change alert names so we don't get multiple alerts with the same - // name, as the rook-ceph operator deploys its own copy of these - // rules. alert: 'SYN_%s' % super.alert, - // add customAnnotations configured for all alerts on cluster - annotations+: global_alert_params.customAnnotations { - runbook_url: runbook_url, - }, labels+: { - // ensure the alerts are not silenced on OCP4 - // TODO: figure out how to ensure we don't get duplicate alerts on - // not-OCP4 syn: 'true', - // mark alert as belonging to rook-ceph - // can be used for inhibition rules - syn_component: 'rook-ceph', + syn_component: inv.parameters._instance, }, - } + rulepatch, - std.filter( - // Filter out unwanted rules - function(rule) - // only create duplicates of alert rules, we can use the recording - // rules which are deployed anyway when we enable monitoring on the - // CephCluster resource. - std.objectHas(rule, 'alert') && - // Drop rules which are in the ignore_set - !std.member(ignore_set, rule.alert), - super.rules - ), - ), - }; - -/* TO HERE */ + }; + std.trace( + 'Alert patching library not available on non-OCP4, alerts may be configured incorrectly', + { + patchRule: patchRule, + filterPatchRules(group, ignoreNames, patches): + group { + rules: [ + patchRule(r) + for r in super.rules + if !std.member(ignoreNames, r.alert) + ], + }, + } + ); local alert_rules_raw = helpers.load_manifest('prometheus-ceph-rules'); assert std.length(alert_rules_raw) >= 1; @@ -154,25 +106,36 @@ local additional_rules = [ { name: 'syn-rook-ceph-additional.alerts', rules: [ - { - alert: 'SYN_RookCephOperatorScaledDown', - expr: 'kube_deployment_spec_replicas{deployment="rook-ceph-operator", namespace="%s"} == 0' % params.namespace, - annotations: global_alert_params.customAnnotations { - summary: 'rook-ceph operator scaled to 0 for more than 1 hour.', - description: 'TODO', - runbook_url: runbook('RookCephOperatorScaledDown'), - }, - labels: { - severity: 'warning', - syn_component: 'rook-ceph', - syn: 'true', + alertpatching.patchRule( + { + alert: 'RookCephOperatorScaledDown', + expr: 'kube_deployment_spec_replicas{deployment="rook-ceph-operator", namespace="%s"} == 0' % params.namespace, + annotations: { + summary: 'rook-ceph operator scaled to 0 for more than 1 hour.', + description: 'TODO', + runbook_url: runbook('RookCephOperatorScaledDown'), + }, + labels: { + severity: 'warning', + }, + 'for': '1h', }, - 'for': '1h', - }, + ), ], }, ]; +local add_runbook_url = { + rules: [ + r { + annotations+: { + runbook_url: runbook(r.alert), + }, + } + for r in super.rules + ], +}; + local alert_rules = [ local gs = std.filter( function(it) !std.member(ignore_groups, it.name), @@ -187,7 +150,11 @@ local alert_rules = [ groups: std.filter( function(it) it != null, [ - local r = filter_patch_rules(g); + local r = alertpatching.filterPatchRules( + g + add_runbook_url, + ignore_alerts, + params.alerts.patchRules, + ); if std.length(r.rules) > 0 then r for g in gs ] diff --git a/tests/golden/defaults/rook-ceph/rook-ceph/40_alertrules.yaml b/tests/golden/defaults/rook-ceph/rook-ceph/40_alertrules.yaml index 9d9d4a97..f666b15f 100644 --- a/tests/golden/defaults/rook-ceph/rook-ceph/40_alertrules.yaml +++ b/tests/golden/defaults/rook-ceph/rook-ceph/40_alertrules.yaml @@ -85,63 +85,8 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephMonClockSkew - annotations: - description: | - Ceph monitors rely on closely synchronized time to maintain - quorum and cluster consistency. This event indicates that time on at least - one mon has drifted too far from the lead mon. - - Review cluster status with ceph -s. This will show which monitors - are affected. Check the time sync status on each monitor host with - "ceph time-sync-status" and the state and peers of your ntpd or chrony daemon. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephMonClockSkew.html - summary: Clock skew detected among monitors - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - name: osd rules: - - alert: SYN_CephOSDDownHigh - annotations: - description: | - {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). - - The following OSDs are down: - {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} - {{- end }} - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDDownHigh.html - summary: More than 10% of OSDs are down - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10 - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.4.1 - severity: critical - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephOSDHostDown - annotations: - description: | - The following OSDs are down: - {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} - {{- end }} - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDHostDown.html - summary: An OSD host is offline - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 - for: 5m - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.4.8 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephOSDDown annotations: description: | @@ -162,24 +107,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephOSDNearFull - annotations: - description: | - One or more OSDs have reached the NEARFULL threshold - - Use 'ceph health detail' and 'ceph osd df' to identify the problem. - To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDNearFull.html - summary: OSD(s) running low on free space (NEARFULL) - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 - for: 5m - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.4.3 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephOSDFull annotations: description: | @@ -199,39 +126,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephOSDBackfillFull - annotations: - description: | - An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations - from completing. - Use 'ceph health detail' and 'ceph osd df' to identify the problem. - - To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDBackfillFull.html - summary: OSD(s) too full for backfill operations - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephOSDTooManyRepairs - annotations: - description: | - Reads from an OSD have used a secondary PG to return data to the client, indicating - a potential failing disk. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDTooManyRepairs.html - summary: OSD reports a high number of read errors - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1 - for: 30s - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephOSDTimeoutsPublicNetwork annotations: description: | @@ -260,21 +154,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephOSDInternalDiskSizeMismatch - annotations: - description: | - One or more OSDs have an internal inconsistency between metadata and the size of the device. - This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDInternalDiskSizeMismatch.html - summary: OSD size inconsistency error - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephOSDFlapping annotations: description: | @@ -293,39 +172,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephOSDReadErrors - annotations: - description: | - An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDReadErrors.html - summary: Device read errors detected - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 - for: 30s - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephPGImbalance - annotations: - description: | - OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPGImbalance.html - summary: PGs are not balanced across OSDs - expr: | - abs( - ( - (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 - for: 5m - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.4.5 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - name: mds rules: - alert: SYN_CephFilesystemDamaged @@ -373,34 +219,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephFilesystemMDSRanksLow - annotations: - description: | - The filesystem's "max_mds" setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value. - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephFilesystemMDSRanksLow.html - summary: MDS daemon count is lower than configured - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephFilesystemInsufficientStandby - annotations: - description: | - The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons. - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephFilesystemInsufficientStandby.html - summary: Ceph filesystem standby daemons too few - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephFilesystemFailureNoStandby annotations: description: | @@ -558,66 +376,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephPGNotScrubbed - annotations: - description: | - One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, - protecting against bit-rot. They check that metadata - is consistent across data replicas. When PGs miss their scrub interval, it may - indicate that the scrub window is too small, or PGs were not in a 'clean' state during the - scrub window. - - You can manually initiate a scrub with: ceph pg scrub - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPGNotScrubbed.html - summary: Placement group(s) have not been scrubbed - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 - for: 5m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephPGsHighPerOSD - annotations: - description: | - The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting). - - Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', - and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide - the autoscaler based on the expected relative size of the pool - ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler - mode to "warn" and adjust pg_num appropriately for one or more pools. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPGsHighPerOSD.html - summary: Placement groups per OSD is too high - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephPGNotDeepScrubbed - annotations: - description: | - One or more PGs have not been deep scrubbed recently. Deep scrubs - protect against bit-rot. They compare data - replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate - that the window is too small or PGs were not in a 'clean' state during the deep-scrub - window. - - You can manually initiate a deep scrub with: ceph pg deep-scrub - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPGNotDeepScrubbed.html - summary: Placement group(s) have not been deep scrubbed - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 - for: 5m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - name: nodes rules: - alert: SYN_CephNodeRootFilesystemFull @@ -635,81 +393,8 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephNodeNetworkPacketDrops - annotations: - description: | - Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephNodeNetworkPacketDrops.html - summary: One or more NICs reports packet drops - expr: | - ( - increase(node_network_receive_drop_total{device!="lo"}[1m]) + - increase(node_network_transmit_drop_total{device!="lo"}[1m]) - ) / ( - increase(node_network_receive_packets_total{device!="lo"}[1m]) + - increase(node_network_transmit_packets_total{device!="lo"}[1m]) - ) >= 0.0001 or ( - increase(node_network_receive_drop_total{device!="lo"}[1m]) + - increase(node_network_transmit_drop_total{device!="lo"}[1m]) - ) >= 10 - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.8.2 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephNodeNetworkPacketErrors - annotations: - description: | - Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephNodeNetworkPacketErrors.html - summary: One or more NICs reports packet errors - expr: | - ( - increase(node_network_receive_errs_total{device!="lo"}[1m]) + - increase(node_network_transmit_errs_total{device!="lo"}[1m]) - ) / ( - increase(node_network_receive_packets_total{device!="lo"}[1m]) + - increase(node_network_transmit_packets_total{device!="lo"}[1m]) - ) >= 0.0001 or ( - increase(node_network_receive_errs_total{device!="lo"}[1m]) + - increase(node_network_transmit_errs_total{device!="lo"}[1m]) - ) >= 10 - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.8.3 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - name: pools rules: - - alert: SYN_CephPoolGrowthWarning - annotations: - description: | - Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPoolGrowthWarning.html - summary: Pool growth rate may soon exceed capacity - expr: | - (predict_linear((max(ceph_pool_percent_used) without (pod, instance))[2d:1h], 3600 * 24 * 5) * on(pool_id) - group_right ceph_pool_metadata) >= 95 - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.9.2 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephPoolBackfillFull - annotations: - description: | - A pool is approaching the near full threshold, which will prevent recovery/backfill from completing. Consider adding more capacity. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPoolBackfillFull.html - summary: Free space in a pool is too low for recovery/backfill - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0 - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephPoolFull annotations: description: | @@ -733,44 +418,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephPoolNearFull - annotations: - description: | - A pool has exceeded the warning (percent full) threshold, or OSDs - supporting the pool have reached the NEARFULL threshold. Writes may - continue, but you are at risk of the pool going read-only if more capacity - isn't made available. - - Determine the affected pool with 'ceph df detail', looking - at QUOTA BYTES and STORED. Increase the pool's quota, or add - capacity to the cluster then increase the pool's quota - (e.g. ceph osd pool set quota max_bytes ). - Also ensure that the balancer is active. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPoolNearFull.html - summary: One or more Ceph pools are nearly full - expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0 - for: 5m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - name: healthchecks - rules: - - alert: SYN_CephSlowOps - annotations: - description: | - {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded) - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephSlowOps.html - summary: OSD operations are slow to complete - expr: ceph_healthcheck_slow_ops > 0 - for: 30s - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - name: rados rules: - alert: SYN_CephObjectMissing diff --git a/tests/golden/openshift4/rook-ceph/rook-ceph/40_alertrules.yaml b/tests/golden/openshift4/rook-ceph/rook-ceph/40_alertrules.yaml index 9d9d4a97..f666b15f 100644 --- a/tests/golden/openshift4/rook-ceph/rook-ceph/40_alertrules.yaml +++ b/tests/golden/openshift4/rook-ceph/rook-ceph/40_alertrules.yaml @@ -85,63 +85,8 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephMonClockSkew - annotations: - description: | - Ceph monitors rely on closely synchronized time to maintain - quorum and cluster consistency. This event indicates that time on at least - one mon has drifted too far from the lead mon. - - Review cluster status with ceph -s. This will show which monitors - are affected. Check the time sync status on each monitor host with - "ceph time-sync-status" and the state and peers of your ntpd or chrony daemon. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephMonClockSkew.html - summary: Clock skew detected among monitors - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - name: osd rules: - - alert: SYN_CephOSDDownHigh - annotations: - description: | - {{ $value | humanize }}% or {{ with query "count(ceph_osd_up == 0)" }}{{ . | first | value }}{{ end }} of {{ with query "count(ceph_osd_up)" }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). - - The following OSDs are down: - {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} - {{- end }} - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDDownHigh.html - summary: More than 10% of OSDs are down - expr: count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10 - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.4.1 - severity: critical - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephOSDHostDown - annotations: - description: | - The following OSDs are down: - {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} - {{- end }} - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDHostDown.html - summary: An OSD host is offline - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 - for: 5m - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.4.8 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephOSDDown annotations: description: | @@ -162,24 +107,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephOSDNearFull - annotations: - description: | - One or more OSDs have reached the NEARFULL threshold - - Use 'ceph health detail' and 'ceph osd df' to identify the problem. - To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDNearFull.html - summary: OSD(s) running low on free space (NEARFULL) - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 - for: 5m - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.4.3 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephOSDFull annotations: description: | @@ -199,39 +126,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephOSDBackfillFull - annotations: - description: | - An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations - from completing. - Use 'ceph health detail' and 'ceph osd df' to identify the problem. - - To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDBackfillFull.html - summary: OSD(s) too full for backfill operations - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephOSDTooManyRepairs - annotations: - description: | - Reads from an OSD have used a secondary PG to return data to the client, indicating - a potential failing disk. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDTooManyRepairs.html - summary: OSD reports a high number of read errors - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1 - for: 30s - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephOSDTimeoutsPublicNetwork annotations: description: | @@ -260,21 +154,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephOSDInternalDiskSizeMismatch - annotations: - description: | - One or more OSDs have an internal inconsistency between metadata and the size of the device. - This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDInternalDiskSizeMismatch.html - summary: OSD size inconsistency error - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephOSDFlapping annotations: description: | @@ -293,39 +172,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephOSDReadErrors - annotations: - description: | - An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephOSDReadErrors.html - summary: Device read errors detected - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 - for: 30s - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephPGImbalance - annotations: - description: | - OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPGImbalance.html - summary: PGs are not balanced across OSDs - expr: | - abs( - ( - (ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) / on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 - for: 5m - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.4.5 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - name: mds rules: - alert: SYN_CephFilesystemDamaged @@ -373,34 +219,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephFilesystemMDSRanksLow - annotations: - description: | - The filesystem's "max_mds" setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value. - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephFilesystemMDSRanksLow.html - summary: MDS daemon count is lower than configured - expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephFilesystemInsufficientStandby - annotations: - description: | - The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons. - documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephFilesystemInsufficientStandby.html - summary: Ceph filesystem standby daemons too few - expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephFilesystemFailureNoStandby annotations: description: | @@ -558,66 +376,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephPGNotScrubbed - annotations: - description: | - One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, - protecting against bit-rot. They check that metadata - is consistent across data replicas. When PGs miss their scrub interval, it may - indicate that the scrub window is too small, or PGs were not in a 'clean' state during the - scrub window. - - You can manually initiate a scrub with: ceph pg scrub - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPGNotScrubbed.html - summary: Placement group(s) have not been scrubbed - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 - for: 5m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephPGsHighPerOSD - annotations: - description: | - The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting). - - Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', - and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide - the autoscaler based on the expected relative size of the pool - ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler - mode to "warn" and adjust pg_num appropriately for one or more pools. - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPGsHighPerOSD.html - summary: Placement groups per OSD is too high - expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1 - for: 1m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephPGNotDeepScrubbed - annotations: - description: | - One or more PGs have not been deep scrubbed recently. Deep scrubs - protect against bit-rot. They compare data - replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate - that the window is too small or PGs were not in a 'clean' state during the deep-scrub - window. - - You can manually initiate a deep scrub with: ceph pg deep-scrub - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPGNotDeepScrubbed.html - summary: Placement group(s) have not been deep scrubbed - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 - for: 5m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - name: nodes rules: - alert: SYN_CephNodeRootFilesystemFull @@ -635,81 +393,8 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephNodeNetworkPacketDrops - annotations: - description: | - Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephNodeNetworkPacketDrops.html - summary: One or more NICs reports packet drops - expr: | - ( - increase(node_network_receive_drop_total{device!="lo"}[1m]) + - increase(node_network_transmit_drop_total{device!="lo"}[1m]) - ) / ( - increase(node_network_receive_packets_total{device!="lo"}[1m]) + - increase(node_network_transmit_packets_total{device!="lo"}[1m]) - ) >= 0.0001 or ( - increase(node_network_receive_drop_total{device!="lo"}[1m]) + - increase(node_network_transmit_drop_total{device!="lo"}[1m]) - ) >= 10 - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.8.2 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephNodeNetworkPacketErrors - annotations: - description: | - Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephNodeNetworkPacketErrors.html - summary: One or more NICs reports packet errors - expr: | - ( - increase(node_network_receive_errs_total{device!="lo"}[1m]) + - increase(node_network_transmit_errs_total{device!="lo"}[1m]) - ) / ( - increase(node_network_receive_packets_total{device!="lo"}[1m]) + - increase(node_network_transmit_packets_total{device!="lo"}[1m]) - ) >= 0.0001 or ( - increase(node_network_receive_errs_total{device!="lo"}[1m]) + - increase(node_network_transmit_errs_total{device!="lo"}[1m]) - ) >= 10 - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.8.3 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - name: pools rules: - - alert: SYN_CephPoolGrowthWarning - annotations: - description: | - Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPoolGrowthWarning.html - summary: Pool growth rate may soon exceed capacity - expr: | - (predict_linear((max(ceph_pool_percent_used) without (pod, instance))[2d:1h], 3600 * 24 * 5) * on(pool_id) - group_right ceph_pool_metadata) >= 95 - labels: - oid: 1.3.6.1.4.1.50495.1.2.1.9.2 - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - alert: SYN_CephPoolBackfillFull - annotations: - description: | - A pool is approaching the near full threshold, which will prevent recovery/backfill from completing. Consider adding more capacity. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPoolBackfillFull.html - summary: Free space in a pool is too low for recovery/backfill - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0 - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - alert: SYN_CephPoolFull annotations: description: | @@ -733,44 +418,6 @@ spec: syn: 'true' syn_component: rook-ceph type: ceph_default - - alert: SYN_CephPoolNearFull - annotations: - description: | - A pool has exceeded the warning (percent full) threshold, or OSDs - supporting the pool have reached the NEARFULL threshold. Writes may - continue, but you are at risk of the pool going read-only if more capacity - isn't made available. - - Determine the affected pool with 'ceph df detail', looking - at QUOTA BYTES and STORED. Increase the pool's quota, or add - capacity to the cluster then increase the pool's quota - (e.g. ceph osd pool set quota max_bytes ). - Also ensure that the balancer is active. - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephPoolNearFull.html - summary: One or more Ceph pools are nearly full - expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0 - for: 5m - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - - name: healthchecks - rules: - - alert: SYN_CephSlowOps - annotations: - description: | - {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded) - documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops - runbook_url: https://hub.syn.tools/rook-ceph/runbooks/CephSlowOps.html - summary: OSD operations are slow to complete - expr: ceph_healthcheck_slow_ops > 0 - for: 30s - labels: - severity: warning - syn: 'true' - syn_component: rook-ceph - type: ceph_default - name: rados rules: - alert: SYN_CephObjectMissing diff --git a/tests/openshift4.yml b/tests/openshift4.yml index 3523835a..2a9b1624 100644 --- a/tests/openshift4.yml +++ b/tests/openshift4.yml @@ -9,14 +9,12 @@ parameters: - type: https source: https://raw.githubusercontent.com/projectsyn/component-storageclass/v1.0.0/lib/storageclass.libsonnet output_path: vendor/lib/storageclass.libsonnet + - type: https + source: https://raw.githubusercontent.com/appuio/component-openshift4-monitoring/v3.1.0/lib/openshift4-monitoring-alert-patching.libsonnet + output_path: vendor/lib/alert-patching.libsonnet storageclass: defaults: {} defaultClass: "" rook_ceph: {} - - openshift4_monitoring: - alerts: - ignoreNames: [] - customAnnotations: {}