Skip to content

Commit

Permalink
Merge pull request #125 from projectsyn/feat/streamline-alerts
Browse files Browse the repository at this point in the history
Disable low-value alerts by default
  • Loading branch information
simu authored Jun 23, 2023
2 parents 8b3d018 + 2a55ad9 commit 499b559
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 807 deletions.
27 changes: 27 additions & 0 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,33 @@ parameters:
- CephDeviceFailurePredicted
- CephDeviceFailurePredictionTooHigh
- CephDeviceFailureRelocationIncomplete
# Offline OSD host should be covered by cluster-level alerts
- CephOSDHostDown
# Clock skew should be covered by cluster-level alert
- CephMonClockSkew
# Network packet drops/errors should be covered by cluster-level
# alerts, we don't need to have duplicates for Ceph nodes
- CephNodeNetworkPacketDrops
- CephNodeNetworkPacketErrors
# All the following alerts should be covered by CephHealthWarning
- CephOSDInternalDiskSizeMismatch
- CephOSDInternalDiskSizeMismatch
- CephFilesystemInsufficientStandby
- CephFilesystemMDSRanksLow
- CephOSDTooManyRepairs
- CephOSDBackfillFull
- CephOSDReadErrors
- CephOSDNearFull
- CephPGNotDeepScrubbed
- CephPGNotScrubbed
- CephPGsHighPerOSD
- CephPoolBackfillFull
- CephPoolNearFull
# The following alerts are not valuable enough to have active by
# default
- CephPGImbalance
- CephPoolGrowthWarning
- CephSlowOps

# Alert rule patches.
# Provide partial objects for alert rules that need to be tuned compared to
Expand Down
159 changes: 63 additions & 96 deletions component/alertrules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,29 +11,6 @@ assert
std.member(inv.applications, 'openshift4-monitoring')
: 'Neither rancher-monitoring nor openshift4-monitoring is available';

// Function to process an array which supports removing previously added
// elements by prefixing them with ~
local render_array(arr) =
// extract real value of array entry
local realval(v) = std.lstripChars(v, '~');
// Compute whether each element should be included by keeping track of
// whether its last occurrence in the input array was prefixed with ~ or
// not.
local val_state = std.foldl(
function(a, it) a + it,
[
{ [realval(v)]: !std.startsWith(v, '~') }
for v in arr
],
{}
);
// Return filtered array containing only elements whose last occurrence
// wasn't prefixed by ~.
std.filter(
function(val) val_state[val],
std.objectFields(val_state)
);

// Keep only alerts from params.ceph_cluster.ignore_alerts for which the last
// array entry wasn't prefixed with `~`.
local user_ignore_alerts =
Expand All @@ -46,7 +23,7 @@ local user_ignore_alerts =
)
else
[];
render_array(
com.renderArray(
legacyIgnores + params.alerts.ignoreNames
);

Expand All @@ -58,6 +35,16 @@ local ignore_alerts = std.set(
// covered by `CephMgrIsAbsent`.
'CephMgrIsMissingReplicas',
] +
(
// Drop CephOSDDownHigh for installations with < 10 nodes, since the alert
// fires if more than 10% of OSDs are down (i.e. 1 node/OSD for small
// clusters). The assumption here is that for clusters with >= 10 nodes,
// the likelyhood of running >1 OSD per node is significant.
if params.ceph_cluster.node_count < 10 then
[ 'CephOSDDownHigh' ]
else
[]
) +
// Add set of upstream alerts that should be ignored from processed value of
// `params.alerts.ignoreNames`
user_ignore_alerts
Expand All @@ -66,72 +53,37 @@ local ignore_alerts = std.set(
local runbook(alertname) =
'https://hub.syn.tools/rook-ceph/runbooks/%s.html' % alertname;

/* FROM HERE: should be provided as library function by
* rancher-/openshift4-monitoring */
// We shouldn't be expected to care how rancher-/openshift4-monitoring
// implement alert managmement and patching, instead we should be able to
// reuse their functionality as a black box to make sure our alerts work
// correctly in the environment into which we're deploying.

local on_openshift =
inv.parameters.facts.distribution == 'openshift4';

local global_alert_params =
local alertpatching =
if on_openshift then
inv.parameters.openshift4_monitoring.alerts
import 'lib/alert-patching.libsonnet'
else
inv.parameters.rancher_monitoring.alerts;

local filter_patch_rules(g) =
// combine our set of alerts to ignore with the monitoring component's
// set of ignoreNames.
local ignore_set = std.set(global_alert_params.ignoreNames + ignore_alerts);
g {
rules: std.map(
// Patch rules to make sure they match the requirements.
function(rule)
local rulepatch = com.makeMergeable(
com.getValueOrDefault(
params.alerts.patchRules,
rule.alert,
{}
)
);
local runbook_url = runbook(rule.alert);
local patchRule(rule) =
if !std.objectHas(rule, 'alert') then
rule
else
rule {
// Change alert names so we don't get multiple alerts with the same
// name, as the rook-ceph operator deploys its own copy of these
// rules.
alert: 'SYN_%s' % super.alert,
// add customAnnotations configured for all alerts on cluster
annotations+: global_alert_params.customAnnotations {
runbook_url: runbook_url,
},
labels+: {
// ensure the alerts are not silenced on OCP4
// TODO: figure out how to ensure we don't get duplicate alerts on
// not-OCP4
syn: 'true',
// mark alert as belonging to rook-ceph
// can be used for inhibition rules
syn_component: 'rook-ceph',
syn_component: inv.parameters._instance,
},
} + rulepatch,
std.filter(
// Filter out unwanted rules
function(rule)
// only create duplicates of alert rules, we can use the recording
// rules which are deployed anyway when we enable monitoring on the
// CephCluster resource.
std.objectHas(rule, 'alert') &&
// Drop rules which are in the ignore_set
!std.member(ignore_set, rule.alert),
super.rules
),
),
};

/* TO HERE */
};
std.trace(
'Alert patching library not available on non-OCP4, alerts may be configured incorrectly',
{
patchRule: patchRule,
filterPatchRules(group, ignoreNames, patches):
group {
rules: [
patchRule(r)
for r in super.rules
if !std.member(ignoreNames, r.alert)
],
},
}
);

local alert_rules_raw = helpers.load_manifest('prometheus-ceph-rules');
assert std.length(alert_rules_raw) >= 1;
Expand All @@ -154,25 +106,36 @@ local additional_rules = [
{
name: 'syn-rook-ceph-additional.alerts',
rules: [
{
alert: 'SYN_RookCephOperatorScaledDown',
expr: 'kube_deployment_spec_replicas{deployment="rook-ceph-operator", namespace="%s"} == 0' % params.namespace,
annotations: global_alert_params.customAnnotations {
summary: 'rook-ceph operator scaled to 0 for more than 1 hour.',
description: 'TODO',
runbook_url: runbook('RookCephOperatorScaledDown'),
},
labels: {
severity: 'warning',
syn_component: 'rook-ceph',
syn: 'true',
alertpatching.patchRule(
{
alert: 'RookCephOperatorScaledDown',
expr: 'kube_deployment_spec_replicas{deployment="rook-ceph-operator", namespace="%s"} == 0' % params.namespace,
annotations: {
summary: 'rook-ceph operator scaled to 0 for more than 1 hour.',
description: 'TODO',
runbook_url: runbook('RookCephOperatorScaledDown'),
},
labels: {
severity: 'warning',
},
'for': '1h',
},
'for': '1h',
},
),
],
},
];

local add_runbook_url = {
rules: [
r {
annotations+: {
runbook_url: runbook(r.alert),
},
}
for r in super.rules
],
};

local alert_rules = [
local gs = std.filter(
function(it) !std.member(ignore_groups, it.name),
Expand All @@ -187,7 +150,11 @@ local alert_rules = [
groups: std.filter(
function(it) it != null,
[
local r = filter_patch_rules(g);
local r = alertpatching.filterPatchRules(
g + add_runbook_url,
ignore_alerts,
params.alerts.patchRules,
);
if std.length(r.rules) > 0 then r
for g in gs
]
Expand Down
Loading

0 comments on commit 499b559

Please sign in to comment.