From 5e1c10d45f8309299c8fda229372b17d4f6e85dc Mon Sep 17 00:00:00 2001 From: Philip Gough Date: Thu, 16 Jan 2025 15:51:22 +0000 Subject: [PATCH 1/3] jb:update alertmanager --- jsonnetfile.lock.json | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index b58d384d8b..745bd38b82 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -51,6 +51,26 @@ "version": "6db00c292d3a1c71661fc875f90e0ec7caa538c2", "sum": "gCtR9s/4D5fxU9aKXg0Bru+/njZhA0YjLjPiASc61FM=" }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" + } + }, + "version": "d20e609202733790caf5b554c9945d049f243ae3", + "sum": "V9vAj21qJOc2DlMPDgB1eEjSQU4A+sAA4AXuJ6bd4xc=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v11.4.0" + } + }, + "version": "d20e609202733790caf5b554c9945d049f243ae3", + "sum": "aVAX09paQYNOoCSKVpuk1exVIyBoMt/C50QJI+Q/3nA=" + }, { "source": { "git": { @@ -171,6 +191,16 @@ "version": "9e5b48eee32913938d3cac30f183b49ecd9fe13a", "sum": "7pl3HQqiKg4zJ0dWFqMo9yMGDEvlVdxgPGr1rMm0/LE=" }, + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/xtd.git", + "subdir": "" + } + }, + "version": "1199b50e9d2ff53d4bb5fb2304ad1fb69d38e609", + "sum": "LfbgcJbilu4uBdKYZSvmkoOTPwEAzg10L3/VqKAIWtA=" + }, { "source": { "git": { @@ -344,8 +374,8 @@ "subdir": "doc/alertmanager-mixin" } }, - "version": "f958b8be84b870e363f7dafcbeb807b463269a75", - "sum": "f3iZDUXQ/YWB5yDCY7VLD5bs442+3CdJgXJhJyWhNf8=" + "version": "b4a2179d70ae0b95cd180e1cfb4e811b7b2de822", + "sum": "Mf4h1BYLle2nrgjf/HXrBbl0Zk8N+xaoEM017o0BC+k=" }, { "source": { From 9782543c41ff4e9eacd750b81730ca244a9fd04e Mon Sep 17 00:00:00 2001 From: Philip Gough Date: Mon, 20 Jan 2025 10:36:30 +0000 Subject: [PATCH 2/3] generate alertmanager dashboard --- lib/k.libsonnet | 2 +- ...orium-alertmanager-overview.configmap.yaml | 740 ++++++------------ 2 files changed, 253 insertions(+), 489 deletions(-) diff --git a/lib/k.libsonnet b/lib/k.libsonnet index ba846036c2..10db782ff8 100644 --- a/lib/k.libsonnet +++ b/lib/k.libsonnet @@ -1 +1 @@ -(import 'github.com/jsonnet-libs/k8s-libsonnet/1.26/main.libsonnet') +(import 'github.com/jsonnet-libs/k8s-libsonnet/1.32/main.libsonnet') diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-alertmanager-overview.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-alertmanager-overview.configmap.yaml index 4bdfc0ed0f..e78c69c890 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-alertmanager-overview.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-alertmanager-overview.configmap.yaml @@ -2,454 +2,270 @@ apiVersion: v1 data: alertmanager-overview.json: |- { - "__inputs": [ - - ], - "__requires": [ - - ], - "annotations": { - "list": [ - - ] - }, - "editable": false, - "gnetId": null, "graphTooltip": 1, - "hideControls": false, - "id": null, - "links": [ - - ], - "refresh": "30s", - "rows": [ + "panels": [ { - "collapse": false, "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, "panels": [ - { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 2, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(alertmanager_alerts{namespace=~\"$namespace\",job=~\"$job\"}) by (namespace,job,pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}}", - "refId": "A" + ], + "title": "Alerts", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "current set of alerts stored in the Alertmanager", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never", + "stacking": { + "mode": "normal" } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Alerts", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] }, - "yaxes": [ - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "unit": "none" + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "showLegend": false }, + "tooltip": { + "mode": "multi" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ { - "aliasColors": { - - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 3, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "sideWidth": null, - "total": false, - "values": false + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ - - ], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(alertmanager_alerts_received_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Received", - "refId": "A" - }, - { - "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Invalid", - "refId": "B" + "expr": "sum(alertmanager_alerts{namespace=~\"$namespace\",job=~\"$job\"}) by (namespace,job,pod)", + "intervalFactor": 2, + "legendFormat": "{{pod}}" + } + ], + "title": "Alerts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "rate of successful and invalid alerts received by the Alertmanager", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never", + "stacking": { + "mode": "normal" } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "Alerts receive rate", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] + "unit": "ops" + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "showLegend": false + }, + "tooltip": { + "mode": "multi" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "expr": "sum(rate(alertmanager_alerts_received_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", + "intervalFactor": 2, + "legendFormat": "{{pod}} Received" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(rate(alertmanager_alerts_invalid_total{namespace=~\"$namespace\",job=~\"$job\"}[$__rate_interval])) by (namespace,job,pod)", + "intervalFactor": 2, + "legendFormat": "{{pod}} Invalid" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Alerts", - "titleSize": "h6", - "type": "row" + "title": "Alerts receive rate", + "type": "timeseries" }, { - "collapse": false, "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 4, "panels": [ - { - "aliasColors": { - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - - }, - "id": 4, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "sideWidth": null, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "integration", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(alertmanager_notifications_total{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,job,pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Total", - "refId": "A" - }, - { - "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,job,pod)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Failed", - "refId": "B" + ], + "title": "Notifications", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "rate of successful and invalid notifications sent by the Alertmanager", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never", + "stacking": { + "mode": "normal" } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "$integration: Notifications Send Rate", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] + "unit": "ops" + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 5, + "options": { + "legend": { + "showLegend": false + }, + "tooltip": { + "mode": "multi" + } + }, + "pluginVersion": "v11.4.0", + "repeat": "integration", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "expr": "sum(rate(alertmanager_notifications_total{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,job,pod)", + "intervalFactor": 2, + "legendFormat": "{{pod}} Total" }, { - "aliasColors": { - + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, - "gridPos": { - + "expr": "sum(rate(alertmanager_notifications_failed_total{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (integration,namespace,job,pod)", + "intervalFactor": 2, + "legendFormat": "{{pod}} Failed" + } + ], + "title": "$integration: Notifications Send Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "latency of notifications sent by the Alertmanager", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10, + "showPoints": "never", + "stacking": { + "mode": "normal" + } }, - "id": 5, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": false, - "sideWidth": null, - "total": false, - "values": false + "unit": "s" + } + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 6, + "options": { + "legend": { + "showLegend": false + }, + "tooltip": { + "mode": "multi" + } + }, + "pluginVersion": "v11.4.0", + "repeat": "integration", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "lines": true, - "linewidth": 1, - "links": [ - - ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "integration", - "seriesOverrides": [ - - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,job,pod)\n) \n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} 99th Percentile", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,job,pod)\n) \n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Median", - "refId": "B" - }, - { - "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (namespace,job,pod)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (namespace,job,pod)\n", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{pod}} Average", - "refId": "C" - } - ], - "thresholds": [ - - ], - "timeFrom": null, - "timeShift": null, - "title": "$integration: Notification Duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + "expr": "histogram_quantile(0.99,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,job,pod)\n)\n", + "intervalFactor": 2, + "legendFormat": "{{pod}} 99th Percentile" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - - ] + "expr": "histogram_quantile(0.50,\n sum(rate(alertmanager_notification_latency_seconds_bucket{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (le,namespace,job,pod)\n)\n", + "intervalFactor": 2, + "legendFormat": "{{pod}} Median" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "expr": "sum(rate(alertmanager_notification_latency_seconds_sum{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (namespace,job,pod)\n/\nsum(rate(alertmanager_notification_latency_seconds_count{namespace=~\"$namespace\",job=~\"$job\", integration=\"$integration\"}[$__rate_interval])) by (namespace,job,pod)\n", + "intervalFactor": 2, + "legendFormat": "{{pod}} Average" } ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Notifications", - "titleSize": "h6", - "type": "row" + "title": "$integration: Notification Duration", + "type": "timeseries" } ], - "schemaVersion": 14, - "style": "dark", + "schemaVersion": 39, "tags": [ "alertmanager-mixin", "observatorium" @@ -458,100 +274,69 @@ data: "list": [ { "current": { + "selected": false, "text": "Prometheus", "value": "Prometheus" }, "hide": 0, "label": "Data Source", "name": "datasource", - "options": [ - - ], "query": "prometheus", - "refresh": 1, - "regex": "", "type": "datasource" }, { - "allValue": null, "current": { + "selected": false, "text": "", "value": "" }, - "datasource": "$datasource", - "hide": 0, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "includeAll": false, "label": "namespace", - "multi": false, "name": "namespace", - "options": [ - - ], "query": "label_values(alertmanager_alerts, namespace)", "refresh": 2, - "regex": "", "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { - "allValue": null, "current": { + "selected": false, "text": "", "value": "" }, - "datasource": "$datasource", - "hide": 0, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "includeAll": false, "label": "job", - "multi": false, "name": "job", - "options": [ - - ], "query": "label_values(alertmanager_alerts, job)", "refresh": 2, - "regex": "", "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" }, { - "allValue": null, "current": { - "text": "all", + "selected": false, + "text": "$__all", "value": "$__all" }, - "datasource": "$datasource", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, "hide": 2, "includeAll": true, - "label": null, - "multi": false, "name": "integration", - "options": [ - - ], "query": "label_values(alertmanager_notifications_total{integration=~\"slack|pagerduty|email|webhook\"}, integration)", "refresh": 2, - "regex": "", "sort": 1, - "tagValuesQuery": "", - "tags": [ - - ], - "tagsQuery": "", - "type": "query", - "useTags": false + "type": "query" } ] }, @@ -561,33 +346,12 @@ data: }, "timepicker": { "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" + "30s" ] }, "timezone": "utc", "title": "Alertmanager / Overview", - "uid": "alertmanager-overview", - "version": 0 + "uid": "alertmanager-overview" } kind: ConfigMap metadata: From 03812a4ea0488edf20fcff680bf934910a77c8cd Mon Sep 17 00:00:00 2001 From: Philip Gough Date: Mon, 20 Jan 2025 11:32:19 +0000 Subject: [PATCH 3/3] jsonnet: Bump k8s lib --- jsonnetfile.json | 9 +++++++++ jsonnetfile.lock.json | 10 ---------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/jsonnetfile.json b/jsonnetfile.json index b12b3aef0d..68287119c1 100644 --- a/jsonnetfile.json +++ b/jsonnetfile.json @@ -1,6 +1,15 @@ { "version": 1, "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/k8s-libsonnet.git", + "subdir": "1.32" + } + }, + "version": "main" + }, { "source": { "git": { diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 745bd38b82..faa6c7151a 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -181,16 +181,6 @@ "version": "2eae33a828320269c42acf38e808479a33e416db", "sum": "lppHbNARpG3YTpuSv94X9TyIE9TfV3CyTVceIHSRxpc=" }, - { - "source": { - "git": { - "remote": "https://github.com/jsonnet-libs/k8s-libsonnet.git", - "subdir": "1.26" - } - }, - "version": "9e5b48eee32913938d3cac30f183b49ecd9fe13a", - "sum": "7pl3HQqiKg4zJ0dWFqMo9yMGDEvlVdxgPGr1rMm0/LE=" - }, { "source": { "git": {