From c79f7e0af067c3c5d47d932012ce41587d4879f9 Mon Sep 17 00:00:00 2001 From: Itay Grudev Date: Thu, 28 Mar 2024 00:49:32 +0200 Subject: [PATCH] Added additional PrometheusRule annotations Signed-off-by: Itay Grudev --- charts/cluster/prometheus_rules/cluster-ha-critical.yaml | 2 ++ charts/cluster/prometheus_rules/cluster-ha-warning.yaml | 2 ++ .../prometheus_rules/cluster-high_connection-critical.yaml | 4 +++- .../prometheus_rules/cluster-high_connection-warning.yaml | 4 +++- .../prometheus_rules/cluster-high_replication_lag.yaml | 4 +++- .../prometheus_rules/cluster-instances_on_same_node.yaml | 4 +++- .../prometheus_rules/cluster-low_disk_space-critical.yaml | 4 +++- .../prometheus_rules/cluster-low_disk_space-warning.yaml | 4 +++- charts/cluster/prometheus_rules/cluster-offline.yaml | 2 ++ .../prometheus_rules/cluster-zone_spread-warning.yaml | 4 +++- charts/cluster/templates/prometheus-rule.yaml | 7 +++++-- 11 files changed, 32 insertions(+), 9 deletions(-) diff --git a/charts/cluster/prometheus_rules/cluster-ha-critical.yaml b/charts/cluster/prometheus_rules/cluster-ha-critical.yaml index 014e9ec448..246a5af6b0 100644 --- a/charts/cluster/prometheus_rules/cluster-ha-critical.yaml +++ b/charts/cluster/prometheus_rules/cluster-ha-critical.yaml @@ -21,4 +21,6 @@ expr: | for: 5m labels: severity: critical + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-ha-warning.yaml b/charts/cluster/prometheus_rules/cluster-ha-warning.yaml index 15a5d4d13c..736ddf393d 100644 --- a/charts/cluster/prometheus_rules/cluster-ha-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-ha-warning.yaml @@ -19,4 +19,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml index ac83376ab9..e5de952253 100644 --- a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Instance maximum number of connections critical! description: |- - CloudNativePG Cluster "{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of the maximum number of connections. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md expr: | @@ -12,4 +12,6 @@ expr: | for: 5m labels: severity: critical + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml index 126abd863e..ae706ee0b0 100644 --- a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Instance is approaching the maximum number of connections. description: |- - CloudNativePG Cluster "{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of the maximum number of connections. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md expr: | @@ -12,4 +12,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml index 4cf1610d21..ab1c175a18 100644 --- a/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Cluster high replication lag description: |- - CloudNativePG Cluster "{{ .cluster }}" is experiencing a high replication lag of + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is experiencing a high replication lag of {{ .value }}ms. High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration. @@ -14,4 +14,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml index 39900cf207..b5a90742ee 100644 --- a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml +++ b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Cluster instances are located on the same node. description: |- - CloudNativePG Cluster "{{ .cluster }}" has {{ .value }} + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has {{ .value }} instances on the same node {{ .labels.node }}. A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss. @@ -14,4 +14,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml b/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml index fcacab9bee..c211bc61a9 100644 --- a/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml +++ b/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Instance is running out of disk space! description: |- - CloudNativePG Cluster "{{ .cluster }}" is running extremely low on disk space. Check attached PVCs! + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running extremely low on disk space. Check attached PVCs! runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md expr: | max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.9 OR @@ -19,4 +19,6 @@ expr: | for: 5m labels: severity: critical + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml b/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml index 7f36f43514..5ed3653aad 100644 --- a/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Instance is running out of disk space. description: |- - CloudNativePG Cluster "{{ .cluster }}" is running low on disk space. Check attached PVCs. + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running low on disk space. Check attached PVCs. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md expr: | max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.7 OR @@ -19,4 +19,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-offline.yaml b/charts/cluster/prometheus_rules/cluster-offline.yaml index 75647f7cc1..4ac68ce35b 100644 --- a/charts/cluster/prometheus_rules/cluster-offline.yaml +++ b/charts/cluster/prometheus_rules/cluster-offline.yaml @@ -14,4 +14,6 @@ expr: | for: 5m labels: severity: critical + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml index 17183986e8..0959ae87bd 100644 --- a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Cluster instances in the same zone. description: |- - CloudNativePG Cluster "{{ .cluster }}" has instances in the same availability zone. + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has instances in the same availability zone. A disaster in one availability zone will lead to a potential service disruption and/or data loss. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md @@ -13,4 +13,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/templates/prometheus-rule.yaml b/charts/cluster/templates/prometheus-rule.yaml index 1836f51d9f..9b29bc692f 100644 --- a/charts/cluster/templates/prometheus-rule.yaml +++ b/charts/cluster/templates/prometheus-rule.yaml @@ -15,12 +15,15 @@ spec: {{- $dict := dict "excludeRules" .Values.cluster.monitoring.prometheusRule.excludeRules -}} {{- $_ := set $dict "value" "{{ $value }}" -}} {{- $_ := set $dict "namespace" .Release.Namespace -}} - {{- $_ := set $dict "cluster" (printf "%s/%s" .Release.Namespace (include "cluster.fullname" .)) -}} + {{- $_ := set $dict "cluster" (include "cluster.fullname" .) -}} {{- $_ := set $dict "labels" (dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}") -}} {{- $_ := set $dict "podSelector" (printf "%s-([1-9][0-9]*)$" (include "cluster.fullname" .)) -}} {{- $_ := set $dict "Values" .Values -}} {{- $_ := set $dict "Template" .Template -}} {{- range $path, $_ := .Files.Glob "prometheus_rules/**.yaml" }} - - {{ tpl ($.Files.Get $path) $dict | nindent 10 | trim -}} + {{- $tpl := tpl ($.Files.Get $path) $dict | nindent 10 | trim -}} + {{- with $tpl }} + - {{ $tpl }} + {{- end -}} {{- end -}} {{ end }}