Added additional PrometheusRule annotations

Signed-off-by: Itay Grudev <[email protected]>
cloudnative-pg · Mar 27, 2024 · c79f7e0 · c79f7e0
1 parent 0a85ff4
commit c79f7e0
Show file tree

Hide file tree

Showing 11 changed files with 32 additions and 9 deletions.
diff --git a/charts/cluster/prometheus_rules/cluster-ha-critical.yaml b/charts/cluster/prometheus_rules/cluster-ha-critical.yaml
@@ -21,4 +21,6 @@ expr: |
 for: 5m
 labels:
   severity: critical
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
 {{- end -}}
diff --git a/charts/cluster/prometheus_rules/cluster-ha-warning.yaml b/charts/cluster/prometheus_rules/cluster-ha-warning.yaml
@@ -19,4 +19,6 @@ expr: |
 for: 5m
 labels:
   severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
 {{- end -}}
diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml
@@ -4,12 +4,14 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Instance maximum number of connections critical!
   description: |-
-    CloudNativePG Cluster "{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
     the maximum number of connections.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
 expr: |
   sum by (pod) (cnpg_backends_total{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 95
 for: 5m
 labels:
   severity: critical
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
 {{- end -}}
diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml
@@ -4,12 +4,14 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Instance is approaching the maximum number of connections.
   description: |-
-    CloudNativePG Cluster "{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
     the maximum number of connections.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
 expr: |
   sum by (pod) (cnpg_backends_total{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 80
 for: 5m
 labels:
   severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
 {{- end -}}
diff --git a/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml
@@ -4,7 +4,7 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Cluster high replication lag
   description: |-
-    CloudNativePG Cluster "{{ .cluster }}" is experiencing a high replication lag of
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is experiencing a high replication lag of
     {{ .value }}ms.
 
     High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration.
@@ -14,4 +14,6 @@ expr: |
 for: 5m
 labels:
   severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
 {{- end -}}
diff --git a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml
@@ -4,7 +4,7 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Cluster instances are located on the same node.
   description: |-
-    CloudNativePG Cluster "{{ .cluster }}" has {{ .value }}
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has {{ .value }}
     instances on the same node {{ .labels.node }}.
 
     A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss.
@@ -14,4 +14,6 @@ expr: |
 for: 5m
 labels:
   severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
 {{- end -}}
diff --git a/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml b/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml
@@ -4,7 +4,7 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Instance is running out of disk space!
   description: |-
-    CloudNativePG Cluster "{{ .cluster }}" is running extremely low on disk space. Check attached PVCs!
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running extremely low on disk space. Check attached PVCs!
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md
 expr: |
   max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.9 OR
@@ -19,4 +19,6 @@ expr: |
 for: 5m
 labels:
   severity: critical
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
 {{- end -}}
diff --git a/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml b/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml
@@ -4,7 +4,7 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Instance is running out of disk space.
   description: |-
-    CloudNativePG Cluster "{{ .cluster }}" is running low on disk space. Check attached PVCs.
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running low on disk space. Check attached PVCs.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md
 expr: |
   max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.7 OR
@@ -19,4 +19,6 @@ expr: |
 for: 5m
 labels:
   severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
 {{- end -}}
diff --git a/charts/cluster/prometheus_rules/cluster-offline.yaml b/charts/cluster/prometheus_rules/cluster-offline.yaml
@@ -14,4 +14,6 @@ expr: |
 for: 5m
 labels:
   severity: critical
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
 {{- end -}}
diff --git a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml
@@ -4,7 +4,7 @@ alert: {{ $alert }}
 annotations:
   summary: CNPG Cluster instances in the same zone.
   description: |-
-    CloudNativePG Cluster "{{ .cluster }}" has instances in the same availability zone.
+    CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has instances in the same availability zone.
 
     A disaster in one availability zone will lead to a potential service disruption and/or data loss.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
@@ -13,4 +13,6 @@ expr: |
 for: 5m
 labels:
   severity: warning
+  namespace: {{ .namespace }}
+  cnpg_cluster: {{ .cluster }}
 {{- end -}}
diff --git a/charts/cluster/templates/prometheus-rule.yaml b/charts/cluster/templates/prometheus-rule.yaml
@@ -15,12 +15,15 @@ spec:
         {{- $dict := dict "excludeRules" .Values.cluster.monitoring.prometheusRule.excludeRules -}}
         {{- $_ := set $dict "value"       "{{ $value }}" -}}
         {{- $_ := set $dict "namespace"   .Release.Namespace -}}
-        {{- $_ := set $dict "cluster"     (printf "%s/%s" .Release.Namespace (include "cluster.fullname" .)) -}}
+        {{- $_ := set $dict "cluster"     (include "cluster.fullname" .) -}}
         {{- $_ := set $dict "labels"      (dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}") -}}
         {{- $_ := set $dict "podSelector" (printf "%s-([1-9][0-9]*)$" (include "cluster.fullname" .)) -}}
         {{- $_ := set $dict "Values"      .Values -}}
         {{- $_ := set $dict "Template"    .Template -}}
         {{- range $path, $_ := .Files.Glob  "prometheus_rules/**.yaml" }}
-        - {{ tpl ($.Files.Get $path) $dict | nindent 10 | trim -}}
+        {{- $tpl := tpl ($.Files.Get $path) $dict | nindent 10 | trim -}}
+        {{- with $tpl }}
+        - {{ $tpl }}
+        {{- end -}}
         {{- end -}}
 {{ end }}