Skip to content

Commit

Permalink
Added additional PrometheusRule annotations
Browse files Browse the repository at this point in the history
Signed-off-by: Itay Grudev <[email protected]>
  • Loading branch information
itay-grudev committed Mar 27, 2024
1 parent 0a85ff4 commit c79f7e0
Show file tree
Hide file tree
Showing 11 changed files with 32 additions and 9 deletions.
2 changes: 2 additions & 0 deletions charts/cluster/prometheus_rules/cluster-ha-critical.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@ expr: |
for: 5m
labels:
severity: critical
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}
2 changes: 2 additions & 0 deletions charts/cluster/prometheus_rules/cluster-ha-warning.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@ expr: |
for: 5m
labels:
severity: warning
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ alert: {{ $alert }}
annotations:
summary: CNPG Instance maximum number of connections critical!
description: |-
CloudNativePG Cluster "{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
the maximum number of connections.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
expr: |
sum by (pod) (cnpg_backends_total{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 95
for: 5m
labels:
severity: critical
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ alert: {{ $alert }}
annotations:
summary: CNPG Instance is approaching the maximum number of connections.
description: |-
CloudNativePG Cluster "{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of
the maximum number of connections.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
expr: |
sum by (pod) (cnpg_backends_total{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 80
for: 5m
labels:
severity: warning
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ alert: {{ $alert }}
annotations:
summary: CNPG Cluster high replication lag
description: |-
CloudNativePG Cluster "{{ .cluster }}" is experiencing a high replication lag of
CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is experiencing a high replication lag of
{{ .value }}ms.
High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration.
Expand All @@ -14,4 +14,6 @@ expr: |
for: 5m
labels:
severity: warning
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ alert: {{ $alert }}
annotations:
summary: CNPG Cluster instances are located on the same node.
description: |-
CloudNativePG Cluster "{{ .cluster }}" has {{ .value }}
CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has {{ .value }}
instances on the same node {{ .labels.node }}.
A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss.
Expand All @@ -14,4 +14,6 @@ expr: |
for: 5m
labels:
severity: warning
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ alert: {{ $alert }}
annotations:
summary: CNPG Instance is running out of disk space!
description: |-
CloudNativePG Cluster "{{ .cluster }}" is running extremely low on disk space. Check attached PVCs!
CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running extremely low on disk space. Check attached PVCs!
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md
expr: |
max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.9 OR
Expand All @@ -19,4 +19,6 @@ expr: |
for: 5m
labels:
severity: critical
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ alert: {{ $alert }}
annotations:
summary: CNPG Instance is running out of disk space.
description: |-
CloudNativePG Cluster "{{ .cluster }}" is running low on disk space. Check attached PVCs.
CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running low on disk space. Check attached PVCs.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md
expr: |
max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.7 OR
Expand All @@ -19,4 +19,6 @@ expr: |
for: 5m
labels:
severity: warning
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}
2 changes: 2 additions & 0 deletions charts/cluster/prometheus_rules/cluster-offline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@ expr: |
for: 5m
labels:
severity: critical
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ alert: {{ $alert }}
annotations:
summary: CNPG Cluster instances in the same zone.
description: |-
CloudNativePG Cluster "{{ .cluster }}" has instances in the same availability zone.
CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has instances in the same availability zone.
A disaster in one availability zone will lead to a potential service disruption and/or data loss.
runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
Expand All @@ -13,4 +13,6 @@ expr: |
for: 5m
labels:
severity: warning
namespace: {{ .namespace }}
cnpg_cluster: {{ .cluster }}
{{- end -}}
7 changes: 5 additions & 2 deletions charts/cluster/templates/prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@ spec:
{{- $dict := dict "excludeRules" .Values.cluster.monitoring.prometheusRule.excludeRules -}}
{{- $_ := set $dict "value" "{{ $value }}" -}}
{{- $_ := set $dict "namespace" .Release.Namespace -}}
{{- $_ := set $dict "cluster" (printf "%s/%s" .Release.Namespace (include "cluster.fullname" .)) -}}
{{- $_ := set $dict "cluster" (include "cluster.fullname" .) -}}
{{- $_ := set $dict "labels" (dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}") -}}
{{- $_ := set $dict "podSelector" (printf "%s-([1-9][0-9]*)$" (include "cluster.fullname" .)) -}}
{{- $_ := set $dict "Values" .Values -}}
{{- $_ := set $dict "Template" .Template -}}
{{- range $path, $_ := .Files.Glob "prometheus_rules/**.yaml" }}
- {{ tpl ($.Files.Get $path) $dict | nindent 10 | trim -}}
{{- $tpl := tpl ($.Files.Get $path) $dict | nindent 10 | trim -}}
{{- with $tpl }}
- {{ $tpl }}
{{- end -}}
{{- end -}}
{{ end }}

0 comments on commit c79f7e0

Please sign in to comment.