From 5a0f365bf323f8f6792b605965885a0771b42a3b Mon Sep 17 00:00:00 2001
From: Itay Grudev <itay.grudev@essentim.com>
Date: Fri, 24 May 2024 22:59:40 +0300
Subject: [PATCH 1/2] Bug Fix: Direct matching instead of pattern matching for
 namespace in PrometheusRules

Signed-off-by: Itay Grudev <itay.grudev@essentim.com>
---
 .../prometheus_rules/cluster-high_connection-critical.yaml      | 2 +-
 .../prometheus_rules/cluster-high_connection-warning.yaml       | 2 +-
 .../cluster/prometheus_rules/cluster-high_replication_lag.yaml  | 2 +-
 .../prometheus_rules/cluster-instances_on_same_node.yaml        | 2 +-
 charts/cluster/prometheus_rules/cluster-offline.yaml            | 2 +-
 .../cluster/prometheus_rules/cluster-zone_spread-warning.yaml   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml
index e5de95225..df13ce3b3 100644
--- a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml
+++ b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml
@@ -8,7 +8,7 @@ annotations:
     the maximum number of connections.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md
 expr: |
-  sum by (pod) (cnpg_backends_total{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 95
+  sum by (pod) (cnpg_backends_total{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 95
 for: 5m
 labels:
   severity: critical
diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml
index ae706ee0b..73cc78392 100644
--- a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml
+++ b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml
@@ -8,7 +8,7 @@ annotations:
     the maximum number of connections.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md
 expr: |
-  sum by (pod) (cnpg_backends_total{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 80
+  sum by (pod) (cnpg_backends_total{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 80
 for: 5m
 labels:
   severity: warning
diff --git a/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml
index ab1c175a1..660db254f 100644
--- a/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml
+++ b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml
@@ -10,7 +10,7 @@ annotations:
     High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md
 expr: |
-  max(cnpg_pg_replication_lag{namespace=~"{{ .namespace }}",pod=~"{{ .podSelector }}"}) * 1000 > 1000
+  max(cnpg_pg_replication_lag{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) * 1000 > 1000
 for: 5m
 labels:
   severity: warning
diff --git a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml
index b5a90742e..aafcfab1e 100644
--- a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml
+++ b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml
@@ -10,7 +10,7 @@ annotations:
     A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md
 expr: |
-  count by (node) (kube_pod_info{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) > 1
+  count by (node) (kube_pod_info{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) > 1
 for: 5m
 labels:
   severity: warning
diff --git a/charts/cluster/prometheus_rules/cluster-offline.yaml b/charts/cluster/prometheus_rules/cluster-offline.yaml
index 4ac68ce35..6a1e59ddc 100644
--- a/charts/cluster/prometheus_rules/cluster-offline.yaml
+++ b/charts/cluster/prometheus_rules/cluster-offline.yaml
@@ -10,7 +10,7 @@ annotations:
     potential service disruption and/or data loss.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md
 expr: |
-  ({{ .Values.cluster.instances }} - count(cnpg_collector_up{namespace=~"{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR vector(0)) > 0
+  ({{ .Values.cluster.instances }} - count(cnpg_collector_up{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR vector(0)) > 0
 for: 5m
 labels:
   severity: critical
diff --git a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml
index 0959ae87b..41fa4002a 100644
--- a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml
+++ b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml
@@ -9,7 +9,7 @@ annotations:
     A disaster in one availability zone will lead to a potential service disruption and/or data loss.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md
 expr: |
-  {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3
+  {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3
 for: 5m
 labels:
   severity: warning

From 2d482a596df8b48d403efc139b19da5090cf855f Mon Sep 17 00:00:00 2001
From: Itay Grudev <itay.grudev@essentim.com>
Date: Fri, 24 May 2024 23:00:00 +0300
Subject: [PATCH 2/2] Bug Fix: CNPGClusterOffline false positives

Signed-off-by: Itay Grudev <itay.grudev@essentim.com>
---
 charts/cluster/prometheus_rules/cluster-offline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charts/cluster/prometheus_rules/cluster-offline.yaml b/charts/cluster/prometheus_rules/cluster-offline.yaml
index 6a1e59ddc..aa4dc12ee 100644
--- a/charts/cluster/prometheus_rules/cluster-offline.yaml
+++ b/charts/cluster/prometheus_rules/cluster-offline.yaml
@@ -10,7 +10,7 @@ annotations:
     potential service disruption and/or data loss.
   runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md
 expr: |
-  ({{ .Values.cluster.instances }} - count(cnpg_collector_up{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR vector(0)) > 0
+  (count(cnpg_collector_up{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR on() vector(0)) == 0
 for: 5m
 labels:
   severity: critical