From 5a0f365bf323f8f6792b605965885a0771b42a3b Mon Sep 17 00:00:00 2001 From: Itay Grudev Date: Fri, 24 May 2024 22:59:40 +0300 Subject: [PATCH 1/2] Bug Fix: Direct matching instead of pattern matching for namespace in PrometheusRules Signed-off-by: Itay Grudev --- .../prometheus_rules/cluster-high_connection-critical.yaml | 2 +- .../prometheus_rules/cluster-high_connection-warning.yaml | 2 +- .../cluster/prometheus_rules/cluster-high_replication_lag.yaml | 2 +- .../prometheus_rules/cluster-instances_on_same_node.yaml | 2 +- charts/cluster/prometheus_rules/cluster-offline.yaml | 2 +- .../cluster/prometheus_rules/cluster-zone_spread-warning.yaml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml index e5de95225..df13ce3b3 100644 --- a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml @@ -8,7 +8,7 @@ annotations: the maximum number of connections. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md expr: | - sum by (pod) (cnpg_backends_total{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 95 + sum by (pod) (cnpg_backends_total{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 95 for: 5m labels: severity: critical diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml index ae706ee0b..73cc78392 100644 --- a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml @@ -8,7 +8,7 @@ annotations: the maximum number of connections. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md expr: | - sum by (pod) (cnpg_backends_total{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 80 + sum by (pod) (cnpg_backends_total{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 80 for: 5m labels: severity: warning diff --git a/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml index ab1c175a1..660db254f 100644 --- a/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml @@ -10,7 +10,7 @@ annotations: High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md expr: | - max(cnpg_pg_replication_lag{namespace=~"{{ .namespace }}",pod=~"{{ .podSelector }}"}) * 1000 > 1000 + max(cnpg_pg_replication_lag{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) * 1000 > 1000 for: 5m labels: severity: warning diff --git a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml index b5a90742e..aafcfab1e 100644 --- a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml +++ b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml @@ -10,7 +10,7 @@ annotations: A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md expr: | - count by (node) (kube_pod_info{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) > 1 + count by (node) (kube_pod_info{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"}) > 1 for: 5m labels: severity: warning diff --git a/charts/cluster/prometheus_rules/cluster-offline.yaml b/charts/cluster/prometheus_rules/cluster-offline.yaml index 4ac68ce35..6a1e59ddc 100644 --- a/charts/cluster/prometheus_rules/cluster-offline.yaml +++ b/charts/cluster/prometheus_rules/cluster-offline.yaml @@ -10,7 +10,7 @@ annotations: potential service disruption and/or data loss. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md expr: | - ({{ .Values.cluster.instances }} - count(cnpg_collector_up{namespace=~"{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR vector(0)) > 0 + ({{ .Values.cluster.instances }} - count(cnpg_collector_up{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR vector(0)) > 0 for: 5m labels: severity: critical diff --git a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml index 0959ae87b..41fa4002a 100644 --- a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml @@ -9,7 +9,7 @@ annotations: A disaster in one availability zone will lead to a potential service disruption and/or data loss. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md expr: | - {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3 + {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace="{{ .namespace }}", pod=~"{{ .podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3 for: 5m labels: severity: warning From 2d482a596df8b48d403efc139b19da5090cf855f Mon Sep 17 00:00:00 2001 From: Itay Grudev Date: Fri, 24 May 2024 23:00:00 +0300 Subject: [PATCH 2/2] Bug Fix: CNPGClusterOffline false positives Signed-off-by: Itay Grudev --- charts/cluster/prometheus_rules/cluster-offline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/cluster/prometheus_rules/cluster-offline.yaml b/charts/cluster/prometheus_rules/cluster-offline.yaml index 6a1e59ddc..aa4dc12ee 100644 --- a/charts/cluster/prometheus_rules/cluster-offline.yaml +++ b/charts/cluster/prometheus_rules/cluster-offline.yaml @@ -10,7 +10,7 @@ annotations: potential service disruption and/or data loss. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md expr: | - ({{ .Values.cluster.instances }} - count(cnpg_collector_up{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR vector(0)) > 0 + (count(cnpg_collector_up{namespace="{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR on() vector(0)) == 0 for: 5m labels: severity: critical