From 181e782ae7d6514f30d57c12bbce063291c86e84 Mon Sep 17 00:00:00 2001 From: Josh Date: Tue, 26 Mar 2024 08:26:44 -0700 Subject: [PATCH 1/5] Add config value endpointCA for private S3 such as MinIO (#230) * Add config value endpointCA for private S3 such as MinIO Signed-off-by: PseudoResonance --- charts/cluster/README.md | 4 +++ .../templates/_barman_object_store.tpl | 6 ++++ charts/cluster/templates/ca-bundle.yaml | 9 +++++ charts/cluster/values.schema.json | 34 +++++++++++++++++++ charts/cluster/values.yaml | 14 ++++++++ 5 files changed, 67 insertions(+) create mode 100644 charts/cluster/templates/ca-bundle.yaml diff --git a/charts/cluster/README.md b/charts/cluster/README.md index 85094a2ce..b21feedfe 100644 --- a/charts/cluster/README.md +++ b/charts/cluster/README.md @@ -125,6 +125,8 @@ refer to the [CloudNativePG Documentation](https://cloudnative-pg.io/documentat | backups.data.jobs | int | `2` | Number of data files to be archived or restored in parallel. | | backups.destinationPath | string | `""` | Overrides the provider specific default path. Defaults to: S3: s3:// Azure: https://..core.windows.net/ Google: gs:// | | backups.enabled | bool | `false` | You need to configure backups manually, so backups are disabled by default. | +| backups.endpointCA | object | `{"create":false,"key":"","name":"","value":""}` | Specifies a CA bundle to validate a privately signed certificate. | +| backups.endpointCA.create | bool | `false` | Creates a secret with the given value if true, otherwise uses an existing secret. | | backups.endpointURL | string | `""` | Overrides the provider specific default endpoint. Defaults to: S3: https://s3..amazonaws.com" | | backups.google.applicationCredentials | string | `""` | | | backups.google.bucket | string | `""` | | @@ -190,6 +192,8 @@ refer to the [CloudNativePG Documentation](https://cloudnative-pg.io/documentat | recovery.backupName | string | `""` | Backup Recovery Method | | recovery.clusterName | string | `""` | Object Store Recovery Method | | recovery.destinationPath | string | `""` | Overrides the provider specific default path. Defaults to: S3: s3:// Azure: https://..core.windows.net/ Google: gs:// | +| recovery.endpointCA | object | `{"create":false,"key":"","name":"","value":""}` | Specifies a CA bundle to validate a privately signed certificate. | +| recovery.endpointCA.create | bool | `false` | Creates a secret with the given value if true, otherwise uses an existing secret. | | recovery.endpointURL | string | `""` | Overrides the provider specific default endpoint. Defaults to: S3: https://s3..amazonaws.com" Leave empty if using the default S3 endpoint | | recovery.google.applicationCredentials | string | `""` | | | recovery.google.bucket | string | `""` | | diff --git a/charts/cluster/templates/_barman_object_store.tpl b/charts/cluster/templates/_barman_object_store.tpl index 96278f11a..f002800e5 100644 --- a/charts/cluster/templates/_barman_object_store.tpl +++ b/charts/cluster/templates/_barman_object_store.tpl @@ -4,6 +4,12 @@ endpointURL: {{ .scope.endpointURL }} {{- end }} +{{- if or (.scope.endpointCA.create) (.scope.endpointCA.name) }} + endpointCA: + name: {{ .chartFullname }}-ca-bundle + key: ca-bundle.crt +{{- end }} + {{- if .scope.destinationPath }} destinationPath: {{ .scope.destinationPath }} {{- end }} diff --git a/charts/cluster/templates/ca-bundle.yaml b/charts/cluster/templates/ca-bundle.yaml new file mode 100644 index 000000000..12991c163 --- /dev/null +++ b/charts/cluster/templates/ca-bundle.yaml @@ -0,0 +1,9 @@ +{{- if .Values.backups.endpointCA.create }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ .Values.backups.endpointCA.name | default (printf "%s-ca-bundle" (include "cluster.fullname" .)) | quote }} +data: + {{ .Values.backups.endpointCA.key | default "ca-bundle.crt" | quote }}: {{ .Values.backups.endpointCA.value }} + +{{- end }} diff --git a/charts/cluster/values.schema.json b/charts/cluster/values.schema.json index 3ee174fdc..5a1c46e28 100644 --- a/charts/cluster/values.schema.json +++ b/charts/cluster/values.schema.json @@ -54,6 +54,23 @@ "enabled": { "type": "boolean" }, + "endpointCA": { + "type": "object", + "properties": { + "create": { + "type": "boolean" + }, + "key": { + "type": "string" + }, + "name": { + "type": "string" + }, + "value": { + "type": "string" + } + } + }, "endpointURL": { "type": "string" }, @@ -335,6 +352,23 @@ "destinationPath": { "type": "string" }, + "endpointCA": { + "type": "object", + "properties": { + "create": { + "type": "boolean" + }, + "key": { + "type": "string" + }, + "name": { + "type": "string" + }, + "value": { + "type": "string" + } + } + }, "endpointURL": { "type": "string" }, diff --git a/charts/cluster/values.yaml b/charts/cluster/values.yaml index 07d979a4c..d4db49e32 100644 --- a/charts/cluster/values.yaml +++ b/charts/cluster/values.yaml @@ -41,6 +41,13 @@ recovery: # S3: https://s3..amazonaws.com" # Leave empty if using the default S3 endpoint endpointURL: "" + # -- Specifies a CA bundle to validate a privately signed certificate. + endpointCA: + # -- Creates a secret with the given value if true, otherwise uses an existing secret. + create: false + name: "" + key: "" + value: "" # -- Overrides the provider specific default path. Defaults to: # S3: s3:// # Azure: https://..core.windows.net/ @@ -184,6 +191,13 @@ backups: # -- Overrides the provider specific default endpoint. Defaults to: # S3: https://s3..amazonaws.com" endpointURL: "" # Leave empty if using the default S3 endpoint + # -- Specifies a CA bundle to validate a privately signed certificate. + endpointCA: + # -- Creates a secret with the given value if true, otherwise uses an existing secret. + create: false + name: "" + key: "" + value: "" # -- Overrides the provider specific default path. Defaults to: # S3: s3:// From ac0a34ee73106e0df6f77394b3258920248d4b52 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Tue, 26 Mar 2024 20:05:44 +0200 Subject: [PATCH 2/5] chore(deps): update actions/setup-python digest to 82c7e63 (#231) * chore(deps): update actions/setup-python digest to 82c7e63 Signed-off-by: Itay Grudev Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: Itay Grudev --- .github/workflows/lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 2f836614a..9e0fccaa5 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -20,7 +20,7 @@ jobs: with: version: v3.4.0 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: 3.7 From 741234645197be23cef35584f9bc17b6b63c7a80 Mon Sep 17 00:00:00 2001 From: Itay Grudev Date: Tue, 26 Mar 2024 23:01:58 +0200 Subject: [PATCH 3/5] Added the ability to exclude specific PrometheusRules (#232) * Added the ability to exclude specific PrometheusRules Signed-off-by: Itay Grudev --- charts/cluster/README.md | 1 + .../prometheus_rules/cluster-ha-critical.yaml | 24 +++ .../prometheus_rules/cluster-ha-warning.yaml | 22 +++ .../cluster-high_connection-critical.yaml | 15 ++ .../cluster-high_connection-warning.yaml | 15 ++ .../cluster-high_replication_lag.yaml | 17 ++ .../cluster-instances_on_same_node.yaml | 17 ++ .../cluster-low_disk_space-critical.yaml | 22 +++ .../cluster-low_disk_space-warning.yaml | 22 +++ .../prometheus_rules/cluster-offline.yaml | 17 ++ .../cluster-zone_spread-warning.yaml | 16 ++ charts/cluster/templates/prometheus-rule.yaml | 174 ++---------------- charts/cluster/values.schema.json | 3 + charts/cluster/values.yaml | 5 +- 14 files changed, 206 insertions(+), 164 deletions(-) create mode 100644 charts/cluster/prometheus_rules/cluster-ha-critical.yaml create mode 100644 charts/cluster/prometheus_rules/cluster-ha-warning.yaml create mode 100644 charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml create mode 100644 charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml create mode 100644 charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml create mode 100644 charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml create mode 100644 charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml create mode 100644 charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml create mode 100644 charts/cluster/prometheus_rules/cluster-offline.yaml create mode 100644 charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml diff --git a/charts/cluster/README.md b/charts/cluster/README.md index b21feedfe..af047fba9 100644 --- a/charts/cluster/README.md +++ b/charts/cluster/README.md @@ -160,6 +160,7 @@ refer to the [CloudNativePG Documentation](https://cloudnative-pg.io/documentat | cluster.monitoring.enabled | bool | `false` | Whether to enable monitoring | | cluster.monitoring.podMonitor.enabled | bool | `true` | Whether to enable the PodMonitor | | cluster.monitoring.prometheusRule.enabled | bool | `true` | Whether to enable the PrometheusRule automated alerts | +| cluster.monitoring.prometheusRule.excludeRules | list | `[]` | Exclude specified rules | | cluster.postgresGID | int | `26` | The GID of the postgres user inside the image, defaults to 26 | | cluster.postgresUID | int | `26` | The UID of the postgres user inside the image, defaults to 26 | | cluster.postgresql | object | `{}` | Configuration of the PostgreSQL server. See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration | diff --git a/charts/cluster/prometheus_rules/cluster-ha-critical.yaml b/charts/cluster/prometheus_rules/cluster-ha-critical.yaml new file mode 100644 index 000000000..014e9ec44 --- /dev/null +++ b/charts/cluster/prometheus_rules/cluster-ha-critical.yaml @@ -0,0 +1,24 @@ +{{- $alert := "CNPGClusterHACritical" -}} +{{- if not (has $alert .excludeRules) -}} +alert: {{ $alert }} +annotations: + summary: CNPG Cluster has no standby replicas! + description: |- + CloudNativePG Cluster "{{ .labels.job }}" has no ready standby replicas. Your cluster at a severe + risk of data loss and downtime if the primary instance fails. + + The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint + will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main. + + This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less + instances. The replaced instance may need some time to catch-up with the cluster primary instance. + + This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this + case you may want to silence it. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md +expr: | + max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ .namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ .namespace }}"}) < 1 +for: 5m +labels: + severity: critical +{{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-ha-warning.yaml b/charts/cluster/prometheus_rules/cluster-ha-warning.yaml new file mode 100644 index 000000000..15a5d4d13 --- /dev/null +++ b/charts/cluster/prometheus_rules/cluster-ha-warning.yaml @@ -0,0 +1,22 @@ +{{- $alert := "CNPGClusterHAWarning" -}} +{{- if not (has $alert .excludeRules) -}} +alert: {{ $alert }} +annotations: + summary: CNPG Cluster less than 2 standby replicas. + description: |- + CloudNativePG Cluster "{{ .labels.job }}" has only {{ .value }} standby replicas, putting + your cluster at risk if another instance fails. The cluster is still able to operate normally, although + the `-ro` and `-r` endpoints operate at reduced capacity. + + This can happen during a normal fail-over or automated minor version upgrades. The replaced instance may + need some time to catch-up with the cluster primary instance. + + This alarm will be constantly triggered if your cluster is configured to run with less than 3 instances. + In this case you may want to silence it. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md +expr: | + max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ .namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ .namespace }}"}) < 2 +for: 5m +labels: + severity: warning +{{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml new file mode 100644 index 000000000..ac83376ab --- /dev/null +++ b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml @@ -0,0 +1,15 @@ +{{- $alert := "CNPGClusterHighConnectionsCritical" -}} +{{- if not (has $alert .excludeRules) -}} +alert: {{ $alert }} +annotations: + summary: CNPG Instance maximum number of connections critical! + description: |- + CloudNativePG Cluster "{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of + the maximum number of connections. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md +expr: | + sum by (pod) (cnpg_backends_total{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 95 +for: 5m +labels: + severity: critical +{{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml new file mode 100644 index 000000000..126abd863 --- /dev/null +++ b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml @@ -0,0 +1,15 @@ +{{- $alert := "CNPGClusterHighConnectionsWarning" -}} +{{- if not (has $alert .excludeRules) -}} +alert: {{ $alert }} +annotations: + summary: CNPG Instance is approaching the maximum number of connections. + description: |- + CloudNativePG Cluster "{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of + the maximum number of connections. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md +expr: | + sum by (pod) (cnpg_backends_total{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) * 100 > 80 +for: 5m +labels: + severity: warning +{{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml new file mode 100644 index 000000000..4cf1610d2 --- /dev/null +++ b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml @@ -0,0 +1,17 @@ +{{- $alert := "CNPGClusterHighReplicationLag" -}} +{{- if not (has $alert .excludeRules) -}} +alert: {{ $alert }} +annotations: + summary: CNPG Cluster high replication lag + description: |- + CloudNativePG Cluster "{{ .cluster }}" is experiencing a high replication lag of + {{ .value }}ms. + + High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md +expr: | + max(cnpg_pg_replication_lag{namespace=~"{{ .namespace }}",pod=~"{{ .podSelector }}"}) * 1000 > 1000 +for: 5m +labels: + severity: warning +{{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml new file mode 100644 index 000000000..39900cf20 --- /dev/null +++ b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml @@ -0,0 +1,17 @@ +{{- $alert := "CNPGClusterInstancesOnSameNode" -}} +{{- if not (has $alert .excludeRules) -}} +alert: {{ $alert }} +annotations: + summary: CNPG Cluster instances are located on the same node. + description: |- + CloudNativePG Cluster "{{ .cluster }}" has {{ .value }} + instances on the same node {{ .labels.node }}. + + A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md +expr: | + count by (node) (kube_pod_info{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"}) > 1 +for: 5m +labels: + severity: warning +{{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml b/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml new file mode 100644 index 000000000..fcacab9be --- /dev/null +++ b/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml @@ -0,0 +1,22 @@ +{{- $alert := "CNPGClusterLowDiskSpaceCritical" -}} +{{- if not (has $alert .excludeRules) -}} +alert: {{ $alert }} +annotations: + summary: CNPG Instance is running out of disk space! + description: |- + CloudNativePG Cluster "{{ .cluster }}" is running extremely low on disk space. Check attached PVCs! + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md +expr: | + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.9 OR + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"})) > 0.9 OR + max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"}) + / + sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"}) + * + on(namespace, persistentvolumeclaim) group_left(volume) + kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ .podSelector }}"} + ) > 0.9 +for: 5m +labels: + severity: critical +{{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml b/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml new file mode 100644 index 000000000..7f36f4351 --- /dev/null +++ b/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml @@ -0,0 +1,22 @@ +{{- $alert := "CNPGClusterLowDiskSpaceWarning" -}} +{{- if not (has $alert .excludeRules) -}} +alert: {{ $alert }} +annotations: + summary: CNPG Instance is running out of disk space. + description: |- + CloudNativePG Cluster "{{ .cluster }}" is running low on disk space. Check attached PVCs. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md +expr: | + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.7 OR + max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-wal"})) > 0.7 OR + max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"}) + / + sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}-tbs.*"}) + * + on(namespace, persistentvolumeclaim) group_left(volume) + kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ .podSelector }}"} + ) > 0.7 +for: 5m +labels: + severity: warning +{{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-offline.yaml b/charts/cluster/prometheus_rules/cluster-offline.yaml new file mode 100644 index 000000000..75647f7cc --- /dev/null +++ b/charts/cluster/prometheus_rules/cluster-offline.yaml @@ -0,0 +1,17 @@ +{{- $alert := "CNPGClusterOffline" -}} +{{- if not (has $alert .excludeRules) -}} +alert: {{ $alert }} +annotations: + summary: CNPG Cluster has no running instances! + description: |- + CloudNativePG Cluster "{{ .labels.job }}" has no ready instances. + + Having an offline cluster means your applications will not be able to access the database, leading to + potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md +expr: | + ({{ .Values.cluster.instances }} - count(cnpg_collector_up{namespace=~"{{ .namespace }}",pod=~"{{ .podSelector }}"}) OR vector(0)) > 0 +for: 5m +labels: + severity: critical +{{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml new file mode 100644 index 000000000..17183986e --- /dev/null +++ b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml @@ -0,0 +1,16 @@ +{{- $alert := "CNPGClusterZoneSpreadWarning" -}} +{{- if not (has $alert .excludeRules) -}} +alert: {{ $alert }} +annotations: + summary: CNPG Cluster instances in the same zone. + description: |- + CloudNativePG Cluster "{{ .cluster }}" has instances in the same availability zone. + + A disaster in one availability zone will lead to a potential service disruption and/or data loss. + runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md +expr: | + {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace=~"{{ .namespace }}", pod=~"{{ .podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3 +for: 5m +labels: + severity: warning +{{- end -}} diff --git a/charts/cluster/templates/prometheus-rule.yaml b/charts/cluster/templates/prometheus-rule.yaml index bc3ffde84..380f90a7d 100644 --- a/charts/cluster/templates/prometheus-rule.yaml +++ b/charts/cluster/templates/prometheus-rule.yaml @@ -1,9 +1,4 @@ {{- if and .Values.cluster.monitoring.enabled .Values.cluster.monitoring.prometheusRule.enabled -}} -{{- $value := "{{ $value }}" -}} -{{- $namespace := .Release.Namespace -}} -{{- $cluster := printf "%s/%s" $namespace (include "cluster.fullname" .)}} -{{- $labels := dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}" -}} -{{- $podSelector := printf "%s-([1-9][0-9]*)$" (include "cluster.fullname" .) -}} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: @@ -16,162 +11,15 @@ metadata: spec: groups: - name: cloudnative-pg/{{ include "cluster.fullname" . }} - rules: - - alert: CNPGClusterHAWarning - annotations: - summary: CNPG Cluster less than 2 standby replicas. - description: |- - CloudNativePG Cluster "{{ $labels.job }}" has only {{ $value }} standby replicas, putting - your cluster at risk if another instance fails. The cluster is still able to operate normally, although - the `-ro` and `-r` endpoints operate at reduced capacity. - - This can happen during a normal fail-over or automated minor version upgrades. The replaced instance may - need some time to catch-up with the cluster primary instance. - - This alarm will be constantly triggered if your cluster is configured to run with less than 3 instances. - In this case you may want to silence it. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHAWarning.md - expr: | - max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ $namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ $namespace }}"}) < 2 - for: 5m - labels: - severity: warning - - alert: CNPGClusterHACritical - annotations: - summary: CNPG Cluster has no standby replicas! - description: |- - CloudNativePG Cluster "{{ $labels.job }}" has no ready standby replicas. Your cluster at a severe - risk of data loss and downtime if the primary instance fails. - - The primary instance is still online and able to serve queries, although connections to the `-ro` endpoint - will fail. The `-r` endpoint os operating at reduced capacity and all traffic is being served by the main. - - This can happen during a normal fail-over or automated minor version upgrades in a cluster with 2 or less - instances. The replaced instance may need some time to catch-up with the cluster primary instance. - - This alarm will be always trigger if your cluster is configured to run with only 1 instance. In this - case you may want to silence it. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHACritical.md - expr: | - max by (job) (cnpg_pg_replication_streaming_replicas{namespace="{{ $namespace }}"} - cnpg_pg_replication_is_wal_receiver_up{namespace="{{ $namespace }}"}) < 1 - for: 5m - labels: - severity: critical - - alert: CNPGClusterOffline - annotations: - summary: CNPG Cluster has no running instances! - description: |- - CloudNativePG Cluster "{{ $labels.job }}" has no ready instances. - - Having an offline cluster means your applications will not be able to access the database, leading to - potential service disruption and/or data loss. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterOffline.md - expr: | - ({{ .Values.cluster.instances }} - count(cnpg_collector_up{namespace=~"{{ $namespace }}",pod=~"{{ $podSelector }}"}) OR vector(0)) > 0 - for: 5m - labels: - severity: critical - - alert: CNPGClusterZoneSpreadWarning - annotations: - summary: CNPG Cluster instances in the same zone. - description: |- - CloudNativePG Cluster "{{ $cluster }}" has instances in the same availability zone. - - A disaster in one availability zone will lead to a potential service disruption and/or data loss. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md - expr: | - {{ .Values.cluster.instances }} > count(count by (label_topology_kubernetes_io_zone) (kube_pod_info{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"} * on(node,instance) group_left(label_topology_kubernetes_io_zone) kube_node_labels)) < 3 - for: 5m - labels: - severity: warning - - alert: CNPGClusterInstancesOnSameNode - annotations: - summary: CNPG Cluster instances are located on the same node. - description: |- - CloudNativePG Cluster "{{ $cluster }}" has {{ $value }} - instances on the same node {{ $labels.node }}. - - A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterInstancesOnSameNode.md - expr: | - count by (node) (kube_pod_info{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) > 1 - for: 5m - labels: - severity: warning - - alert: CNPGClusterHighReplicationLag - annotations: - summary: CNPG Cluster high replication lag - description: |- - CloudNativePG Cluster "{{ $cluster }}" is experiencing a high replication lag of - {{ "{{ $value }}" }}ms. - - High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighReplicationLag.md - expr: | - max(cnpg_pg_replication_lag{namespace=~"{{ $namespace }}",pod=~"{{ $podSelector }}"}) * 1000 > 1000 - for: 5m - labels: - severity: warning - - alert: CNPGClusterHighConnectionsWarning - annotations: - summary: CNPG Instance is approaching the maximum number of connections. - description: |- - CloudNativePG Cluster "{{ $cluster }}" instance {{ $labels.pod }} is using {{ "{{ $value }}" }}% of - the maximum number of connections. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md - expr: | - sum by (pod) (cnpg_backends_total{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) * 100 > 80 - for: 5m - labels: - severity: warning - - alert: CNPGClusterHighConnectionsCritical - annotations: - summary: CNPG Instance maximum number of connections critical! - description: |- - CloudNativePG Cluster "{{ $cluster }}" instance {{ $labels.pod }} is using {{ "{{ $value }}" }}% of - the maximum number of connections. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md - expr: | - sum by (pod) (cnpg_backends_total{namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) / max by (pod) (cnpg_pg_settings_setting{name="max_connections", namespace=~"{{ $namespace }}", pod=~"{{ $podSelector }}"}) * 100 > 95 - for: 5m - labels: - severity: critical - - alert: CNPGClusterLowDiskSpaceWarning - annotations: - summary: CNPG Instance is running out of disk space. - description: |- - CloudNativePG Cluster "{{ $cluster }}" is running low on disk space. Check attached PVCs. - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md - expr: | - max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"})) > 0.7 OR - max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"})) > 0.7 OR - max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"}) - / - sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"}) - * - on(namespace, persistentvolumeclaim) group_left(volume) - kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ $podSelector }}"} - ) > 0.7 - for: 5m - labels: - severity: warning - - alert: CNPGClusterLowDiskSpaceCritical - annotations: - summary: CNPG Instance is running out of disk space! - description: |- - CloudNativePG Cluster "{{ $cluster }}" is running extremely low on disk space. Check attached PVCs! - runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md - expr: | - max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}"})) > 0.9 OR - max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"} / kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-wal"})) > 0.9 OR - max(sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_used_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"}) - / - sum by (namespace,persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes{namespace="{{ $namespace }}", persistentvolumeclaim=~"{{ $podSelector }}-tbs.*"}) - * - on(namespace, persistentvolumeclaim) group_left(volume) - kube_pod_spec_volumes_persistentvolumeclaims_info{pod=~"{{ $podSelector }}"} - ) > 0.9 - for: 5m - labels: - severity: critical + rules: | + {{ $dict := dict "excludeRules" .Values.cluster.monitoring.prometheusRule.excludeRules -}} + {{- $_ := set $dict "value" "{{ $value }}" -}} + {{- $_ := set $dict "namespace" .Release.Namespace -}} + {{- $_ := set $dict "cluster" (printf "%s/%s" .Release.Namespace (include "cluster.fullname" .)) -}} + {{- $_ := set $dict "labels" (dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}") -}} + {{- $_ := set $dict "podSelector" (printf "%s-([1-9][0-9]*)$" (include "cluster.fullname" .)) -}} + {{- $_ := set $dict "Values" .Values -}} + {{- range $path, $_ := .Files.Glob "prometheus_rules/**.yaml" }} + - {{ tpl ($.Files.Get $path) $dict | nindent 10 | trim -}} + {{- end -}} {{ end }} diff --git a/charts/cluster/values.schema.json b/charts/cluster/values.schema.json index 5a1c46e28..0c7c1bbe0 100644 --- a/charts/cluster/values.schema.json +++ b/charts/cluster/values.schema.json @@ -213,6 +213,9 @@ "properties": { "enabled": { "type": "boolean" + }, + "excludeRules": { + "type": "array" } } } diff --git a/charts/cluster/values.yaml b/charts/cluster/values.yaml index d4db49e32..bff9d2202 100644 --- a/charts/cluster/values.yaml +++ b/charts/cluster/values.yaml @@ -151,8 +151,11 @@ cluster: # -- Whether to enable the PodMonitor enabled: true prometheusRule: - # -- Whether to enable the PrometheusRule automated alerts + # -- Whether to enable the PrometheusRule automated alerts enabled: true + # -- Exclude specified rules + excludeRules: [] + # - CNPGClusterZoneSpreadWarning # -- Custom Prometheus metrics customQueries: [] # - name: "pg_cache_hit_ratio" From f2d6ea7effe8c7220ddcd7f2001345abf46a14a8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:04:26 +0200 Subject: [PATCH 4/5] Release cluster-v0.0.4 (#234) * Added the ability to exclude specific PrometheusRules by @itay-grudev (#232) * Added config value endpointCA for private S3 such as MinIO by @PseudoResonance (#229, #230) * Bug Fix: Severity of CNPGClusterLowDiskSpaceCritical should be critical not warning by @baurmatt (#223) * New `backup.barmanObjectStore.wal` and `backup.barmanObjectStore.data` to support disabling encryption by @itay-grudev (#198, #221) Signed-off-by: Itay Grudev Co-authored-by: Itay Grudev --- RELEASE.md | 4 ++-- charts/cluster/Chart.yaml | 2 +- charts/cluster/README.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 984d1d550..d328344e0 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -32,7 +32,7 @@ In order to create a new release of the `cloudnative-pg` chart, follow these ste ``` 3. Create a branch named `release/cloudnative-pg-vX.Y.Z` and switch to it: ```bash - git checkout -b release/cloudnative-pg-v$NEW_VERSION + git switch --create release/cloudnative-pg-v$NEW_VERSION ``` 4. Update the `.version` in the [Chart.yaml](./charts/cloudnative-pg/Chart.yaml) file to `"X.Y.Z"` ```bash @@ -113,7 +113,7 @@ In order to create a new release of the `cluster` chart, follow these steps: ``` 3. Create a branch: named `release/cluster-vX.Y.Z` and switch to it ```bash - git checkout -b release/cluster-v$NEW_VERSION + git switch --create release/cluster-v$NEW_VERSION ``` 4. Update the `.version` in the [Chart.yaml](./charts/cluster/Chart.yaml) file to `"X.Y.Z"` ```bash diff --git a/charts/cluster/Chart.yaml b/charts/cluster/Chart.yaml index c57c3e6a9..dccec1ead 100644 --- a/charts/cluster/Chart.yaml +++ b/charts/cluster/Chart.yaml @@ -18,7 +18,7 @@ name: cluster description: Deploys and manages a CloudNativePG cluster and its associated resources. icon: https://raw.githubusercontent.com/cloudnative-pg/artwork/main/cloudnativepg-logo.svg type: application -version: 0.0.3 +version: 0.0.4 sources: - https://github.com/cloudnative-pg/charts keywords: diff --git a/charts/cluster/README.md b/charts/cluster/README.md index af047fba9..ce4d45567 100644 --- a/charts/cluster/README.md +++ b/charts/cluster/README.md @@ -1,6 +1,6 @@ # cluster -![Version: 0.0.3](https://img.shields.io/badge/Version-0.0.3-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) +![Version: 0.0.4](https://img.shields.io/badge/Version-0.0.4-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) > **Warning** > ### This chart is under active development. From 0a85ff4cc0d73f9bd494ae641d74fe29342b77b6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:48:51 +0200 Subject: [PATCH 5/5] Release cluster-v0.0.5 (#236) Bug Fix: PrometheusRules template issue prevents upgrading the chart Signed-off-by: Itay Grudev Co-authored-by: Itay Grudev --- charts/cluster/Chart.yaml | 2 +- charts/cluster/README.md | 2 +- charts/cluster/templates/prometheus-rule.yaml | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/charts/cluster/Chart.yaml b/charts/cluster/Chart.yaml index dccec1ead..fd2be2809 100644 --- a/charts/cluster/Chart.yaml +++ b/charts/cluster/Chart.yaml @@ -18,7 +18,7 @@ name: cluster description: Deploys and manages a CloudNativePG cluster and its associated resources. icon: https://raw.githubusercontent.com/cloudnative-pg/artwork/main/cloudnativepg-logo.svg type: application -version: 0.0.4 +version: 0.0.5 sources: - https://github.com/cloudnative-pg/charts keywords: diff --git a/charts/cluster/README.md b/charts/cluster/README.md index ce4d45567..fb2fdb814 100644 --- a/charts/cluster/README.md +++ b/charts/cluster/README.md @@ -1,6 +1,6 @@ # cluster -![Version: 0.0.4](https://img.shields.io/badge/Version-0.0.4-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) +![Version: 0.0.5](https://img.shields.io/badge/Version-0.0.5-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) > **Warning** > ### This chart is under active development. diff --git a/charts/cluster/templates/prometheus-rule.yaml b/charts/cluster/templates/prometheus-rule.yaml index 380f90a7d..1836f51d9 100644 --- a/charts/cluster/templates/prometheus-rule.yaml +++ b/charts/cluster/templates/prometheus-rule.yaml @@ -11,14 +11,15 @@ metadata: spec: groups: - name: cloudnative-pg/{{ include "cluster.fullname" . }} - rules: | - {{ $dict := dict "excludeRules" .Values.cluster.monitoring.prometheusRule.excludeRules -}} + rules: + {{- $dict := dict "excludeRules" .Values.cluster.monitoring.prometheusRule.excludeRules -}} {{- $_ := set $dict "value" "{{ $value }}" -}} {{- $_ := set $dict "namespace" .Release.Namespace -}} {{- $_ := set $dict "cluster" (printf "%s/%s" .Release.Namespace (include "cluster.fullname" .)) -}} {{- $_ := set $dict "labels" (dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}") -}} {{- $_ := set $dict "podSelector" (printf "%s-([1-9][0-9]*)$" (include "cluster.fullname" .)) -}} {{- $_ := set $dict "Values" .Values -}} + {{- $_ := set $dict "Template" .Template -}} {{- range $path, $_ := .Files.Glob "prometheus_rules/**.yaml" }} - {{ tpl ($.Files.Get $path) $dict | nindent 10 | trim -}} {{- end -}}