From 36142ddfa538cb552f69a06f203bb50a6d1c393c Mon Sep 17 00:00:00 2001 From: Itay Grudev Date: Wed, 3 Apr 2024 12:46:39 +0300 Subject: [PATCH 1/2] Added additional PrometheusRule labels (#238) Signed-off-by: Itay Grudev --- charts/cluster/prometheus_rules/cluster-ha-critical.yaml | 2 ++ charts/cluster/prometheus_rules/cluster-ha-warning.yaml | 2 ++ .../prometheus_rules/cluster-high_connection-critical.yaml | 4 +++- .../prometheus_rules/cluster-high_connection-warning.yaml | 4 +++- .../prometheus_rules/cluster-high_replication_lag.yaml | 4 +++- .../prometheus_rules/cluster-instances_on_same_node.yaml | 4 +++- .../prometheus_rules/cluster-low_disk_space-critical.yaml | 4 +++- .../prometheus_rules/cluster-low_disk_space-warning.yaml | 4 +++- charts/cluster/prometheus_rules/cluster-offline.yaml | 2 ++ .../prometheus_rules/cluster-zone_spread-warning.yaml | 4 +++- charts/cluster/templates/prometheus-rule.yaml | 7 +++++-- 11 files changed, 32 insertions(+), 9 deletions(-) diff --git a/charts/cluster/prometheus_rules/cluster-ha-critical.yaml b/charts/cluster/prometheus_rules/cluster-ha-critical.yaml index 014e9ec44..246a5af6b 100644 --- a/charts/cluster/prometheus_rules/cluster-ha-critical.yaml +++ b/charts/cluster/prometheus_rules/cluster-ha-critical.yaml @@ -21,4 +21,6 @@ expr: | for: 5m labels: severity: critical + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-ha-warning.yaml b/charts/cluster/prometheus_rules/cluster-ha-warning.yaml index 15a5d4d13..736ddf393 100644 --- a/charts/cluster/prometheus_rules/cluster-ha-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-ha-warning.yaml @@ -19,4 +19,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml index ac83376ab..e5de95225 100644 --- a/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_connection-critical.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Instance maximum number of connections critical! description: |- - CloudNativePG Cluster "{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of the maximum number of connections. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsCritical.md expr: | @@ -12,4 +12,6 @@ expr: | for: 5m labels: severity: critical + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml index 126abd863..ae706ee0b 100644 --- a/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_connection-warning.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Instance is approaching the maximum number of connections. description: |- - CloudNativePG Cluster "{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" instance {{ .labels.pod }} is using {{ .value }}% of the maximum number of connections. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterHighConnectionsWarning.md expr: | @@ -12,4 +12,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml index 4cf1610d2..ab1c175a1 100644 --- a/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml +++ b/charts/cluster/prometheus_rules/cluster-high_replication_lag.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Cluster high replication lag description: |- - CloudNativePG Cluster "{{ .cluster }}" is experiencing a high replication lag of + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is experiencing a high replication lag of {{ .value }}ms. High replication lag indicates network issues, busy instances, slow queries or suboptimal configuration. @@ -14,4 +14,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml index 39900cf20..b5a90742e 100644 --- a/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml +++ b/charts/cluster/prometheus_rules/cluster-instances_on_same_node.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Cluster instances are located on the same node. description: |- - CloudNativePG Cluster "{{ .cluster }}" has {{ .value }} + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has {{ .value }} instances on the same node {{ .labels.node }}. A failure or scheduled downtime of a single node will lead to a potential service disruption and/or data loss. @@ -14,4 +14,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml b/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml index fcacab9be..c211bc61a 100644 --- a/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml +++ b/charts/cluster/prometheus_rules/cluster-low_disk_space-critical.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Instance is running out of disk space! description: |- - CloudNativePG Cluster "{{ .cluster }}" is running extremely low on disk space. Check attached PVCs! + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running extremely low on disk space. Check attached PVCs! runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceCritical.md expr: | max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.9 OR @@ -19,4 +19,6 @@ expr: | for: 5m labels: severity: critical + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml b/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml index 7f36f4351..5ed3653aa 100644 --- a/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-low_disk_space-warning.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Instance is running out of disk space. description: |- - CloudNativePG Cluster "{{ .cluster }}" is running low on disk space. Check attached PVCs. + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" is running low on disk space. Check attached PVCs. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterLowDiskSpaceWarning.md expr: | max(max by(persistentvolumeclaim) (1 - kubelet_volume_stats_available_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"} / kubelet_volume_stats_capacity_bytes{namespace="{{ .namespace }}", persistentvolumeclaim=~"{{ .podSelector }}"})) > 0.7 OR @@ -19,4 +19,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-offline.yaml b/charts/cluster/prometheus_rules/cluster-offline.yaml index 75647f7cc..4ac68ce35 100644 --- a/charts/cluster/prometheus_rules/cluster-offline.yaml +++ b/charts/cluster/prometheus_rules/cluster-offline.yaml @@ -14,4 +14,6 @@ expr: | for: 5m labels: severity: critical + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml index 17183986e..0959ae87b 100644 --- a/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml +++ b/charts/cluster/prometheus_rules/cluster-zone_spread-warning.yaml @@ -4,7 +4,7 @@ alert: {{ $alert }} annotations: summary: CNPG Cluster instances in the same zone. description: |- - CloudNativePG Cluster "{{ .cluster }}" has instances in the same availability zone. + CloudNativePG Cluster "{{ .namespace }}/{{ .cluster }}" has instances in the same availability zone. A disaster in one availability zone will lead to a potential service disruption and/or data loss. runbook_url: https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/runbooks/CNPGClusterZoneSpreadWarning.md @@ -13,4 +13,6 @@ expr: | for: 5m labels: severity: warning + namespace: {{ .namespace }} + cnpg_cluster: {{ .cluster }} {{- end -}} diff --git a/charts/cluster/templates/prometheus-rule.yaml b/charts/cluster/templates/prometheus-rule.yaml index 1836f51d9..9b29bc692 100644 --- a/charts/cluster/templates/prometheus-rule.yaml +++ b/charts/cluster/templates/prometheus-rule.yaml @@ -15,12 +15,15 @@ spec: {{- $dict := dict "excludeRules" .Values.cluster.monitoring.prometheusRule.excludeRules -}} {{- $_ := set $dict "value" "{{ $value }}" -}} {{- $_ := set $dict "namespace" .Release.Namespace -}} - {{- $_ := set $dict "cluster" (printf "%s/%s" .Release.Namespace (include "cluster.fullname" .)) -}} + {{- $_ := set $dict "cluster" (include "cluster.fullname" .) -}} {{- $_ := set $dict "labels" (dict "job" "{{ $labels.job }}" "node" "{{ $labels.node }}" "pod" "{{ $labels.pod }}") -}} {{- $_ := set $dict "podSelector" (printf "%s-([1-9][0-9]*)$" (include "cluster.fullname" .)) -}} {{- $_ := set $dict "Values" .Values -}} {{- $_ := set $dict "Template" .Template -}} {{- range $path, $_ := .Files.Glob "prometheus_rules/**.yaml" }} - - {{ tpl ($.Files.Get $path) $dict | nindent 10 | trim -}} + {{- $tpl := tpl ($.Files.Get $path) $dict | nindent 10 | trim -}} + {{- with $tpl }} + - {{ $tpl }} + {{- end -}} {{- end -}} {{ end }} From b7e9e3e37216c4eb5a354d938774bcce1bdb2bba Mon Sep 17 00:00:00 2001 From: Rajeesh C V Date: Wed, 3 Apr 2024 15:17:57 +0530 Subject: [PATCH 2/2] feat(cloudnative-pg): Support for additional labels in PodMonitor (#196) Signed-off-by: Rajeesh Signed-off-by: Itay Grudev Signed-off-by: Itay Grudev Co-authored-by: Itay Grudev Co-authored-by: Itay Grudev --- charts/cloudnative-pg/README.md | 1 + charts/cloudnative-pg/templates/podmonitor.yaml | 5 ++++- charts/cloudnative-pg/values.schema.json | 3 +++ charts/cloudnative-pg/values.yaml | 4 ++++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/charts/cloudnative-pg/README.md b/charts/cloudnative-pg/README.md index a58caadb3..bc02f7165 100644 --- a/charts/cloudnative-pg/README.md +++ b/charts/cloudnative-pg/README.md @@ -49,6 +49,7 @@ CloudNativePG Operator Helm Chart | monitoring.grafanaDashboard.sidecarLabel | string | `"grafana_dashboard"` | Label that ConfigMaps should have to be loaded as dashboards. DEPRECATED: Use labels instead. | | monitoring.grafanaDashboard.sidecarLabelValue | string | `"1"` | Label value that ConfigMaps should have to be loaded as dashboards. DEPRECATED: Use labels instead. | | monitoring.podMonitorEnabled | bool | `false` | Specifies whether the monitoring should be enabled. Requires Prometheus Operator CRDs. | +| monitoring.podMonitorAdditionalLabels | object | `{}` | Additional labels for the podMonitor | | monitoringQueriesConfigMap.name | string | `"cnpg-default-monitoring"` | The name of the default monitoring configmap. | | monitoringQueriesConfigMap.queries | string | `"backends:\n query: |\n SELECT sa.datname\n , sa.usename\n , sa.application_name\n , states.state\n , COALESCE(sa.count, 0) AS total\n , COALESCE(sa.max_tx_secs, 0) AS max_tx_duration_seconds\n FROM ( VALUES ('active')\n , ('idle')\n , ('idle in transaction')\n , ('idle in transaction (aborted)')\n , ('fastpath function call')\n , ('disabled')\n ) AS states(state)\n LEFT JOIN (\n SELECT datname\n , state\n , usename\n , COALESCE(application_name, '') AS application_name\n , COUNT(*)\n , COALESCE(EXTRACT (EPOCH FROM (max(now() - xact_start))), 0) AS max_tx_secs\n FROM pg_catalog.pg_stat_activity\n GROUP BY datname, state, usename, application_name\n ) sa ON states.state = sa.state\n WHERE sa.usename IS NOT NULL\n metrics:\n - datname:\n usage: \"LABEL\"\n description: \"Name of the database\"\n - usename:\n usage: \"LABEL\"\n description: \"Name of the user\"\n - application_name:\n usage: \"LABEL\"\n description: \"Name of the application\"\n - state:\n usage: \"LABEL\"\n description: \"State of the backend\"\n - total:\n usage: \"GAUGE\"\n description: \"Number of backends\"\n - max_tx_duration_seconds:\n usage: \"GAUGE\"\n description: \"Maximum duration of a transaction in seconds\"\n\nbackends_waiting:\n query: |\n SELECT count(*) AS total\n FROM pg_catalog.pg_locks blocked_locks\n JOIN pg_catalog.pg_locks blocking_locks\n ON blocking_locks.locktype = blocked_locks.locktype\n AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database\n AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation\n AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page\n AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple\n AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid\n AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid\n AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid\n AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid\n AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid\n AND blocking_locks.pid != blocked_locks.pid\n JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid\n WHERE NOT blocked_locks.granted\n metrics:\n - total:\n usage: \"GAUGE\"\n description: \"Total number of backends that are currently waiting on other queries\"\n\npg_database:\n query: |\n SELECT datname\n , pg_catalog.pg_database_size(datname) AS size_bytes\n , pg_catalog.age(datfrozenxid) AS xid_age\n , pg_catalog.mxid_age(datminmxid) AS mxid_age\n FROM pg_catalog.pg_database\n metrics:\n - datname:\n usage: \"LABEL\"\n description: \"Name of the database\"\n - size_bytes:\n usage: \"GAUGE\"\n description: \"Disk space used by the database\"\n - xid_age:\n usage: \"GAUGE\"\n description: \"Number of transactions from the frozen XID to the current one\"\n - mxid_age:\n usage: \"GAUGE\"\n description: \"Number of multiple transactions (Multixact) from the frozen XID to the current one\"\n\npg_postmaster:\n query: |\n SELECT EXTRACT(EPOCH FROM pg_postmaster_start_time) AS start_time\n FROM pg_catalog.pg_postmaster_start_time()\n metrics:\n - start_time:\n usage: \"GAUGE\"\n description: \"Time at which postgres started (based on epoch)\"\n\npg_replication:\n query: \"SELECT CASE WHEN (\n NOT pg_catalog.pg_is_in_recovery()\n OR pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn())\n THEN 0\n ELSE GREATEST (0,\n EXTRACT(EPOCH FROM (now() - pg_catalog.pg_last_xact_replay_timestamp())))\n END AS lag,\n pg_catalog.pg_is_in_recovery() AS in_recovery,\n EXISTS (TABLE pg_stat_wal_receiver) AS is_wal_receiver_up,\n (SELECT count(*) FROM pg_catalog.pg_stat_replication) AS streaming_replicas\"\n metrics:\n - lag:\n usage: \"GAUGE\"\n description: \"Replication lag behind primary in seconds\"\n - in_recovery:\n usage: \"GAUGE\"\n description: \"Whether the instance is in recovery\"\n - is_wal_receiver_up:\n usage: \"GAUGE\"\n description: \"Whether the instance wal_receiver is up\"\n - streaming_replicas:\n usage: \"GAUGE\"\n description: \"Number of streaming replicas connected to the instance\"\n\npg_replication_slots:\n query: |\n SELECT slot_name,\n slot_type,\n database,\n active,\n (CASE pg_catalog.pg_is_in_recovery()\n WHEN TRUE THEN pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), restart_lsn)\n ELSE pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), restart_lsn)\n END) as pg_wal_lsn_diff\n FROM pg_catalog.pg_replication_slots\n WHERE NOT temporary\n metrics:\n - slot_name:\n usage: \"LABEL\"\n description: \"Name of the replication slot\"\n - slot_type:\n usage: \"LABEL\"\n description: \"Type of the replication slot\"\n - database:\n usage: \"LABEL\"\n description: \"Name of the database\"\n - active:\n usage: \"GAUGE\"\n description: \"Flag indicating whether the slot is active\"\n - pg_wal_lsn_diff:\n usage: \"GAUGE\"\n description: \"Replication lag in bytes\"\n\npg_stat_archiver:\n query: |\n SELECT archived_count\n , failed_count\n , COALESCE(EXTRACT(EPOCH FROM (now() - last_archived_time)), -1) AS seconds_since_last_archival\n , COALESCE(EXTRACT(EPOCH FROM (now() - last_failed_time)), -1) AS seconds_since_last_failure\n , COALESCE(EXTRACT(EPOCH FROM last_archived_time), -1) AS last_archived_time\n , COALESCE(EXTRACT(EPOCH FROM last_failed_time), -1) AS last_failed_time\n , COALESCE(CAST(CAST('x'||pg_catalog.right(pg_catalog.split_part(last_archived_wal, '.', 1), 16) AS pg_catalog.bit(64)) AS pg_catalog.int8), -1) AS last_archived_wal_start_lsn\n , COALESCE(CAST(CAST('x'||pg_catalog.right(pg_catalog.split_part(last_failed_wal, '.', 1), 16) AS pg_catalog.bit(64)) AS pg_catalog.int8), -1) AS last_failed_wal_start_lsn\n , EXTRACT(EPOCH FROM stats_reset) AS stats_reset_time\n FROM pg_catalog.pg_stat_archiver\n metrics:\n - archived_count:\n usage: \"COUNTER\"\n description: \"Number of WAL files that have been successfully archived\"\n - failed_count:\n usage: \"COUNTER\"\n description: \"Number of failed attempts for archiving WAL files\"\n - seconds_since_last_archival:\n usage: \"GAUGE\"\n description: \"Seconds since the last successful archival operation\"\n - seconds_since_last_failure:\n usage: \"GAUGE\"\n description: \"Seconds since the last failed archival operation\"\n - last_archived_time:\n usage: \"GAUGE\"\n description: \"Epoch of the last time WAL archiving succeeded\"\n - last_failed_time:\n usage: \"GAUGE\"\n description: \"Epoch of the last time WAL archiving failed\"\n - last_archived_wal_start_lsn:\n usage: \"GAUGE\"\n description: \"Archived WAL start LSN\"\n - last_failed_wal_start_lsn:\n usage: \"GAUGE\"\n description: \"Last failed WAL LSN\"\n - stats_reset_time:\n usage: \"GAUGE\"\n description: \"Time at which these statistics were last reset\"\n\npg_stat_bgwriter:\n query: |\n SELECT checkpoints_timed\n , checkpoints_req\n , checkpoint_write_time\n , checkpoint_sync_time\n , buffers_checkpoint\n , buffers_clean\n , maxwritten_clean\n , buffers_backend\n , buffers_backend_fsync\n , buffers_alloc\n FROM pg_catalog.pg_stat_bgwriter\n metrics:\n - checkpoints_timed:\n usage: \"COUNTER\"\n description: \"Number of scheduled checkpoints that have been performed\"\n - checkpoints_req:\n usage: \"COUNTER\"\n description: \"Number of requested checkpoints that have been performed\"\n - checkpoint_write_time:\n usage: \"COUNTER\"\n description: \"Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds\"\n - checkpoint_sync_time:\n usage: \"COUNTER\"\n description: \"Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds\"\n - buffers_checkpoint:\n usage: \"COUNTER\"\n description: \"Number of buffers written during checkpoints\"\n - buffers_clean:\n usage: \"COUNTER\"\n description: \"Number of buffers written by the background writer\"\n - maxwritten_clean:\n usage: \"COUNTER\"\n description: \"Number of times the background writer stopped a cleaning scan because it had written too many buffers\"\n - buffers_backend:\n usage: \"COUNTER\"\n description: \"Number of buffers written directly by a backend\"\n - buffers_backend_fsync:\n usage: \"COUNTER\"\n description: \"Number of times a backend had to execute its own fsync call (normally the background writer handles those even when the backend does its own write)\"\n - buffers_alloc:\n usage: \"COUNTER\"\n description: \"Number of buffers allocated\"\n\npg_stat_database:\n query: |\n SELECT datname\n , xact_commit\n , xact_rollback\n , blks_read\n , blks_hit\n , tup_returned\n , tup_fetched\n , tup_inserted\n , tup_updated\n , tup_deleted\n , conflicts\n , temp_files\n , temp_bytes\n , deadlocks\n , blk_read_time\n , blk_write_time\n FROM pg_catalog.pg_stat_database\n metrics:\n - datname:\n usage: \"LABEL\"\n description: \"Name of this database\"\n - xact_commit:\n usage: \"COUNTER\"\n description: \"Number of transactions in this database that have been committed\"\n - xact_rollback:\n usage: \"COUNTER\"\n description: \"Number of transactions in this database that have been rolled back\"\n - blks_read:\n usage: \"COUNTER\"\n description: \"Number of disk blocks read in this database\"\n - blks_hit:\n usage: \"COUNTER\"\n description: \"Number of times disk blocks were found already in the buffer cache, so that a read was not necessary (this only includes hits in the PostgreSQL buffer cache, not the operating system's file system cache)\"\n - tup_returned:\n usage: \"COUNTER\"\n description: \"Number of rows returned by queries in this database\"\n - tup_fetched:\n usage: \"COUNTER\"\n description: \"Number of rows fetched by queries in this database\"\n - tup_inserted:\n usage: \"COUNTER\"\n description: \"Number of rows inserted by queries in this database\"\n - tup_updated:\n usage: \"COUNTER\"\n description: \"Number of rows updated by queries in this database\"\n - tup_deleted:\n usage: \"COUNTER\"\n description: \"Number of rows deleted by queries in this database\"\n - conflicts:\n usage: \"COUNTER\"\n description: \"Number of queries canceled due to conflicts with recovery in this database\"\n - temp_files:\n usage: \"COUNTER\"\n description: \"Number of temporary files created by queries in this database\"\n - temp_bytes:\n usage: \"COUNTER\"\n description: \"Total amount of data written to temporary files by queries in this database\"\n - deadlocks:\n usage: \"COUNTER\"\n description: \"Number of deadlocks detected in this database\"\n - blk_read_time:\n usage: \"COUNTER\"\n description: \"Time spent reading data file blocks by backends in this database, in milliseconds\"\n - blk_write_time:\n usage: \"COUNTER\"\n description: \"Time spent writing data file blocks by backends in this database, in milliseconds\"\n\npg_stat_replication:\n primary: true\n query: |\n SELECT usename\n , COALESCE(application_name, '') AS application_name\n , COALESCE(client_addr::text, '') AS client_addr\n , COALESCE(client_port::text, '') AS client_port\n , EXTRACT(EPOCH FROM backend_start) AS backend_start\n , COALESCE(pg_catalog.age(backend_xmin), 0) AS backend_xmin_age\n , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), sent_lsn) AS sent_diff_bytes\n , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), write_lsn) AS write_diff_bytes\n , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), flush_lsn) AS flush_diff_bytes\n , COALESCE(pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), replay_lsn),0) AS replay_diff_bytes\n , COALESCE((EXTRACT(EPOCH FROM write_lag)),0)::float AS write_lag_seconds\n , COALESCE((EXTRACT(EPOCH FROM flush_lag)),0)::float AS flush_lag_seconds\n , COALESCE((EXTRACT(EPOCH FROM replay_lag)),0)::float AS replay_lag_seconds\n FROM pg_catalog.pg_stat_replication\n metrics:\n - usename:\n usage: \"LABEL\"\n description: \"Name of the replication user\"\n - application_name:\n usage: \"LABEL\"\n description: \"Name of the application\"\n - client_addr:\n usage: \"LABEL\"\n description: \"Client IP address\"\n - client_port:\n usage: \"LABEL\"\n description: \"Client TCP port\"\n - backend_start:\n usage: \"COUNTER\"\n description: \"Time when this process was started\"\n - backend_xmin_age:\n usage: \"COUNTER\"\n description: \"The age of this standby's xmin horizon\"\n - sent_diff_bytes:\n usage: \"GAUGE\"\n description: \"Difference in bytes from the last write-ahead log location sent on this connection\"\n - write_diff_bytes:\n usage: \"GAUGE\"\n description: \"Difference in bytes from the last write-ahead log location written to disk by this standby server\"\n - flush_diff_bytes:\n usage: \"GAUGE\"\n description: \"Difference in bytes from the last write-ahead log location flushed to disk by this standby server\"\n - replay_diff_bytes:\n usage: \"GAUGE\"\n description: \"Difference in bytes from the last write-ahead log location replayed into the database on this standby server\"\n - write_lag_seconds:\n usage: \"GAUGE\"\n description: \"Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it\"\n - flush_lag_seconds:\n usage: \"GAUGE\"\n description: \"Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it\"\n - replay_lag_seconds:\n usage: \"GAUGE\"\n description: \"Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it\"\n\npg_settings:\n query: |\n SELECT name,\n CASE setting WHEN 'on' THEN '1' WHEN 'off' THEN '0' ELSE setting END AS setting\n FROM pg_catalog.pg_settings\n WHERE vartype IN ('integer', 'real', 'bool')\n ORDER BY 1\n metrics:\n - name:\n usage: \"LABEL\"\n description: \"Name of the setting\"\n - setting:\n usage: \"GAUGE\"\n description: \"Setting value\"\n"` | A string representation of a YAML defining monitoring queries. | | nameOverride | string | `""` | | diff --git a/charts/cloudnative-pg/templates/podmonitor.yaml b/charts/cloudnative-pg/templates/podmonitor.yaml index 89789127d..bae86ca8d 100644 --- a/charts/cloudnative-pg/templates/podmonitor.yaml +++ b/charts/cloudnative-pg/templates/podmonitor.yaml @@ -5,6 +5,9 @@ metadata: name: {{ include "cloudnative-pg.fullname" . }} labels: {{- include "cloudnative-pg.labels" . | nindent 4 }} + {{- with .Values.monitoring.podMonitorAdditionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end}} {{- with .Values.commonAnnotations }} annotations: {{- toYaml . | nindent 4 }} @@ -15,4 +18,4 @@ spec: {{- include "cloudnative-pg.selectorLabels" . | nindent 6 }} podMetricsEndpoints: - port: metrics -{{- end }} \ No newline at end of file +{{- end }} diff --git a/charts/cloudnative-pg/values.schema.json b/charts/cloudnative-pg/values.schema.json index 9d380c6ea..efce45658 100644 --- a/charts/cloudnative-pg/values.schema.json +++ b/charts/cloudnative-pg/values.schema.json @@ -123,6 +123,9 @@ }, "podMonitorEnabled": { "type": "boolean" + }, + "podMonitorAdditionalLabels": { + "type": "object" } } }, diff --git a/charts/cloudnative-pg/values.yaml b/charts/cloudnative-pg/values.yaml index 956f0a56e..17314ed7a 100644 --- a/charts/cloudnative-pg/values.yaml +++ b/charts/cloudnative-pg/values.yaml @@ -137,8 +137,12 @@ tolerations: [] affinity: {} monitoring: + # -- Specifies whether the monitoring should be enabled. Requires Prometheus Operator CRDs. podMonitorEnabled: false + # -- Additional labels for the podMonitor + podMonitorAdditionalLabels: {} + grafanaDashboard: create: false # -- Allows overriding the namespace where the ConfigMap will be created, defaulting to the same one as the Release.