diff --git a/charts/cloud-native-postgresql/Chart.yaml b/charts/cloud-native-postgresql/Chart.yaml index 7d1c0f3..ec99cd5 100644 --- a/charts/cloud-native-postgresql/Chart.yaml +++ b/charts/cloud-native-postgresql/Chart.yaml @@ -18,13 +18,13 @@ description: Cloud Native Postgresql Helm Chart icon: https://www.enterprisedb.com/themes/custom/edb_bootstrap_sass/edb-logo-disc-dark-2.svg type: application -version: "0.8.0" +version: "0.9.0" # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning, they should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.10.0" +appVersion: "1.11.0" sources: - https://github.com/EnterpriseDB/cloud-native-postgresql-helm diff --git a/charts/cloud-native-postgresql/README.md b/charts/cloud-native-postgresql/README.md index b94ecdc..de0411b 100644 --- a/charts/cloud-native-postgresql/README.md +++ b/charts/cloud-native-postgresql/README.md @@ -1,6 +1,6 @@ # cloud-native-postgresql -![Version: 0.8.0](https://img.shields.io/badge/Version-0.8.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.10.0](https://img.shields.io/badge/AppVersion-1.10.0-informational?style=flat-square) +![Version: 0.9.0](https://img.shields.io/badge/Version-0.9.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.11.0](https://img.shields.io/badge/AppVersion-1.11.0-informational?style=flat-square) Cloud Native Postgresql Helm Chart @@ -32,6 +32,8 @@ Cloud Native Postgresql Helm Chart | image.repository | string | `"quay.io/enterprisedb/cloud-native-postgresql"` | | | image.tag | string | `""` | Overrides the image tag whose default is the chart appVersion. | | imagePullSecrets | list | `[]` | | +| monitoringQueriesConfigMap.name | string | `"postgresql-operator-default-monitoring"` | The name of the default monitoring configmap | +| monitoringQueriesConfigMap.queries | string | `"backends:\n query: |\n SELECT sa.datname\n , sa.usename\n , sa.application_name\n , states.state\n , COALESCE(sa.count, 0) AS total\n , COALESCE(sa.max_tx_secs, 0) AS max_tx_duration_seconds\n FROM ( VALUES ('active')\n , ('idle')\n , ('idle in transaction')\n , ('idle in transaction (aborted)')\n , ('fastpath function call')\n , ('disabled')\n ) AS states(state)\n LEFT JOIN (\n SELECT datname\n , state\n , usename\n , COALESCE(application_name, '') AS application_name\n , COUNT(*)\n , COALESCE(EXTRACT (EPOCH FROM (max(now() - xact_start))), 0) AS max_tx_secs\n FROM pg_catalog.pg_stat_activity\n GROUP BY datname, state, usename, application_name\n ) sa ON states.state = sa.state\n WHERE sa.usename IS NOT NULL\n metrics:\n - datname:\n usage: \"LABEL\"\n description: \"Name of the database\"\n - usename:\n usage: \"LABEL\"\n description: \"Name of the user\"\n - application_name:\n usage: \"LABEL\"\n description: \"Name of the application\"\n - state:\n usage: \"LABEL\"\n description: \"State of the backend\"\n - total:\n usage: \"GAUGE\"\n description: \"Number of backends\"\n - max_tx_duration_seconds:\n usage: \"GAUGE\"\n description: \"Maximum duration of a transaction in seconds\"\n\nbackends_waiting:\n query: |\n SELECT count(*) AS total\n FROM pg_catalog.pg_locks blocked_locks\n JOIN pg_catalog.pg_locks blocking_locks\n ON blocking_locks.locktype = blocked_locks.locktype\n AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database\n AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation\n AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page\n AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple\n AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid\n AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid\n AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid\n AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid\n AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid\n AND blocking_locks.pid != blocked_locks.pid\n JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid\n WHERE NOT blocked_locks.granted\n metrics:\n - total:\n usage: \"GAUGE\"\n description: \"Total number of backends that are currently waiting on other queries\"\n\npg_database:\n query: |\n SELECT datname\n , pg_catalog.pg_database_size(datname) AS size_bytes\n , pg_catalog.age(datfrozenxid) AS xid_age\n , pg_catalog.mxid_age(datminmxid) AS mxid_age\n FROM pg_catalog.pg_database\n metrics:\n - datname:\n usage: \"LABEL\"\n description: \"Name of the database\"\n - size_bytes:\n usage: \"GAUGE\"\n description: \"Disk space used by the database\"\n - xid_age:\n usage: \"GAUGE\"\n description: \"Number of transactions from the frozen XID to the current one\"\n - mxid_age:\n usage: \"GAUGE\"\n description: \"Number of multiple transactions (Multixact) from the frozen XID to the current one\"\n\npg_postmaster:\n query: |\n SELECT EXTRACT(EPOCH FROM pg_postmaster_start_time) AS start_time\n FROM pg_catalog.pg_postmaster_start_time()\n metrics:\n - start_time:\n usage: \"GAUGE\"\n description: \"Time at which postgres started (based on epoch)\"\n\npg_replication:\n query: \"SELECT CASE WHEN NOT pg_catalog.pg_is_in_recovery()\n THEN 0\n ELSE GREATEST (0,\n EXTRACT(EPOCH FROM (now() - pg_catalog.pg_last_xact_replay_timestamp())))\n END AS lag,\n pg_catalog.pg_is_in_recovery() AS in_recovery,\n EXISTS (TABLE pg_stat_wal_receiver) AS is_wal_receiver_up,\n (SELECT count(*) FROM pg_stat_replication) AS streaming_replicas\"\n metrics:\n - lag:\n usage: \"GAUGE\"\n description: \"Replication lag behind primary in seconds\"\n - in_recovery:\n usage: \"GAUGE\"\n description: \"Whether the instance is in recovery\"\n - is_wal_receiver_up:\n usage: \"GAUGE\"\n description: \"Whether the instance wal_receiver is up\"\n - streaming_replicas:\n usage: \"GAUGE\"\n description: \"Number of streaming replicas connected to the instance\"\n\npg_stat_archiver:\n query: |\n SELECT archived_count\n , failed_count\n , COALESCE(EXTRACT(EPOCH FROM (now() - last_archived_time)), -1) AS seconds_since_last_archival\n , COALESCE(EXTRACT(EPOCH FROM (now() - last_failed_time)), -1) AS seconds_since_last_failure\n , COALESCE(EXTRACT(EPOCH FROM last_archived_time), -1) AS last_archived_time\n , COALESCE(EXTRACT(EPOCH FROM last_failed_time), -1) AS last_failed_time\n , COALESCE(CAST(CAST('x'||pg_catalog.right(pg_catalog.split_part(last_archived_wal, '.', 1), 16) AS pg_catalog.bit(64)) AS pg_catalog.int8), -1) AS last_archived_wal_start_lsn\n , COALESCE(CAST(CAST('x'||pg_catalog.right(pg_catalog.split_part(last_failed_wal, '.', 1), 16) AS pg_catalog.bit(64)) AS pg_catalog.int8), -1) AS last_failed_wal_start_lsn\n , EXTRACT(EPOCH FROM stats_reset) AS stats_reset_time\n FROM pg_catalog.pg_stat_archiver\n metrics:\n - archived_count:\n usage: \"COUNTER\"\n description: \"Number of WAL files that have been successfully archived\"\n - failed_count:\n usage: \"COUNTER\"\n description: \"Number of failed attempts for archiving WAL files\"\n - seconds_since_last_archival:\n usage: \"GAUGE\"\n description: \"Seconds since the last successful archival operation\"\n - seconds_since_last_failure:\n usage: \"GAUGE\"\n description: \"Seconds since the last failed archival operation\"\n - last_archived_time:\n usage: \"GAUGE\"\n description: \"Epoch of the last time WAL archiving succeeded\"\n - last_failed_time:\n usage: \"GAUGE\"\n description: \"Epoch of the last time WAL archiving failed\"\n - last_archived_wal_start_lsn:\n usage: \"GAUGE\"\n description: \"Archived WAL start LSN\"\n - last_failed_wal_start_lsn:\n usage: \"GAUGE\"\n description: \"Last failed WAL LSN\"\n - stats_reset_time:\n usage: \"GAUGE\"\n description: \"Time at which these statistics were last reset\"\n\npg_stat_bgwriter:\n query: |\n SELECT checkpoints_timed\n , checkpoints_req\n , checkpoint_write_time\n , checkpoint_sync_time\n , buffers_checkpoint\n , buffers_clean\n , maxwritten_clean\n , buffers_backend\n , buffers_backend_fsync\n , buffers_alloc\n FROM pg_catalog.pg_stat_bgwriter\n metrics:\n - checkpoints_timed:\n usage: \"COUNTER\"\n description: \"Number of scheduled checkpoints that have been performed\"\n - checkpoints_req:\n usage: \"COUNTER\"\n description: \"Number of requested checkpoints that have been performed\"\n - checkpoint_write_time:\n usage: \"COUNTER\"\n description: \"Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds\"\n - checkpoint_sync_time:\n usage: \"COUNTER\"\n description: \"Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds\"\n - buffers_checkpoint:\n usage: \"COUNTER\"\n description: \"Number of buffers written during checkpoints\"\n - buffers_clean:\n usage: \"COUNTER\"\n description: \"Number of buffers written by the background writer\"\n - maxwritten_clean:\n usage: \"COUNTER\"\n description: \"Number of times the background writer stopped a cleaning scan because it had written too many buffers\"\n - buffers_backend:\n usage: \"COUNTER\"\n description: \"Number of buffers written directly by a backend\"\n - buffers_backend_fsync:\n usage: \"COUNTER\"\n description: \"Number of times a backend had to execute its own fsync call (normally the background writer handles those even when the backend does its own write)\"\n - buffers_alloc:\n usage: \"COUNTER\"\n description: \"Number of buffers allocated\"\n\npg_stat_database:\n query: |\n SELECT datname\n , xact_commit\n , xact_rollback\n , blks_read\n , blks_hit\n , tup_returned\n , tup_fetched\n , tup_inserted\n , tup_updated\n , tup_deleted\n , conflicts\n , temp_files\n , temp_bytes\n , deadlocks\n , blk_read_time\n , blk_write_time\n FROM pg_catalog.pg_stat_database\n metrics:\n - datname:\n usage: \"LABEL\"\n description: \"Name of this database\"\n - xact_commit:\n usage: \"COUNTER\"\n description: \"Number of transactions in this database that have been committed\"\n - xact_rollback:\n usage: \"COUNTER\"\n description: \"Number of transactions in this database that have been rolled back\"\n - blks_read:\n usage: \"COUNTER\"\n description: \"Number of disk blocks read in this database\"\n - blks_hit:\n usage: \"COUNTER\"\n description: \"Number of times disk blocks were found already in the buffer cache, so that a read was not necessary (this only includes hits in the PostgreSQL buffer cache, not the operating system's file system cache)\"\n - tup_returned:\n usage: \"COUNTER\"\n description: \"Number of rows returned by queries in this database\"\n - tup_fetched:\n usage: \"COUNTER\"\n description: \"Number of rows fetched by queries in this database\"\n - tup_inserted:\n usage: \"COUNTER\"\n description: \"Number of rows inserted by queries in this database\"\n - tup_updated:\n usage: \"COUNTER\"\n description: \"Number of rows updated by queries in this database\"\n - tup_deleted:\n usage: \"COUNTER\"\n description: \"Number of rows deleted by queries in this database\"\n - conflicts:\n usage: \"COUNTER\"\n description: \"Number of queries canceled due to conflicts with recovery in this database\"\n - temp_files:\n usage: \"COUNTER\"\n description: \"Number of temporary files created by queries in this database\"\n - temp_bytes:\n usage: \"COUNTER\"\n description: \"Total amount of data written to temporary files by queries in this database\"\n - deadlocks:\n usage: \"COUNTER\"\n description: \"Number of deadlocks detected in this database\"\n - blk_read_time:\n usage: \"COUNTER\"\n description: \"Time spent reading data file blocks by backends in this database, in milliseconds\"\n - blk_write_time:\n usage: \"COUNTER\"\n description: \"Time spent writing data file blocks by backends in this database, in milliseconds\"\n\npg_stat_replication:\n query: |\n SELECT usename\n , COALESCE(application_name, '') AS application_name\n , COALESCE(client_addr::text, '') AS client_addr\n , EXTRACT(EPOCH FROM backend_start) AS backend_start\n , COALESCE(pg_catalog.age(backend_xmin), 0) AS backend_xmin_age\n , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), sent_lsn) AS sent_diff_bytes\n , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), write_lsn) AS write_diff_bytes\n , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), flush_lsn) AS flush_diff_bytes\n , COALESCE(pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), replay_lsn),0) AS replay_diff_bytes\n , COALESCE((EXTRACT(EPOCH FROM write_lag)),0)::float AS write_lag_seconds\n , COALESCE((EXTRACT(EPOCH FROM flush_lag)),0)::float AS flush_lag_seconds\n , COALESCE((EXTRACT(EPOCH FROM replay_lag)),0)::float AS replay_lag_seconds\n FROM pg_catalog.pg_stat_replication\n metrics:\n - usename:\n usage: \"LABEL\"\n description: \"Name of the replication user\"\n - application_name:\n usage: \"LABEL\"\n description: \"Name of the application\"\n - client_addr:\n usage: \"LABEL\"\n description: \"Client IP address\"\n - backend_start:\n usage: \"COUNTER\"\n description: \"Time when this process was started\"\n - backend_xmin_age:\n usage: \"COUNTER\"\n description: \"The age of this standby's xmin horizon\"\n - sent_diff_bytes:\n usage: \"GAUGE\"\n description: \"Difference in bytes from the last write-ahead log location sent on this connection\"\n - write_diff_bytes:\n usage: \"GAUGE\"\n description: \"Difference in bytes from the last write-ahead log location written to disk by this standby server\"\n - flush_diff_bytes:\n usage: \"GAUGE\"\n description: \"Difference in bytes from the last write-ahead log location flushed to disk by this standby server\"\n - replay_diff_bytes:\n usage: \"GAUGE\"\n description: \"Difference in bytes from the last write-ahead log location replayed into the database on this standby server\"\n - write_lag_seconds:\n usage: \"GAUGE\"\n description: \"Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it\"\n - flush_lag_seconds:\n usage: \"GAUGE\"\n description: \"Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it\"\n - replay_lag_seconds:\n usage: \"GAUGE\"\n description: \"Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it\"\n\npg_settings:\n query: |\n SELECT name,\n CASE setting WHEN 'on' THEN '1' WHEN 'off' THEN '0' ELSE setting END AS setting\n FROM pg_catalog.pg_settings\n WHERE vartype IN ('integer', 'real', 'bool')\n ORDER BY 1\n metrics:\n - name:\n usage: \"LABEL\"\n description: \"Name of the setting\"\n - setting:\n usage: \"GAUGE\"\n description: \"Setting value\"\n"` | A string representation of a YAML defining monitoring queries | | nameOverride | string | `""` | | | nodeSelector | object | `{}` | Nodeselector for the operator to be installed | | podAnnotations | object | `{}` | Annotations to be added to the pod | @@ -51,5 +53,3 @@ Cloud Native Postgresql Helm Chart | webhook.validating.create | bool | `true` | | | webhook.validating.failurePolicy | string | `"Fail"` | | ----------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.5.0](https://github.com/norwoodj/helm-docs/releases/v1.5.0) diff --git a/charts/cloud-native-postgresql/templates/crds/crds.yaml b/charts/cloud-native-postgresql/templates/crds/crds.yaml index b32bb5a..8ce499d 100644 --- a/charts/cloud-native-postgresql/templates/crds/crds.yaml +++ b/charts/cloud-native-postgresql/templates/crds/crds.yaml @@ -1168,6 +1168,16 @@ spec: - AES256 - aws:kms type: string + maxParallel: + description: Number of WAL files to be either archived + in parallel (when the PostgreSQL instance is archiving + to a backup object store) or restored in parallel (when + a PostgreSQL standby is fetching WAL files from a recovery + object store). If not specified, WAL files will be processed + one at a time. It accepts a positive integer as a value + - with 1 being the minimum accepted value. + minimum: 1 + type: integer type: object required: - destinationPath @@ -1220,6 +1230,13 @@ spec: to be used by applications. Defaults to the value of the `database` key. type: string + postInitApplicationSQL: + description: List of SQL queries to be executed as a superuser + in the application database right after is created - to + be used with extreme care (by default empty) + items: + type: string + type: array postInitSQL: description: List of SQL queries to be executed as a superuser immediately after the cluster has been created - to be used @@ -1589,6 +1606,17 @@ spec: - AES256 - aws:kms type: string + maxParallel: + description: Number of WAL files to be either archived + in parallel (when the PostgreSQL instance is archiving + to a backup object store) or restored in parallel + (when a PostgreSQL standby is fetching WAL files from + a recovery object store). If not specified, WAL files + will be processed one at a time. It accepts a positive + integer as a value - with 1 being the minimum accepted + value. + minimum: 1 + type: integer type: object required: - destinationPath @@ -1741,9 +1769,10 @@ spec: logLevel: default: info description: 'The instances'' log level, one of the following values: - error, info (default), debug, trace' + error, warning, info (default), debug, trace' enum: - error + - warning - info - debug - trace @@ -1803,8 +1832,14 @@ spec: type: object type: array disableDefaultQueries: + default: false description: 'Whether the default queries should be injected. - Default: false.' + Set it to `true` if you don''t want to inject default queries + into the cluster. Default: false.' + type: boolean + enablePodMonitor: + default: false + description: Enable or disable the `PodMonitor` type: boolean type: object nodeMaintenanceWindow: @@ -1857,7 +1892,9 @@ spec: type: array promotionTimeout: description: Specifies the maximum number of seconds to wait when - promoting an instance to primary + promoting an instance to primary. Default value is 40000000, + greater than one year in seconds, big enough to simulate an + infinite timeout format: int32 type: integer shared_preload_libraries: @@ -1930,7 +1967,7 @@ spec: stopDelay: default: 30 description: The time in seconds that is allowed for a PostgreSQL - instance node to gracefully shutdown (default 30) + instance to gracefully shutdown (default 30) format: int32 type: integer storage: @@ -2125,6 +2162,14 @@ spec: required: - name type: object + switchoverDelay: + default: 40000000 + description: The time in seconds that is allowed for a primary PostgreSQL + instance to gracefully shutdown during a switchover. Default value + is 40000000, greater than one year in seconds, big enough to simulate + an infinite delay + format: int32 + type: integer required: - instances type: object @@ -2133,6 +2178,10 @@ spec: may not be up to date. Populated by the system. Read-only. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status' properties: + azurePVCUpdateEnabled: + description: AzurePVCUpdateEnabled shows if the PVC online upgrade + is enabled for this cluster + type: boolean certificates: description: The configuration for the CA and related certificates, initialized with defaults. @@ -2319,6 +2368,11 @@ spec: description: Total number of ready instances in the cluster format: int32 type: integer + resizingPVC: + description: List of all the PVCs that have ResizingPVC condition. + items: + type: string + type: array secretsResourceVersion: description: The list of resource versions of the secrets managed by the operator. Every change here is done in the interest of the @@ -2482,6 +2536,7 @@ spec: can configure type: object paused: + default: false description: When set to `true`, PgBouncer will disconnect from the PostgreSQL server, first waiting for all queries to complete, and pause all new client connections until this value is set @@ -2489,6 +2544,7 @@ spec: `PAUSE` and `RESUME` commands. type: boolean poolMode: + default: session description: The pool mode enum: - session @@ -9312,6 +9368,7 @@ spec: type: object type: object type: + default: rw description: Which instances we must forward traffic to? enum: - rw @@ -9326,6 +9383,10 @@ spec: status: description: PoolerStatus defines the observed state of Pooler properties: + instances: + description: The number of pods trying to be scheduled + format: int32 + type: integer secrets: description: The resource version of the config object properties: diff --git a/charts/cloud-native-postgresql/templates/deployment.yaml b/charts/cloud-native-postgresql/templates/deployment.yaml index 17a6a8a..1eb33b7 100644 --- a/charts/cloud-native-postgresql/templates/deployment.yaml +++ b/charts/cloud-native-postgresql/templates/deployment.yaml @@ -63,6 +63,8 @@ spec: valueFrom: fieldRef: fieldPath: metadata.namespace + - name: MONITORING_QUERIES_CONFIGMAP + value: "{{ .Values.monitoringQueriesConfigMap.name }}" image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} livenessProbe: diff --git a/charts/cloud-native-postgresql/templates/monitoring-configmap.yaml b/charts/cloud-native-postgresql/templates/monitoring-configmap.yaml new file mode 100644 index 0000000..9ad9bef --- /dev/null +++ b/charts/cloud-native-postgresql/templates/monitoring-configmap.yaml @@ -0,0 +1,28 @@ +# Copyright 2021 EnterpriseDB Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.monitoringQueriesConfigMap.name }} + labels: + {{- include "cloud-native-postgresql.labels" . | nindent 4 }} + k8s.enterprisedb.io/reload: "" + {{- with .Values.commonAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +data: + queries: {{- toYaml .Values.monitoringQueriesConfigMap.queries | nindent 4 }} diff --git a/charts/cloud-native-postgresql/templates/rbac.yaml b/charts/cloud-native-postgresql/templates/rbac.yaml index 819e740..36916d0 100644 --- a/charts/cloud-native-postgresql/templates/rbac.yaml +++ b/charts/cloud-native-postgresql/templates/rbac.yaml @@ -219,6 +219,17 @@ rules: - create - get - update +- apiGroups: + - monitoring.coreos.com + resources: + - podmonitors + verbs: + - create + - delete + - get + - list + - patch + - watch - apiGroups: - policy resources: diff --git a/charts/cloud-native-postgresql/values.schema.json b/charts/cloud-native-postgresql/values.schema.json index 46c6618..5321b99 100644 --- a/charts/cloud-native-postgresql/values.schema.json +++ b/charts/cloud-native-postgresql/values.schema.json @@ -48,6 +48,17 @@ "imagePullSecrets": { "type": "array" }, + "monitoringQueriesConfigMap": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "queries": { + "type": "string" + } + } + }, "nameOverride": { "type": "string" }, diff --git a/charts/cloud-native-postgresql/values.yaml b/charts/cloud-native-postgresql/values.yaml index 8ed9951..0929dca 100644 --- a/charts/cloud-native-postgresql/values.yaml +++ b/charts/cloud-native-postgresql/values.yaml @@ -43,10 +43,12 @@ config: create: true # -- Specifies whether it should be stored in a secret, instead of a configmap secret: false + # Examples: + # INHERITED_ANNOTATIONS: categories + # INHERITED_LABELS: environment, workload, app + # EDB_LICENSE_KEY: + data: {} - # INHERITED_ANNOTATIONS: categories - # INHERITED_LABELS: environment, workload, app - # EDB_LICENSE_KEY: name: postgresql-operator-controller-manager-config # -- Additinal arguments to be added to the operator's args list @@ -99,3 +101,354 @@ tolerations: [] # -- Affinity for the operator to be installed affinity: {} + +# Default monitoring queries +monitoringQueriesConfigMap: + # -- The name of the default monitoring configmap + name: postgresql-operator-default-monitoring + # -- A string representation of a YAML defining monitoring queries + queries: | + backends: + query: | + SELECT sa.datname + , sa.usename + , sa.application_name + , states.state + , COALESCE(sa.count, 0) AS total + , COALESCE(sa.max_tx_secs, 0) AS max_tx_duration_seconds + FROM ( VALUES ('active') + , ('idle') + , ('idle in transaction') + , ('idle in transaction (aborted)') + , ('fastpath function call') + , ('disabled') + ) AS states(state) + LEFT JOIN ( + SELECT datname + , state + , usename + , COALESCE(application_name, '') AS application_name + , COUNT(*) + , COALESCE(EXTRACT (EPOCH FROM (max(now() - xact_start))), 0) AS max_tx_secs + FROM pg_catalog.pg_stat_activity + GROUP BY datname, state, usename, application_name + ) sa ON states.state = sa.state + WHERE sa.usename IS NOT NULL + metrics: + - datname: + usage: "LABEL" + description: "Name of the database" + - usename: + usage: "LABEL" + description: "Name of the user" + - application_name: + usage: "LABEL" + description: "Name of the application" + - state: + usage: "LABEL" + description: "State of the backend" + - total: + usage: "GAUGE" + description: "Number of backends" + - max_tx_duration_seconds: + usage: "GAUGE" + description: "Maximum duration of a transaction in seconds" + + backends_waiting: + query: | + SELECT count(*) AS total + FROM pg_catalog.pg_locks blocked_locks + JOIN pg_catalog.pg_locks blocking_locks + ON blocking_locks.locktype = blocked_locks.locktype + AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database + AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation + AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page + AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple + AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid + AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid + AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid + AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid + AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid + AND blocking_locks.pid != blocked_locks.pid + JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid + WHERE NOT blocked_locks.granted + metrics: + - total: + usage: "GAUGE" + description: "Total number of backends that are currently waiting on other queries" + + pg_database: + query: | + SELECT datname + , pg_catalog.pg_database_size(datname) AS size_bytes + , pg_catalog.age(datfrozenxid) AS xid_age + , pg_catalog.mxid_age(datminmxid) AS mxid_age + FROM pg_catalog.pg_database + metrics: + - datname: + usage: "LABEL" + description: "Name of the database" + - size_bytes: + usage: "GAUGE" + description: "Disk space used by the database" + - xid_age: + usage: "GAUGE" + description: "Number of transactions from the frozen XID to the current one" + - mxid_age: + usage: "GAUGE" + description: "Number of multiple transactions (Multixact) from the frozen XID to the current one" + + pg_postmaster: + query: | + SELECT EXTRACT(EPOCH FROM pg_postmaster_start_time) AS start_time + FROM pg_catalog.pg_postmaster_start_time() + metrics: + - start_time: + usage: "GAUGE" + description: "Time at which postgres started (based on epoch)" + + pg_replication: + query: "SELECT CASE WHEN NOT pg_catalog.pg_is_in_recovery() + THEN 0 + ELSE GREATEST (0, + EXTRACT(EPOCH FROM (now() - pg_catalog.pg_last_xact_replay_timestamp()))) + END AS lag, + pg_catalog.pg_is_in_recovery() AS in_recovery, + EXISTS (TABLE pg_stat_wal_receiver) AS is_wal_receiver_up, + (SELECT count(*) FROM pg_stat_replication) AS streaming_replicas" + metrics: + - lag: + usage: "GAUGE" + description: "Replication lag behind primary in seconds" + - in_recovery: + usage: "GAUGE" + description: "Whether the instance is in recovery" + - is_wal_receiver_up: + usage: "GAUGE" + description: "Whether the instance wal_receiver is up" + - streaming_replicas: + usage: "GAUGE" + description: "Number of streaming replicas connected to the instance" + + pg_stat_archiver: + query: | + SELECT archived_count + , failed_count + , COALESCE(EXTRACT(EPOCH FROM (now() - last_archived_time)), -1) AS seconds_since_last_archival + , COALESCE(EXTRACT(EPOCH FROM (now() - last_failed_time)), -1) AS seconds_since_last_failure + , COALESCE(EXTRACT(EPOCH FROM last_archived_time), -1) AS last_archived_time + , COALESCE(EXTRACT(EPOCH FROM last_failed_time), -1) AS last_failed_time + , COALESCE(CAST(CAST('x'||pg_catalog.right(pg_catalog.split_part(last_archived_wal, '.', 1), 16) AS pg_catalog.bit(64)) AS pg_catalog.int8), -1) AS last_archived_wal_start_lsn + , COALESCE(CAST(CAST('x'||pg_catalog.right(pg_catalog.split_part(last_failed_wal, '.', 1), 16) AS pg_catalog.bit(64)) AS pg_catalog.int8), -1) AS last_failed_wal_start_lsn + , EXTRACT(EPOCH FROM stats_reset) AS stats_reset_time + FROM pg_catalog.pg_stat_archiver + metrics: + - archived_count: + usage: "COUNTER" + description: "Number of WAL files that have been successfully archived" + - failed_count: + usage: "COUNTER" + description: "Number of failed attempts for archiving WAL files" + - seconds_since_last_archival: + usage: "GAUGE" + description: "Seconds since the last successful archival operation" + - seconds_since_last_failure: + usage: "GAUGE" + description: "Seconds since the last failed archival operation" + - last_archived_time: + usage: "GAUGE" + description: "Epoch of the last time WAL archiving succeeded" + - last_failed_time: + usage: "GAUGE" + description: "Epoch of the last time WAL archiving failed" + - last_archived_wal_start_lsn: + usage: "GAUGE" + description: "Archived WAL start LSN" + - last_failed_wal_start_lsn: + usage: "GAUGE" + description: "Last failed WAL LSN" + - stats_reset_time: + usage: "GAUGE" + description: "Time at which these statistics were last reset" + + pg_stat_bgwriter: + query: | + SELECT checkpoints_timed + , checkpoints_req + , checkpoint_write_time + , checkpoint_sync_time + , buffers_checkpoint + , buffers_clean + , maxwritten_clean + , buffers_backend + , buffers_backend_fsync + , buffers_alloc + FROM pg_catalog.pg_stat_bgwriter + metrics: + - checkpoints_timed: + usage: "COUNTER" + description: "Number of scheduled checkpoints that have been performed" + - checkpoints_req: + usage: "COUNTER" + description: "Number of requested checkpoints that have been performed" + - checkpoint_write_time: + usage: "COUNTER" + description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds" + - checkpoint_sync_time: + usage: "COUNTER" + description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds" + - buffers_checkpoint: + usage: "COUNTER" + description: "Number of buffers written during checkpoints" + - buffers_clean: + usage: "COUNTER" + description: "Number of buffers written by the background writer" + - maxwritten_clean: + usage: "COUNTER" + description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" + - buffers_backend: + usage: "COUNTER" + description: "Number of buffers written directly by a backend" + - buffers_backend_fsync: + usage: "COUNTER" + description: "Number of times a backend had to execute its own fsync call (normally the background writer handles those even when the backend does its own write)" + - buffers_alloc: + usage: "COUNTER" + description: "Number of buffers allocated" + + pg_stat_database: + query: | + SELECT datname + , xact_commit + , xact_rollback + , blks_read + , blks_hit + , tup_returned + , tup_fetched + , tup_inserted + , tup_updated + , tup_deleted + , conflicts + , temp_files + , temp_bytes + , deadlocks + , blk_read_time + , blk_write_time + FROM pg_catalog.pg_stat_database + metrics: + - datname: + usage: "LABEL" + description: "Name of this database" + - xact_commit: + usage: "COUNTER" + description: "Number of transactions in this database that have been committed" + - xact_rollback: + usage: "COUNTER" + description: "Number of transactions in this database that have been rolled back" + - blks_read: + usage: "COUNTER" + description: "Number of disk blocks read in this database" + - blks_hit: + usage: "COUNTER" + description: "Number of times disk blocks were found already in the buffer cache, so that a read was not necessary (this only includes hits in the PostgreSQL buffer cache, not the operating system's file system cache)" + - tup_returned: + usage: "COUNTER" + description: "Number of rows returned by queries in this database" + - tup_fetched: + usage: "COUNTER" + description: "Number of rows fetched by queries in this database" + - tup_inserted: + usage: "COUNTER" + description: "Number of rows inserted by queries in this database" + - tup_updated: + usage: "COUNTER" + description: "Number of rows updated by queries in this database" + - tup_deleted: + usage: "COUNTER" + description: "Number of rows deleted by queries in this database" + - conflicts: + usage: "COUNTER" + description: "Number of queries canceled due to conflicts with recovery in this database" + - temp_files: + usage: "COUNTER" + description: "Number of temporary files created by queries in this database" + - temp_bytes: + usage: "COUNTER" + description: "Total amount of data written to temporary files by queries in this database" + - deadlocks: + usage: "COUNTER" + description: "Number of deadlocks detected in this database" + - blk_read_time: + usage: "COUNTER" + description: "Time spent reading data file blocks by backends in this database, in milliseconds" + - blk_write_time: + usage: "COUNTER" + description: "Time spent writing data file blocks by backends in this database, in milliseconds" + + pg_stat_replication: + query: | + SELECT usename + , COALESCE(application_name, '') AS application_name + , COALESCE(client_addr::text, '') AS client_addr + , EXTRACT(EPOCH FROM backend_start) AS backend_start + , COALESCE(pg_catalog.age(backend_xmin), 0) AS backend_xmin_age + , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), sent_lsn) AS sent_diff_bytes + , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), write_lsn) AS write_diff_bytes + , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), flush_lsn) AS flush_diff_bytes + , COALESCE(pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), replay_lsn),0) AS replay_diff_bytes + , COALESCE((EXTRACT(EPOCH FROM write_lag)),0)::float AS write_lag_seconds + , COALESCE((EXTRACT(EPOCH FROM flush_lag)),0)::float AS flush_lag_seconds + , COALESCE((EXTRACT(EPOCH FROM replay_lag)),0)::float AS replay_lag_seconds + FROM pg_catalog.pg_stat_replication + metrics: + - usename: + usage: "LABEL" + description: "Name of the replication user" + - application_name: + usage: "LABEL" + description: "Name of the application" + - client_addr: + usage: "LABEL" + description: "Client IP address" + - backend_start: + usage: "COUNTER" + description: "Time when this process was started" + - backend_xmin_age: + usage: "COUNTER" + description: "The age of this standby's xmin horizon" + - sent_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location sent on this connection" + - write_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location written to disk by this standby server" + - flush_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location flushed to disk by this standby server" + - replay_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location replayed into the database on this standby server" + - write_lag_seconds: + usage: "GAUGE" + description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it" + - flush_lag_seconds: + usage: "GAUGE" + description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it" + - replay_lag_seconds: + usage: "GAUGE" + description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it" + + pg_settings: + query: | + SELECT name, + CASE setting WHEN 'on' THEN '1' WHEN 'off' THEN '0' ELSE setting END AS setting + FROM pg_catalog.pg_settings + WHERE vartype IN ('integer', 'real', 'bool') + ORDER BY 1 + metrics: + - name: + usage: "LABEL" + description: "Name of the setting" + - setting: + usage: "GAUGE" + description: "Setting value"