diff --git a/build-sources.json b/build-sources.json index 2c422fc0328..120150259fe 100644 --- a/build-sources.json +++ b/build-sources.json @@ -28,5 +28,6 @@ "postgis": true, "repmgr": true, "slony": true, - "tde": true + "tde": true, + "tpa": true } diff --git a/gatsby-config.js b/gatsby-config.js index 7d4b3194e63..bd5c03d63a0 100644 --- a/gatsby-config.js +++ b/gatsby-config.js @@ -81,6 +81,7 @@ const sourceToPluginConfig = { repmgr: { name: "repmgr", path: "product_docs/docs/repmgr" }, slony: { name: "slony", path: "product_docs/docs/slony" }, tde: { name: "tde", path: "product_docs/docs/tde" }, + tpa: { name: "tpa", path: "product_docs/docs/tpa" }, }; const externalSourcePlugins = () => { diff --git a/product_docs/docs/pgd/4/bdr/appusage.mdx b/product_docs/docs/pgd/4/bdr/appusage.mdx index 92576eeb4a7..61436c6be7f 100644 --- a/product_docs/docs/pgd/4/bdr/appusage.mdx +++ b/product_docs/docs/pgd/4/bdr/appusage.mdx @@ -174,7 +174,7 @@ a cluster, you can't add a node with a minor version if the cluster uses a newer protocol version. This returns an error. Both of these features might be affected by specific restrictions. -See [Release notes](/pgd/latest/rel_notes/) for any known incompatibilities. +See [Release notes](/pgd/4/rel_notes/) for any known incompatibilities. ## Replicating between nodes with differences diff --git a/product_docs/docs/pgd/4/bdr/catalogs.mdx b/product_docs/docs/pgd/4/bdr/catalogs.mdx index 5ff7e1dd0db..fc386d92bcb 100644 --- a/product_docs/docs/pgd/4/bdr/catalogs.mdx +++ b/product_docs/docs/pgd/4/bdr/catalogs.mdx @@ -218,7 +218,7 @@ A view containing active global locks on this node. The [`bdr.global_locks`](#bd exposes BDR's shared-memory lock state tracking, giving administrators greater insight into BDR's global locking activity and progress. -See [Monitoring global locks](/pgd/latest/monitoring#monitoring-global-locks) +See [Monitoring global locks](/pgd/4/monitoring#monitoring-global-locks) for more information about global locking. #### `bdr.global_locks` columns @@ -481,7 +481,7 @@ Every node in the cluster regularly broadcasts its progress every is 60000 ms, i.e., 1 minute). Expect N \* (N-1) rows in this relation. You might be more interested in the `bdr.node_slots` view for monitoring -purposes. See also [Monitoring](/pgd/latest/monitoring). +purposes. See also [Monitoring](/pgd/4/monitoring). #### `bdr.node_peer_progress` columns @@ -543,7 +543,7 @@ given node. This view contains information about replication slots used in the current database by BDR. -See [Monitoring outgoing replication](/pgd/latest/monitoring#monitoring-outgoing-replication) +See [Monitoring outgoing replication](/pgd/4/monitoring#monitoring-outgoing-replication) for guidance on the use and interpretation of this view's fields. #### `bdr.node_slots` columns diff --git a/product_docs/docs/pgd/4/bdr/configuration.mdx b/product_docs/docs/pgd/4/bdr/configuration.mdx index 983a77f3c78..9d83e65ba28 100644 --- a/product_docs/docs/pgd/4/bdr/configuration.mdx +++ b/product_docs/docs/pgd/4/bdr/configuration.mdx @@ -39,7 +39,7 @@ which vary according to the size and scale of the cluster. - `max_replication_slots` — Same as `max_wal_senders`. - `wal_sender_timeout` and `wal_receiver_timeout` — Determines how quickly a node considers its CAMO partner as disconnected or - reconnected. See [CAMO failure scenarios](/pgd/latest/bdr/camo/#failure-scenarios) for + reconnected. See [CAMO failure scenarios](/pgd/4/bdr/camo/#failure-scenarios) for details. In normal running for a group with N peer nodes, BDR requires diff --git a/product_docs/docs/pgd/4/bdr/conflicts.mdx b/product_docs/docs/pgd/4/bdr/conflicts.mdx index e4fc82642a9..c48fa48065e 100644 --- a/product_docs/docs/pgd/4/bdr/conflicts.mdx +++ b/product_docs/docs/pgd/4/bdr/conflicts.mdx @@ -418,7 +418,7 @@ cost of doing this penalizes the majority of users, so at this time it simply logs `delete_missing`. Later releases will automatically resolve `INSERT`/`DELETE` anomalies -via rechecks using [LiveCompare](/latest/livecompare) when `delete_missing` conflicts occur. +via rechecks using [LiveCompare](/livecompare/latest) when `delete_missing` conflicts occur. These can be performed manually by applications by checking the `bdr.conflict_history_summary` view. diff --git a/product_docs/docs/pgd/4/bdr/ddl.mdx b/product_docs/docs/pgd/4/bdr/ddl.mdx index 8d999deda25..57161213b90 100644 --- a/product_docs/docs/pgd/4/bdr/ddl.mdx +++ b/product_docs/docs/pgd/4/bdr/ddl.mdx @@ -214,7 +214,7 @@ ALTER or DROP of an object created in the current transaction doesn't required global DML lock. Monitoring of global DDL locks and global DML locks is shown in -[Monitoring](/pgd/latest/monitoring). +[Monitoring](/pgd/4/monitoring). ## Minimizing the impact of DDL diff --git a/product_docs/docs/pgd/4/bdr/functions.mdx b/product_docs/docs/pgd/4/bdr/functions.mdx index d34da61a1dc..defa6d0b443 100644 --- a/product_docs/docs/pgd/4/bdr/functions.mdx +++ b/product_docs/docs/pgd/4/bdr/functions.mdx @@ -744,7 +744,7 @@ bdr.monitor_local_replslots() #### Notes This function returns a record with fields `status` and `message`, -as explained in [Monitoring replication slots](/pgd/latest/monitoring/#monitoring-replication-slots). +as explained in [Monitoring replication slots](/pgd/4/monitoring/#monitoring-replication-slots). ### bdr.wal_sender_stats @@ -794,7 +794,7 @@ bdr.get_decoding_worker_stat() #### Notes -For further details, see [Monitoring WAL senders using LCR](/pgd/latest/monitoring/#monitoring-wal-senders-using-lcr). +For further details, see [Monitoring WAL senders using LCR](/pgd/4/monitoring/#monitoring-wal-senders-using-lcr). ### bdr.lag_control diff --git a/product_docs/docs/pgd/4/bdr/index.mdx b/product_docs/docs/pgd/4/bdr/index.mdx index 6a040e70236..278829faa71 100644 --- a/product_docs/docs/pgd/4/bdr/index.mdx +++ b/product_docs/docs/pgd/4/bdr/index.mdx @@ -159,7 +159,7 @@ overhead of replication as the cluster grows and minimizing the bandwidth to oth BDR is compatible with Postgres, EDB Postgres Extended Server, and EDB Postgres Advanced Server distributions and can be deployed as a -standard Postgres extension. See the [Compatibility matrix](/pgd/latest/#compatibility-matrix) +standard Postgres extension. See the [Compatibility matrix](/pgd/4/#compatibility-matrix) for details of supported version combinations. Some key BDR features depend on certain core @@ -170,7 +170,7 @@ example, if having the BDR feature Commit At Most Once (CAMO) is mission critical to your use case, don't adopt the community PostgreSQL distribution because it doesn't have the core capability required to handle CAMO. See the full feature matrix compatibility in -[Choosing a Postgres distribution](/pgd/latest/choosing_server/). +[Choosing a Postgres distribution](/pgd/4/choosing_server/). BDR offers close to native Postgres compatibility. However, some access patterns don't necessarily work as well in multi-node setup as they do on a @@ -259,4 +259,4 @@ BDR places a limit that at most 10 databases in any one PostgreSQL instance can be BDR nodes across different BDR node groups. However, BDR works best if you use only one BDR database per PostgreSQL instance. -The minimum recommended number of nodes in a group is three to provide fault tolerance for BDR's consensus mechanism. With just two nodes, consensus would fail if one of the nodes was unresponsive. Consensus is required for some BDR operations such as distributed sequence generation. For more information about the consensus mechanism used by EDB Postgres Distributed, see [Architectural details](/pgd/latest/architectures/#architecture-details). +The minimum recommended number of nodes in a group is three to provide fault tolerance for BDR's consensus mechanism. With just two nodes, consensus would fail if one of the nodes was unresponsive. Consensus is required for some BDR operations such as distributed sequence generation. For more information about the consensus mechanism used by EDB Postgres Distributed, see [Architectural details](/pgd/4/architectures/#architecture-details). diff --git a/product_docs/docs/pgd/4/bdr/nodes.mdx b/product_docs/docs/pgd/4/bdr/nodes.mdx index ca9cf27986a..9a2ff5d812c 100644 --- a/product_docs/docs/pgd/4/bdr/nodes.mdx +++ b/product_docs/docs/pgd/4/bdr/nodes.mdx @@ -587,7 +587,7 @@ On EDB Postgres Extended Server and EDB Postgres Advanced Server, offline nodes also hold back freezing of data to prevent losing conflict-resolution data (see [Origin conflict detection](conflicts)). -Administrators must monitor for node outages (see [monitoring](/pgd/latest/monitoring/)) +Administrators must monitor for node outages (see [monitoring](/pgd/4/monitoring/)) and make sure nodes have enough free disk space. If the workload is predictable, you might be able to calculate how much space is used over time, allowing a prediction of the maximum time a node can be down before critical diff --git a/product_docs/docs/pgd/4/choosing_durability.mdx b/product_docs/docs/pgd/4/choosing_durability.mdx index eae6bf5cc83..9465ec9959b 100644 --- a/product_docs/docs/pgd/4/choosing_durability.mdx +++ b/product_docs/docs/pgd/4/choosing_durability.mdx @@ -6,9 +6,9 @@ EDB Postgres Distributed allows you to choose from several replication configura * Asynchronous * Synchronous (using `synchronous_standby_names`) -* [Commit at Most Once](/pgd/latest/bdr/camo) -* [Eager](/pgd/latest/bdr/eager) -* [Group Commit](/pgd/latest/bdr/group-commit) +* [Commit at Most Once](/pgd/4/bdr/camo) +* [Eager](/pgd/4/bdr/eager) +* [Group Commit](/pgd/4/bdr/group-commit) -For more information, see [Durability](/pgd/latest/bdr/durability). +For more information, see [Durability](/pgd/4/bdr/durability). diff --git a/product_docs/docs/pgd/4/cli/installing_cli.mdx b/product_docs/docs/pgd/4/cli/installing_cli.mdx index 6d2b46ef940..7f93a9bb37d 100644 --- a/product_docs/docs/pgd/4/cli/installing_cli.mdx +++ b/product_docs/docs/pgd/4/cli/installing_cli.mdx @@ -4,7 +4,7 @@ navTitle: "Installing PGD CLI" --- -TPAexec installs and configures PGD CLI on each BDR node, by default. If you wish to install PGD CLI on any non-BDR instance in the cluster, you simply attach the pgdcli role to that instance in TPAexec's configuration file before deploying. See [TPAexec](/pgd/latest/deployments/tpaexec) for more information. +TPAexec installs and configures PGD CLI on each BDR node, by default. If you wish to install PGD CLI on any non-BDR instance in the cluster, you simply attach the pgdcli role to that instance in TPAexec's configuration file before deploying. See [TPAexec](/pgd/4/deployments/tpaexec) for more information. ## Installing manually @@ -20,7 +20,7 @@ When the PGD CLI is configured by TPAexec, it connects automatically, but with a ### Specifying database connection strings -You can either use a configuration file to specify the database connection strings for your cluster (see following section) or pass the connection string directly to a command (see the [sample use case](/pgd/latest/cli/#passing-a-database-connection-string)). +You can either use a configuration file to specify the database connection strings for your cluster (see following section) or pass the connection string directly to a command (see the [sample use case](/pgd/4/cli/#passing-a-database-connection-string)). #### Using a configuration file @@ -43,5 +43,5 @@ The `pgd-config.yml`, is located in the `/etc/edb` directory, by default. The PG 2. `$HOME/.edb` 3. `.` (working directory) -If you rename the file or move it to another location, specify the new name and location using the optional `-f` or `--config-file` flag when entering a command. See the [sample use case](/pgd/latest/cli/#passing-a-database-connection-string). +If you rename the file or move it to another location, specify the new name and location using the optional `-f` or `--config-file` flag when entering a command. See the [sample use case](/pgd/4/cli/#passing-a-database-connection-string). diff --git a/product_docs/docs/pgd/4/deployments/tpaexec/using_tpaexec.mdx b/product_docs/docs/pgd/4/deployments/tpaexec/using_tpaexec.mdx index 9c2d211f04f..f6c133417c0 100644 --- a/product_docs/docs/pgd/4/deployments/tpaexec/using_tpaexec.mdx +++ b/product_docs/docs/pgd/4/deployments/tpaexec/using_tpaexec.mdx @@ -115,7 +115,7 @@ By default, `tpaexec configure` uses the names first, second, and so on for any Specify `--location-names` to provide more meaningful names for each location. ### Enable Commit At Most Once -Specify `--enable-camo` to set the pair of BDR primary instances in each region to be each other's Commit At Most Once (CAMO) partners. See [Commit At Most Once (CAMO)](/pgd/latest/bdr/camo/) for more information. +Specify `--enable-camo` to set the pair of BDR primary instances in each region to be each other's Commit At Most Once (CAMO) partners. See [Commit At Most Once (CAMO)](/pgd/4/bdr/camo/) for more information. ## Provision diff --git a/product_docs/docs/pgd/4/harp/03_installation.mdx b/product_docs/docs/pgd/4/harp/03_installation.mdx index 729b5f7409e..5d28d84ff13 100644 --- a/product_docs/docs/pgd/4/harp/03_installation.mdx +++ b/product_docs/docs/pgd/4/harp/03_installation.mdx @@ -1,6 +1,8 @@ --- navTitle: Installation title: Installation +redirects: + - /pgd/latest/harp/03_installation/ --- A standard installation of HARP includes two system services: diff --git a/product_docs/docs/pgd/4/harp/04_configuration.mdx b/product_docs/docs/pgd/4/harp/04_configuration.mdx index 10ac24fba06..acd31921516 100644 --- a/product_docs/docs/pgd/4/harp/04_configuration.mdx +++ b/product_docs/docs/pgd/4/harp/04_configuration.mdx @@ -1,6 +1,8 @@ --- navTitle: Configuration title: Configuring HARP for cluster management +redirects: + - /pgd/latest/harp/04_configuration/ --- The HARP configuration file follows a standard YAML-style formatting that was simplified for readability. This file is located in the `/etc/harp` diff --git a/product_docs/docs/pgd/4/harp/05_bootstrapping.mdx b/product_docs/docs/pgd/4/harp/05_bootstrapping.mdx index 55d78e8dac4..b845d152e92 100644 --- a/product_docs/docs/pgd/4/harp/05_bootstrapping.mdx +++ b/product_docs/docs/pgd/4/harp/05_bootstrapping.mdx @@ -1,6 +1,8 @@ --- navTitle: Bootstrapping title: Cluster bootstrapping +redirects: + - /pgd/latest/harp/05_bootstrapping/ --- To use HARP, a minimum amount of metadata must exist in the DCS. The diff --git a/product_docs/docs/pgd/4/harp/06_harp_manager.mdx b/product_docs/docs/pgd/4/harp/06_harp_manager.mdx index f13e87c24ec..c29559bf894 100644 --- a/product_docs/docs/pgd/4/harp/06_harp_manager.mdx +++ b/product_docs/docs/pgd/4/harp/06_harp_manager.mdx @@ -1,6 +1,8 @@ --- navTitle: HARP Manager title: HARP Manager +redirects: + - /pgd/latest/harp/06_harp_manager/ --- HARP Manager is a daemon that interacts with the local PostgreSQL/BDR node diff --git a/product_docs/docs/pgd/4/harp/07_harp_proxy.mdx b/product_docs/docs/pgd/4/harp/07_harp_proxy.mdx index f106da3b22b..691536ae244 100644 --- a/product_docs/docs/pgd/4/harp/07_harp_proxy.mdx +++ b/product_docs/docs/pgd/4/harp/07_harp_proxy.mdx @@ -1,6 +1,8 @@ --- navTitle: HARP Proxy title: HARP Proxy +redirects: + - /pgd/latest/harp/07_harp_proxy/ --- HARP Proxy is a daemon that acts as an abstraction layer between the client diff --git a/product_docs/docs/pgd/4/harp/08_harpctl.mdx b/product_docs/docs/pgd/4/harp/08_harpctl.mdx index bc39a94a9b1..b08515506a8 100644 --- a/product_docs/docs/pgd/4/harp/08_harpctl.mdx +++ b/product_docs/docs/pgd/4/harp/08_harpctl.mdx @@ -1,6 +1,8 @@ --- navTitle: harpctl title: harpctl command-line tool +redirects: + - /pgd/latest/harp/08_harpctl/ --- `harpctl` is a command-line tool for directly manipulating the consensus layer diff --git a/product_docs/docs/pgd/4/harp/09_consensus-layer.mdx b/product_docs/docs/pgd/4/harp/09_consensus-layer.mdx index 55a6643c208..233bf122417 100644 --- a/product_docs/docs/pgd/4/harp/09_consensus-layer.mdx +++ b/product_docs/docs/pgd/4/harp/09_consensus-layer.mdx @@ -1,6 +1,8 @@ --- navTitle: Consensus layer title: Consensus layer considerations +redirects: + - /pgd/latest/harp/09_consensus-layer/ --- HARP is designed so that it can work with different implementations of diff --git a/product_docs/docs/pgd/4/harp/10_security.mdx b/product_docs/docs/pgd/4/harp/10_security.mdx index a8c53c66fc3..e7cbbfef7e5 100644 --- a/product_docs/docs/pgd/4/harp/10_security.mdx +++ b/product_docs/docs/pgd/4/harp/10_security.mdx @@ -1,6 +1,8 @@ --- navTitle: Security title: Security and roles +redirects: + - /pgd/latest/harp/10_security/ --- Beyond basic package installation and configuration, HARP requires diff --git a/product_docs/docs/pgd/4/harp/index.mdx b/product_docs/docs/pgd/4/harp/index.mdx index 421d56855fd..acd29099098 100644 --- a/product_docs/docs/pgd/4/harp/index.mdx +++ b/product_docs/docs/pgd/4/harp/index.mdx @@ -5,6 +5,7 @@ directoryDefaults: description: "High Availability Routing for Postgres (HARP) is a cluster-management tool for EDB Postgres Distributed clusters." redirects: - /pgd/4/harp/02_overview +- /pgd/latest/harp/ --- High Availability Routing for Postgres (HARP) is a new approach for managing high availabiliity for diff --git a/product_docs/docs/pgd/4/index.mdx b/product_docs/docs/pgd/4/index.mdx index d8737a73e95..294b7009a06 100644 --- a/product_docs/docs/pgd/4/index.mdx +++ b/product_docs/docs/pgd/4/index.mdx @@ -32,8 +32,8 @@ EDB Postgres Distributed provides multi-master replication and data distribution By default EDB Postgres Distributed uses asynchronous replication, applying changes on the peer nodes only after the local commit. Additional levels of synchronicity can be configured between different nodes, groups of nodes or all nodes by configuring -[Group Commit](/pgd/latest/bdr/group-commit), [CAMO](/pgd/latest/bdr/camo), or -[Eager](/pgd/latest/bdr/eager) replication. +[Group Commit](/pgd/4/bdr/group-commit), [CAMO](/pgd/4/bdr/camo), or +[Eager](/pgd/4/bdr/eager) replication. ## Compatibility matrix diff --git a/product_docs/docs/pgd/4/known_issues.mdx b/product_docs/docs/pgd/4/known_issues.mdx index ad6ac57cf29..a60faa9de26 100644 --- a/product_docs/docs/pgd/4/known_issues.mdx +++ b/product_docs/docs/pgd/4/known_issues.mdx @@ -6,7 +6,7 @@ This section discusses currently known issues in EDB Postgres Distributed 4. ## Data Consistency -Read about [Conflicts](/pgd/latest/bdr/conflicts/) to understand +Read about [Conflicts](/pgd/4/bdr/conflicts/) to understand the implications of the asynchronous operation mode in terms of data consistency. @@ -33,7 +33,7 @@ release. concurrent updates of the same row are repeatedly applied on two different nodes, then one of the update statements might hang due to a deadlock with the BDR writer. As mentioned in the - [Conflicts](/pgd/latest/bdr/conflicts/) chapter, `skip` is not the default + [Conflicts](/pgd/4/bdr/conflicts/) chapter, `skip` is not the default resolver for the `update_origin_change` conflict, and this combination isn't intended to be used in production. It discards one of the two conflicting updates based on the order of arrival @@ -63,8 +63,8 @@ release. Adding or removing a pair doesn't need a restart of Postgres or even a reload of the configuration. -- Group Commit cannot be combined with [CAMO](/pgd/latest/bdr/camo/) or [Eager All Node - replication](/pgd/latest/bdr/eager/). Eager Replication currently only works by using the +- Group Commit cannot be combined with [CAMO](/pgd/4/bdr/camo/) or [Eager All Node + replication](/pgd/4/bdr/eager/). Eager Replication currently only works by using the "global" BDR commit scope. - Neither Eager replication nor Group Commit support @@ -82,9 +82,9 @@ release. - Parallel apply is not currently supported in combination with Group Commit, please make sure to disable it when using Group Commit by either setting `num_writers` to 1 for the node group (using - [`bdr.alter_node_group_config`](/pgd/latest/bdr/nodes#bdralter_node_group_config)) or + [`bdr.alter_node_group_config`](/pgd/4/bdr/nodes#bdralter_node_group_config)) or via the GUC `bdr.writers_per_subscription` (see - [Configuration of Generic Replication](/pgd/latest/bdr/configuration#generic-replication)). + [Configuration of Generic Replication](/pgd/4/bdr/configuration#generic-replication)). - There currently is no protection against altering or removing a commit scope. Running transactions in a commit scope that is concurrently diff --git a/product_docs/docs/pgd/4/monitoring.mdx b/product_docs/docs/pgd/4/monitoring.mdx index 3a2b23d5855..b07db2e01af 100644 --- a/product_docs/docs/pgd/4/monitoring.mdx +++ b/product_docs/docs/pgd/4/monitoring.mdx @@ -83,7 +83,7 @@ node_seq_id | 3 node_local_dbname | postgres ``` -Also, the table [`bdr.node_catchup_info`](/pgd/latest/bdr/catalogs/#bdrnode_catchup_info) will give information +Also, the table [`bdr.node_catchup_info`](/pgd/4/bdr/catalogs/#bdrnode_catchup_info) will give information on the catch-up state, which can be relevant to joining nodes or parting nodes. When a node is parted, it could be that some nodes in the cluster did not receive @@ -103,8 +103,8 @@ The `catchup_state` can be one of the following: There are two main views used for monitoring of replication activity: -- [`bdr.node_slots`](/pgd/latest/bdr/catalogs/#bdrnode_slots) for monitoring outgoing replication -- [`bdr.subscription_summary`](/pgd/latest/bdr/catalogs/#bdrsubscription_summary) for monitoring incoming replication +- [`bdr.node_slots`](/pgd/4/bdr/catalogs/#bdrnode_slots) for monitoring outgoing replication +- [`bdr.subscription_summary`](/pgd/4/bdr/catalogs/#bdrsubscription_summary) for monitoring incoming replication Most of the information provided by `bdr.node_slots` can be also obtained by querying the standard PostgreSQL replication monitoring views @@ -114,13 +114,13 @@ and Each node has one BDR group slot which should never have a connection to it and will very rarely be marked as active. This is normal, and does not imply -something is down or disconnected. See [`Replication Slots created by BDR`](/pgd/latest/bdr/nodes/#replication-slots-created-by-bdr). +something is down or disconnected. See [`Replication Slots created by BDR`](/pgd/4/bdr/nodes/#replication-slots-created-by-bdr). ### Monitoring Outgoing Replication There is an additional view used for monitoring of outgoing replication activity: -- [`bdr.node_replication_rates`](/pgd/latest/bdr/catalogs/#bdrnode_replication_rates) for monitoring outgoing replication +- [`bdr.node_replication_rates`](/pgd/4/bdr/catalogs/#bdrnode_replication_rates) for monitoring outgoing replication The `bdr.node_replication_rates` view gives an overall picture of the outgoing replication activity along with the catchup estimates for peer nodes, @@ -274,9 +274,9 @@ subscription_status | replicating ### Monitoring WAL senders using LCR -If the [Decoding Worker](/pgd/latest/bdr/nodes#decoding-worker) is enabled, information about the +If the [Decoding Worker](/pgd/4/bdr/nodes#decoding-worker) is enabled, information about the current LCR (`Logical Change Record`) file for each WAL sender can be monitored -via the function [bdr.wal_sender_stats](/pgd/latest/bdr/functions#bdrwal_sender_stats), +via the function [bdr.wal_sender_stats](/pgd/4/bdr/functions#bdrwal_sender_stats), e.g.: ``` @@ -291,10 +291,10 @@ postgres=# SELECT * FROM bdr.wal_sender_stats(); If `is_using_lcr` is `FALSE`, `decoder_slot_name`/`lcr_file_name` will be `NULL`. This will be the case if the Decoding Worker is not enabled, or the WAL sender is -serving a [logical standby](/pgd/latest/bdr/nodes#logical-standby-nodes). +serving a [logical standby](/pgd/4/bdr/nodes#logical-standby-nodes). Additionally, information about the Decoding Worker can be monitored via the function -[bdr.get_decoding_worker_stat](/pgd/latest/bdr/functions#bdrget_decoding_worker_stat), e.g.: +[bdr.get_decoding_worker_stat](/pgd/4/bdr/functions#bdrget_decoding_worker_stat), e.g.: ``` postgres=# SELECT * FROM bdr.get_decoding_worker_stat(); @@ -364,7 +364,7 @@ Either or both entry types may be created for the same transaction, depending on the type of DDL operation and the value of the `bdr.ddl_locking` setting. Global locks held on the local node are visible in [the `bdr.global_locks` -view](/pgd/latest/bdr/catalogs#bdrglobal_locks). This view shows the type of the lock; for +view](/pgd/4/bdr/catalogs#bdrglobal_locks). This view shows the type of the lock; for relation locks it shows which relation is being locked, the PID holding the lock (if local), and whether the lock has been globally granted or not. In case of global advisory locks, `lock_type` column shows `GLOBAL_LOCK_ADVISORY` and @@ -390,7 +390,7 @@ timing information. ## Monitoring Conflicts -Replication [conflicts](/pgd/latest/bdr/conflicts) can arise when multiple nodes make +Replication [conflicts](/pgd/4/bdr/conflicts) can arise when multiple nodes make changes that affect the same rows in ways that can interact with each other. The BDR system should be monitored to ensure that conflicts are identified and, where possible, application changes are made to eliminate them or make diff --git a/product_docs/docs/pgd/4/other_considerations.mdx b/product_docs/docs/pgd/4/other_considerations.mdx index d0ecc53cce8..9fd8f4de3f5 100644 --- a/product_docs/docs/pgd/4/other_considerations.mdx +++ b/product_docs/docs/pgd/4/other_considerations.mdx @@ -21,4 +21,4 @@ EDB Postgres Distributed has been designed to operate with nodes in multiple tim Server clocks should be synchronized using NTP or other solutions. Clock synchronization is not critical to performance, as is the case with some other solutions. Clock skew can impact Origin Conflict Detection, though -EDB Postgres Distributed provides controls to report and manage any skew that exists. EDB Postgres Distributed also provides Row Version Conflict Detection, as described in [Conflict Detection](/pgd/latest/bdr/conflicts). +EDB Postgres Distributed provides controls to report and manage any skew that exists. EDB Postgres Distributed also provides Row Version Conflict Detection, as described in [Conflict Detection](/pgd/4/bdr/conflicts). diff --git a/product_docs/docs/pgd/4/overview/index.mdx b/product_docs/docs/pgd/4/overview/index.mdx index cb56598b016..0f1e936ddf9 100644 --- a/product_docs/docs/pgd/4/overview/index.mdx +++ b/product_docs/docs/pgd/4/overview/index.mdx @@ -19,7 +19,7 @@ Three different Postgres distributions can be used: - [EDB Postgres Advanced Server](/epas/latest) - Oracle compatible, optimized for replication, and additional enterprise features What Postgres distribution and version is right for you depends on the features you need. -See the feature matrix in [Choosing a Postgres distribution](/pgd/latest/choosing_server) for detailed comparison. +See the feature matrix in [Choosing a Postgres distribution](/pgd/4/choosing_server) for detailed comparison. ## [BDR](../bdr) diff --git a/product_docs/docs/pgd/4/rel_notes/index.mdx b/product_docs/docs/pgd/4/rel_notes/index.mdx index 6e801f34ef5..ff6d90ec2be 100644 --- a/product_docs/docs/pgd/4/rel_notes/index.mdx +++ b/product_docs/docs/pgd/4/rel_notes/index.mdx @@ -12,6 +12,18 @@ navigation: - pgd_4.0.2_rel_notes - pgd_4.0.1_rel_notes - pgd_4.0.0_rel_notes +redirects: + - /pgd/latest/rel_notes/pgd_4.0.0_rel_notes/ + - /pgd/latest/rel_notes/pgd_4.0.1_rel_notes/ + - /pgd/latest/rel_notes/pgd_4.0.2_rel_notes/ + - /pgd/latest/rel_notes/pgd_4.0.3_rel_notes/ + - /pgd/latest/rel_notes/pgd_4.1.0_rel_notes/ + - /pgd/latest/rel_notes/pgd_4.1.1_rel_notes/ + - /pgd/latest/rel_notes/pgd_4.2.0_rel_notes/ + - /pgd/latest/rel_notes/pgd_4.2.1_rel_notes/ + - /pgd/latest/rel_notes/pgd_4.2.2_rel_notes/ + - /pgd/latest/rel_notes/pgd_4.3.0_rel_notes/ + --- The EDB Postgres Distributed documentation describes the latest version of EDB Postgres Distributed 4, including minor releases and patches. The release notes provide information on what was new in each release. For new functionality introduced in a minor or patch release, the content also indicates the release that introduced the feature. diff --git a/product_docs/docs/pgd/4/upgrades/index.mdx b/product_docs/docs/pgd/4/upgrades/index.mdx index 120943b34fa..5d2a06a8c90 100644 --- a/product_docs/docs/pgd/4/upgrades/index.mdx +++ b/product_docs/docs/pgd/4/upgrades/index.mdx @@ -122,7 +122,7 @@ compatibility with previous releases. These may affect the Postgres configuration, deployment scripts, as well as applications using BDR. We recommend to consider and possibly adjust in advance of the upgrade. -Please see individual changes mentioned in [release notes](/pgd/latest/rel_notes/) and any version +Please see individual changes mentioned in [release notes](/pgd/4/rel_notes/) and any version specific upgrade notes in this topic. ## Server Software Upgrade diff --git a/product_docs/docs/pgd/5/appusage.mdx b/product_docs/docs/pgd/5/appusage.mdx new file mode 100644 index 00000000000..ed4eda333d7 --- /dev/null +++ b/product_docs/docs/pgd/5/appusage.mdx @@ -0,0 +1,710 @@ +--- +title: Application use +redirects: + - bdr/appusage +--- + +Learn about the BDR application from a user perspective. + +## Application behavior + +BDR supports replicating changes made on one node to other nodes. + +BDRs, by default, replicate all changes from INSERT, UPDATE, DELETE +and TRUNCATE operations from the source node to other nodes. Only the final changes +are sent, after all triggers and rules are processed. For example, +`INSERT ... ON CONFLICT UPDATE` sends either an insert or an update +depending on what occurred on the origin. If an update or delete affects +zero rows, then no changes are sent. + +INSERT can be replicated without any preconditions. + +For updates and deletes to replicate on other nodes, we must be able to +identify the unique rows affected. BDR requires that a table have either +a PRIMARY KEY defined, a UNIQUE constraint, or an explicit REPLICA IDENTITY +defined on specific columns. If one of those isn't defined, a warning is +generated, and later updates or deletes are explicitly blocked. +If REPLICA IDENTITY FULL is defined for a table, then a unique index isn't required. +In that case, updates and deletes are allowed and use the first non-unique +index that is live, valid, not deferred, and doesn't have expressions or WHERE +clauses. Otherwise, a sequential scan is used. + +You can use TRUNCATE even without a defined replication identity. +Replication of TRUNCATE commands is supported, but take care +when truncating groups of tables connected by foreign keys. When replicating +a truncate action, the subscriber truncates the same group of tables that +was truncated on the origin, either explicitly specified or implicitly +collected by CASCADE, except in cases where replication sets are defined. +See [Replication sets](repsets) for further details and examples. +This works correctly if all affected tables are part of the same +subscription. But if some tables to be truncated on the subscriber have +foreign-key links to tables that aren't part of the same (or any) +replication set, then applying the truncate action on the +subscriber fails. + +Row-level locks taken implicitly by INSERT, UPDATE, and DELETE commands are +replicated as the changes are made. +Table-level locks taken implicitly by INSERT, UPDATE, DELETE, and TRUNCATE +commands are also replicated. +Explicit row-level locking (`SELECT ... FOR UPDATE/FOR SHARE`) by user sessions +isn't replicated, nor are advisory locks. Information stored by transactions +running in SERIALIZABLE mode isn't replicated to other nodes. The +transaction isolation level of SERIALIAZABLE is supported, but transactions +aren't serialized across nodes in the presence of concurrent +transactions on multiple nodes. + +If DML is executed on multiple nodes concurrently, then potential conflicts +might occur if executing with asynchronous replication. These must be +either handled or avoided. Various avoidance mechanisms are possible, +discussed in [Conflicts](consistency/conflicts). + +Sequences need special handling, described in [Sequences](sequences). + +Binary data in BYTEA columns is replicated normally, allowing "blobs" of data +up to 1 GB in size. Use of the PostgreSQL "large object" facility isn't +supported in BDR. + +Rules execute only on the origin node so aren't executed during apply, +even if they're enabled for replicas. + +Replication is possible only from base tables to base tables. That is, the +tables on the source and target on the subscription side must be +tables, not views, materialized views, or foreign tables. Attempts to +replicate tables other than base tables result in an error. +DML changes that are made through updatable views are resolved to +base tables on the origin and then applied to the same base table name on +the target. + +BDR supports partitioned tables transparently, meaning that a partitioned +table can be added to a replication set and +changes that involve any of the partitions are replicated downstream. + +By default, triggers execute only on the origin node. For example, an INSERT +trigger executes on the origin node and is ignored when you apply the change on +the target node. You can specify for triggers to execute on both the origin +node at execution time and on the target when it's replicated ("apply time") +by using `ALTER TABLE ... ENABLE ALWAYS TRIGGER`, or use the `REPLICA` option +to execute only at apply time: `ALTER TABLE ... ENABLE REPLICA TRIGGER`. + +Some types of trigger aren't executed on apply, even if they exist on a +table and are currently enabled. Trigger types not executed are: + +- Statement-level triggers (`FOR EACH STATEMENT`) +- Per-column UPDATE triggers (`UPDATE OF column_name [, ...]`) + +BDR replication apply uses the system-level default search_path. Replica +triggers, stream triggers, and index expression functions can assume +other search_path settings that then fail when they execute on apply. +To prevent this from occurring, resolve object references clearly using +either only the default search_path, always use fully qualified references to +objects, e.g., schema.objectname, or set the search path for a function using +`ALTER FUNCTION ... SET search_path = ...` for the functions affected. + +BDR assumes that there are no issues related to text or other +collatable datatypes, i.e., all collations in use are available on all +nodes, and the default collation is the same on all nodes. Replication of +changes uses equality searches to locate Replica Identity values, so this +does't have any effect except where unique indexes are explicitly defined +with nonmatching collation qualifiers. Row filters might be affected +by differences in collations if collatable expressions were used. + +BDR handling of very long "toasted" data in PostgreSQL is transparent to +the user. The TOAST "chunkid" values likely differ between +the same row on different nodes, but that doesn't cause any problems. + +BDR can't work correctly if Replica Identity columns are marked as external. + +PostgreSQL allows CHECK() constraints that contain volatile functions. Since +BDR re-executes CHECK() constraints on apply, any subsequent re-execution that +doesn't return the same result as previously causes data divergence. + +BDR doesn't restrict the use of foreign keys. Cascading FKs are allowed. + +## Nonreplicated statements + +None of the following user commands are replicated by BDR, so their effects +occur on the local/origin node only: + +- Cursor operations (DECLARE, CLOSE, FETCH) +- Execution commands (DO, CALL, PREPARE, EXECUTE, EXPLAIN) +- Session management (DEALLOCATE, DISCARD, LOAD) +- Parameter commands (SET, SHOW) +- Constraint manipulation (SET CONSTRAINTS) +- Locking commands (LOCK) +- Table maintenance commands (VACUUM, ANALYZE, CLUSTER, REINDEX) +- Async operations (NOTIFY, LISTEN, UNLISTEN) + +Since the `NOTIFY` SQL command and the `pg_notify()` functions +aren't replicated, notifications aren't reliable in case of failover. +This means that notifications can easily be lost at failover if a +transaction is committed just when the server crashes. +Applications running `LISTEN` might miss notifications in case of failover. +This is true in standard PostgreSQL replication, and BDR doesn't +yet improve on this. CAMO and Eager Replication options don't +allow the `NOTIFY` SQL command or the `pg_notify()` function. + +## DML and DDL replication + +BDR doesn't replicate the DML statement. It replicates the changes +caused by the DML statement. For example, an UPDATE that changed +two rows replicates two changes, whereas a DELETE that didn't +remove any rows doesn't replicate anything. This means that the results +of executing volatile statements are replicated, ensuring there's no +divergence between nodes as might occur with statement-based replication. + +DDL replication works differently to DML. For DDL, BDR replicates the +statement, which then executes on all nodes. So a `DROP TABLE IF EXISTS` +might not replicate anything on the local node, but the statement is +still sent to other nodes for execution if DDL replication is enabled. +Full details are covered in [DDL replication](ddl). + +BDR works to ensure that intermixed DML and DDL +statements work correctly, even in the same transaction. + +## Replicating between different release levels + +BDR is designed to replicate between nodes that have different major +versions of PostgreSQL. This feature is designed to allow major +version upgrades without downtime. + +BDR is also designed to replicate between nodes that have different +versions of BDR software. This feature is designed to allow version +upgrades and maintenance without downtime. + +However, while it's possible to join a node with a major version in +a cluster, you can't add a node with a minor version if the cluster +uses a newer protocol version. This returns an error. + +Both of these features might be affected by specific restrictions. +See [Release notes](/pgd/latest/rel_notes/) for any known incompatibilities. + +## Replicating between nodes with differences + +By default, DDL is automatically sent to all nodes. You can control this manually, as described in [DDL Replication](ddl), and you could use it to create differences between database schemas across nodes. +BDR is designed to allow replication to continue even with minor +differences between nodes. These features are designed to allow +application schema migration without downtime or to allow logical +standby nodes for reporting or testing. + +Currently, replication requires the same table name on all nodes. A future +feature might allow a mapping between different table names. + +It is possible to replicate between tables with dissimilar partitioning +definitions, such as a source that is a normal table replicating to a +partitioned table, including support for updates that change partitions +on the target. It can be faster if the partitioning definition is the +same on the source and target since dynamic partition routing doesn't need to execute at apply time. +For details, see [Replication sets](repsets). + +By default, all columns are replicated. +BDR replicates data columns based on the column name. If a column +has the same name but a different datatype, we attempt to cast from the source +type to the target type, if casts were defined that allow that. + +BDR supports replicating between tables that have a different number of columns. + +If the target has missing columns from the source, then BDR raises +a `target_column_missing` conflict, for which the default conflict resolver +is `ignore_if_null`. This throws an error if a non-NULL value arrives. +Alternatively, you can also configure a node with a conflict resolver of `ignore`. +This setting doesn't throw an error but silently ignores any additional +columns. + +If the target has additional columns not seen in the source record, then BDR +raises a `source_column_missing` conflict, for which the default conflict resolver +is `use_default_value`. Replication proceeds if the additional columns +have a default, either NULL (if nullable) or a default expression, but +throws an error and halts replication if not. + +Transform triggers can also be used on tables to provide default values +or alter the incoming data in various ways before apply. + +If the source and the target have different constraints, then +replication is attempted, but it might fail if the rows from +source can't be applied to the target. Row filters can help here. + +Replicating data from one schema to a more relaxed schema won't cause failures. +Replicating data from a schema to a more restrictive schema can be a source of +potential failures. +The right way to solve this is to place a constraint on the more relaxed side, +so bad data can't be entered. That way, no bad data ever arrives +by replication, so it never fails the transform into the more restrictive +schema. For example, if one schema has a column of type TEXT and another schema +defines the same column as XML, add a CHECK constraint onto the TEXT column +to enforce that the text is XML. + +You can define a table with different indexes on each node. By default, the +index definitions are replicated. See [DDL replication](ddl) to +specify how to create an index only on a subset of nodes or just locally. + +Storage parameters, such as `fillfactor` and `toast_tuple_target`, can differ +between nodes for a table without problems. An exception to that is the +value of a table's storage parameter `user_catalog_table` must be identical +on all nodes. + +A table being replicated must be owned by the same user/role on each node. +See [Security and roles](security) for further discussion. + +Roles can have different passwords for connection on each node, although +by default changes to roles are replicated to each node. See +[DDL replication](ddl) to specify how to alter a role password only on a +subset of nodes or locally. + +## Comparison between nodes with differences + +LiveCompare is a tool for data comparison on a database, against BDR and +non-BDR nodes. It needs a minimum of two connections to compare against +and reach a final result. + +Since LiveCompare 1.3, you can configure with `all_bdr_nodes` set. This +saves you from clarifying all the relevant DSNs for each separate node in the +cluster. A EDB Postgres Distributed cluster has N amount of nodes with connection information, but +it's only the initial and output connection that LiveCompare 1.3+ needs +to complete its job. Setting `logical_replication_mode` states how all the +nodes are communicating. + +All the configuration is done in a `.ini` file, named `bdrLC.ini`, for example. +Find templates for this configuration file in +`/etc/2ndq-livecompare/`. + +While LiveCompare executes, you see N+1 progress bars, N being +the number of processes. Once all the tables are sourced, a time displays, +as the transactions per second (tps) was measured. This continues to +count the time, giving you an estimate and then a total execution time at the end. + +This tool offers a lot of customization and filters, such as tables, schemas, and +replication_sets. LiveCompare can use stop-start without losing context +information, so it can run at convenient times. After the comparison, a +summary and a DML script are generated so you can review it. Apply +the DML to fix the any differences found. + +## General rules for applications + +BDR uses replica identity values to identify the rows to +change. +Applications can cause difficulties if they insert, delete, and then later +reuse the same unique identifiers. +This is known as the [ABA problem](https://en.wikipedia.org/wiki/ABA_problem). BDR can't know whether the rows are the +current row, the last row, or much older rows. + +Similarly, since BDR uses table names to identify the table against which +changes are replayed, a similar ABA problem exists with applications that +create, drop, and then later reuse the same object names. + +These issues give rise to some simple rules for applications to follow: + +- Use unique identifiers for rows (INSERT). +- Avoid modifying unique identifiers (UPDATE). +- Avoid reusing deleted unique identifiers. +- Avoid reusing dropped object names. + +In the general case, breaking those rules can lead to data anomalies and +divergence. Applications can break those rules as long as certain conditions +are met, but use caution: while anomalies are unlikely, they aren't +impossible. For example, a row value can be reused as long as the DELETE was replayed on all nodes, including down nodes. This might normally occur in +less than a second but can take days if a severe issue occurred +on one node that prevented it from restarting correctly. + +## Timing considerations and synchronous replication + +Being asynchronous by default, peer nodes might lag behind, making it +possible for a client connected to multiple BDR nodes or switching +between them to read stale data. + +A [queue wait function](functions#bdrwait_for_apply_queue) is +provided for clients or proxies to prevent such stale reads. + +The synchronous replication features of Postgres are available to BDR +as well. In addition, BDR provides multiple variants for more synchronous +replication. See +[Durability and performance options](durability) for an overview and comparison of all variants available and +its different modes. + +## Application testing + +You can test BDR applications using the following programs, +in addition to other techniques. + +- [Trusted Postgres Architect](#trusted-postgres-architect) +- [pgbench with CAMO/Failover options](#pgbench-with-camofailover-options) +- [isolationtester with multi-node access](#isolationtester-with-multi-node-access) + +### Trusted Postgres Architect + +[Trusted Postgres Architect](/tpa/latest) is the system used by EDB to +deploy reference architectures, including those based on EDB Postgres Distributed. + +Trusted Postgres Architect includes test suites for each reference architecture. +It also simplifies creating and managing a local collection of tests to run +against a TPA cluster, using a syntax like the following: + +``` +tpaexec test mycluster mytest +``` + +We strongly recommend that developers write their own multi-node suite +of Trusted Postgres Architect tests that verify the main expected properties +of the application. + +### pgbench with CAMO/Failover options + +In EDB Postgres Extended, the pgbench was extended to allow users to +run failover tests while using CAMO or regular BDR deployments. The following options were added: + +``` +-m, --mode=regular|camo|failover +mode in which pgbench should run (default: regular) + +--retry +retry transactions on failover +``` + +In addition to these options, the connection information about the +peer node for failover must be specified in [DSN +form](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING). + +- Use `-m camo` or `-m failover` to specify the mode for pgbench. + You can use The `-m failover` specification to test failover in + regular BDR deployments. + +- Use `--retry` to specify whether to retry transactions when + failover happens with `-m failover` mode. This option is enabled by default + for `-m camo` mode. + +Here's an example in a CAMO environment: + +```sh + pgbench -m camo -p $node1_port -h $node1_host bdrdemo \ + "host=$node2_host user=postgres port=$node2_port dbname=bdrdemo" +``` + +This command runs in camo mode. It connects to node1 and runs the tests. If the +connection to node1 is lost, then pgbench connects to +node2. It queries node2 to get the status of in-flight transactions. +Aborted and in-flight transactions are retried in camo mode. + +In failover mode, if `--retry` is specified, then in-flight transactions are retried. In +this scenario there's no way to find the status of in-flight transactions. + +### isolationtester with multi-node access + +isolationtester was extended to allow users to run tests on multiple +sessions and on multiple nodes. This is used for internal BDR testing, +although it's also available for use with user application testing. + +``` +$ isolationtester \ + --outputdir=./iso_output \ + --create-role=logical \ + --dbname=postgres \ + --server 'd1=dbname=node1' \ + --server 'd2=dbname=node2' \ + --server 'd3=dbname=node3' +``` + +Isolation tests are a set of tests for examining concurrent behaviors in +PostgreSQL. These tests require running multiple interacting transactions, +which requires managing multiple concurrent connections and therefore +can't be tested using the normal `pg_regress` program. The name "isolation" +comes from the fact that the original motivation was to test the +serializable isolation level. Tests for other sorts of concurrent +behaviors were added as well. + +It's built using PGXS as an external module. +On installation, it creates the `isolationtester` binary file, which is run by +`pg_isolation_regress` to perform concurrent regression tests and observe +results. + +`pg_isolation_regress` is a tool similar to `pg_regress`, but instead of using +psql to execute a test, it uses isolationtester. It accepts all the same +command-line arguments as `pg_regress`. It was modified to accept multiple +hosts as parameters. It then passes the host conninfo along with server names +to the `isolationtester` binary. Isolation tester compares these server names with the +names specified in each session in the spec files and runs given tests on +respective servers. + +To define tests with overlapping transactions, we use test specification +files with a custom syntax. To add +a new test, place a spec file in the `specs/` subdirectory, add the expected +output in the `expected/` subdirectory, and add the test's name to the makefile. + +Isolationtester is a program that uses libpq to open multiple connections +and executes a test specified by a spec file. A libpq connection string +specifies the server and database to connect to. Defaults derived from +environment variables are used otherwise. + +Specification consists of five parts, tested in this order: + +`server ""` + + This defines the name of the servers for the sessions to run on. + There can be zero or more server `""` specifications. + The conninfo corresponding to the names is provided by the command to + run isolationtester. This is described in `quickstart_isolationtest.md`. + This part is optional. + +`setup { }` + + The given SQL block is executed once, in one session only, before running + the test. Create any test tables or other required objects here. This + part is optional. Multiple setup blocks are allowed if needed. Each is + run separately, in the given order. The reason for allowing multiple + setup blocks is that each block is run as a single PQexec submission, + and some statements such as VACUUM can't be combined with others in such + a block. + +`teardown { }` + + The teardown SQL block is executed once after the test is finished. Use + this to clean up in preparation for the next permutation, such as dropping + any test tables created by setup. This part is optional. + +`session ""` + + There are normally several "session" parts in a spec file. Each + session is executed in its own connection. A session part consists + of three parts: setup, teardown, and one or more "steps." The per-session + setup and teardown parts have the same syntax as the per-test setup and + teardown, but they are executed in each session. The + setup part typically contains a BEGIN command to begin a transaction. + + A session part also consists of `connect_to` specification. + This points to a server name specified in the beginning that + indicates the server on which this session runs. + + `connect_to ""` + + Each step has the syntax: + + `step "" { }` + + where `` is a name identifying this step, and SQL is a SQL statement + (or statements, separated by semicolons) that's executed in the step. + Step names must be unique across the whole spec file. + +`permutation ""` + + A permutation line specifies a list of steps that are run in that order. + Any number of permutation lines can appear. If no permutation lines are + given, the test program automatically generates all possible orderings + of the steps from each session (running the steps of any one session in + order). The list of steps in a manually specified + "permutation" line doesn't actually have to be a permutation of the + available steps. It can, for instance, repeat some steps more than once + or leave others out. + +Lines beginning with a # are comments. + +For each permutation of the session steps (whether these are manually +specified in the spec file or automatically generated), the isolation +tester runs: + +1. The main setup part +1. Per-session setup parts +1. The selected session steps +1. Per-session teardown +1. The main teardown script + +Each selected step is sent to the connection associated +with its session. + +To run isolation tests in a BDR environment that ran all prerequisite make +commands: + +1. Run `make isolationcheck-install` to install the isolationtester submodule. + +2. You can run isolation regression tests using either + of the following commands from the bdr-private repo: + + `make isolationcheck-installcheck` + `make isolationcheck-makecheck` + +To run `isolationcheck-installcheck`, you need to have two or more postgresql +servers running. Pass the conninfo of each server to `pg_isolation_regress` +in the BDR makefile. + Ex: `pg_isolation_regress --server 'd1=host=myhost dbname=mydb port=5434' + --server 'd2=host=myhost1 dbname=mydb port=5432'` + +Next, add a `.spec` file containing tests in the `specs/isolation` directory +of the `bdr-private/` repo. Add a `.out` file in `expected/isolation` directory of +the n`bdr-private/` repo. + +Then run + `make isolationcheck-installcheck` + +`Isolationcheck-makecheck` currently supports running isolation tests on a +single instance by setting up BDR between multiple databases. + +You need to pass appropriate database names and the conninfo of bdr instances +to `pg_isolation_regress` in the BDR makefile as follows: + `pg_isolation_regress --dbname=db1,db2 --server 'd1=dbname=db1' + --server 'd2=dbname=db2'` + +Then run + `make isolationcheck-makecheck` + +Each step can contain commands that block until further action has been taken +(most likely, some other session runs a step that unblocks it or causes a +deadlock). A test that uses this ability must manually specify valid +permutations, that is, those that don'tt expect a blocked session to execute a +command. If a test doesn't follow that rule, isolationtester cancels it +after 300 seconds. If the cancel doesn't work, isolationtester exits +uncleanly after 375 seconds of wait time. Avoid testing invalid +permutations because they can make the isolation tests take +a very long time to run, and they serve no useful testing purpose. + +isolationtester recognizes that a command has blocked by checking whether it is shown as waiting in the `pg_locks` view. Therefore, only +blocks on heavyweight locks are detected. + +## Performance testing and tuning + +BDR allows you to issue write transactions onto multiple master nodes. +Bringing those writes back together onto each node has a cost in +performance. + +First, replaying changes from another node has a CPU cost, an I/O cost, +and it generates WAL records. The resource use is usually less +than in the original transaction since CPU overheads are lower as a result +of not needing to reexecute SQL. In the case of UPDATE and DELETE +transactions, there might be I/O costs on replay if data isn't cached. + +Second, replaying changes holds table-level and row-level locks that can +produce contention against local workloads. The conflict-free replicated data types (CRDT) and column-level conflict detection (CLCD) features +ensure you get the correct answers even for concurrent updates, but they +don't remove the normal locking overheads. If you get locking contention, +try to avoid conflicting updates or keep transactions as short as +possible. A heavily updated row in a larger transaction causes +a bottleneck on performance for that transaction. Complex applications +require some thought to maintain scalability. + +If you think you're having performance problems, +develop performance tests using the benchmarking tools. pgbench +allows you to write custom test scripts specific to your use case +so you can understand the overheads of your SQL and measure the impact +of concurrent execution. + +If BDR is running slow, then we suggest the following: + +1. Write a custom test script for pgbench, as close as you can make it + to the production system's problem case. +2. Run the script on one node to give you a baseline figure. +3. Run the script on as many nodes as occurs in production, using the + same number of sessions in total as you did on one node. This + shows you the effect of moving to multiple nodes. +4. Increase the number of sessions for these two tests so you can + plot the effect of increased contention on your application. +5. Make sure your tests are long enough to account for replication delays. +6. Ensure that replication delay isn't growing during your tests. + +Use all of the normal Postgres tuning features to improve the speed +of critical parts of your application. + +## Assessing suitability + +BDR is compatible with PostgreSQL, but not all PostgreSQL applications are +suitable for use on distributed databases. Most applications are already or +can easily be modified to become BDR compliant. You can undertake an +assessment activity in which you can point your application to a BDR-enabled +setup. BDR provides a few knobs that can be set during the assessment period. +These aid in the process of deciding suitability of your application in +a BDR-enabled environment. + +### Assessing updates of primary key/replica identity + +BDR can't currently perform conflict resolution where the PRIMARY KEY is changed +by an UPDATE operation. You can update the primary key, but you must +ensure that no conflict with existing values is possible. + +BDR provides the following configuration parameter to assess how frequently +the primary key/replica identity of any table is being subjected to update +operations. + +Use these configuration parameters only for assessment. +You can use them on a single node BDR instance, but don't use them on a production +EDB Postgres Distributed cluster with two or more nodes replicating to each other. In fact, a node +might fail to start or a new node fail to join the cluster if any of the +assessment parameters are set to anything other than `IGNORE`. + +```sql +bdr.assess_update_replica_identity = IGNORE (default) | LOG | WARNING | ERROR +``` + +By enabling this parameter during the assessment period, you can log updates to +the key/replica identity values of a row. You can also potentially block such +updates, if desired. For example: + +```sql +CREATE TABLE public.test(g int primary key, h int); +INSERT INTO test VALUES (1, 1); + +SET bdr.assess_update_replica_identity TO 'error'; +UPDATE test SET g = 4 WHERE g = 1; +ERROR: bdr_assess: update of key/replica identity of table public.test +``` + +Apply worker processes always ignore any settings for this parameter. + +### Assessing use of LOCK on tables or in SELECT queries + +Because BDR writer processes operate much like normal user sessions, they're subject to +the usual rules around row and table locking. This can sometimes lead to BDR writer +processes waiting on locks held by user transactions or even by each other. + +BDR provides the following configuration parameter to assess if the application +is taking explicit locks: + +```sql +bdr.assess_lock_statement = IGNORE (default) | LOG | WARNING | ERROR +``` + +Two types of locks that you can track are: + +- Explicit table-level locking (`LOCK TABLE ...`) by user sessions +- Explicit row-level locking (`SELECT ... FOR UPDATE/FOR SHARE`) by user sessions + +By enabling this parameter during the assessment period, you can track (or block) such explicit +locking activity. For example: + +```sql +CREATE TABLE public.test(g int primary key, h int); +INSERT INTO test VALUES (1, 1); + +SET bdr.assess_lock_statement TO 'error'; +SELECT * FROM test FOR UPDATE; +ERROR: bdr_assess: "SELECT FOR UPDATE" invoked on a BDR node + +SELECT * FROM test FOR SHARE; +ERROR: bdr_assess: "SELECT FOR SHARE" invoked on a BDR node + +SET bdr.assess_lock_statement TO 'warning'; +LOCK TABLE test IN ACCESS SHARE MODE; +WARNING: bdr_assess: "LOCK STATEMENT" invoked on a BDR node +``` + +## Use of Table Access Methods(TAMs) in PGD + +PGD 5.0 supports two table access methods released with EDB Postgres 15.0. +These two table access methods have been certified and allowed in PGD 5.0. + + * Auto cluster + * Ref data + +Any other TAM will be restricted until certified by EDB. +If user is planning to use any of the table access method on a table, +that TAM will need to be cofigured on each participating node in the +PGD cluster. +To configure Auto cluster or Ref data TAM, follow these two steps on each node: +- Update postgresql.conf to specify TAMs 'autocluster' or 'refdata' for + 'shared_preload_libraries' parameter. +- Restart the server and execute 'CREATE EXTENSION autocluster;' or + 'CREATE EXTENSION refdata;'. +Once extenstion is created, TAM can be used for creating a table. +'CREATE TABLE test USING autocluster;' or 'CREATE TABLE test USING refdata;' +This will replicated to all the PGD nodes. +For more information on these table access methods, please refer EDB docs. + + + + + + diff --git a/product_docs/docs/pgd/5/architectures.mdx b/product_docs/docs/pgd/5/architectures.mdx new file mode 100644 index 00000000000..97decc7d94a --- /dev/null +++ b/product_docs/docs/pgd/5/architectures.mdx @@ -0,0 +1,137 @@ +--- +title: "Choosing your architecture" +redirects: + - /pgd/latest/architectures/bronze/ + - /pgd/latest/architectures/gold/ + - /pgd/latest/architectures/platinum/ + - /pgd/latest/architectures/silver/ +--- + +Always On architectures reflect EDB’s Trusted Postgres architectures that +encapsulate practices and help you to achieve the highest possible service +availability in multiple configurations. These configurations range from +single-location architectures to complex distributed systems that protect from +hardware failures and data center failures. The architectures leverage EDB +Postgres Distributed’s multi-master capability and its ability to achieve +99.999% availability, even during maintenance operations. + +You can use EDB Postgres Distributed for architectures beyond the examples +described here. Use-case-specific variations have been successfully deployed in +production. However, these variations must undergo rigorous architecture review +first. Also, EDB’s standard deployment tool for Always On architectures, TPA, +must be enabled to support the variations before they can be supported in +production environments. + +## Standard EDB Always On architectures + +EDB has identified a set of standardized architectures to support single or +multi-location deployments with varying levels of redundancy depending on your +RPO and RTO requirements. + +The Always ON architecture uses 3 database node group as a basic building block +(it's possible to use 5 node group for extra redundancy as well). + +EDB Postgres Distributed consists of the following major building blocks: + +- Bi-Directional Replication (BDR) - a Postgres extension that creates the + multi-master mesh network +- PGD-proxy - a connection router that makes sure the application is connected + to the right data nodes. + +All Always On architectures protect an increasing range of failure situations. +For example, a single active location with 2 data nodes protects against local +hardware failure, but does not provide protection from location failure (data +center or availability zone) failure. Extending that architecture with a backup +at a different location, ensures some protection in case of the catastrophic +loss of a location, but the database still has to be restored from backup first +which may violate recovery time objective (RTO) requirements. By adding a second +active location connected in a multi-master mesh network, ensuring that service +remains available even in case a location goes offline. Finally adding 3rd +location (this can be a witness only location) allows global Raft functionality +to work even in case of one location going offline. The global Raft is primarily +needed to run administrative commands and also some features like DDL or +sequence allocation may not work without it, while DML replication will +continue to work even in the absence of global Raft. + +Each architecture can provide zero recovery point objective (RPO), as data can +be streamed synchronously to at least one local master, thus guaranteeing zero +data loss in case of local hardware failure. + +Increasing the availability guarantee always drives additional cost for hardware +and licenses, networking requirements, and operational complexity. Thus it is +important to carefully consider the availability and compliance requirements +before choosing an architecture. + + +## Architecture details + +By default, application transactions do not require cluster-wide consensus for +DML (selects, inserts, updates, and deletes) allowing for lower latency and +better performance. However, for certain operations such as generating new +global sequences or performing distributed DDL, EDB Postgres Distributed +requires an odd number of nodes to make decisions using a Raft +() based consensus model. Thus, even the simpler +architectures always have three nodes, even if not all of them are storing data. + +Applications connect to the standard Always On architectures via multi-host +connection strings, where each PGD-Proxy server is a distinct entry in the +multi-host connection string. There should always be at least two proxy nodes in +each location to ensure high availability. The proxy can be co-located with the +database instance, in which case it's recommended to put the proxy on every data +node. + +Other connection mechanisms have been successfully deployed in production, but +they are not part of the standard Always On architectures. + +### Always On Single Location + +![Always On 1 Location, 3 Nodes Diagram](images/always-on-1x3.png) + +* Redundant hardware to quickly restore from local failures + * 3 PGD nodes + * could be 3 data nodes (recommended), or 2 data nodes and 1 witness which does not hold data (depicted) + * A PGD-Proxy for each data node with affinity to the applications + * can be co-located with data node +* Barman for backup and recovery (not depicted) + * Offsite is optional, but recommended + * Can be shared by multiple clusters +* Postgres Enterprise Manager (PEM) for monitoring (not depicted) + * Can be shared by multiple clusters + +### Always On Multi-Location + +![Always On 2 Locations, 3 Nodes Per Location, Active/Active Diagram](images/always-on-2x3-aa.png) + +* Application can be Active/Active in each location, or Active/Passive or Active DR with only one location taking writes +* Additional replication between all nodes in Region A and Region B is not shown but occurs as part of the replication mesh +* Redundant hardware to quickly restore from local failures + * 6 PGD nodes total, 3 in each location + * could be 3 data nodes (recommended), or 2 data nodes and 1 witness which does not hold data (depicted) + * A PGD-Proxy for each data node with affinity to the applications + * can be co-located with data node +* Barman for backup and recovery (not depicted) + * Can be shared by multiple clusters +* Postgres Enterprise Manager (PEM) for monitoring (not depicted) + * Can be shared by multiple clusters +* An optional witness node should be placed in a third region to increase tolerance for location failure + * Otherwise, when a location fails, actions requiring global consensus will be blocked such as adding new nodes, distributed DDL, etc. + +## Choosing your architecture + +All architectures provide the following: +* Hardware failure protection +* Zero downtime upgrades +* Support for availability zones in public/private cloud + +Use these criteria to help you to select the appropriate Always On architecture. + +| | Single  Data Location | Two  Data  Locations | Two  Data Locations  + Witness | Three or More Data Locations | +|------------------------------------------------------|----------------------------------------------------------|----------------------------------------------------------|----------------------------------------------------------|----------------------------------------------------------| +| Locations needed | 1 | 2 | 3 | 3 | +| Fast restoration of local HA after data node failure | Yes - if 3 PGD data nodes
No - if 2 PGD data nodes | Yes - if 3 PGD data nodes
No - if 2 PGD data nodes | Yes - if 3 PGD data nodes
No - if 2 PGD data nodes | Yes - if 3 PGD data nodes
No - if 2 PGD data nodes | +| Data protection in case of  location failure | No (unless offsite backup) | Yes | Yes | Yes | +| Global consensus in case of location failure | N/A | No | Yes | Yes | +| Data restore required after location failure | Yes | No | No | No | +| Immediate failover in case of location failure | No - requires data restore from backup | Yes - alternate Location | Yes - alternate Location | Yes - alternate Location | +| Cross Location Network Traffic | Only if backup is offsite | Full replication traffic | Full replication traffic | Full replication traffic | +| License Cost | 2 or 3 PGD data nodes | 4 or 6  PGD data nodes | 4 or 6 PGD data nodes | 6+ PGD data nodes | diff --git a/product_docs/docs/pgd/5/backup.mdx b/product_docs/docs/pgd/5/backup.mdx new file mode 100644 index 00000000000..fd61bdc2c20 --- /dev/null +++ b/product_docs/docs/pgd/5/backup.mdx @@ -0,0 +1,286 @@ +--- +title: Backup and recovery +originalFilePath: backup.md + +--- + +In this chapter we discuss the backup and restore of a EDB Postgres Distributed cluster. + +BDR is designed to be a distributed, highly available system. If +one or more nodes of a cluster are lost, the best way to replace them +is to clone new nodes directly from the remaining nodes. + +The role of backup and recovery in BDR is to provide for Disaster +Recovery (DR), such as in the following situations: + +- Loss of all nodes in the cluster +- Significant, uncorrectable data corruption across multiple nodes + as a result of data corruption, application error or + security breach + +## Backup + +### `pg_dump` + +`pg_dump`, sometimes referred to as "logical backup", can be used +normally with BDR. + +Note that `pg_dump` dumps both local and global sequences as if +they were local sequences. This is intentional, to allow a BDR +schema to be dumped and ported to other PostgreSQL databases. +This means that sequence kind metadata is lost at the time of dump, +so a restore would effectively reset all sequence kinds to +the value of `bdr.default_sequence_kind` at time of restore. + +To create a post-restore script to reset the precise sequence kind +for each sequence, you might want to use an SQL script like this: + +```sql +SELECT 'SELECT bdr.alter_sequence_set_kind('''|| + nspname||'.'||relname||''','''||seqkind||''');' +FROM bdr.sequences +WHERE seqkind != 'local'; +``` + +Note that if `pg_dump` is run using `bdr.crdt_raw_value = on` then the +dump can only be reloaded with `bdr.crdt_raw_value = on`. + +Technical Support recommends the use of physical backup techniques for +backup and recovery of BDR. + +### Physical Backup + +Physical backups of a node in a EDB Postgres Distributed cluster can be taken using +standard PostgreSQL software, such as +[Barman](https://www.enterprisedb.com/docs/supported-open-source/barman/). + +A physical backup of a BDR node can be performed with the same +procedure that applies to any PostgreSQL node: a BDR node is just a +PostgreSQL node running the BDR extension. + +There are some specific points that must be considered when applying +PostgreSQL backup techniques to BDR: + +- BDR operates at the level of a single database, while a physical + backup includes all the databases in the instance; you should plan + your databases to allow them to be easily backed-up and restored. + +- Backups will make a copy of just one node. In the simplest case, + every node has a copy of all data, so you would need to backup only + one node to capture all data. However, the goal of BDR will not be + met if the site containing that single copy goes down, so the + minimum should be at least one node backup per site (obviously with + many copies etc.). + +- However, each node may have un-replicated local data, and/or the + definition of replication sets may be complex so that all nodes do + not subscribe to all replication sets. In these cases, backup + planning must also include plans for how to backup any unreplicated + local data and a backup of at least one node that subscribes to each + replication set. + +### Eventual Consistency + +The nodes in a EDB Postgres Distributed cluster are *eventually consistent*, but not +entirely *consistent*; a physical backup of a given node will +provide Point-In-Time Recovery capabilities limited to the states +actually assumed by that node (see the [Example] below). + +The following example shows how two nodes in the same EDB Postgres Distributed cluster might not +(and usually do not) go through the same sequence of states. + +Consider a cluster with two nodes `N1` and `N2`, which is initially in +state `S`. If transaction `W1` is applied to node `N1`, and at the same +time a non-conflicting transaction `W2` is applied to node `N2`, then +node `N1` will go through the following states: + +``` +(N1) S --> S + W1 --> S + W1 + W2 +``` + +...while node `N2` will go through the following states: + +``` +(N2) S --> S + W2 --> S + W1 + W2 +``` + +That is: node `N1` will *never* assume state `S + W2`, and node `N2` +likewise will never assume state `S + W1`, but both nodes will end up +in the same state `S + W1 + W2`. Considering this situation might affect how +you decide upon your backup strategy. + +### Point-In-Time Recovery (PITR) + +In the example above, the changes are also inconsistent in time, since +`W1` and `W2` both occur at time `T1`, but the change `W1` is not +applied to `N2` until `T2`. + +PostgreSQL PITR is designed around the assumption of changes arriving +from a single master in COMMIT order. Thus, PITR is possible by simply +scanning through changes until one particular point-in-time (PIT) is reached. +With this scheme, you can restore one node to a single point-in-time +from its viewpoint, e.g. `T1`, but that state would not include other +data from other nodes that had committed near that time but had not yet +arrived on the node. As a result, the recovery might be considered to +be partially inconsistent, or at least consistent for only one +replication origin. + +To request this, use the standard syntax: + +``` +recovery_target_time = T1 +``` + +BDR allows for changes from multiple masters, all recorded within the +WAL log for one node, separately identified using replication origin +identifiers. + +BDR allows PITR of all or some replication origins to a specific point in time, +providing a fully consistent viewpoint across all subsets of nodes. + +Thus for multi-origins, we view the WAL stream as containing multiple +streams all mixed up into one larger stream. There is still just one PIT, +but that will be reached as different points for each origin separately. + +We read the WAL stream until requested origins have found their PIT. We +apply all changes up until that point, except that we do not mark as committed +any transaction records for an origin after the PIT on that origin has been +reached. + +We end up with one LSN "stopping point" in WAL, but we also have one single +timestamp applied consistently, just as we do with "single origin PITR". + +Once we have reached the defined PIT, a later one may also be set to allow +the recovery to continue, as needed. + +After the desired stopping point has been reached, if the recovered server +will be promoted, shut it down first and move the LSN forwards using +`pg_resetwal` to an LSN value higher than used on any timeline on this server. +This ensures that there will be no duplicate LSNs produced by logical +decoding. + +In the specific example above, `N1` would be restored to `T1`, but +would also include changes from other nodes that have been committed +by `T1`, even though they were not applied on `N1` until later. + +To request multi-origin PITR, use the standard syntax in +the recovery.conf file: + +``` +recovery_target_time = T1 +``` + +The list of replication origins which would be restored to `T1` need either +to be specified in a separate multi_recovery.conf file via the use of +a new parameter `recovery_target_origins`: + +``` +recovery_target_origins = '*' +``` + +...or one can specify the origin subset as a list in `recovery_target_origins`. + +``` +recovery_target_origins = '1,3' +``` + +Note that the local WAL activity recovery to the specified +`recovery_target_time` is always performed implicitly. For origins +that are not specified in `recovery_target_origins`, recovery may +stop at any point, depending on when the target for the list +mentioned in `recovery_target_origins` is achieved. + +In the absence of the `multi_recovery.conf` file, the recovery defaults +to the original PostgreSQL PITR behaviour that is designed around the assumption +of changes arriving from a single master in COMMIT order. + +!!! Note + This is feature is only available on EDB Postgres Extended and + Barman does not currently automatically create a `multi_recovery.conf` file. + +## Restore + +While you can take a physical backup with the same procedure as a +standard PostgreSQL node, what is slightly more complex is +**restoring** the physical backup of a BDR node. + +### EDB Postgres Distributed Cluster Failure or Seeding a New Cluster from a Backup + +The most common use case for restoring a physical backup involves the failure +or replacement of all the BDR nodes in a cluster, for instance in the event of +a datacentre failure. + +You may also want to perform this procedure to clone the current contents of a +EDB Postgres Distributed cluster to seed a QA or development instance. + +In that case, BDR capabilities can be restored based on a physical backup +of a single BDR node, optionally plus WAL archives: + +- If you still have some BDR nodes live and running, fence off the host you + restored the BDR node to, so it cannot connect to any surviving BDR nodes. + This ensures that the new node does not confuse the existing cluster. +- Restore a single PostgreSQL node from a physical backup of one of + the BDR nodes. +- If you have WAL archives associated with the backup, create a suitable + `recovery.conf` and start PostgreSQL in recovery to replay up to the latest + state. You can specify a alternative `recovery_target` here if needed. +- Start the restored node, or promote it to read/write if it was in standby + recovery. Keep it fenced from any surviving nodes! +- Clean up any leftover BDR metadata that was included in the physical backup, + as described below. +- Fully stop and restart the PostgreSQL instance. +- Add further BDR nodes with the standard procedure based on the + `bdr.join_node_group()` function call. + +#### Cleanup BDR Metadata + +The cleaning of leftover BDR metadata is achieved as follows: + +1. Drop the BDR node using `bdr.drop_node` +2. Fully stop and re-start PostgreSQL (important!). + +#### Cleanup of Replication Origins + +Replication origins must be explicitly removed with a separate step +because they are recorded persistently in a system catalog, and +therefore included in the backup and in the restored instance. They +are not removed automatically when dropping the BDR extension, because +they are not explicitly recorded as its dependencies. + +BDR creates one replication origin for each remote master node, to +track progress of incoming replication in a crash-safe way. Therefore +we need to run: + +``` +SELECT pg_replication_origin_drop('bdr_dbname_grpname_nodename'); +``` + +...once for each node in the (previous) cluster. Replication origins can +be listed as follows: + +``` +SELECT * FROM pg_replication_origin; +``` + +...and those created by BDR are easily recognized by their name, as in +the example shown above. + +#### Cleanup of Replication Slots + +If a physical backup was created with `pg_basebackup`, replication slots +will be omitted from the backup. + +Some other backup methods may preserve replications slots, likely in +outdated or invalid states. Once you restore the backup, just: + +``` +SELECT pg_drop_replication_slot(slot_name) +FROM pg_replication_slots; +``` + +...to drop *all* replication slots. If you have a reason to preserve some, +you can add a `WHERE slot_name LIKE 'bdr%'` clause, but this is rarely +useful. + +!!! Warning + Never run this on a live BDR node. diff --git a/product_docs/docs/pgd/5/catalogs.mdx b/product_docs/docs/pgd/5/catalogs.mdx new file mode 100644 index 00000000000..dcc1f6d4fad --- /dev/null +++ b/product_docs/docs/pgd/5/catalogs.mdx @@ -0,0 +1,1384 @@ +--- +title: Catalogs and views +redirects: + - bdr/catalogs + +--- +Catalogs and views are presented here in alphabetical order. + +## User-visible catalogs and views + +### `bdr.conflict_history` + +This table is the default table where conflicts are logged. The table is +RANGE partitioned on column `local_time` and is managed by Autopartition. +The default data retention period is 30 days. + +Access to this table is possible by any table owner, who can see all +conflicts for the tables they own, restricted by row-level security. + +For details, see [Logging conflicts to a table](consistency/conflicts). + +#### `bdr.conflict_history` columns + +| Name | Type | Description | +| ----------------------- | ------------------------ | ------------------------------------------------------------------------------------------------------------------------ | +| sub_id | oid | Which subscription produced this conflict; can be joined to `bdr.subscription` table | +| local_xid | xid | Local transaction of the replication process at the time of conflict | +| local_lsn | pg_lsn | Local LSN at the time of conflict | +| local_time | timestamp with time zone | Local time of the conflict | +| remote_xid | xid | Transaction that produced the conflicting change on the remote node (an origin) | +| remote_commit_lsn | pg_lsn | Commit LSN of the transaction which produced the conflicting change on the remote node (an origin) | +| remote_commit_time | timestamp with time zone | Commit timestamp of the transaction that produced the conflicting change on the remote node (an origin) | +| conflict_type | text | Detected type of the conflict | +| conflict_resolution | text | Conflict resolution chosen | +| conflict_index | regclass | Conflicting index (valid only if the index wasn't dropped since) | +| reloid | oid | Conflicting relation (valid only if the index wasn't dropped since) | +| nspname | text | Name of the schema for the relation on which the conflict has occurred at the time of conflict (doesn't follow renames) | +| relname | text | Name of the relation on which the conflict has occurred at the time of conflict (does not follow renames) | +| key_tuple | json | Json representation of the key used for matching the row | +| remote_tuple | json | Json representation of an incoming conflicting row | +| local_tuple | json | Json representation of the local conflicting row | +| apply_tuple | json | Json representation of the resulting (the one that has been applied) row | +| local_tuple_xmin | xid | Transaction that produced the local conflicting row (if `local_tuple` is set and the row isn't frozen) | +| local_tuple_node_id | oid | Node that produced the local conflicting row (if `local_tuple` is set and the row isn't frozen) | +| local_tuple_commit_time | timestamp with time zone | Last known change timestamp of the local conflicting row (if `local_tuple` is set and the row isn't frozen) | + +### `bdr.conflict_history_summary` + +A view containing user-readable details on row conflict. + +#### `bdr.conflict_history_summary` columns + +| Name | Type | Description | +| ----------------------- | ------------------------ | -------------------------- | +| nspname | text | Name of the schema | +| relname | text | Name of the table | +| local_time | timestamp with time zone | Local time of the conflict | +| local_tuple_commit_time | timestamp with time zone | Time of local commit | +| remote_commit_time | timestamp with time zone | Time of remote commit | +| conflict_type | text | Type of conflict | +| conflict_resolution | text | Resolution adopted | + +### `bdr.consensus_kv_data` + +A persistent storage for the internal Raft-based KV store used by +`bdr.consensus_kv_store()` and `bdr.consensus_kv_fetch()` interfaces. + +#### `bdr.consensus_kv_data` Columns + +| Name | Type | Description | +| ------------ | ----------- | ------------------------------------------------ | +| kv_key | text | Unique key | +| kv_val | json | Arbitrary value in json format | +| kv_create_ts | timestamptz | Last write timestamp | +| kv_ttl | int | Time to live for the value in milliseconds | +| kv_expire_ts | timestamptz | Expiration timestamp (`kv_create_ts` + `kv_ttl`) | + +### `bdr.camo_decision_journal` + +A persistent journal of decisions resolved by a CAMO partner node +after a failover, in case `bdr.logical_transaction_status` was +invoked. Unlike `bdr.node_pre_commit`, this doesn't cover +transactions processed under normal operational conditions (i.e., both +nodes of a CAMO pair are running and connected). Entries in this journal +aren't ever cleaned up automatically. This is a diagnostic +tool that the system doesn't depend on. + +#### `bdr.camo_decision_journal` columns + +| Name | Type | Description | +| -------------- | ----------- | ---------------------------------------------- | +| origin_node_id | oid | OID of the node where the transaction executed | +| origin_xid | oid | Transaction ID on the remote origin node | +| decision | char | 'c' for commit, 'a' for abort | +| decision_ts | timestamptz | Decision time | + +### `bdr.crdt_handlers` + +This table lists merge ("handlers") functions for all CRDT data types. + +#### `bdr.crdt_handlers` Columns + +| Name | Type | Description | +| ------------- | ------- | --------------------------------- | +| crdt_type_id | regtype | CRDT data type ID | +| crdt_merge_id | regproc | Merge function for this data type | + +### `bdr.ddl_replication` + +This view lists DDL replication configuration as set up by current [DDL filters](repsets#ddl-replication-filtering). + +#### `bdr.ddl_replication` columns + +| Name | Type | Description | +| ------------ | ---- | ------------------------------------------------------------ | +| set_ddl_name | name | Name of DDL filter | +| set_ddl_tag | text | The command tags it applies on (regular expression) | +| set_ddl_role | text | The roles it applies to (regular expression) | +| set_name | name | Name of the replication set for which this filter is defined | + +### `bdr.depend` + +This table tracks internal object dependencies inside BDR catalogs. + +### `bdr.global_consensus_journal` + +This catalog table logs all the Raft messages that were sent while +managing global consensus. + +As for the `bdr.global_consensus_response_journal` catalog, the +payload is stored in a binary encoded format, which can be decoded +with the `bdr.decode_message_payload()` function. See the +[`bdr.global_consensus_journal_details`](#bdrglobal_consensus_journal_details) view for more details. + +#### `bdr.global_consensus_journal` columns + +| Name | Type | Description | +| ------------- | ----- | --------------------------------------- | +| log_index | int8 | ID of the journal entry | +| term | int8 | Raft term | +| origin | oid | ID of node where the request originated | +| req_id | int8 | ID for the request | +| req_payload | bytea | Payload for the request | +| trace_context | bytea | Trace context for the request | + +### `bdr.global_consensus_journal_details` + +This view presents Raft messages that were sent and the corresponding +responses, using the `bdr.decode_message_payload()` function to decode +their payloads. + +#### `bdr.global_consensus_journal_details` columns + +| Name | Type | Description | +| ------------------------ | ----- | --------------------------------------------- | +| node_group_name | name | Name of the node group | +| log_index | int8 | ID of the journal entry | +| term | int8 | Raft term | +| request_id | int8 | ID of the request | +| origin_id | oid | ID of the node where the request originated | +| req_payload | bytea | Payload of the request | +| origin_node_name | name | Name of the node where the request originated | +| message_type_no | oid | ID of the BDR message type for the request | +| message_type | text | Name of the BDR message type for the request | +| message_payload | text | BDR message payload for the request | +| response_message_type_no | oid | ID of the BDR message type for the response | +| response_message_type | text | Name of the BDR message type for the response | +| response_payload | text | BDR message payload for the response | +| response_errcode_no | text | SQLSTATE for the response | +| response_errcode | text | Error code for the response | +| response_message | text | Error message for the response | + +### `bdr.global_consensus_response_journal` + +This catalog table collects all the responses to the Raft messages +that were received while managing global consensus. + +As for the `bdr.global_consensus_journal` catalog, the payload is +stored in a binary-encoded format, which can be decoded with the +`bdr.decode_message_payload()` function. See the +[`bdr.global_consensus_journal_details`](#bdrglobal_consensus_journal_details) view for more details. + +#### `bdr.global_consensus_response_journal` columns + +| Name | Type | Description | +| ------------- | ----- | ------------------------------ | +| log_index | int8 | ID of the journal entry | +| res_status | oid | Status code for the response | +| res_payload | bytea | Payload for the response | +| trace_context | bytea | Trace context for the response | + +### `bdr.global_lock` + +This catalog table stores the information needed for recovering the +global lock state on server restart. + +For monitoring usage, the +[`bdr.global_locks`](#bdrglobal_locks) view is preferable because the visible rows +in `bdr.global_lock` don't necessarily reflect all global locking activity. + +Don't modify the contents of this table. It is an important BDR catalog. + +#### `bdr.global_lock` columns + +| Name | Type | Description | +| -------------- | ------- | ---------------------------------------------------------------------------- | +| ddl_epoch | int8 | DDL epoch for the lock | +| origin_node_id | oid | OID of the node where the global lock has originated | +| lock_type | oid | Type of the lock (DDL or DML) | +| nspname | name | Schema name for the locked relation | +| relname | name | Relation name for the locked relation | +| groupid | oid | OID of the top level group (for Advisory locks) | +| key1 | integer | First 32-bit key or lower order 32-bits of 64-bit key (for advisory locks) | +| key2 | integer | Second 32-bit key or higher order 32-bits of 64-bit key (for advisory locks) | +| key_is_bigint | boolean | True if 64-bit integer key is used (for advisory locks) | + +### `bdr.global_locks` + +A view containing active global locks on this node. The [`bdr.global_locks`](#bdrglobal_locks) view +exposes BDR's shared-memory lock state tracking, giving administrators greater +insight into BDR's global locking activity and progress. + +See [Monitoring global locks](monitoring#monitoring-global-locks) +for more information about global locking. + +#### `bdr.global_locks` columns + +| Name | Type | Description | +| -------------------------- | ----------- | ----------------------------------------------------------------- | +| `origin_node_id` | oid | The OID of the node where the global lock has originated | +| `origin_node_name` | name | Name of the node where the global lock has originated | +| `lock_type` | text | Type of the lock (DDL or DML) | +| `relation` | text | Locked relation name (for DML locks) or keys (for advisory locks) | +| `pid` | int4 | PID of the process holding the lock | +| `acquire_stage` | text | Internal state of the lock acquisition process | +| `waiters` | int4 | List of backends waiting for the same global lock | +| `global_lock_request_time` | timestamptz | Time this global lock acquire was initiated by origin node | +| `local_lock_request_time` | timestamptz | Time the local node started trying to acquire the local lock | +| `last_state_change_time` | timestamptz | Time `acquire_stage` last changed | + +Column details: + +- `relation`: For DML locks, `relation` shows the relation on which the DML + lock is acquired. For global advisory locks, `relation` column actually shows + the two 32-bit integers or one 64-bit integer on which the lock is acquired. + +- `origin_node_id` and `origin_node_name`: If these are the same as the local + node's ID and name, then the local node is the initiator of the global DDL + lock, i.e., it is the node running the acquiring transaction. If these fields + specify a different node, then the local node is instead trying to acquire its + local DDL lock to satisfy a global DDL lock request from a remote node. + +- `pid`: The process ID of the process that requested the global DDL lock, + if the local node is the requesting node. Null on other nodes. Query the + origin node to determine the locker pid. + +- `global_lock_request_time`: The timestamp at which the global-lock request + initiator started the process of acquiring a global lock. Can be null if + unknown on the current node. This time is stamped at the beginning + of the DDL lock request and includes the time taken for DDL epoch management + and any required flushes of pending-replication queues. Currently only + known on origin node. + +- `local_lock_request_time`: The timestamp at which the local node started + trying to acquire the local lock for this global lock. This includes the + time taken for the heavyweight session lock acquire but doesn't include + any time taken on DDL epochs or queue flushing. If the lock is reacquired + after local node restart, it becomes the node restart time. + +- `last_state_change_time`: The timestamp at which the + `bdr.global_locks.acquire_stage` field last changed for this global lock + entry. + +### `bdr.local_consensus_snapshot` + +This catalog table contains consensus snapshots created or received by +the local node. + +#### `bdr.local_consensus_snapshot` columns + +| Name | Type | Description | +| --------- | ----- | ----------------------- | +| log_index | int8 | ID of the journal entry | +| log_term | int8 | Raft term | +| snapshot | bytea | Raft snapshot data | + +### `bdr.local_consensus_state` + +This catalog table stores the current state of Raft on the local node. + +#### `bdr.local_consensus_state` columns + +| Name | Type | Description | +| ----------------- | ---- | ----------------------------------- | +| node_id | oid | ID of the node | +| current_term | int8 | Raft term | +| apply_index | int8 | Raft apply index | +| voted_for | oid | Vote cast by this node in this term | +| last_known_leader | oid | node_id of last known Raft leader | + +### `bdr.local_node` + +This table identifies the local node in the current database of the current Postgres instance. + +#### `bdr.local_node` columns + +| Name | Type | Description | +| ----------- | ------- | --------------------------- | +| node_id | oid | ID of the node | +| pub_repsets | text\[] | Published replication sets | +| sub_repsets | text\[] | Subscribed replication sets | + +### `bdr.local_node_summary` + +A view containing the same information as [`bdr.node_summary`](#bdrnode_summary) but only for the +local node. + +### `bdr.local_sync_status` + +Information about status of either subscription or table synchronization process. + +#### `bdr.local_sync_status` columns + +| Name | Type | Description | +| ----------------- | ------ | -------------------------------------------------------- | +| sync_kind | char | The kind of synchronization done | +| sync_subid | oid | ID of subscription doing the synchronization | +| sync_nspname | name | Schema name of the synchronized table (if any) | +| sync_relname | name | Name of the synchronized table (if any) | +| sync_status | char | Current state of the synchronization | +| sync_remote_relid | oid | ID of the synchronized table (if any) on the upstream | +| sync_end_lsn | pg_lsn | Position at which the synchronization state last changed | + +### `bdr.network_path_info` + +A catalog view that stores user-defined information on network costs between node locations. + +#### `bdr.network_path_info` columns + +| Name | Type | Description | +| --------------- | ------- | ------------------------------------------ | +| node_group_name | name | Name of the BDR group | +| node_region1 | text | Node region name, from bdr.node_location | +| node_region2 | text | Node region name, from bdr.node_location | +| node_location1 | text | Node location name, from bdr.node_location | +| node_location2 | text | Node location name, from bdr.node_location | +| network_cost | numeric | Node location name, from bdr.node_location | + +### `bdr.node` + +This table lists all the BDR nodes in the cluster. + +#### `bdr.node` columns + +| Name | Type | Description | +| --------------------- | ------- | --------------------------------------------------------------------------- | +| node_id | oid | ID of the node | +| node_name | name | Name of the node | +| node_group_id | oid | ID of the node group | +| source_node_id | oid | ID of the source node | +| synchronize_structure | "char" | Schema synchronization done during the join | +| node_state | oid | Consistent state of the node | +| target_state | oid | State that the node is trying to reach (during join or promotion) | +| seq_id | int4 | Sequence identifier of the node used for generating unique sequence numbers | +| dbname | name | Database name of the node | +| node_dsn | char | Connection string for the node | +| proto_version_ranges | int\[] | Supported protocol version ranges by the node | +| node_join_finished | boolean | Check if the join is finished | + +### `bdr.node_catchup_info` + +This catalog table records relevant catchup information on each node, either +if it is related to the join or part procedure. + +#### `bdr.node_catchup_info` columns + +| Name | Type | Description | +| -------------- | ------ | -------------------------------------------------------------------------- | +| node_id | oid | ID of the node | +| node_source_id | oid | ID of the node used as source for the data | +| slot_name | name | Slot used for this source | +| min_node_lsn | pg_lsn | Minimum LSN at which the node can switch to direct replay from a peer node | +| catchup_state | oid | Status code of the catchup state | +| origin_node_id | oid | ID of the node from which we want transactions | + +If a node(node_id) needs missing data from a parting node(origin_node_id), +it can get it from a node that already has it(node_source_id) by forwarding. +The records in this table persists until the node(node_id) is a member of +the EDB Postgres Distributed cluster. + +### `bdr.node_conflict_resolvers` + +Currently configured conflict resolution for all known conflict types. + +#### `bdr.node_conflict_resolvers` columns + +| Name | Type | Description | +| ----------------- | ---- | ------------------------------------ | +| conflict_type | text | Type of the conflict | +| conflict_resolver | text | Resolver used for this conflict type | + +### `bdr.node_group` + +This catalog table lists all the BDR node groups. + +#### `bdr.node_group` columns + +| Name | Type | Description | +| ------------------------------- | -------- | --------------------------------------------------------------------------------------------- | +| node_group_id | oid | ID of the node group | +| node_group_name | name | Name of the node group | +| node_group_default_repset | oid | Default replication set for this node group | +| node_group_default_repset_ext | oid | Default replication set for this node group | +| node_group_parent_id | oid | ID of parent group (0 if this is a root group) | +| node_group_flags | int | The group flags | +| node_group_uuid | uuid | The uuid of the group | +| node_group_apply_delay | interval | How long a subscriber waits before applying changes from the provider | +| node_group_check_constraints | bool | Whether the apply process checks constraints when applying data | +| node_group_num_writers | int | Number of writers to use for subscriptions backing this node group | +| node_group_enable_wal_decoder | bool | Whether the group has enable_wal_decoder set | +| node_group_streaming_mode | char | Transaction streaming setting: 'O' - off, 'F' - file, 'W' - writer, 'A' - auto, 'D' - default | +| node_group_location | char | Transaction streaming setting: 'O' - off, 'F' - file, 'W' - writer, 'A' - auto, 'D' - default | +| node_group_enable_proxy_routing | char | Transaction streaming setting: 'O' - off, 'F' - file, 'W' - writer, 'A' - auto, 'D' - default | + +### `bdr.node_group_replication_sets` + +A view showing default replication sets create for BDR groups. See also +`bdr.replication_sets`. + +#### `bdr.node_group_replication_sets` columns + +| Name | Type | Description | +| ------------------ | ------- | ------------------------------------------------------------------------------------ | +| node_group_name | name | Name of the BDR group | +| def_repset | name | Name of the default repset | +| def_repset_ops | text\[] | Actions replicated by the default repset | +| def_repset_ext | name | Name of the default "external" repset (usually same as def_repset) | +| def_repset_ext_ops | text\[] | Actions replicated by the default "external" repset (usually same as def_repset_ops) | + +### `bdr.node_local_info` + +A catalog table used to store per-node configuration that's specific to the +local node (as opposed to global view of per-node configuration). + +#### `bdr.node_local_info` columns + +| Name | Type | Description | +| ------------- | ---- | ----------------------------------------------------------------------- | +| node_id | oid | The OID of the node (including the local node) | +| applied_state | oid | Internal ID of the node state | +| ddl_epoch | int8 | Last epoch number processed by the node | +| slot_name | name | Name of the slot used to connect to that node (NULL for the local node) | + +### `bdr.node_location` + +A catalog view that stores user-defined information on node locations. + +#### `bdr.node_location` Columns + +| Name | Type | Description | +| --------------- | ---- | --------------------------- | +| node_group_name | name | Name of the BDR group | +| node_id | oid | ID of the node | +| node_region | text | User-supplied region name | +| node_location | text | User-supplied location name | + +### `bdr.node_log_config` + +A catalog view that stores information on the conflict logging configurations. + +#### `bdr.node_log_config` columns + +| Name | Description | +| ----------------- | --------------------------------------------------------- | +| log_name | Name of the logging configuration | +| log_to_file | Whether it logs to the server log file | +| log_to_table | Whether it logs to a table, and which table is the target | +| log_conflict_type | Which conflict types it logs, if NULL means all | +| log_conflict_res | Which conflict resolutions it logs, if NULL means all | + +### `bdr.node_peer_progress` + +Catalog used to keep track of every node's progress in the replication stream. +Every node in the cluster regularly broadcasts its progress every +`bdr.replay_progress_frequency` milliseconds to all other nodes (default +is 60000 ms, i.e., 1 minute). Expect N \* (N-1) rows in this relation. + +You might be more interested in the `bdr.node_slots` view for monitoring +purposes. See also [Monitoring](monitoring). + +#### `bdr.node_peer_progress` columns + +| Name | Type | Description | +| ----------------------- | ----------- | ------------------------------------------------------------------------------------ | +| node_id | oid | The OID of the originating node that reported this position info | +| peer_node_id | oid | The OID of the node's peer (remote node) for which this position info was reported | +| last_update_sent_time | timestamptz | The time at which the report was sent by the originating node | +| last_update_recv_time | timestamptz | The time at which the report was received by the local server | +| last_update_node_lsn | pg_lsn | LSN on the originating node at the time of the report | +| peer_position | pg_lsn | Latest LSN of the node's peer seen by the originating node | +| peer_replay_time | timestamptz | Latest replay time of peer seen by the reporting node | +| last_update_horizon_xid | oid | Internal resolution horizon: all lower xids are known resolved on the reporting node | +| last_update_horizon_lsn | pg_lsn | Internal resolution horizon: same in terms of an LSN of the reporting node | + +### `bdr.node_pre_commit` + +Used internally on a node configured as a Commit At Most Once (CAMO) +partner. Shows the decisions a CAMO partner took on transactions in +the last 15 minutes. + +#### `bdr.node_pre_commit` columns + +| Name | Type | Description | +| -------------- | ----------- | ---------------------------------------------- | +| origin_node_id | oid | OID of the node where the transaction executed | +| origin_xid | oid | Transaction ID on the remote origin node | +| decision | char | 'c' for commit, 'a' for abort | +| local_xid | xid | Transaction ID on the local node | +| commit_ts | timestamptz | Commit timestamp of the transaction | +| decision_ts | timestamptz | Decision time | + +### `bdr.node_replication_rates` + +This view contains information about outgoing replication activity from a +given node. + +#### `bdr.node_replication_rates` columns + +| Column | Type | Description | +| ---------------- | -------- | ---------------------------------------------------------------------------------------------------- | +| peer_node_id | oid | The OID of node's peer (remote node) for which this info was reported | +| target_name | name | Name of the target peer node | +| sent_lsn | pg_lsn | Latest sent position | +| replay_lsn | pg_lsn | Latest position reported as replayed (visible) | +| replay_lag | interval | Approximate lag time for reported replay | +| replay_lag_bytes | int8 | Bytes difference between replay_lsn and current WAL write position on origin | +| replay_lag_size | text | Human-readable bytes difference between replay_lsn and current WAL write position | +| apply_rate | bigint | LSNs being applied per second at the peer node | +| catchup_interval | interval | Approximate time required for the peer node to catch up to all the changes that are yet to be applied | + +!!! Note + The `replay_lag` is set immediately to zero after reconnect. + As a workaround, use `replay_lag_bytes`, `replay_lag_size`, or + `catchup_interval`. + +### `bdr.node_slots` + +This view contains information about replication slots used in the current +database by BDR. + +See [Monitoring outgoing replication](monitoring#monitoring-outgoing-replication) +for guidance on the use and interpretation of this view's fields. + +#### `bdr.node_slots` columns + +| Name | Type | Description | +| ------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------- | +| target_dbname | name | Database name on the target node | +| node_group_name | name | Name of the BDR group | +| node_group_id | oid | The OID of the BDR group | +| origin_name | name | Name of the origin node | +| target_name | name | Name of the target node | +| origin_id | oid | The OID of the origin node | +| target_id | oid | The OID of the target node | +| local_slot_name | name | Name of the replication slot according to BDR | +| slot_name | name | Name of the slot according to Postgres (same as above) | +| is_group_slot | boolean | True if the slot is the node-group crash recovery slot for this node (see ["Group Replication Slot"]\(nodes#Group Replication Slot)) | +| is_decoder_slot | boolean | Is this slot used by Decoding Worker | +| plugin | name | Logical decoding plugin using this slot (should be pglogical_output or bdr) | +| slot_type | text | Type of the slot (should be logical) | +| datoid | oid | The OID of the current database | +| database | name | Name of the current database | +| temporary | bool | Is the slot temporary | +| active | bool | Is the slot active (does it have a connection attached to it) | +| active_pid | int4 | The PID of the process attached to the slot | +| xmin | xid | The XID needed by the slot | +| catalog_xmin | xid | The catalog XID needed by the slot | +| restart_lsn | pg_lsn | LSN at which the slot can restart decoding | +| confirmed_flush_lsn | pg_lsn | Latest confirmed replicated position | +| usesysid | oid | sysid of the user the replication session is running as | +| usename | name | username of the user the replication session is running as | +| application_name | text | Application name of the client connection (used by `synchronous_standby_names`) | +| client_addr | inet | IP address of the client connection | +| client_hostname | text | Hostname of the client connection | +| client_port | int4 | Port of the client connection | +| backend_start | timestamptz | When the connection started | +| state | text | State of the replication (catchup, streaming, ...) or 'disconnected' if offline | +| sent_lsn | pg_lsn | Latest sent position | +| write_lsn | pg_lsn | Latest position reported as written | +| flush_lsn | pg_lsn | Latest position reported as flushed to disk | +| replay_lsn | pg_lsn | Latest position reported as replayed (visible) | +| write_lag | interval | Approximate lag time for reported write | +| flush_lag | interval | Approximate lag time for reported flush | +| replay_lag | interval | Approximate lag time for reported replay | +| sent_lag_bytes | int8 | Bytes difference between sent_lsn and current WAL write position | +| write_lag_bytes | int8 | Bytes difference between write_lsn and current WAL write position | +| flush_lag_bytes | int8 | Bytes difference between flush_lsn and current WAL write position | +| replay_lag_bytes | int8 | Bytes difference between replay_lsn and current WAL write position | +| sent_lag_size | text | Human-readable bytes difference between sent_lsn and current WAL write position | +| write_lag_size | text | Human-readable bytes difference between write_lsn and current WAL write position | +| flush_lag_size | text | Human-readable bytes difference between flush_lsn and current WAL write position | +| replay_lag_size | text | Human-readable bytes difference between replay_lsn and current WAL write position | + +!!! Note + The `replay_lag` is set immediately to zero after reconnect. + As a workaround, use `replay_lag_bytes` or `replay_lag_size`. + +### `bdr.node_summary` + +This view contains summary information about all BDR nodes known to the local +node. + +#### `bdr.node_summary` columns + +| Name | Type | Description | +| ---------------------- | ---- | --------------------------------------------------------------------------- | +| node_name | name | Name of the node | +| node_group_name | name | Name of the BDR group the node is part of | +| interface_connstr | text | Connection string to the node | +| peer_state_name | text | Consistent state of the node in human readable form | +| peer_target_state_name | text | State that the node is trying to reach (during join or promotion) | +| node_seq_id | int4 | Sequence identifier of the node used for generating unique sequence numbers | +| node_local_dbname | name | Database name of the node | +| node_id | oid | The OID of the node | +| node_group_id | oid | The OID of the BDR node group | +| node_kind_name | oid | Node kind name | + +### `bdr.queue` + +This table stores the historical record of replicated DDL statements. + +#### `bdr.queue` columns + +| Name | Type | Description | +| ---------------- | ----------- | -------------------------------------------------------------- | +| queued_at | timestamptz | When was the statement queued | +| role | name | Which role has executed the statement | +| replication_sets | text\[] | Which replication sets was the statement published to | +| message_type | char | Type of a message. Possible values:
A - Table sync
D - DDL
S - Sequence
T - Truncate
Q - SQL statement | +| message | json | Payload of the message needed for replication of the statement | + +### `bdr.replication_set` + +A table that stores replication set configuration. For user queries, we recommend instead checking the +`bdr.replication_sets` view. + +#### `bdr.replication_set` columns + +| Name | Type | Description | +| ------------------ | ------- | ------------------------------------------------------------------------------ | +| set_id | oid | The OID of the replication set | +| set_nodeid | oid | OID of the node (always local node oid currently) | +| set_name | name | Name of the replication set | +| replicate_insert | boolean | Indicates if the replication set replicates INSERTs | +| replicate_update | boolean | Indicates if the replication set replicates UPDATEs | +| replicate_delete | boolean | Indicates if the replication set replicates DELETEs | +| replicate_truncate | boolean | Indicates if the replication set replicates TRUNCATEs | +| set_isinternal | boolean | Reserved | +| set_autoadd_tables | boolean | Indicates if new tables are automatically added to this replication set | +| set_autoadd_seqs | boolean | Indicates if new sequences are automatically added to this replication set | + +### `bdr.replication_set_table` + +A table that stores replication set table membership. For user queries, we recommend instead checking +the `bdr.tables` view. + +#### `bdr.replication_set_table` columns + +| Name | Type | Description | +| -------------- | ------------ | --------------------------------- | +| set_id | oid | The OID of the replication set | +| set_reloid | regclass | Local ID of the table | +| set_att_list | text\[] | Reserved | +| set_row_filter | pg_node_tree | Compiled row filtering expression | + +### `bdr.replication_set_ddl` + +A table that stores replication set ddl replication filters. For user queries, we recommend +instead checking the `bdr.ddl_replication` view. + +#### `bdr.replication_set_ddl` Columns + +| Name | Type | Description | +| ------------ | ---- | ------------------------------ | +| set_id | oid | The OID of the replication set | +| set_ddl_name | name | Name of the DDL filter | +| set_ddl_tag | text | Command tag for the DDL filter | +| set_ddl_role | text | Role executing the DDL | + +### `bdr.replication_sets` + +A view showing replication sets defined in the BDR group, even if they aren't +currently used by any node. + +#### `bdr.replication_sets` columns + +| Name | Type | Description | +| ------------------ | ------- | ------------------------------------------------------------------------------ | +| set_id | oid | The OID of the replication set | +| set_name | name | Name of the replication set | +| replicate_insert | boolean | Indicates if the replication set replicates INSERTs | +| replicate_update | boolean | Indicates if the replication set replicates UPDATEs | +| replicate_delete | boolean | Indicates if the replication set replicates DELETEs | +| replicate_truncate | boolean | Indicates if the replication set replicates TRUNCATEs | +| set_autoadd_tables | boolean | Indicates if new tables are automatically added to this replication set | +| set_autoadd_seqs | boolean | Indicates if new sequences are automatically added to this replication set | + +### `bdr.schema_changes` + +A simple view to show all the changes to schemas win BDR. + +#### `bdr.schema_changes` columns + +| Name | Type | Description | +| ------------------------ | ------------ | ------------------------- | +| schema_changes_ts | timestampstz | The ID of the trigger | +| schema_changes_change | char | A flag of change type | +| schema_changes_classid | oid | Class ID | +| schema_changes_objectid | oid | Object ID | +| schema_changes_subid | smallint | The subscription | +| schema_changes_descr | text | The object changed | +| schema_changes_addrnames | text\[] | Location of schema change | + +### `bdr.sequence_alloc` + +A view to see the allocation details for galloc sequences. + +#### `bdr.sequence_alloc` columns + +| Name | Type | Description | +| ------------------- | ----------- | ------------------------------------------------ | +| seqid | regclass | The ID of the sequence | +| seq_chunk_size | bigint | A sequence number for the chunk within its value | +| seq_allocated_up_to | bigint | | +| seq_nallocs | bigint | | +| seq_last_alloc | timestamptz | Last sequence allocated | + +### `bdr.schema_changes` + +A simple view to show all the changes to schemas in BDR. + +#### `bdr.schema_changes` columns + +| Name | Type | Description | +| ------------------------ | ------------ | ------------------------- | +| schema_changes_ts | timestampstz | The ID of the trigger | +| schema_changes_change | char | A flag of change type | +| schema_changes_classid | oid | Class ID | +| schema_changes_objectid | oid | Object ID | +| schema_changes_subid | smallint | The subscription | +| schema_changes_descr | text | The object changed | +| schema_changes_addrnames | text\[] | Location of schema change | + +### `bdr.sequence_alloc` + +A view to see the sequences allocated. + +#### `bdr.sequence_alloc` columns + +| Name | Type | Description | +| ------------------- | ----------- | ------------------------------------------------ | +| seqid | regclass | The ID of the sequence | +| seq_chunk_size | bigint | A sequence number for the chunk within its value | +| seq_allocated_up_to | bigint | | +| seq_nallocs | bigint | | +| seq_last_alloc | timestamptz | Last sequence allocated | + +### `bdr.sequences` + +This view lists all sequences with their kind, excluding sequences +for internal BDR bookkeeping. + +#### `bdr.sequences` columns + +| Name | Type | Description | +| ------- | ---- | ----------------------------------------------------- | +| nspname | name | Namespace containing the sequence | +| relname | name | Name of the sequence | +| seqkind | text | Type of the sequence ('local', 'timeshard', 'galloc') | + +### `bdr.stat_activity` + +Dynamic activity for each backend or worker process. + +This contains the same information as `pg_stat_activity`, except `wait_event` +is set correctly when the wait relates to BDR. + +### `bdr.stat_relation` + +Apply statistics for each relation. Contains data only if the tracking +is enabled and something was replicated for a given relation. + +#### `bdr.stat_relation` columns + +| Column | Type | Description | +| ------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------- | +| nspname | name | Name of the relation's schema | +| relname | name | Name of the relation | +| relid | oid | OID of the relation | +| total_time | double precision | Total time spent processing replication for the relation | +| ninsert | bigint | Number of inserts replicated for the relation | +| nupdate | bigint | Number of updates replicated for the relation | +| ndelete | bigint | Number of deletes replicated for the relation | +| ntruncate | bigint | Number of truncates replicated for the relation | +| shared_blks_hit | bigint | Total number of shared block cache hits for the relation | +| shared_blks_read | bigint | Total number of shared blocks read for the relation | +| shared_blks_dirtied | bigint | Total number of shared blocks dirtied for the relation | +| shared_blks_written | bigint | Total number of shared blocks written for the relation | +| blk_read_time | double precision | Total time spent reading blocks for the relation, in milliseconds (if `track_io_timing` is enabled, otherwise zero) | +| blk_write_time | double precision | Total time spent writing blocks for the relation, in milliseconds (if `track_io_timing` is enabled, otherwise zero) | +| lock_acquire_time | double precision | Total time spent acquiring locks on the relation, in milliseconds (if `bdr.track_apply_lock_timing` is enabled, otherwise zero) | + +### `bdr.stat_subscription` + +Apply statistics for each subscription. Contains data only if the tracking +is enabled. + +#### `bdr.stat_subscription` columns + +| Column | Type | Description | +| -------------------- | ------------------------ | ------------------------------------------------------------------------------------------------------------------- | +| sub_name | name | Name of the subscription | +| subid | oid | OID of the subscription | +| nconnect | bigint | Number of times this subscription has connected upstream | +| ncommit | bigint | Number of commits this subscription did | +| nabort | bigint | Number of aborts writer did for this subscription | +| nerror | bigint | Number of errors writer has hit for this subscription | +| nskippedtx | bigint | Number of transactions skipped by writer for this subscription (due to `skip_transaction` conflict resolver) | +| ninsert | bigint | Number of inserts this subscription did | +| nupdate | bigint | Number of updates this subscription did | +| ndelete | bigint | Number of deletes this subscription did | +| ntruncate | bigint | Number of truncates this subscription did | +| nddl | bigint | Number of DDL operations this subscription has executed | +| ndeadlocks | bigint | Number of errors that were caused by deadlocks | +| nretries | bigint | Number of retries the writer did (without going for full restart/reconnect) | +| nstream_writer | bigint | Number of transactions streamed to writer | +| nstream_file | bigint | Number of transactions streamed to file | +| nstream_commit | bigint | Number of streaming transactions committed | +| nstream_abort | bigint | Number of streaming transactions aborted | +| nstream_start | bigint | Number of STREAT START messages processed | +| nstream_stop | bigint | Number of STREAM STOP messages processed | +| shared_blks_hit | bigint | Total number of shared block cache hits by the subscription | +| shared_blks_read | bigint | Total number of shared blocks read by the subscription | +| shared_blks_dirtied | bigint | Total number of shared blocks dirtied by the subscription | +| shared_blks_written | bigint | Total number of shared blocks written by the subscription | +| blk_read_time | double precision | Total time the subscription spent reading blocks, in milliseconds (if `track_io_timing` is enabled, otherwise zero) | +| blk_write_time | double precision | Total time the subscription spent writing blocks, in milliseconds (if `track_io_timing` is enabled, otherwise zero) | +| connect_time | timestamp with time zone | Time when the current upstream connection was established, NULL if not connected | +| last_disconnect_time | timestamp with time zone | Time when the last upstream connection was dropped | +| start_lsn | pg_lsn | LSN from which this subscription requested to start replication from the upstream | +| retries_at_same_lsn | bigint | Number of attempts the subscription was restarted from the same LSN value | +| curr_ncommit | bigint | Number of commits this subscription did after the current connection was established | + +### `bdr.subscription` + +This catalog table lists all the subscriptions owned by the local BDR +node and their modes. + +#### `bdr.subscription` columns + +| Name | Type | Description | +| ----------------- | -------- | -------------------------------------------------------------------------------- | +| sub_id | oid | ID of the subscription | +| sub_name | name | Name of the subscription | +| nodegroup_id | oid | ID of nodegroup | +| origin_node_id | oid | ID of origin node | +| source_node_id | oid | ID of source node | +| target_node_id | oid | ID of target node | +| subscription_mode | char | Mode of subscription | +| sub_enabled | bool | Whether the subscription is enabled (should be replication) | +| apply_delay | interval | How much behind should the apply of changes on this subscription be (normally 0) | +| slot_name | name | Slot on upstream used by this subscription | +| origin_name | name | Local origin used by this subscription | +| num_writers | int | Number of writer processes this subscription uses | +| streaming_mode | char | Streaming configuration for the subscription | +| replication_sets | text\[] | Replication sets replicated by this subscription (NULL = all) | +| forward_origin | text\[] | Origins forwarded by this subscription (NULL = all) | + +### `bdr.subscription_summary` + +This view contains summary information about all BDR subscriptions that the +local node has to other nodes. + +#### `bdr.subscription_summary` columns + +| Name | Type | Description | +| -------------------------- | ----------- | ---------------------------------------------------------------------------------------- | +| node_group_name | name | Name of the BDR group the node is part of | +| sub_name | name | Name of the subscription | +| origin_name | name | Name of the origin node | +| target_name | name | Name of the target node (normally local node) | +| sub_enabled | bool | Is the subscription enabled | +| sub_slot_name | name | Slot name on the origin node used by this subscription | +| sub_replication_sets | text\[] | Replication sets subscribed | +| sub_forward_origins | text\[] | Does the subscription accept changes forwarded from other nodes besides the origin | +| sub_apply_delay | interval | Delay transactions by this much compared to the origin | +| sub_origin_name | name | Replication origin name used by this subscription | +| bdr_subscription_mode | char | Subscription mode | +| subscription_status | text | Status of the subscription worker | +| node_group_id | oid | The OID of the BDR group the node is part of | +| sub_id | oid | The OID of the subscription | +| origin_id | oid | The OID of the origin node | +| target_id | oid | The OID of the target node | +| receive_lsn | pg_lsn | Latest LSN of any change or message received (this can go backwards in case of restarts) | +| receive_commit_lsn | pg_lsn | Latest LSN of last COMMIT received (this can go backwards in case of restarts) | +| last_xact_replay_lsn | pg_lsn | LSN of last transaction replayed on this subscription | +| last_xact_flush_lsn | timestamptz | LSN of last transaction replayed on this subscription that's flushed durably to disk | +| last_xact_replay_timestamp | timestamptz | Timestamp of last transaction replayed on this subscription | + +### `bdr.replication_status` + +This view shows incoming replication status between the local node and +all other nodes in the EDB Postgres Distributed cluster. We consider replication to be +blocked when the subscription restarted from the same LSN at least +twice and not a single transaction is yet applied after the current +upstream connection was established. If the first transaction after +restart is very big and still being applied, the `replication_blocked` +result might be wrong. + +If this is a logical standby node, then only the status for its upstream +node is shown. Similarly, replication status isn't shown for +subscriber-only nodes since they never send replication changes to other +nodes. + +#### `bdr.replication_status` columns + +| Column | Type | Description | +| ------------------- | ------------------------ | --------------------------------------------------------------- | +| node_id | oid | OID of the local node | +| node_name | name | Name of the local node | +| origin_node_id | oid | OID of the origin node | +| origin_node_name | name | Name of the origin node | +| sub_id | oid | OID of the subscription for this origin node | +| sub_name | name | Name of the subscription for this origin node | +| connected | boolean | Is this node connected to the origin node? | +| replication_blocked | boolean | Is the replication currently blocked for this origin? | +| connect_time | timestamp with time zone | Time when the current connection was established | +| disconnect_time | timestamp with time zone | Time when the last connection was dropped | +| uptime | interval | Duration since the current connection is active for this origin | + +### `bdr.tables` + +This view lists information about table membership in replication sets. +If a table exists in multiple replication sets, it appears multiple times +in this table. + +#### `bdr.tables` columns + +| Name | Type | Description | +| ------------------ | ------- | --------------------------------------------------------------------------------- | +| relid | oid | The OID of the relation | +| nspname | name | Name of the schema relation is in | +| relname | name | Name of the relation | +| set_name | name | Name of the replication set | +| set_ops | text\[] | List of replicated operations | +| rel_columns | text\[] | List of replicated columns (NULL = all columns) (\*) | +| row_filter | text | Row filtering expression | +| conflict_detection | text | Conflict detection method used: row_origin (default), row_version or column_level | + +(\*) These columns are reserved for future use and should currently be NULL + +### `bdr.trigger` + +In this view, you can see all the stream triggers created. +Often triggers here are created from `bdr.create_conflict_trigger`. + +#### `bdr.trigger` columns + +| Name | Type | Description | +| -------------- | -------- | ----------------------------- | +| trigger_id | oid | The ID of the trigger | +| trigger_reloid | regclass | Name of the relating function | +| trigger_pgtgid | oid | Postgres trigger ID | +| trigger_type | char | Type of trigger call | +| trigger_name | name | Name of the trigger | + +### `bdr.triggers` + +An expanded view of `bdr.trigger` with columns that are easier to read. + +| Name | Type | Description | +| ------------------ | ------------------ | ----------------------- | +| trigger_name | name | The name of the trigger | +| event_manipulation | text | The operation(s) | +| trigger_type | bdr.trigger_type | Type of trigger | +| trigger_table | bdr.trigger_reloid | The table that calls it | +| trigger_function | name | The function used | + +### `bdr.workers` + +Information about running BDR worker processes. + +This can be joined with `bdr.stat_activity` using pid to get even more insight +into the state of BDR workers. + +#### `bdr.workers` Columns + +| Name | Type | Description | +| ----------------------- | ----------- | --------------------------------------------------------- | +| worker_pid | int | Process ID of the worker process | +| worker_role | int | Numeric representation of worker role | +| worker_role_name | text | Name of the worker role | +| worker_subid | oid | Subscription ID if the worker is associated with one | + +### `bdr.writers` + +Specific information about BDR writer processes. + +#### `bdr.writers` columns + +| Name | Type | Description | +| --------------------- | ----------- | ------------------------------------------------------------------ | +| sub_name | name | Name of the subscription | +| pid | int | Process ID of the worker process | +| syncing_rel | int | OID of the relation being synchronized (if any) | +| streaming_allowed | text | Can this writer be target of direct to writer streaming | +| is_streaming | bool | Is there transaction being streamed to this writer | +| remote_xid | xid | Remote transaction id of the transaction being processed (if any) | +| remote_commit_lsn | pg_lsn | LSN of last commit processed | +| commit_queue_position | int | Position in the internal commit queue | +| nxacts | bigint | Number of transactions processed by this writer | +| ncommits | bigint | Number of transactions committed by this writer | +| naborts | bigint | Number of transactions aborted by this writer | +| nstream_file | bigint | Number of streamed-to-file transactions processed by this writer | +| nstream_writer | bigint | Number of streamed-to-writer transactions processed by this writer | + +### `bdr.worker_tasks` + +The `bdr.worker_tasks` view shows BDR's current worker launch rate +limiting state as well as some basic statistics on background worker launch +and registration activity. + +Unlike the other views listed here, it isn't specific to the current database +and BDR node. State for all BDR nodes on the current PostgreSQL +instance is shown. Join on the current database to filter it. + +`bdr.worker_tasks` doesn't track walsenders and output plugins. + +#### `bdr.worker_tasks` columns + +| Column | Type | Description | +| --------------------------------- | ------------------------ | ------------------------------------------------------- | +| task_key_worker_role | integer | Worker role identifier | +| task_key_worker_role_name | text | Worker role name | +| task_key_dboid | oid | Database identifier, if available | +| datname | name | Name of the database, if available | +| task_key_subid | oid | Subscription identifier, if available | +| sub_name | name | Name of the subscription, if available | +| task_key_ext_libname | name | Name of the library (most likely bdr) | +| task_key_ext_funcname | name | Name of the function entry point | +| task_key_ext_workername | name | Name assigned to the worker | +| task_key_remoterelid | oid | Identifier of the remote syncing relation, if available | +| task_pid | integer | Process ID of the worker | +| task_registered | timestamp with time zone | Worker registration timestamp | +| since_registered | interval | Interval since the worker registered | +| task_attached | timestamp with time zone | Worker attach timestamp | +| since_attached | interval | Interval since the worker attached | +| task_exited | timestamp with time zone | Worker exit timestamp | +| since_exited | interval | Interval since the worker exited | +| task_success | boolean | Is worker still running? | +| task_next_launch_not_before | timestamp with time zone | Timestamp when the worker will be restarted again | +| until_launch_allowed | interval | Time remaining for next launch | +| task_last_launch_requestor_pid | integer | Process ID that requested launch | +| task_last_launch_request_time | timestamp with time zone | Timestamp when the request was made | +| since_last_request | interval | Interval since the last request | +| task_last_launch_request_approved | boolean | Did the last request succeed? | +| task_nrequests | integer | Number of requests | +| task_nregistrations | integer | Number of registrations | +| task_prev_pid | integer | Process ID of the previous generation | +| task_prev_registered | timestamp with time zone | Timestamp of the previous registered task | +| since_prev_registered | interval | Interval since the previous registration | +| task_prev_launched | timestamp with time zone | Timestamp of the previous launch | +| since_prev_launched | interval | Interval since the previous launch | +| task_prev_exited | timestamp with time zone | Timestamp when the previous task exited | +| since_prev_exited | interval | Interval since the previous task exited | +| task_first_registered | timestamp with time zone | Timestamp when the first registration happened | +| since_first_registered | interval | Interval since the first registration | + +### `bdr.taskmgr_work_queue` + +Contains work items created and processed by task manager. The work items are created on +only one node and processed on different nodes. + +#### `bdr.taskmgr_work_queue` columns + +| Column | Type | Description | +| ------------------ | ------ | --------------------------------------------------------------------------------------------------------------------------- | +| ap_wq_workid | bigint | The unique ID of the work item | +| ap_wq_ruleid | int | ID of the rule listed in autopartition_rules. Rules are specified using bdr.autopartition command | +| ap_wq_relname | name | Name of the relation the task belongs to | +| ap_wq_relnamespace | name | Name of the tablespace specified in rule for this work item | +| ap_wq_partname | name | Name of the partition created by the workitem | +| ap_wq_work_category| char | The work category can be either 'c' (Create Partition), 'm' (Migrate Partition), 'd' (Drop Partition), 'a' (Alter Partition)| +| ap_wq_work_sql | text | SQL query for the work item | +| ap_wq_work_depends | Oid\[] | OIDs of the nodes on which the work item depends | + +### `bdr.taskmgr_workitem_status` + +The status of the work items that is updated locally on each node. + +#### `bdr.taskmgr_workitem_status` columns + +| Column | Type | Description | +| ----------------- | ----------- | ---------------------------------------------------------------------------------- | +| ap_wi_workid | bigint | The ID of the work item | +| ap_wi_nodeid | Oid | OID of the node on which the work item is being processed | +| ap_wi_status | char | The status can be either 'q' (Queued), 'c' (Complete), 'f' (Failed), 'u' (Unknown) | +| ap_wi_started_at | timestamptz | The start timestamptz of work item | +| ap_wi_finished_at | timestamptz | The end timestamptz of work item | + +### `bdr.taskmgr_local_work_queue` + +Contains work items created and processed by the task manager. This is +similar to `bdr.taskmgr_work_queue`, except that these work items are for +locally managed tables. Each node creates and processes its own local work +items, independent of other nodes in the cluster. + +#### `bdr.taskmgr_local_work_queue` columns + +| Column | Type | Description | +| ------------------ | ------ | ------------------------------------------------------------------------------------------------------------------------ | +| ap_wq_workid | bigint | The unique ID of the work item | +| ap_wq_ruleid | int | ID of the rule listed in autopartition_rules. Rules are specified using bdr.autopartition command | +| ap_wq_relname | name | Name of the relation the task belongs to | +| ap_wq_relnamespace | name | Name of the tablespace specified in rule for this work item. | +| ap_wq_partname | name | Name of the partition created by the workitem | +| ap_wq_work_category| char | The categoty can be either 'c' (Create Partition), 'm' (Migrate Partition), 'd' (Drop Partition), 'a' (Alter Partition) | +| ap_wq_work_sql | text | SQL query for the work item | +| ap_wq_work_depends | Oid\[] | Always NULL | + +### `bdr.taskmgr_local_workitem_status` + +The status of the work items for locally managed tables. + +#### `bdr.taskmgr_local_workitem_status` columns + +| Column | Type | Description | +| ----------------- | ----------- | ---------------------------------------------------------------------------------- | +| ap_wi_workid | bigint | The ID of the work item | +| ap_wi_nodeid | Oid | OID of the node on which the work item is being processed | +| ap_wi_status | char | The status can be either 'q' (Queued), 'c' (Complete), 'f' (Failed), 'u' (Unknown) | +| ap_wi_started_at | timestamptz | The start timestamptz of work item | +| ap_wi_finished_at | timestamptz | The end timestamptz of work item | + +### `bdr.group_camo_details` + +Uses `bdr.run_on_all_nodes` to gather CAMO-related information from all nodes. + +#### `bdr.group_camo_details` columns + +| Name | Type | Description | +| -------------------------- | ---- | ----------------------------------------------------------------------------------- | +| node_id | text | Internal node ID | +| node_name | text | Name of the node | +| camo_partner | text | Node name of the camo partner | +| is_camo_partner_connected | text | Connection status | +| is_camo_partner_ready | text | Readiness status | +| camo_transactions_resolved | text | Are there any pending and unresolved CAMO transactions | +| apply_lsn | text | Latest position reported as replayed (visible) | +| receive_lsn | text | Latest LSN of any change or message received (can go backwards in case of restarts) | +| apply_queue_size | text | Bytes difference between apply_lsn and receive_lsn | + +### `bdr.commit_scopes` + +Catalog storing all possible commit scopes that you can use for +`bdr.commit_scope` to enable group commit. + +#### `bdr.commit_scopes` columns + +| Name | Type | Description | +|--------------------------------|------|----------------------------------------- | +| commit_scope_id | oid | ID of the scope to be referenced | +| commit_scope_name | name | Name of the scope to be referenced | +| commit_scope_origin_node_group | oid | Node group for which the rule applies, referenced by ID | +| sync_scope_rule | text | Definition of the scope | + +### `bdr.group_raft_details` + +Uses `bdr.run_on_all_nodes` to gather Raft Consensus status from all nodes. + +#### `bdr.group_raft_details` columns + +| Name | Type | Description | +| ---------------- | ---- | ------------------------------ | +| node_id | oid | Internal node ID | +| node_name | name | Name of the node | +| node_group_name | name | Name of the group is part of | +| state | text | Raft worker state on the node | +| leader_id | oid | Node id of the RAFT_LEADER | +| current_term | int | Raft election internal ID | +| commit_index | int | Raft snapshot internal ID | +| nodes | int | Number of nodes accessible | +| voting_nodes | int | Number of nodes voting | +| protocol_version | int | Protocol version for this node | + +### `bdr.group_replslots_details` + +Uses `bdr.run_on_all_nodes` to gather BDR slot information from all nodes. + +#### `bdr.group_replslots_details` columns + +| Name | Type | Description | +| --------------- | -------- | ------------------------------------------------------------------------------- | +| node_group_name | text | Name of the BDR group | +| origin_name | text | Name of the origin node | +| target_name | text | Name of the target node | +| slot_name | text | Slot name on the origin node used by this subscription | +| active | text | Is the slot active (does it have a connection attached to it) | +| state | text | State of the replication (catchup, streaming, ...) or 'disconnected' if offline | +| write_lag | interval | Approximate lag time for reported write | +| flush_lag | interval | Approximate lag time for reported flush | +| replay_lag | interval | Approximate lag time for reported replay | +| sent_lag_bytes | int8 | Bytes difference between sent_lsn and current WAL write position | +| write_lag_bytes | int8 | Bytes difference between write_lsn and current WAL write position | +| flush_lag_bytes | int8 | Bytes difference between flush_lsn and current WAL write position | +| replay_lag_byte | int8 | Bytes difference between replay_lsn and current WAL write position | + +### `bdr.group_subscription_summary` + +Uses `bdr.run_on_all_nodes` to gather subscription status from all nodes. + +#### `bdr.group_subscription_summary` columns + +| Name | Type | Description | +| -------------------------- | ---- | ---------------------------------------------- | +| origin_node_name | text | Name of the origin of the subscription | +| target_node_name | text | Name of the target of the subscription | +| last_xact_replay_timestamp | text | Timestamp of the last replayed transaction | +| sub_lag_seconds | text | Lag between now and last_xact_replay_timestamp | + +### `bdr.group_versions_details` + +Uses `bdr.run_on_all_nodes` to gather BDR information from all nodes. + +#### `bdr.group_versions_details` columns + +| Name | Type | Description | +| ---------------- | ---- | ------------------------------ | +| node_id | oid | Internal node ID | +| node_name | name | Name of the node | +| postgres_version | text | PostgreSQL version on the node | +| bdr_version | text | BDR version on the node | + +## Internal catalogs and views + +### `bdr.ddl_epoch` + +An internal catalog table holding state per DDL epoch. + +#### `bdr.ddl_epoch` columns + +| Name | Type | Description | +| --------------------- | ----------- | ------------------------------------------------------------------------ | +| ddl_epoch | int8 | Monotonically increasing epoch number | +| origin_node_id | oid | Internal node ID of the node that requested creation of this epoch | +| epoch_consume_timeout | timestamptz | Timeout of this epoch | +| epoch_consumed | boolean | Switches to true as soon as the local node has fully processed the epoch | +| epoch_consumed_lsn | boolean | LSN at which the local node has processed the epoch | + + +### `bdr.internal_node_pre_commit` + +Internal catalog table. Use the `bdr.node_pre_commit` view. + +### `bdr.sequence_kind` + +An internal state table storing the type of each nonlocal sequence. We recommend the view +`bdr.sequences` for diagnostic purposes. + +#### `bdr.sequence_kind` columns + +| Name | Type | Description | +| ------- | ---- | ----------------------------------------------------------- | +| seqid | oid | Internal OID of the sequence | +| seqkind | char | Internal sequence kind ('l'=local,'t'=timeshard,'s'=snowflakeid,'g'=galloc) | + +### `bdr.event_history` + +Internal catalog table that tracks cluster membership events for a given BDR +node. Specifically, it tracks: + +* Node joins (to the cluster) +* Raft state changes (i.e. whenever the node changes its role in the consensus +protocol - leader, follower or candidate to leader) - see [Monitoring Raft Consensus](monitoring/sql#monitoring-raft-consensus) +* Whenever a worker has errored out (see [bdr.workers](#bdr.workers) +and [Monitoring BDR Replication Workers](monitoring/sql#monitoring-bdr-replication-workers)) + +| Name | Type | Description | +| -------------- | ----------- | ----------------------------------------------------------------------------------- | +| event_node_id | oid | The ID of the node to which the event refers to | +| event_type | int | The type of the event (a node, raft or worker related event) | +| event_sub_type | int | The sub-type of the event, i.e. if it's a join, a state change or an error | +| event_source | text | The name of the worker process where the event was sourced | +| event_time | timestamptz | The timestamp at which the event occurred | +| event_text | text | A textual representation of the event (e.g. the error of the worker) | +| event_detail | text | A more detailed description of the event (for now, only relevant for worker errors) | + +### `bdr.event_summary` + +A view of the `bdr.event_history` catalog that display the information in a more +human-friendly format. Specifically, it displays the event types and sub-types +as textual representations, rather than integers. + +### `bdr.node_catchup_info_details` + +A view of `bdr.node_catchup_info` catalog which shows info in more friendly way + + +### `bdr.node_group_config` + +An internal catalog table with per node group configuration options. + +#### `bdr.node_group_config` columns + +| Name | Type | Description | +| ----------------------- | -------- | ---------------------------------------- | +| node_group_id | oid | The node group id | +| route_writer_max_lag | bigint | Maximum write lag accepted | +| route_reader_max_lag | bigint | Maximum read lag accepted | +| route_writer_wait_flush | boolean | Switch if we need to wait for the flush | + +### `bdr.node_group_routing_config_summary` + +Per node group routing configuration options. + +### `bdr.node_config` + +An internal catalog table with per node configuration options. + +#### `bdr.node_config` columns + +| Name | Type | Description | +| ----------------------- | -------- | ---------------------------------------- | +| node_id | oid | The node id | +| node_route_priority | int | Priority assigned to this node | +| node_route_fence | boolean | Switch to fenxce this node | +| node_route_writes | boolean | Switch to allow writes | +| node_route_reads | boolean | Switch to allow reads | +| node_route_dsn | text | The interface of this node | + +### `bdr.node_routing_config_summary` + +Per node routing configuration options. + +### `bdr.proxy_config` + +An internal catalog table holding proxy specific configurations. + +#### `bdr.proxy_config` columns + +| Name | Type | Description | +| ---------------------- | -------- | ------------------------------------------------------------------------ | +| proxy_name | name | The name of the proxy | +| node_group_id | oid | The node group id that this proxy uses | +| listen_port | int | Port that the proxy uses | +| max_client_conn | int | Number of maximum client connections that the proxy accepts | +| max_server_conn | int | Number of maximum connections that the server accepts | +| server_conn_timeout | interval | Timeout for the server connections | +| server_conn_keepalive | interval | Interval between the server connection keep alive | +| fallback_group_timeout | interval | Timeout needed for the falback | +| fallback_group_ids | oid[] | List of group IDs to be used for the fallback | +| listen_addrs | text[] | Listen address | + + +### `bdr.proxy_config_summary` + +Per proxy instance specific configuration options. + +### `bdr.node_group_routing_info` + +An internal catalog table holding current routing information for a proxy. + +#### `bdr.node_group_routing_info` columns + +| Name | Type | Description | +| -------------------- | -------- | -------------------------------------- | +| node_group_id | oid | The node group id tha this proxy uses | +| write_node_id | oid | Current write node | +| prev_write_node_id | oid | Previous write node | +| read_node_ids | oid[] | List of read nodes IDs | + +### `bdr.node_group_routing_summary` + +A view of `bdr.node_group_routing_info` catalog which shows info in more friendly way diff --git a/product_docs/docs/pgd/5/choosing_server.mdx b/product_docs/docs/pgd/5/choosing_server.mdx new file mode 100644 index 00000000000..2d3282c07b7 --- /dev/null +++ b/product_docs/docs/pgd/5/choosing_server.mdx @@ -0,0 +1,38 @@ +--- +title: "Choosing a Postgres distribution" +--- + +EDB Postgres Distributed can be deployed with three different Postgres distributions: PostgreSQL, EDB Postgres Extended Server, or EDB Postgres Advanced Server. The availability of particular EDB Postgres Distributed features depends on which Postgres distribution is used. Therefore, it is essential to adopt the Postgres distribution best suited to your business needs. For example, if having the feature "Commit At Most Once (CAMO)" is mission critical to your use case, you should not adopt open source PostgreSQL because it does not have the core capabilities required to handle CAMO. + +The following table lists features of EDB Postgres Distributed that are dependent on the Postgres distribution and version. + +| Feature | PostgreSQL | EDB Postgres Extended | EDB Postgres Advanced | +|-------------------------------------------------|------------|-----------------------|-----------------------| +| Rolling application and database upgrades | Y | Y | Y | +| Asynchronous replication | Y | Y | Y | +| Row-level last-update wins conflict resolution | Y | Y | Y | +| DDL replication | Y | Y | Y | +| Granular DDL Locking | Y | Y | Y | +| Streaming of large transactions | v14+ | v13+ | v14+ | +| Distribured sequences | Y | Y | Y | +| Subscribe-only nodes | Y | Y | Y | +| Monitoring | Y | Y | Y | +| OpenTelemetry support | Y | Y | Y | +| Parallel apply | Y | Y | Y | +| Conflict-free replicated data types (CRDTs) | Y | Y | Y | +| Column-level conflict resolution | Y | Y | Y | +| Transform triggers | Y | Y | Y | +| Conflict triggers | Y | Y | Y | +| Transparent Data Encryption | N | 15+ | 15+ | +| Legacy synchronous replication | Y | Y | Y | +| Group Commit | N | Y | 14+ | +| Commit At Most Once (CAMO) | N | Y | 14+ | +| Eager Conflict Resolution | N | Y | 14+ | +| Lag Control | N | Y | 14+ | +| Decoding Worker | N | 13+ | 14+ | +| Transaction Streaming | 14+ | 13+ | 14+ | +| Lag Tracker | N | Y | 14+ | +| Assessment Tooling | N | Y | 14+ | +| Missing Partition Conflict | N | Y | 14+ | +| No need for UPDATE Trigger on tables with TOAST | N | Y | 14+ | +| Automatically hold back FREEZE | N | Y | 14+ | diff --git a/product_docs/docs/pgd/5/cli/command_ref/index.mdx b/product_docs/docs/pgd/5/cli/command_ref/index.mdx new file mode 100644 index 00000000000..9510ab9c334 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/index.mdx @@ -0,0 +1,47 @@ +--- +title: Command reference +redirects: + - /pgd/latest/cli/command_ref/pgd_show-camo/ +--- + +pgd is the command name for the PGD command line interface. + +## Synopsis + +The EDB Postgres Distributed Command Line Interface (PGD CLI) is a tool to +manage your EDB Postgres Distributed cluster. It allows you to run commands +against EDB Postgres Distributed clusters. You can use it to inspect and manage +cluster resources. + +## Options + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -h, --help help for pgd + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` + +## See also + +- [check-health](pgd_check-health) +- [create-proxy](pgd_create-proxy) +- [delete-proxy](pgd_delete-proxy) +- [set-group-options](pgd_set-group-options) +- [set-node-options](pgd_set-node-options) +- [set-proxy-options](pgd_set-proxy-options) +- [show-clockscrew](pgd_show-clockskew) +- [show-events](pgd_show-events) +- [show-groups](pgd_show-groups) +- [show-nodes](pgd_show-nodes) +- [show-proxies](pgd_show-proxies) +- [show-raft](pgd_show-raft) +- [show-replslots](pgd_show-replslots) +- [show-subscriptions](pgd_show-subscriptions) +- [show-version](pgd_show-version) +- [switchover](pgd_switchover) +- [verify-cluster](pgd_verify-cluster) +- [verify-settings](pgd_verify-settings) diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_check-health.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_check-health.mdx new file mode 100644 index 00000000000..c90de1c4e7e --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_check-health.mdx @@ -0,0 +1,79 @@ +--- +title: check-health +--- + +Checks the health of the EDB Postgres Distributed cluster. + +### Synopsis + +Performs various checks such as if all nodes are accessible and all +replication slots are working. + +Please note that the current implementation of clock skew may return an +inaccurate skew value if the cluster is under high load while running this +command or has large number of nodes in it. + +```sh +pgd check-health [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd check-health + + Check Status Message + ----- ------ ------- + ClockSkew Critical Clockskew cannot be determined for at least 1 BDR node pair + Connection Critical The node bdr-b1 is not accessible + Raft Warning There is at least 1 node that is not accessible + Replslots Critical There is at least 1 BDR replication slot which is inactive + Version Warning There is at least 1 node that is not accessible + + + Example 2 (3 node cluster, all nodes are up but system clocks are not in sync) + + $ pgd check-health + + Check Status Message + ----- ------ ------- + ClockSkew Warning At least 1 BDR node pair has clockskew greater than 2 seconds + Connection Ok All BDR nodes are accessible + Raft Ok Raft Consensus is working correctly + Replslots Ok All BDR replication slots are working correctly + Version Ok All nodes are running same BDR versions + + + Example 3 (3 node cluster, all nodes are up and all checks are Ok) + + $ pgd check-health + + Check Status Message + ----- ------ ------- + ClockSkew Ok All BDR node pairs have clockskew within permissible limit + Connection Ok All BDR nodes are accessible + Raft Ok Raft Consensus is working correctly + Replslots Ok All BDR replication slots are working correctly + Version Ok All nodes are running same BDR versions + +``` + +### Options + +```text + -h, --help help for check-health +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_create-proxy.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_create-proxy.mdx new file mode 100644 index 00000000000..9a05b8f82df --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_create-proxy.mdx @@ -0,0 +1,46 @@ +--- +title: create-proxy +--- + +Creates proxy in the EDB Postgres Distributed cluster. + +### Synopsis + +Creates proxy in the EDB Postgres Distributed cluster and attaches it to the +given group. The proxy name must be unique across the cluster and match with +the name given in the corresponding proxy config file. + + +```sh +pgd create-proxy [flags] +``` + +### Examples + +```text + + Example 1 (attach new proxy called proxy-a1 to group bdrgroup) + + $ pgd create-proxy --proxy-name proxy-a1 --group-name bdrgroup + proxy created successfully + +``` + +### Options + +```text + --group-name string group name + -h, --help help for create-proxy + --proxy-name string proxy name +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_delete-proxy.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_delete-proxy.mdx new file mode 100644 index 00000000000..137239a33ec --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_delete-proxy.mdx @@ -0,0 +1,43 @@ +--- +title: delete-proxy +--- + +Deletes proxy from the EDB Postgres Distributed cluster. + +### Synopsis + +Deletes proxy from the EDB Postgres Distributed cluster. + + +```sh +pgd delete-proxy [flags] +``` + +### Examples + +```text + + Example 1 (delete proxy proxy-a1) + + $ pgd delete-proxy --proxy-name proxy-a1 + proxy deleted successfully + +``` + +### Options + +```text + -h, --help help for delete-proxy + --proxy-name string proxy name +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_set-group-options.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_set-group-options.mdx new file mode 100644 index 00000000000..5f5444bf3e0 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_set-group-options.mdx @@ -0,0 +1,66 @@ +--- +title: set-group-options +--- + +Sets group options such as 'enable_raft', 'enable_proxy_routing', and 'location'. + +### Synopsis + +You can set the following group options with this command: + +- 'enable_raft' +- 'enable_proxy_routing' +- 'location' +- 'route_writer_max_lag' + +Both 'enable_raft' and 'enable_proxy_routing' must be true if proxy is +attached to the group. + +Use 'pgd show-groups -o json' to view option values for each group. + + +```sh +pgd set-group-options [flags] +``` + +### Examples + +```text + + Example 1 (comma separated multiple options, spaces are not allowed) + + $ pgd set-group-options --group-name bdrgroup --option enable_proxy_routing=true,route_writer_max_lag=1000000 + group options updated successfully + + + Example 2 (multiple --option flags are allowed) + + $ pgd set-group-options --group-name bdrgroup --option enable_proxy_routing=true --option route_writer_max_lag=1000000 + group options updated successfully + + + Example 3 (use double quote if option value has spaces or special characters) + + $ pgd set-group-options --group-name bdrgroup --option "location = mumbai" --option "route_writer_max_lag = 1000000" + group options updated successfully + +``` + +### Options + +```text + --group-name string group name + -h, --help help for set-group-options + --option strings option in name=value format +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_set-node-options.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_set-node-options.mdx new file mode 100644 index 00000000000..d7c03d390d1 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_set-node-options.mdx @@ -0,0 +1,63 @@ +--- +title: set-node-options +--- + +Sets node options such as 'route_fence', 'route_priority', and 'route_writes'. + +### Synopsis + +You can set the following node options with this command: + +- 'route_dsn' +- 'route_fence' +- 'route_priority' +- 'route_writes' + +Use 'pgd show-nodes -o json' to view option values for each node. + + +```sh +pgd set-node-options [flags] +``` + +### Examples + +```text + + Example 1 (comma separated multiple options, spaces are not allowed) + + $ pgd set-node-options --node-name bdr-a1 --option route_priority=100,route_fence=true + node options updated successfully + + + Example 2 (multiple --option flags are allowed) + + $ pgd set-node-options --node-name bdr-a1 --option route_priority=100 --option route_fence=true + node options updated successfully + + + Example 3 (use double quote if option value has spaces or special characters) + + $ pgd set-node-options --node-name bdr-a1 --option "route_priority = 100" --option "route_fence = true" + node options updated successfully + +``` + +### Options + +```text + -h, --help help for set-node-options + --node-name string node name + --option strings option in name=value format +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_set-proxy-options.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_set-proxy-options.mdx new file mode 100644 index 00000000000..ab6a9df556c --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_set-proxy-options.mdx @@ -0,0 +1,67 @@ +--- +title: set-proxy-options +--- + +Sets proxy options such as 'listen_address', 'listen_port', and 'max_client_conn'. + +### Synopsis + +You can set the following proxy options with this command: + +- 'listen_address' +- 'listen_port' +- 'max_client_conn' +- 'max_server_conn' +- 'server_conn_keepalive' +- 'server_conn_timeout' + +After updating any of these options, restart proxy. + +Use 'pgd show-proxies -o json' to view option values for each proxy. + + +```sh +pgd set-proxy-options [flags] +``` + +### Examples + +```text + + Example 1 (comma separated multiple options, spaces are not allowed) + + $ pgd set-proxy-options --proxy-name proxy-a1 --option listen_address=0.0.0.0,listen_port=6432 + proxy options updated successfully, please restart proxy service + + + Example 2 (multiple --option flags are allowed) + + $ pgd set-proxy-options --proxy-name proxy-a1 --option listen_address=0.0.0.0 --option listen_port=6432 + proxy options updated successfully, please restart proxy service + + + Example 3 (use double quote if option value has spaces or special characters) + + $ pgd set-proxy-options --proxy-name proxy-a1 --option "listen_address = 0.0.0.0" --option "listen_port = 6432" + proxy options updated successfully, please restart proxy service + +``` + +### Options + +```text + -h, --help help for set-proxy-options + --option strings option in name=value format + --proxy-name string proxy name +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_show-clockskew.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-clockskew.mdx new file mode 100644 index 00000000000..eaaf1b73462 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-clockskew.mdx @@ -0,0 +1,70 @@ +--- +title: show-clockskew +--- + +Shows the status of clock skew between each BDR node pair. + +### Synopsis + +Shows the status of clock skew between each BDR node pair in the cluster. + +Please note that the current implementation of clock skew may return an +inaccurate skew value if the cluster is under high load while running this +command or has large number of nodes in it. + + Symbol Meaning + ------- -------- + * ok + ~ warning (skew > 2 seconds) + ! critical (skew > 5 seconds) + x down / unreachable + ? unknown + - n/a + +```sh +pgd show-clockskew [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd show-clockskew + + Node bdr-a1 bdr-b1 bdr-c1 Current Time + ---- ------ ------ ------ ------------ + bdr-a1 * ? * 2022-03-30 07:02:21.334472 + bdr-b1 x * x x + bdr-c1 * ? * 2022-03-30 07:02:21.186809 + + + Example 2 (3 node cluster, all nodes are up) + + $ pgd show-clockskew + + Node bdr-a1 bdr-b1 bdr-c1 Current Time + ---- ------ ------ ------ ------------ + bdr-a1 * * * 2022-03-30 07:04:54.147017 + bdr-b1 * * * 2022-03-30 07:04:54.340543 + bdr-c1 * * * 2022-03-30 07:04:53.90451 + +``` + +### Options + +```text + -h, --help help for show-clockskew +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_show-events.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-events.mdx new file mode 100644 index 00000000000..b5dff840db0 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-events.mdx @@ -0,0 +1,60 @@ +--- +title: show-events +--- + +Shows events such as background worker errors and node membership changes. + +### Synopsis + +Shows events such as background worker errors and node membership changes. +Output is sorted by Time column in descending order. Message column is +truncated after a few lines. To view complete message use json output format +('-o json'). + +For more details on each node state, see show-nodes command help +('pgd show-nodes -h'). + +```sh +pgd show-events [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster) + + $ pgd show-events --lines 10 + + Time Observer Node Subject Node Type Message + ---- ------------- ------------ ---- ------- + 2022-04-19 19:45:43.077712+00 bdr-b1 bdr-c1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:43.066804+00 bdr-c1 bdr-a1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:43.057598+00 bdr-b1 bdr-a1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:43.046515+00 bdr-c1 bdr-b1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:43.033369+00 bdr-a1 bdr-c1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:43.013203+00 bdr-a1 bdr-b1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:40.024662+00 bdr-c1 bdr-c1 node state change ACTIVE + 2022-04-19 19:45:40.024575+00 bdr-b1 bdr-c1 node state change ACTIVE + 2022-04-19 19:45:40.022788+00 bdr-a1 bdr-c1 node state change ACTIVE + 2022-04-19 19:45:38.961424+00 bdr-c1 bdr-c1 node state change PROMOTING + +``` + +### Options + +```text + -h, --help help for show-events + -n, --lines int show top n lines +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_show-groups.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-groups.mdx new file mode 100644 index 00000000000..a36359a89a5 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-groups.mdx @@ -0,0 +1,58 @@ +--- +title: show-groups +--- + +Shows all groups in the EDB Postgres Distributed cluster and their summary. + +### Synopsis + +Shows all groups in the EDB Postgres Distributed cluster and their summary. + +In some cases when the raft isn't working properly or the group raft leader +isn't present, this command might show stale or incorrect write leader for +that group. + + +```sh +pgd show-groups [flags] +``` + +### Examples + +```text + + Example 1 (multi-group cluster, with special witness-only data group + group_c and subscriber-only group group_so) + Note: + 1. For group write leader election both enable_raft and + enable_proxy_routing options should be true for that group + 2. enable_raft is always set to true for global group + + $ pgd show-groups + + Group Group ID Type Parent Group Location Raft Routing Write Leader + ----- -------- ---- ------------ -------- ---- ------- ------------ + bdrgroup 1360502012 global true false + group_a 3618712053 data bdrgroup a true true bdr-a1 + group_b 402614658 data bdrgroup b true true bdr-b1 + group_c 2808307099 data bdrgroup c false false + group_so 2123208041 subscriber-only bdrgroup c false false + +``` + +### Options + +```text + -h, --help help for show-groups +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_show-nodes.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-nodes.mdx new file mode 100644 index 00000000000..7281c5d271a --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-nodes.mdx @@ -0,0 +1,104 @@ +--- +title: show-nodes +--- + +Shows all nodes in the EDB Postgres Distributed cluster and their summary. + +### Synopsis + +Shows all nodes in the EDB Postgres Distributed cluster and their summary, +including name, node id, group, and current/target state. + +Node States + +* NONE: Node state is unset when the worker starts, expected to be set +quickly to the current known state. +* CREATED: bdr.create_node() has been executed, but the node is not a +member of any EDB Postgres Distributed cluster yet. +* JOIN_START: bdr.join_node_group() begins to join the local node to an +existing EDB Postgres Distributed cluster. +* JOINING: The node join has started and is currently at the initial sync +phase, creating the schema and data on the node. +* CATCHUP: Initial sync phase is completed; now the join is at the last step +of retrieving and applying transactions that were performed on the upstream +peer node since the join started. +* STANDBY: Node join has finished, but not yet started to broadcast changes. +All joins spend some time in this state, but if defined as a Logical +Standby, the node will continue in this state. +* PROMOTE: Node was a logical standby and we just called bdr.promote_node to +move the node state to ACTIVE. These two PROMOTE states have to be +coherent to the fact, that only one node can be with a state higher than +STANDBY but lower than ACTIVE. +* PROMOTING: Promotion from logical standby to full BDR node is in progress. +* ACTIVE: The node is a full BDR node and is currently ACTIVE. This is the +most common node status. +* PART_START: Node was ACTIVE or STANDBY and we just called bdr.part_node +to remove the node from the EDB Postgres Distributed cluster. +* PARTING: Node disconnects from other nodes and plays no further part in +consensus or replication. +* PART_CATCHUP: Non-parting nodes synchronize any missing data from the +recently parted node. +* PARTED: Node parting operation is now complete on all nodes. + +Only one node at a time can be in either of the states PROMOTE or PROMOTING. + +Note that the read-only state of a node, as shown in the Current State or in +the Target State columns, is indicated as STANDBY. + +```sh +pgd show-nodes [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and witness-a are up, bdr-a2 is down) + + $ pgd show-nodes + + Node Node ID Group Type Current State Target State Status Seq ID + ---- ------- ----- ---- ------------- ------------ ------ ------ + bdr-a1 3136956818 group_a data ACTIVE ACTIVE Up 1 + bdr-a2 2133699692 group_a data ACTIVE ACTIVE Unreachable 2 + witness-a 3889635963 group_a witness ACTIVE ACTIVE Up 3 + + + Example 2 (multi-group cluster with witness, logical standby and + subscriber-only nodes) + Note: In contrast to logical standby, the subscriber-only nodes are fully + joined node to the cluster + + $ pgd show-nodes + + Node Node ID Group Type Current State Target State Status Seq ID + ---- ------- ----- ---- ------------- ------------ ------ ------ + bdr-a1 3136956818 group_a data ACTIVE ACTIVE Up 6 + bdr-a2 2133699692 group_a data ACTIVE ACTIVE Up 3 + logical-standby-a1 1140256918 group_a standby STANDBY STANDBY Up 9 + witness-a 3889635963 group_a witness ACTIVE ACTIVE Up 7 + bdr-b1 2380210996 group_b data ACTIVE ACTIVE Up 1 + bdr-b2 2244996162 group_b data ACTIVE ACTIVE Up 2 + logical-standby-b1 3541792022 group_b standby STANDBY STANDBY Up 10 + witness-b 661050297 group_b witness ACTIVE ACTIVE Up 5 + witness-c 1954444188 group_c witness ACTIVE ACTIVE Up 4 + subscriber-only-c1 2448841809 group_so subscriber-only ACTIVE ACTIVE Up 8 + +``` + +### Options + +```text + -h, --help help for show-nodes +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_show-proxies.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-proxies.mdx new file mode 100644 index 00000000000..685cc1b7d10 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-proxies.mdx @@ -0,0 +1,51 @@ +--- +title: show-proxies +--- + +Shows all proxies in the EDB Postgres Distributed cluster and their summary. + +### Synopsis + +Shows all proxies in the EDB Postgres Distributed cluster and their summary. + +We recommend giving all the proxies attached to the same group the same proxy +option values. + + +```sh +pgd show-proxies [flags] +``` + +### Examples + +```text + + Example 1 (multi-group cluster, with 2 proxies attached to each data group) + + $ pgd show-proxies + + Proxy Group Listen Addresses Listen Port + ----- ----- ---------------- ----------- + proxy-a1 group_a [0.0.0.0] 6432 + proxy-a2 group_a [0.0.0.0] 6432 + proxy-b1 group_b [0.0.0.0] 6432 + proxy-b2 group_b [0.0.0.0] 6432 + +``` + +### Options + +```text + -h, --help help for show-proxies +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_show-raft.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-raft.mdx new file mode 100644 index 00000000000..394bbec42d9 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-raft.mdx @@ -0,0 +1,70 @@ +--- +title: show-raft +--- + +Shows BDR Raft (consensus protocol) details. + +### Synopsis + +Shows BDR Raft (consensus protocol) details such as Raft instance id, Raft +state (leader, follower), and Raft term. If Raft is enabled at subgroup +level, then that subgroup's Raft instance is also shown. + +In some cases, such as network partition, output might vary based on the node +to which the CLI is connected. + + +```sh +pgd show-raft [flags] +``` + +### Examples + +```text + + Example 1 (multi-group cluster with subgroup Raft and with witness, + logical standby, subscriber-only nodes) + Note: Unlike data or witness node, logical standby and subscriber-only + nodes don't have Raft voting rights (see Voting Nodes column). + + $ pgd show-raft + + Instance Group Node Raft State Raft Term Commit Index Nodes Voting Nodes Protocol Version + -------- ----- ---- ---------- --------- ------------ ----- ------------ ---------------- + 1 bdrgroup bdr-b1 RAFT_LEADER 0 383 10 7 5000 + 1 bdrgroup bdr-a1 RAFT_FOLLOWER 0 383 10 7 5000 + 1 bdrgroup bdr-a2 RAFT_FOLLOWER 0 383 10 7 5000 + 1 bdrgroup bdr-b2 RAFT_FOLLOWER 0 383 10 7 5000 + 1 bdrgroup logical-standby-a1 RAFT_FOLLOWER 0 383 10 7 5000 + 1 bdrgroup logical-standby-b1 RAFT_FOLLOWER 0 383 10 7 5000 + 1 bdrgroup subscriber-only-c1 RAFT_FOLLOWER 0 383 10 7 5000 + 1 bdrgroup witness-a RAFT_FOLLOWER 0 383 10 7 5000 + 1 bdrgroup witness-b RAFT_FOLLOWER 0 383 10 7 5000 + 1 bdrgroup witness-c RAFT_FOLLOWER 0 383 10 7 5000 + 2 group_a witness-a RAFT_LEADER 1 2 4 3 0 + 2 group_a bdr-a1 RAFT_FOLLOWER 1 2 4 3 0 + 2 group_a bdr-a2 RAFT_FOLLOWER 1 2 4 3 0 + 2 group_a logical-standby-a1 RAFT_FOLLOWER 1 2 4 3 0 + 3 group_b witness-b RAFT_LEADER 1 2 4 3 0 + 3 group_b bdr-b1 RAFT_FOLLOWER 1 2 4 3 0 + 3 group_b bdr-b2 RAFT_FOLLOWER 1 2 4 3 0 + 3 group_b logical-standby-b1 RAFT_FOLLOWER 1 2 4 3 0 + +``` + +### Options + +```text + -h, --help help for show-raft +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_show-replslots.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-replslots.mdx new file mode 100644 index 00000000000..e36225ac569 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-replslots.mdx @@ -0,0 +1,136 @@ +--- +title: show-replslots +--- + +Shows the status of BDR replication slots. + +### Synopsis + +Shows the status of BDR replication slots. Output with the verbose flag gives +details such as is slot active, replication state (disconnected, streaming, +catchup), and approximate lag. + + Symbol Meaning + ------- -------- + * ok + ~ warning (lag > 10M) + ! critical (lag > 100M OR slot is 'inactive' OR 'disconnected') + x down / unreachable + - n/a + + In matrix view, sometimes byte lag is shown in parentheses. It is a + maxOf(WriteLag, FlushLag, ReplayLag, SentLag). + +```sh +pgd show-replslots [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd show-replslots + + Node bdr-a1 bdr-b1 bdr-c1 + ---- ------ ------ ------ + bdr-a1 * !(6.6G) * + bdr-b1 x * x + bdr-c1 * !(6.9G) * + + + $ pgd show-replslots --verbose + + Origin Node Target Node Status (active/state) Write Lag (bytes/duration) Flush Lag (bytes/duration) Replay Lag (bytes/duration) Sent Lag (bytes) + ----------- ----------- --------------------- -------------------------- -------------------------- --------------------------- ---------------- + bdr-a1 bdr-b1 f / disconnected 6.6G / 8 days 02:58:36.243723 6.6G / 8 days 02:58:36.243723 6.6G / 8 days 02:58:36.243723 6.6G + bdr-a1 bdr-c1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-c1 bdr-a1 t / streaming 0B / 00:00:00.000812 0B / 00:00:00.000812 0B / 00:00:00.000812 0B + bdr-c1 bdr-b1 f / disconnected 6.9G / 8 days 02:58:36.004415 6.9G / 8 days 02:58:36.004415 6.9G / 8 days 02:58:36.004415 6.9G + + + Example 2 (3 node cluster, bdr-b1 was down and it has just been restarted) + + $ pgd show-replslots + + Node bdr-a1 bdr-b1 bdr-c1 + ---- ------ ------ ------ + bdr-a1 * !(6.9G) * + bdr-b1 * * * + bdr-c1 * !(5.8G) * + + + $ pgd show-replslots --verbose + + Origin Node Target Node Status (active/state) Write Lag (bytes/duration) Flush Lag (bytes/duration) Replay Lag (bytes/duration) Sent Lag (bytes) + ----------- ----------- --------------------- -------------------------- -------------------------- --------------------------- ---------------- + bdr-a1 bdr-b1 t / catchup 6.9G / 00:00:00.000778 6.9G / 00:00:00.000778 6.9G / 00:00:00.000778 6.9G + bdr-a1 bdr-c1 t / streaming 0B / 00:00:00.104121 0B / 00:00:00.104133 0B / 00:00:00.104133 0B + bdr-b1 bdr-a1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-b1 bdr-c1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-c1 bdr-a1 t / streaming 6.8K / 00:00:00 6.8K / 00:00:00 6.8K / 00:00:00 6.8K + bdr-c1 bdr-b1 t / catchup 5.5G / 00:00:00.008257 5.5G / 00:00:00.008257 5.5G / 00:00:00.008257 5.5G + + + Example 3 (3 node cluster, all nodes are up and in 'streaming' state) + + $ pgd show-replslots + + Node bdr-a1 bdr-b1 bdr-c1 + ---- ------ ------ ------ + bdr-a1 * * * + bdr-b1 * * * + bdr-c1 * * * + + + $ pgd show-replslots --verbose + + Origin Node Target Node Status (active/state) Write Lag (bytes/duration) Flush Lag (bytes/duration) Replay Lag (bytes/duration) Sent Lag (bytes) + ----------- ----------- --------------------- -------------------------- -------------------------- --------------------------- ---------------- + bdr-a1 bdr-b1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-a1 bdr-c1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-b1 bdr-a1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-b1 bdr-c1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-c1 bdr-a1 t / streaming 0B / 00:00:00 528B / 00:00:00 528B / 00:00:00 0B + bdr-c1 bdr-b1 t / streaming 528B / 00:00:00 528B / 00:00:00 528B / 00:00:00 0B + + + Example 4 (cluster with witness, logical standby and subscriber-only nodes; + upstream for logical-standby-a1 is bdr-a1 and for logical-standby-b1 it is bdr-b1) + Note: + 1. A logical standby is sent data only by one source node, but no other + nodes receive replication changes from it + 2. Subscriber-only node subscribes to replication changes from other nodes + in the cluster, but no other nodes receive replication changes from it + + $ pgd show-replslots + + Node bdr-a1 bdr-b1 logical-standby-a1 logical-standby-b1 subscriber-only-c1 witness-c1 + ---- ------ ------ ------------------ ------------------ ------------------ ---------- + bdr-a1 * * * - * * + bdr-b1 * * - * * * + logical-standby-a1 - - * - - - + logical-standby-b1 - - - * - - + subscriber-only-c1 - - - - * - + witness-c1 * * - - * * + +``` + +### Options + +```text + -h, --help help for show-replslots + -v, --verbose verbose output (default true) +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_show-subscriptions.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-subscriptions.mdx new file mode 100644 index 00000000000..839755e7669 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-subscriptions.mdx @@ -0,0 +1,103 @@ +--- +title: show-subscriptions +--- + +Shows BDR subscription (incoming replication) details. + +### Synopsis + +Shows BDR subscription (incoming replication) details such as origin/target +node, timestamp of the last replayed transaction, and lag between now and the +timestamp of the last replayed transaction. + +```sh +pgd show-subscriptions [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd show-subscriptions + + Origin Node Target Node Last Transaction Replayed At Lag Duration (seconds) + ----------- ----------- ---------------------------- ---------------------- + bdr-a1 bdr-c1 2022-04-23 13:13:40.854433+00 0.514275 + bdr-b1 bdr-a1 + bdr-b1 bdr-c1 + bdr-c1 bdr-a1 2022-04-23 13:13:40.852233+00 0.335464 + + + Example 2 (3 node cluster, bdr-b1 was down and it has just been restarted) + + $ pgd show-subscriptions + + Origin Node Target Node Last Transaction Replayed At Lag Duration (seconds) + ----------- ----------- ---------------------------- ---------------------- + bdr-a1 bdr-b1 2022-04-23 13:14:45.669254+00 0.001686 + bdr-a1 bdr-c1 2022-04-23 13:14:46.157913+00 -0.002009 + bdr-b1 bdr-a1 + bdr-b1 bdr-c1 + bdr-c1 bdr-a1 2022-04-23 13:14:45.698472+00 0.259521 + bdr-c1 bdr-b1 2022-04-23 13:14:45.667979+00 0.002961 + + + Example 3 (3 node cluster, all nodes are up and in 'streaming' state) + + $ pgd show-subscriptions + + Origin Node Target Node Last Transaction Replayed At Lag Duration (seconds) + ----------- ----------- ---------------------------- ---------------------- + bdr-a1 bdr-b1 2022-04-23 13:15:39.732375+00 0.034462 + bdr-a1 bdr-c1 2022-04-23 13:15:40.179618+00 0.002647 + bdr-b1 bdr-a1 2022-04-23 13:15:39.719994+00 0.305814 + bdr-b1 bdr-c1 2022-04-23 13:15:40.180886+00 0.001379 + bdr-c1 bdr-a1 2022-04-23 13:15:39.714397+00 0.311411 + bdr-c1 bdr-b1 2022-04-23 13:15:39.714397+00 0.052440 + + + Example 4 (cluster with witness, logical standby and subscriber-only nodes; + upstream for logical-standby-a1 is bdr-a1 and for logical-standby-b1 it is bdr-b1) + Note: Logical standby and subscriber-only nodes receive changes but do not + send changes made locally to other nodes + + $ pgd show-subscriptions + + Origin Node Target Node Last Transaction Replayed At Lag Duration (seconds) + ----------- ----------- ---------------------------- ---------------------- + bdr-a1 bdr-b1 2022-04-23 13:40:49.106411+00 0.853665 + bdr-a1 logical-standby-a1 2022-04-23 13:40:50.72036+00 0.138430 + bdr-a1 logical-standby-b1 + bdr-a1 subscriber-only-c1 2022-04-23 13:40:50.72036+00 0.016226 + bdr-a1 witness-c1 2022-04-23 13:40:50.470142+00 0.001514 + bdr-b1 bdr-a1 2022-04-23 13:40:49.10174+00 1.095422 + bdr-b1 logical-standby-a1 + bdr-b1 logical-standby-b1 2022-04-23 13:40:50.713666+00 0.271213 + bdr-b1 subscriber-only-c1 2022-04-23 13:40:50.713666+00 0.022920 + bdr-b1 witness-c1 2022-04-23 13:40:50.471789+00 -0.000133 + witness-c1 bdr-a1 2022-04-23 13:40:49.107706+00 1.089456 + witness-c1 bdr-b1 2022-04-23 13:40:49.107706+00 0.852370 + witness-c1 logical-standby-a1 + witness-c1 logical-standby-b1 + witness-c1 subscriber-only-c1 2022-04-23 13:40:50.719844+00 0.016742 + +``` + +### Options + +```text + -h, --help help for show-subscriptions +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_show-version.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-version.mdx new file mode 100644 index 00000000000..f420b48f6ff --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_show-version.mdx @@ -0,0 +1,57 @@ +--- +title: show-version +--- + +Shows the version of BDR and Postgres installed on each node. + +### Synopsis + +Shows the version of BDR and Postgres installed on each node in the cluster. + +```sh +pgd show-version [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd show-version + + Node BDR Version Postgres Version + ---- ----------- ---------------- + bdr-c1 4.1.0 14.2 (EDB Postgres Extended Server 14.2.0) (Debian 2:14.2.0edbpge-1.buster+1) + bdr-a1 4.1.0 14.2 (EDB Postgres Extended Server 14.2.0) (Debian 2:14.2.0edbpge-1.buster+1) + bdr-b1 + + + Example 2 (3 node cluster, all nodes are up) + + $ pgd show-version + + Node BDR Version Postgres Version + ---- ----------- ---------------- + bdr-c1 4.1.0 14.2 (EDB Postgres Extended Server 14.2.0) (Debian 2:14.2.0edbpge-1.buster+1) + bdr-a1 4.1.0 14.2 (EDB Postgres Extended Server 14.2.0) (Debian 2:14.2.0edbpge-1.buster+1) + bdr-b1 4.1.0 14.2 (EDB Postgres Extended Server 14.2.0) (Debian 2:14.2.0edbpge-1.buster+1) + +``` + +### Options + +```text + -h, --help help for show-version +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_switchover.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_switchover.mdx new file mode 100644 index 00000000000..7044a0fada6 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_switchover.mdx @@ -0,0 +1,64 @@ +--- +title: switchover +--- + +Switches over to new write leader. + +### Synopsis + +Switches over to new write leader. Use switchover method 'fast' for immediate +switchover. Use 'strict' to wait until lag is less than 'route_writer_max_lag' +on the given target node. If switchover fails due to timeout or any other +issue, BDR might elect a write leader that's different from the given target +node. + +```sh +pgd switchover [flags] +``` + +### Examples + +```text + + Example 1 (with required arguments, default method is 'strict' and default + timeout is '10s') + + $ pgd switchover --group-name group_a --node-name bdr-a1 + switchover is complete + + + Example 2 (with optional arguments) + + $ pgd switchover --group-name group_a --node-name bdr-a1 --method strict --timeout 15s + switchover is complete + + + Example 3 (immediate switchover) + + $ pgd switchover --group-name group_a --node-name bdr-a1 --method fast + switchover is complete + +``` + +### Options + +```text + --group-name string group name + -h, --help help for switchover + --method string switchover method (strict, fast) + strict - waits until lag on given node is less than route_writer_max_lag + fast - immediate switchover, route_writer_max_lag is ignored (default "strict") + --node-name string node name + --timeout duration timeout period when switchover method is strict (default 10s) +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_verify-cluster.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_verify-cluster.mdx new file mode 100644 index 00000000000..0764b8c4648 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_verify-cluster.mdx @@ -0,0 +1,64 @@ +--- +title: verify-cluster +--- + +Verifies whether the cluster follows the rules as per the AlwaysOn architecture. + +### Synopsis + +Verifies whether the cluster follows the rules as per the AlwaysOn architecture. + + +```sh +pgd verify-cluster [flags] +``` + +### Examples + +```text + + Example 1 (cluster with not recommended architecture) + + $ pgd verify-cluster + + Check Status Groups + ----- ------ ------ + There is always at least 1 Global Group and 1 Data Group Ok + There are at least 2 data nodes in a Data Group (except for the witness-only group) Critical group_b + There is at most 1 witness node in a Data Group Warning group_a + Witness-only group does not have any child groups Ok + There is at max 1 witness-only group iff there is even number of local Data Groups Warning bdrgroup + There are at least 2 proxies configured per Data Group if routing is enabled Warning group_a, group_b + + + Example 2 (cluster with recommended architecture) + + $ pgd verify-cluster + + Check Status Groups + ----- ------ ------ + There is always at least 1 Global Group and 1 Data Group Ok + There are at least 2 data nodes in a Data Group (except for the witness-only group) Ok + There is at most 1 witness node in a Data Group Ok + Witness-only group does not have any child groups Ok + There is at max 1 witness-only group iff there is even number of local Data Groups Ok + There are at least 2 proxies configured per Data Group if routing is enabled Ok + +``` + +### Options + +```text + -h, --help help for verify-cluster +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/command_ref/pgd_verify-settings.mdx b/product_docs/docs/pgd/5/cli/command_ref/pgd_verify-settings.mdx new file mode 100644 index 00000000000..bf6c33b20ee --- /dev/null +++ b/product_docs/docs/pgd/5/cli/command_ref/pgd_verify-settings.mdx @@ -0,0 +1,114 @@ +--- +title: verify-settings +--- + +Verifies the EDB Postgres Distributed cluster settings. + +### Synopsis + +Verifies the EDB Postgres Distributed cluster settings. + + +```sh +pgd verify-settings [flags] +``` + +### Examples + +```text + + Example 1 + + $ pgd verify-settings + + # bdr.ddl_locking + Ok: all node values are ok + + + # bdr.max_writers_per_subscription + Ok: all node values are ok + + + # bdr.standby_slots_min_confirmed + Node Status Pending Restart Value Message + ---- ------ --------------- ----- ------- + bdr-a1 Warning false -1 must be >= 1 + bdr-a2 Warning false -1 must be >= 1 + bdr-b1 Warning false -1 must be >= 1 + bdr-b2 Warning false -1 must be >= 1 + logical-standby-a1 Warning false -1 must be >= 1 + logical-standby-b1 Warning false -1 must be >= 1 + subscriber-only-c1 Warning false -1 must be >= 1 + witness-a Warning false -1 must be >= 1 + witness-b Warning false -1 must be >= 1 + witness-c Warning false -1 must be >= 1 + + + # bdr.xact_replication + Ok: all node values are ok + + + # max_replication_slots + Node Status Pending Restart Value Message + ---- ------ --------------- ----- ------- + bdr-a1 Critical false 8 must be >= 10 + bdr-a2 Ok false 12 + bdr-b1 Ok false 12 + bdr-b2 Ok false 12 + logical-standby-a1 Ok false 12 + logical-standby-b1 Ok false 12 + subscriber-only-c1 Ok false 12 + witness-a Ok false 12 + witness-b Ok false 12 + witness-c Ok false 12 + Warning: value must be same on all primary nodes + + + # max_wal_senders + Ok: all node values are ok + + + # max_worker_processes + Ok: all node values are ok + + + # shared_preload_libraries + Node Status Pending Restart Value Message + ---- ------ --------------- ----- ------- + bdr-a1 Warning false pg_stat_statements, bdr must contain bdr as first entry + bdr-a2 Warning false pg_stat_statements, bdr must contain bdr as first entry + bdr-b1 Warning false pg_stat_statements, bdr must contain bdr as first entry + bdr-b2 Warning false pg_stat_statements, bdr must contain bdr as first entry + logical-standby-a1 Warning false pg_stat_statements, bdr must contain bdr as first entry + logical-standby-b1 Warning false pg_stat_statements, bdr must contain bdr as first entry + subscriber-only-c1 Warning false pg_stat_statements, bdr must contain bdr as first entry + witness-a Warning false pg_stat_statements, bdr must contain bdr as first entry + witness-b Warning false pg_stat_statements, bdr must contain bdr as first entry + witness-c Warning false pg_stat_statements, bdr must contain bdr as first entry + + + # track_commit_timestamp + Ok: all node values are ok + + + # wal_level + Ok: all node values are ok + +``` + +### Options + +```text + -h, --help help for verify-settings +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-cli/pgd-cli-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd/5/cli/index.mdx b/product_docs/docs/pgd/5/cli/index.mdx new file mode 100644 index 00000000000..7f2a2f9ad8e --- /dev/null +++ b/product_docs/docs/pgd/5/cli/index.mdx @@ -0,0 +1,69 @@ +--- +title: "EDB Postgres Distributed Command Line Interface" +navTitle: "Command line interface" +indexCards: none +navigation: +- installing_cli +- command_ref + +directoryDefaults: + description: "The PGD Command Line Interface (CLI) is a tool to manage your EDB Postgres Distributed cluster" +--- + +The EDB Postgres Distributed Command Line Interface (PGD CLI) is a tool to manage your EDB Postgres Distributed cluster. It allows you to run commands against EDB Postgres Distributed clusters. + +See the [Command reference](command_ref) for the available commands to inspect, manage, and get information on cluster resources. + +See [Installing PGD CLI](installing_cli) for information about how Trusted Platform Architect deploys PGD CLI, how to install PGD CLI on a standalone server manually, and specifying connection strings. + +## Requirements + +The PGD CLI requires postgres superuser privileges to run. + +## Using the PGD CLI +`pgd` is the command name for the PGD command line interface. See [pgd](command_ref) in the Command reference for a description of the command options. See the following sections for sample use cases. + + +## Specifying a configuration file + +If you rename the file or move it to another location, specify the new name and location using the optional `-f` or `--config-file` flag. For example: + +```sh + pgd show-nodes -f /opt/my-config.yml +``` + +## Passing a database connection string + +Use the `--dsn` flag to pass a database connection string directly to a command. You don't need a configuration file if you pass the connection string with this flag. The flag takes precedence if a configuration file is present. For example: + +```sh +pgd show-nodes --dsn "host=bdr-a1 port=5432 dbname=bdrdb user=postgres " +``` +## Specifying the output format + +The PGD CLI supports the following output formats: + +| Format | Considerations | +| ------- | -------------- | +| tabular | Default format. Presents the data in tabular form.| +| json | Presents the raw data with no formatting. For some commands, the json output may show more data than shown in the tabular output such as extra fields and more detailed messages. | +| yaml | Same as json except field order is alphabetical. Experimental and may not be fully supported in future versions. | + +Use the `-o` or `--output` flag to change the default output format to json or yaml. For example: + +```sh +pgd show-nodes -o json +``` +## Accessing the command line help + +To list the supported commands, enter: + +```sh +pgd help +``` + +For help for a specific command and its parameters, enter `pgd help `. For example: + +```sh +pgd help show-nodes +``` diff --git a/product_docs/docs/pgd/5/cli/installing_cli.mdx b/product_docs/docs/pgd/5/cli/installing_cli.mdx new file mode 100644 index 00000000000..a60971ef227 --- /dev/null +++ b/product_docs/docs/pgd/5/cli/installing_cli.mdx @@ -0,0 +1,45 @@ +--- +title: "Installing PGD CLI" +navTitle: "Installing PGD CLI" +--- + + +Trusted Platform Architect installs and configures PGD CLI on each PGD node, by default. If you wish to install PGD CLI on any non-PGD instance in the cluster, you simply attach the pgdcli role to that instance in TPA's configuration file before deploying. See [Trusted Postgres Architect](/tpa/latest/) for more information. + +## Installing manually + +You can manually install the PGD CLI on any Linux machine using `.deb` and `.rpm` packages available from the EDB repository. The package name is `edb-pgd5-cli`. For example: + +```sh +# for Debian +sudo apt-get install edb-pgd5-cli +``` + +When the PGD CLI is configured by TPA, it connects automatically, but with a manual installation to a standalone EDB Postgres Distributed cluster you need to provide a connection string. + + +### Specifying database connection strings + +You can use a configuration file to specify the database connection strings for your cluster. Alternatively, you can pass the connection string directly to a command. For details, see the [sample use case](./#passing-a-database-connection-string). + +#### Using a configuration file + +Use the `pgd-cli-config.yml` configuration file to specify the database connection string for your cluster. The configuration file must contain the database connection string for at least one BDR node in the cluster. The cluster name is optional and isn't validated. + +For example: + +```yaml +cluster: + name: cluster-name + endpoints: + - "host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + - "host=bdr-b1 port=5432 dbname=bdrdb user=postgres " + - "host=bdr-c1 port=5432 dbname=bdrdb user=postgres " +``` + +By default, `pgd-cli-config.yml` is located in the `/etc/edb/pgd-cli` directory. In v1, the file was named `pgd-config.yml` and default location was `/etc/edb`. The PGD CLI searches for `pgd-cli-config.yml` in the following locations. Precedence order is high to low. + + 1. `/etc/edb/pgd-cli` (default) + 2. `$HOME/.edb/pgd-cli` + +If you rename the file or move it to another location, specify the new name and location using the optional `-f` or `--config-file` flag when entering a command. See the [sample use case](./#passing-a-database-connection-string). diff --git a/product_docs/docs/pgd/5/configuration.mdx b/product_docs/docs/pgd/5/configuration.mdx new file mode 100644 index 00000000000..69c2472f5b3 --- /dev/null +++ b/product_docs/docs/pgd/5/configuration.mdx @@ -0,0 +1,517 @@ +--- +navTitle: Postgres configuration +title: Postgres configuration +redirects: + - bdr/configuration + +--- + +Several Postgres configuration parameters affect PGD +nodes. You can set these parameters differently on each node, +although that isn't generally recommended. + +## Postgres settings + +PGD requires these Postgres settings to run correctly: + +- `wal_level` — Must be set to `logical`, since PGD relies on logical decoding. +- `shared_preload_libraries` — Must contain `bdr`, although it can contain + other entries before or after, as needed. However, don't include `pglogical`. +- `track_commit_timestamp` — Must be set to `on` for conflict resolution to + retrieve the timestamp for each conflicting row. + +PGD requires these PostgreSQL settings to be set to appropriate values, +which vary according to the size and scale of the cluster. + +- `logical_decoding_work_mem` — Memory buffer size used by logical decoding. + Transactions larger than this overflow the buffer and are stored + temporarily on local disk. Default is 64 MB, but you can set it much higher. +- `max_worker_processes` — PGD uses background workers for replication + and maintenance tasks, so you need enough worker slots for it to + work correctly. The formula for the correct minimal number of workers, for each database, is: + - One per PostgreSQL instance plus + - One per database on that instance plus + - Four per PGD-enabled database plus + - One per peer node in the PGD group plus + - One for each writer-enabled per peer node in the PGD group + You might need more worker processes temporarily when a node is being + removed from a PGD group. +- `max_wal_senders` — Two needed per every peer node. +- `max_replication_slots` — Same as `max_wal_senders`. +- `wal_sender_timeout` and `wal_receiver_timeout` — Determines how + quickly a node considers its CAMO partner as disconnected or + reconnected. See [CAMO failure scenarios](durability/camo/#failure-scenarios) for + details. + +In normal running for a group with N peer nodes, PGD requires +N slots and WAL senders. During synchronization, PGD temporarily uses another +N - 1 slots and WAL senders, so be careful to set the parameters high enough +for this occasional peak demand. + +With parallel apply turned on, the number of slots must be increased to +N slots from the formula \* writers. This is because the `max_replication_slots` +also sets the maximum number of replication origins, and some of the functionality +of parallel apply uses extra origin per writer. + +When the [decoding worker](nodes#decoding-worker) is enabled, this +process requires one extra replication slot per PGD group. + +Changing these parameters requires restarting the local node: +`max_worker_processes`, `max_wal_senders`, `max_replication_slots`. + +A legacy synchronous replication mode is supported via the use of the following +parameters. See [Durability and performance options](durability) for details and +limitations. + +- `synchronous_commit` — Affects the durability and performance of PGD replication. + in a similar way to [physical replication](https://www.postgresql.org/docs/11/runtime-config-wal.html#GUC-SYNCHRONOUS-COMMIT). +- `synchronous_standby_names` — Same as above. + +## BDR-specific settings + +You can also set BDR-specific configuration settings. +Unless noted otherwise, you can set the values at any time. + +### Conflict handling + +- `bdr.default_conflict_detection` — Sets the default conflict detection method + for newly created tables. Accepts same values as + [bdr.alter_table_conflict_detection()](consistency/conflicts#bdralter_table_conflict_detection). + +### Global sequence parameters + +- `bdr.default_sequence_kind` — Sets the default [sequence kind](sequences). + The default value is `distributed`, which means `snowflakeid` is used + for `int8` sequences (i.e., `bigserial`) and `galloc` sequence for `int4` + (i.e., `serial`) and `int2` sequences. + +### DDL handling + +- `bdr.default_replica_identity` — Sets the default value for `REPLICA IDENTITY` + on newly created tables. The `REPLICA IDENTITY` defines the information + written to the write-ahead log to identify rows that are updated or deleted. + + The accepted values are: + + - `DEFAULT` — Records the old values of the columns of the primary key, + if any (this is the default PostgreSQL behavior). + - `FULL` — Records the old values of all columns in the row. + - `NOTHING` — Records no information about the old row. + + See [PostgreSQL documentation](https://www.postgresql.org/docs/current/sql-altertable.html#SQL-CREATETABLE-REPLICA-IDENTITY) + for more details. + + BDR can't replicate `UPDATE` and `DELETE` operations on tables without a `PRIMARY KEY` + or `UNIQUE` constraint. The exception is when the replica identity for the table is `FULL`, + either by table-specific configuration or by `bdr.default_replica_identity`. + + If `bdr.default_replica_identity` is `DEFAULT` and there is a `UNIQUE` + constraint on the table, it isn't automatically picked up as + `REPLICA IDENTITY`. You need to set it explicitly when creating the table + or after, as described above. + + Setting the replica identity of tables to `FULL` increases the volume of + WAL written and the amount of data replicated on the wire for the table. + +- `bdr.ddl_replication` — Automatically replicate DDL across nodes (default is + `on`). + + This parameter can be set only by bdr_superuser or superuser roles. + + Running DDL or calling BDR administration functions with + `bdr.ddl_replication = off` can create situations where replication stops + until an administrator can intervene. See [DDL replication](ddl) + for details. + + A `LOG`-level log message is emitted to the PostgreSQL server logs whenever + `bdr.ddl_replication` is set to `off`. Additionally, a `WARNING-level` + message is written whenever replication of captured DDL commands or BDR + replication functions is skipped due to this setting. + +- `bdr.role_replication` — Automatically replicate ROLE commands across nodes + (default is `on`). Only a superuser can set this parameter. This setting + works only if `bdr.ddl_replication` is turned on as well. + + Turning this off without using external methods to ensure roles are in sync + across all nodes might cause replicated DDL to interrupt replication until + the administrator intervenes. + + See [Role manipulation statements](ddl/#role-manipulation-statements) + for details. + +- `bdr.ddl_locking` — Configures the operation mode of global locking for DDL. + + This parameter can be set only by bdr_superuser or superuser roles. + + Possible options are: + + - off — Don't use global locking for DDL operations. + - on — Use global locking for all DDL operations. + - dml — Use global locking only for DDL operations that need to prevent + writes by taking the global DML lock for a relation. + + A `LOG`-level log message is emitted to the PostgreSQL server logs + whenever `bdr.ddl_replication` is set to `off`. Additionally, a `WARNING` + message is written whenever any global locking steps are skipped due to + this setting. It's normal for some statements to result in two `WARNING` messages: + one for skipping the DML lock and one for skipping the DDL lock. + +- `bdr.truncate_locking` — False by default, this configuration option sets the + TRUNCATE command's locking behavior. Determines whether (when true) TRUNCATE + obeys the `bdr.ddl_locking` setting. + +### Global locking + +- `bdr.ddl_locking` — Described above. +- `bdr.global_lock_max_locks` — Maximum number of global locks that can be + held on a node (default 1000). Can be set only at Postgres server start. +- `bdr.global_lock_timeout` — Sets the maximum allowed duration of any wait + for a global lock (default 10 minutes). A value of zero disables this timeout. +- `bdr.global_lock_statement_timeout` — Sets the maximum allowed duration of + any statement holding a global lock (default 60 minutes). + A value of zero disables this timeout. +- `bdr.global_lock_idle_timeout` — Sets the maximum allowed duration of + idle time in transaction holding a global lock (default 10 minutes). + A value of zero disables this timeout. +- `bdr.predictive_checks` — Log level for predictive checks (currently used only + by global locks). Can be `DEBUG`, `LOG`, `WARNING` (default), or `ERROR`. Predictive checks + are early validations for expected cluster state when doing certain operations. You + can use them for those operations for fail early rather than wait for + timeouts. In global lock terms, BDR checks that there are enough nodes + connected and withing reasonable lag limit for getting quorum needed by the + global lock. + +### Node management + +- `bdr.replay_progress_frequency` — Interval for sending replication position + info to the rest of the cluster (default 1 minute). + +- `bdr.standby_slot_names` — Require these slots to receive and confirm + replication changes before any other ones. This setting is useful primarily when + using physical standbys for failover or when using subscribe-only nodes. + +### Generic replication + +- `bdr.writers_per_subscription` — Default number of writers per + subscription (in BDR, you can also change this with + `bdr.alter_node_group_config` for a group). + +- `bdr.max_writers_per_subscription` — Maximum number of writers + per subscription (sets upper limit for the setting above). + +- `bdr.xact_replication` — Replicate current transaction (default is `on`). + + Turning this off makes the whole transaction local only, which + means the transaction isn't visible to logical decoding by + BDR and all other downstream targets of logical decoding. Data isn't + transferred to any other node, including logical standby nodes. + + This parameter can be set only by the bdr_superuser or superuser roles. + + This parameter can be set only inside the current transaction using the + `SET LOCAL` command unless `bdr.permit_unsafe_commands = on`. + +!!! Note + Even with transaction replication disabled, WAL is generated, + but those changes are filtered away on the origin. + +!!! Warning + Turning off `bdr.xact_replication` leads to data + inconsistency between nodes. Use it only to recover from + data divergence between nodes or in + replication situations where changes on single nodes are required for + replication to continue. Use at your own risk. + +- `bdr.permit_unsafe_commands` — Option to override safety check on commands + that are deemed unsafe for general use. + + Requires `bdr_superuser` or PostgreSQL superuser. + +!!! Warning + The commands that are normally not considered safe can either + produce inconsistent results or break replication altogether. Use at your + own risk. + +- `bdr.batch_inserts` — Number of consecutive inserts to one table in + a single transaction turns on batch processing of inserts for that table. + + This option allows replication of large data loads as COPY internally, + rather than set of inserts. It is also how the initial data during node join + is copied. + +- `bdr.maximum_clock_skew` + + This option specifies the maximum difference between + the incoming transaction commit timestamp and the current time on the + subscriber before triggering `bdr.maximum_clock_skew_action`. + + It checks if the timestamp of the currently replayed transaction is in the + future compared to the current time on the subscriber. If it is, and the + difference is larger than `bdr.maximum_clock_skew`, it performs the action + specified by the `bdr.maximum_clock_skew_action` setting. + + The default is `-1`, which means ignore clock skew (the check is turned + off). It's valid to set 0 as when the clock on all servers are synchronized. + The fact that we are replaying the transaction means it has been committed in + the past. + +- `bdr.maximum_clock_skew_action` + + This specifies the action to take if a clock skew higher than + `bdr.maximum_clock_skew` is detected. + + There are two possible values for this option: + + - `WARN` — Log a warning about this fact. The warnings are logged once per + minute (the default) at the maximum to prevent flooding the server log. + - `WAIT` — Wait until the current local timestamp is no longer older than + remote commit timestamp minus the `bdr.maximum_clock_skew`. + +- `bdr.accept_connections` — Option to enable or disable connections to BDR. + Defaults to `on`. + + Requires `bdr_superuser` or PostgreSQL superuser. + +### `bdr.standby_slot_names` + +This option is typically used in failover configurations to ensure that the +failover-candidate streaming physical replicas for this BDR node +have received and flushed all changes before they ever become visible to +subscribers. That guarantees that a commit can't vanish on failover to a +standby for the provider. + +Replication slots whose names are listed in the comma-separated +`bdr.standby_slot_names` list are treated specially by the walsender +on a BDR node. + +BDR's logical replication walsenders ensures that all local changes +are sent and flushed to the replication slots in `bdr.standby_slot_names` +before the node sends those changes to any other BDR replication +clients. Effectively, it provides a synchronous replication barrier between the +named list of slots and all other replication clients. + +Any replication slot can be listed in `bdr.standby_slot_names`. Both +logical and physical slots work, but it's generally used for physical slots. + +Without this safeguard, two anomalies are possible where a commit can be +received by a subscriber and then vanish from the provider on failover because +the failover candidate hadn't received it yet: + +- For 1+ subscribers, the subscriber might have applied the change but the new + provider might execute new transactions that conflict with the received change, + as it never happened as far as the provider is concerned. + +- For 2+ subscribers, at the time of failover, not all subscribers have applied + the change. The subscribers now have inconsistent and irreconcilable states + because the subscribers that didn't receive the commit have no way to get it. + +Setting `bdr.standby_slot_names` by design causes other subscribers +not listed in there to lag behind the provider if the required number +of listed nodes are not keeping up. Monitoring is thus essential. + +Another use case where `bdr.standby_slot_names` is useful is when +using a subscriber-only node, to ensure that it does not move ahead of +any of the regular BDR nodes. This can best be achieved by listing the +logical slots of all regular BDR peer nodes in combination with +setting `bdr.standby_slots_min_confirmed` to at least one. + +### `bdr.standby_slots_min_confirmed` + +Controls how many of the `bdr.standby_slot_names` have to confirm before +we send data to BDR subscribers. + +### `bdr.writer_input_queue_size` + +This option specifies the size of the shared memory queue used +by the receiver to send data to the writer process. If the writer process is +stalled or making slow progress, then the queue might get filled up, stalling +the receiver process too. So it's important to provide enough shared memory for +this queue. The default is 1 MB, and the maximum allowed size is 1 GB. While any +storage size specifier can be used to set the GUC, the default is KB. + +### `bdr.writer_output_queue_size` + +This option specifies the size of the shared memory queue used +by the receiver to receive data from the writer process. Since the writer +isn't expected to send a large amount of data, a relatively smaller sized queue +is enough. The default is 32 KB, and the maximum allowed size is 1 MB. +While any storage size specifier can be used to set the GUC, the default is +KB. + +### `bdr.min_worker_backoff_delay` + +Rate limit BDR background worker launches by preventing a given worker +from being relaunched more often than every +`bdr.min_worker_backoff_delay` milliseconds. On repeated errors, the backoff +increases exponentially with added jitter up to maximum of +`bdr.max_worker_backoff_delay`. + +Time-unit suffixes are supported. + +!!! Note + This setting currently affects only receiver worker, which means it + primarily affects how fast a subscription tries to reconnect on error + or connection failure. + +The default for `bdr.min_worker_backoff_delay` is 1 second. For +`bdr.max_worker_backoff_delay`, it is 1 minute. + +If the backoff delay setting is changed and the PostgreSQL configuration is +reloaded, then all current backoff waits for reset. Additionally, the +`bdr.worker_task_reset_backoff_all()` function is provided to allow the +administrator to force all backoff intervals to immediately expire. + +A tracking table in shared memory is maintained to remember the last launch +time of each type of worker. This tracking table isn't persistent. It is +cleared by PostgreSQL restarts, including soft restarts during crash recovery +after an unclean backend exit. + +You can use the view [`bdr.worker_tasks`](catalogs#bdrworker_tasks) to inspect this state so the administrator can see any backoff +rate limiting currently in effect. + +For rate limiting purposes, workers are classified by task. This key consists +of the worker role, database OID, subscription ID, subscription writer ID, +extension library name and function name, extension-supplied worker name, and +the remote relation ID for sync writers. `NULL` is used where a given +classifier doesn't apply, for example, manager workers don't have a subscription ID +and receivers don't have a writer ID. + +### CRDTs + +- `bdr.crdt_raw_value` — Sets the output format of [CRDT data types](consistency/crdt). + The default output (when this setting is `off`) is to return only the current + value of the base CRDT type (for example, a bigint for `crdt_pncounter`). + When set to `on`, the returned value represents the full representation of + the CRDT value, which can, for example, include the state from multiple nodes. + +### Max prepared transactions + +- `max_prepared_transactions` — Needs to be set high enough to cope + with the maximum number of concurrent prepared transactions across + the cluster due to explicit two-phase commits, CAMO, or Eager + transactions. Exceeding the limit prevents a node from running a + local two-phase commit or CAMO transaction and prevents all + Eager transactions on the cluster. + You can set this only at Postgres server start. + +### Eager Replication + +- `bdr.commit_scope` — Set's the current (or default) + [Commit Scope](/latest/pgd/durability/commit-scopes) (default `local`). + +### Commit At Most Once + +- `bdr.camo_local_mode_delay` — The commit delay that applies in + CAMO's asynchronous mode to emulate the overhead that normally occurs with + the CAMO partner having to confirm transactions. Defaults to 5 ms. + Set to `0` to disable this feature. +- `bdr.camo_enable_client_warnings` — Emit warnings if an activity is + carried out in the database for which CAMO properties can't be + guaranteed. This is enabled by default. Well-informed users can choose + to disable this to reduce the amount of warnings going into their logs. + +### Transaction streaming + +- `bdr.default_streaming_mode` — Used to control transaction streaming by + the subscriber node. Permissible values are: `off`, `writer`, `file`, and `auto`. + Defaults to `auto`. If set to `off`, the subscriber doesn't request + transaction streaming. If set to one of the other values, the + subscriber requests transaction streaming and the publisher provides + it if it supports them and if configured at group level. For + more details, see [Transaction streaming](transaction-streaming). + +### Lag control + +- `bdr.lag_control_max_commit_delay` — Maximum acceptable post commit delay that + can be tolerated, in fractional milliseconds. +- `bdr.lag_control_max_lag_size` — Maximum acceptable lag size that can be tolerated, + in kilobytes. +- `bdr.lag_control_max_lag_time` — Maximum acceptable lag time that can be tolerated, + in milliseconds. +- `bdr.lag_control_min_conforming_nodes` — Minimum number of nodes required to stay + below acceptable lag measures. +- `bdr.lag_control_commit_delay_adjust` — Commit delay micro adjustment measured as a + fraction of the maximum commit delay time. At a default value of 0.01%, it takes + 100 net increments to reach the maximum commit delay. +- `bdr.lag_control_sample_interval` — Minimum time between lag samples and + commit delay micro adjustments, in milliseconds. +- `bdr.lag_control_commit_delay_start` — The lag threshold at which commit delay + increments start to be applied, expressed as a fraction of acceptable lag measures. + At a default value of 1.0%, commit delay increments don't begin until acceptable lag + measures are breached. + + By setting a smaller fraction, it might be possible to prevent a breach + by "bending the lag curve" earlier so that it's asymptotic with the + acceptable lag measure. + +### Timestamp-based snapshots + +- `snapshot_timestamp` — Turns on the use of + [timestamp-based snapshots](tssnapshots) and sets the timestamp to use. +- `bdr.timestamp_snapshot_keep` — Time to keep valid snapshots for the + timestamp-based snapshot use (default is `0`, meaning don't keep past snapshots). + +### Monitoring and logging + +- `bdr.debug_level` — Defines the log level that BDR uses to write + its debug messages. The default value is `debug2`. If you want to see + detailed BDR debug output, set `bdr.debug_level = 'log'`. + +- `bdr.trace_level` — Similar to the above, this defines the log level + to use for BDR trace messages. Enabling tracing on all nodes of a + EDB Postgres Distributed cluster might help EDB Support to diagnose issues. + You can set this only at Postgres server start. + +!!! Warning + Setting `bdr.debug_level` or `bdr.trace_level` to a value >= + `log_min_messages` can produce a very large volume of log output, so don't + enabled it long term in production unless plans are in place for log filtering, + archival, and rotation to prevent disk space exhaustion. + +- `bdr.track_subscription_apply` — Track apply statistics for + each subscription. +- `bdr.track_relation_apply` — Track apply statistics for each + relation. +- `bdr.track_apply_lock_timing` — Track lock timing when tracking + statistics for relations. + +### Internals + +- `bdr.raft_keep_min_entries` — The minimum number of entries to keep in the + Raft log when doing log compaction (default 100). The value of 0 disables + log compaction. You can set this only at Postgres server start. + !!! Warning + If log compaction is disabled, the log grows in size forever. +- `bdr.raft_response_timeout` — To account for network failures, the + Raft consensus protocol implemented times out requests after a + certain amount of time. This timeout defaults to 30 seconds. +- `bdr.raft_log_min_apply_duration` — To move the state machine + forward, Raft appends entries to its internal log. During normal + operation, appending takes only a few milliseconds. This poses an + upper threshold on the duration of that append action, above which + an `INFO` message is logged. This can indicate a + problem. Default value of this parameter is 3000 ms. +- `bdr.raft_log_min_message_duration` — When to log a consensus request. + Measure roundtrip time of a bdr consensus request and log an + `INFO` message if the time exceeds this parameter. Default value + of this parameter is 5000 ms. +- `bdr.raft_group_max_connections` — The maximum number of connections + across all BDR groups for a Postgres server. These connections carry + bdr consensus requests between the groups' nodes. Default value of this + parameter is 100 connections. You can set it only at Postgres server start. +- `bdr.backwards_compatibility` — Specifies the version to be + backward compatible to, in the same numerical format as used by + `bdr.bdr_version_num`, e.g., `30618`. Enables exact behavior of a + former BDR version, even if this has generally unwanted effects. + Defaults to the current BDR version. Since this changes from release + to release, we advise against explicit use in the configuration + file unless the value is different from the current version. +- `bdr.track_replication_estimates` — Track replication estimates in terms + of apply rates and catchup intervals for peer nodes. Protocols like CAMO can use this information + to estimate the readiness of a + peer node. This parameter is enabled by default. +- `bdr.lag_tracker_apply_rate_weight` — We monitor how far behind peer nodes + are in terms of applying WAL from the local node and calculate a moving + average of the apply rates for the lag tracking. This parameter specifies + how much contribution newer calculated values have in this moving average + calculation. Default value is 0.1. diff --git a/product_docs/docs/pgd/5/consistency/column-level-conflicts.mdx b/product_docs/docs/pgd/5/consistency/column-level-conflicts.mdx new file mode 100644 index 00000000000..5d2736bdc5c --- /dev/null +++ b/product_docs/docs/pgd/5/consistency/column-level-conflicts.mdx @@ -0,0 +1,337 @@ +--- +navTitle: Column-level conflict resolution +title: Column-level conflict detection +redirects: + - /pgd/latest/bdr/column-level-conflicts/ +--- + +By default, conflicts are resolved at row level. That is, when changes +from two nodes conflict, we pick either the local or remote tuple and +discard the other one. For example, we might compare commit timestamps for +the two conflicting changes and keep the newer one. This ensures that all +nodes converge to the same result and establishes commit-order-like +semantics on the whole cluster. + +However, in some cases it might be appropriate to resolve conflicts at +the column level rather than the row level. + +Consider a simple example, where we have a table t with two integer +columns a and b and a single row `(1,1)`. Assume that on one node +we execute: + +```sql +UPDATE t SET a = 100 +``` + +On another node we concurrently (before receiving the preceding +`UPDATE`) execute: + +```sql +UPDATE t SET b = 100 +``` + +This results in an `UPDATE-UPDATE` conflict. With the `update_if_newer` +conflict resolution, we compare the commit timestamps and keep the new +row version. Assuming the second node committed last, we end up with +`(1,100)`, effectively discarding the change to column a. + +For many use cases, this is the desired and expected behavior, but for +some this might be an issue. Consider, for example, a multi-node cluster +where each part of the application is connected to a different node, +updating a dedicated subset of columns in a shared table. In that case, +the different components might step on each other's toes, overwriting +their changes. + +For such use cases, it might be more appropriate to resolve conflicts on +a given table at the column level. To achieve that, BDR tracks +the timestamp of the last change for each column separately and uses that +to pick the most recent value (essentially `update_if_newer`). + +Applied to the previous example, we'll end up with `(100,100)` on both +nodes, despite neither of the nodes ever seeing such a row. + +When thinking about column-level conflict resolution, it can be useful +to see tables as vertically partitioned, so that each update affects +data in only one slice. This approach eliminates conflicts between changes to +different subsets of columns. In fact, vertical partitioning can even +be a practical alternative to column-level conflict resolution. + +Column-level conflict resolution requires the table to have +`REPLICA IDENTITY FULL`. The `bdr.alter_table_conflict_detection` function does check +that and fails with an error otherwise. + +## Enabling and disabling column-level conflict resolution + +The column-level conflict resolution is managed by the +[bdr.alter_table_conflict_detection()](conflicts#bdralter_table_conflict_detection) +function. + +### Example + +To see how the `bdr.alter_table_conflict_detection()` is used, consider +this example that creates a trivial table `test_table` and then enables +column-level conflict resolution on it: + +```sql +db=# CREATE TABLE my_app.test_table (id SERIAL PRIMARY KEY, val INT); +CREATE TABLE + +db=# ALTER TABLE my_app.test_table REPLICA IDENTITY FULL; +ALTER TABLE + +db=# SELECT bdr.alter_table_conflict_detection( +db(# 'my_app.test_table'::regclass, 'column_modify_timestamp', 'cts'); + alter_table_conflict_detection +-------------------------------- + t + +db=# \d my_app.test_table +``` + +The function adds a new `cts` column (as specified in +the function call), but it also created two triggers (`BEFORE INSERT` +and `BEFORE UPDATE`) that are responsible for maintaining timestamps +in the new column before each change. + +Also, the new column specifies `NOT NULL` +with a default value, which means that `ALTER TABLE ... ADD COLUMN` +doesn't perform a table rewrite. + +!!! Note + We discourage using columns with the `bdr.column_timestamps` data type + for other purposes as it can have negative effects. + For example, it switches the table to column-level conflict resolution, which doesn't + work correctly without the triggers. + +### Listing table with column-level conflict resolution + +You can list tables having column-level conflict resolution enabled +with the following query. This query detects the presence of a column of +type `bdr.column_timestamp`. + +```sql +SELECT nc.nspname, c.relname +FROM pg_attribute a +JOIN (pg_class c JOIN pg_namespace nc ON c.relnamespace = nc.oid) + ON a.attrelid = c.oid +JOIN (pg_type t JOIN pg_namespace nt ON t.typnamespace = nt.oid) + ON a.atttypid = t.oid +WHERE NOT pg_is_other_temp_schema(nc.oid) + AND nt.nspname = 'bdr' + AND t.typname = 'column_timestamps' + AND NOT a.attisdropped + AND c.relkind IN ('r', 'v', 'f', 'p'); +``` + +### bdr.column_timestamps_create + +This function creates column-level conflict resolution. It's called within +`column_timestamp_enable`. + +#### Synopsis + +```sql +bdr.column_timestamps_create(p_source cstring, p_timestamp timestampstz) +``` + +#### Parameters + +- `p_source` — The two options are `current` or `commit`. +- `p_timestamp` — Timestamp depends on the source chosen. If `commit`, + then `TIMESTAMP_SOURCE_COMMIT`. If `current`, then `TIMESTAMP_SOURCE_CURRENT`. + +## DDL locking + +When enabling or disabling column timestamps on a table, the code uses +DDL locking to ensure that there are no pending changes from before the +switch. This approach ensures we see only conflicts with timestamps in both +tuples or in neither of them. Otherwise, the code might unexpectedly see +timestamps in the local tuple and NULL in the remote one. It also +ensures that the changes are resolved the same way (column-level or +row-level) on all nodes. + +## Current versus commit timestamp + +An important decision is the timestamp to assign to modified columns. + +By default, the timestamp assigned to modified columns is the current +timestamp, as if obtained from `clock_timestamp`. This is simple, and +for many cases it is perfectly correct (for example, when the conflicting rows +modify non-overlapping subsets of columns). + +It can, however, have various unexpected effects: + +- The timestamp changes during statement execution, so if an `UPDATE` + affects multiple rows, each gets a slightly different timestamp. + This means that the effects of concurrent changes might get "mixed" in various + ways (depending on how exactly the changes performed on different + nodes interleave). + +- The timestamp is unrelated to the commit timestamp, and using it to + resolve conflicts means that the result isn't equivalent to the commit order, + which means it likely can't be serialized. + +!!! Note + We might add statement and transaction timestamps in the future, + which would address issues with mixing effects of concurrent statements or + transactions. Still, neither of these options can ever produce results + equivalent to commit order. + +It's possible to also use the actual commit timestamp, although this +feature is currently considered experimental. To use the commit timestamp, +set the last parameter to `true` when enabling column-level conflict +resolution: + +```sql +SELECT bdr.column_timestamps_enable('test_table'::regclass, 'cts', true); +``` + +You can disable it using `bdr.column_timestamps_disable`. + +Commit timestamps currently have restrictions that are +explained in [Notes](#notes). + +## Inspecting column timestamps + +The column storing timestamps for modified columns is maintained +automatically by triggers. Don't modify it directly. It can +be useful to inspect the current timestamps value, for example, while +investigating how a conflict was resolved. + +Three functions are useful for this purpose: + +- `bdr.column_timestamps_to_text(bdr.column_timestamps)` + + This function returns a human-readable representation of the timestamp mapping and + is used when casting the value to `text`: + +```sql +db=# select cts::text from test_table; + cts +----------------------------------------------------------------------------------------------------- + {source: current, default: 2018-09-23 19:24:52.118583+02, map: [2 : 2018-09-23 19:25:02.590677+02]} +(1 row) + +``` + +- `bdr.column_timestamps_to_jsonb(bdr.column_timestamps)` + + This function turns a JSONB representation of the timestamps mapping and is used + when casting the value to `jsonb`: + +```sql +db=# select jsonb_pretty(cts::jsonb) from test_table; + jsonb_pretty +--------------------------------------------------- + { + + "map": { + + "2": "2018-09-23T19:24:52.118583+02:00" + + }, + + "source": "current", + + "default": "2018-09-23T19:24:52.118583+02:00"+ + } +(1 row) +``` + +- `bdr.column_timestamps_resolve(bdr.column_timestamps, xid)` + + This function updates the mapping with the commit timestamp for the attributes modified + by the most recent transaction (if it already committed). This + matters only when using the commit timestamp. For example, in this case, the last + transaction updated the second attribute (with `attnum = 2`): + +```sql +test=# select cts::jsonb from test_table; + cts +---------------------------------------------------------------------------------------------------------------------------------------- + {"map": {"2": "2018-09-23T19:29:55.581823+02:00"}, "source": "commit", "default": "2018-09-23T19:29:55.581823+02:00", "modified": [2]} +(1 row) + +db=# select bdr.column_timestamps_resolve(cts, xmin)::jsonb from test_table; + column_timestamps_resolve +----------------------------------------------------------------------------------------------------------------------- + {"map": {"2": "2018-09-23T19:29:55.581823+02:00"}, "source": "commit", "default": "2018-09-23T19:29:55.581823+02:00"} +(1 row) +``` + +## Handling column conflicts using CRDT data types + +By default, column-level conflict resolution picks the value with +a higher timestamp and discards the other one. You can, however, +reconcile the conflict in different, more elaborate ways. For example, you can use +CRDT types that allow merging the conflicting values without +discarding any information. + +## Notes + +- The attributes modified by an `UPDATE` are determined by comparing the + old and new row in a trigger. This means that if the attribute doesn't + change a value, it isn't detected as modified even if it's + explicitly set. For example, `UPDATE t SET a = a` doesn't mark `a` as + modified for any row. Similarly, `UPDATE t SET a = 1` doesn't mark + `a` as modified for rows that are already set to `1`. + +- For `INSERT` statements, we don't have any old row to compare the new + one to, so we consider all attributes to be modified and assign them + a new timestamp. This applies even for columns that weren't included + in the `INSERT` statement and received default values. We can detect + which attributes have a default value but can't know if + it was included automatically or specified explicitly. + + This effectively means column-level conflict resolution doesn't work + for `INSERT-INSERT` conflicts even if the `INSERT` statements specify + different subsets of columns. The newer row has + timestamps that are all newer than the older row. + +- By treating the columns independently, it's easy to violate constraints + in a way that isn't possible when all changes happen on the same + node. Consider, for example, a table like this: + +```sql +CREATE TABLE t (id INT PRIMARY KEY, a INT, b INT, CHECK (a > b)); +INSERT INTO t VALUES (1, 1000, 1); +``` + +Assume one node does: + +```sql +UPDATE t SET a = 100; +``` + +Another node concurrently does: + +```sql +UPDATE t SET b = 500; +``` + + Each of those updates is valid when executed on the initial row and + so passes on each node. But when replicating to the other node, + the resulting row violates the `CHECK (A > b)` constraint, and the + replication stops until the issue is resolved manually. + +- The column storing timestamp mapping is managed automatically. Don't + specify or override the value in your queries, as it can result in + unpredictable effects. (We do ignore the value where possible anyway.) + +- The timestamp mapping is maintained by triggers, but the order in which + triggers execute matters. So if you have custom triggers that modify + tuples and are executed after the `pgl_clcd_` triggers, the modified + columns aren't detected correctly. + +- When using regular timestamps to order changes/commits, it's possible + that the conflicting changes have exactly the same timestamp (because + two or more nodes happened to generate the same timestamp). This risk + isn't unique to column-level conflict resolution, as it can happen + even for regular row-level conflict resolution. We use node id as a + tie-breaker in this situation (the higher node id wins), which ensures that the + same changes are applied on all nodes. + +- It is possible that there is a clock skew between different nodes. While it + can induce somewhat unexpected behavior (discarding seemingly newer + changes because the timestamps are inverted), you can manage clock skew between nodes + using the parameters `bdr.maximum_clock_skew` and + `bdr.maximum_clock_skew_action`. + +```sql +SELECT bdr.alter_node_group_config('group', ignore_redundant_updates := false); +``` diff --git a/product_docs/docs/pgd/5/consistency/conflicts.mdx b/product_docs/docs/pgd/5/consistency/conflicts.mdx new file mode 100644 index 00000000000..18c3d01259f --- /dev/null +++ b/product_docs/docs/pgd/5/consistency/conflicts.mdx @@ -0,0 +1,1237 @@ +--- +title: Conflicts +redirects: + - /pgd/latest/bdr/conflicts/ +--- + +EDB Postgres Distributed is an active/active or multi-master DBMS. If used +asynchronously, writes to the same or related rows from multiple different +nodes can result in data conflicts when using standard data types. + +Conflicts aren't errors. In most cases, they are events that PGD can detect +and resolve as they occur. Resolution depends on the +nature of the application and the meaning of the data, so it's important that +PGD provides the application a range of choices as to how to resolve conflicts. + +By default, conflicts are resolved at the row level. When changes from two +nodes conflict, either the local or remote tuple is picked and the other +is discarded. For example, the commit timestamps might be compared for the two conflicting +changes and the newer one kept. This approach ensures that all nodes converge to the +same result and establishes commit-order-like semantics on the whole cluster. + +Conflict handling is configurable, as described in [Conflict resolution](#conflict-resolution). +Conflicts can be detected and handled differently for each table using +conflict triggers, described in [Stream triggers](../striggers). + +Column-level conflict detection and resolution is available with BDR, +described in [CLCD](column-level-conflicts). + +By default, all conflicts are logged to `bdr.conflict_history`. If conflicts +are possible, then table owners must monitor for them and analyze how to +avoid them or make plans to handle them regularly as an application task. +The [LiveCompare](/livecompare/latest) tool is also available to scan regularly for divergence. + +Some clustering systems use distributed lock mechanisms to prevent +concurrent access to data. These can perform reasonably when servers are +very close to each other but can't support geographically distributed applications where +very low latency is critical for acceptable performance. + +Distributed locking is essentially a pessimistic approach. PGD +advocates an optimistic approach, which is to avoid conflicts where possible but allow +some types of conflicts to occur and resolve them when they arise. + +## How conflicts happen + +Inter-node conflicts arise as a result of sequences of events that can't +happen if all the involved transactions happen concurrently on the same +node. Because the nodes exchange changes only after the transactions commit, +each transaction is individually valid on the node it committed on. It isn't +valid if applied on another node that did other conflicting work +at the same time. + +Since BDR replication essentially replays the transaction on the other nodes, +the replay operation can fail if there's a conflict between a transaction +being applied and a transaction that was committed on the receiving node. + +Most conflicts can't happen when all transactions run on a single +node because Postgres has inter-transaction communication mechanisms +to prevent it such as `UNIQUE` indexes, `SEQUENCE` operations, row and relation locking, and +`SERIALIZABLE` dependency tracking. All of these mechanisms are ways +to communicate between ongoing transactions to prevent undesirable concurrency +issues. + +BDR doesn't have a distributed transaction manager or lock manager. +That's part of why it performs well with latency and network partitions. As +a result, transactions on different nodes execute entirely independently +from each other when using the default, lazy replication. Less independence +between nodes can avoid conflicts altogether, which is why BDR also offers +Eager Replication for when this is important. + +## Types of conflict + +### PRIMARY KEY or UNIQUE conflicts + +The most common conflicts are row conflicts, where two operations affect a +row with the same key in ways they can't on a single node. BDR can +detect most of those and applies the `update_if_newer` conflict resolver. + +Row conflicts include: + +- `INSERT` versus `INSERT` +- `UPDATE` versus `UPDATE` +- `UPDATE` versus `DELETE` +- `INSERT` versus `UPDATE` +- `INSERT` versus `DELETE` +- `DELETE` versus `DELETE` + +The view `bdr.node_conflict_resolvers` provides information on how +conflict resolution is currently configured for all known conflict types. + +#### INSERT/INSERT conflicts + +The most common conflict, `INSERT`/`INSERT`, arises where `INSERT` operations on two +different nodes create a tuple with the same `PRIMARY KEY` values (or if no +`PRIMARY KEY` exists, the same values for a single `UNIQUE` constraint). + +BDR handles this situation by retaining the most recently inserted tuple of the two +according to the originating node's timestamps, unless this behavior is overridden by a +user-defined conflict handler. + +This conflict generates the `insert_exists` conflict type, which is by +default resolved by choosing the newer (based on commit time) row and keeping +only that one (`update_if_newer` resolver). You can configure other resolvers. +See [Conflict resolution](#conflict-resolution) for details. + +To resolve this conflict type, you can also use column-level conflict +resolution and user-defined conflict triggers. + +You can effectively eliminate this type of conflict by using +[global sequences](../sequences). + +#### INSERT operations that violate multiple UNIQUE constraints + +An `INSERT`/`INSERT` conflict can violate more than one `UNIQUE` constraint +(of which one might be the `PRIMARY KEY`). If a new row violates more than +one `UNIQUE` constraint and that results in a conflict against more than one +other row, then the apply of the replication change produces a +`multiple_unique_conflicts` conflict. + +In case of such a conflict, you must remove some rows for replication +to continue. Depending on the resolver setting for `multiple_unique_conflicts`, +the apply process either exits with error, skips the incoming row, or deletes +some of the rows. The deletion tries to +preserve the row with the correct `PRIMARY KEY` and delete the others. + +!!! Warning + In case of multiple rows conflicting this way, if the result of conflict + resolution is to proceed with the insert operation, some of the data + is always deleted. + +It's also possible to define a different behavior using a conflict trigger. + +#### UPDATE/UPDATE conflicts + +Where two concurrent `UPDATE` operations on different nodes change the same tuple +(but not its `PRIMARY KEY`), an `UPDATE`/`UPDATE` conflict can occur on replay. + +These can generate different conflict kinds based on the configuration and +situation. If the table is configured with [row version conflict detection](#row-version-conflict-detection), +then the original (key) row is compared with the local row. +If they're different, the `update_differing` conflict is generated. +When using [Origin conflict detection](#origin-conflict-detection), +the origin of the row is checked (the origin is the node that the current +local row came from). If that changed, the `update_origin_change` conflict +is generated. In all other cases, the `UPDATE` is normally applied without +generating a conflict. + +Both of these conflicts are resolved the same way as `insert_exists`, described in [INSERT/INSERT conflicts](#insertinsert-conflicts). + +#### UPDATE conflicts on the PRIMARY KEY + +BDR can't currently perform conflict resolution where the `PRIMARY KEY` +is changed by an `UPDATE` operation. You can update the primary +key, but you must ensure that no conflict with existing values is possible. + +Conflicts on the update of the primary key are [Divergent conflicts](#divergent-conflicts) and +require manual intervention. + +Updating a primary key is possible in Postgres, but there are +issues in both Postgres and BDR. + +A simple schema provides an example that explains: + +```sql +CREATE TABLE pktest (pk integer primary key, val integer); +INSERT INTO pktest VALUES (1,1); +``` + +Updating the Primary Key column is possible, so this SQL succeeds: + +```sql +UPDATE pktest SET pk=2 WHERE pk=1; +``` + +However, suppose there are multiple rows in the table: + +```sql +INSERT INTO pktest VALUES (3,3); +``` + +Some UPDATEs succeed: + +```sql +UPDATE pktest SET pk=4 WHERE pk=3; + +SELECT * FROM pktest; + pk | val +----+----- + 2 | 1 + 4 | 3 +(2 rows) +``` + +Other UPDATEs fail with constraint errors: + +```sql +UPDATE pktest SET pk=4 WHERE pk=2; +ERROR: duplicate key value violates unique constraint "pktest_pkey" +DETAIL: Key (pk)=(4) already exists +``` + +So for Postgres applications that update primary keys, be +careful to avoid runtime errors, even without BDR. + +With BDR, the situation becomes more complex if UPDATEs are +allowed from multiple locations at same time. + +Executing these two changes concurrently works: + +```sql +node1: UPDATE pktest SET pk=pk+1 WHERE pk = 2; +node2: UPDATE pktest SET pk=pk+1 WHERE pk = 4; + +SELECT * FROM pktest; + pk | val +----+----- + 3 | 1 + 5 | 3 +(2 rows) +``` + +Executing these next two changes concurrently causes +a divergent error, since both changes are accepted. But applying +the changes on the other node results in +`update_missing` conflicts. + +```sql +node1: UPDATE pktest SET pk=1 WHERE pk = 3; +node2: UPDATE pktest SET pk=2 WHERE pk = 3; +``` + +This scenario leaves the data different on each node: + +```sql +node1: +SELECT * FROM pktest; + pk | val +----+----- + 1 | 1 + 5 | 3 +(2 rows) + +node2: +SELECT * FROM pktest; + pk | val +----+----- + 2 | 1 + 5 | 3 +(2 rows) +``` + +You can identify and resolve this situation using [LiveCompare](/livecompare/latest). + +Concurrent conflicts present problems. Executing these two changes +concurrently isn't easy to resolve: + +```sql +node1: UPDATE pktest SET pk=6, val=8 WHERE pk = 5; +node2: UPDATE pktest SET pk=6, val=9 WHERE pk = 5; +``` + +Both changes are applied locally, causing a divergence between +the nodes. But then apply on the target fails on both nodes with +a duplicate key-value violation error, which causes the replication +to halt and requires manual resolution. + +This duplicate key violation error can now be avoided, +and replication doesn't break if you set the conflict_type +`update_pkey_exists` to `skip`, `update`, or `update_if_newer`. This +can still lead to divergence depending on the nature of the update. + +You can avoid divergence in cases where the same +old key is being updated by the same new key concurrently by setting +`update_pkey_exists` to `update_if_newer`. However, in certain situations, +divergence occurs even with `update_if_newer`, namely when two different +rows both are updated concurrently to the same new primary key. + +As a result, we recommend strongly against allowing primary key UPDATE operations +in your applications, especially with BDR. If parts +of your application change primary keys, then to avoid concurrent +changes, make those changes using Eager Replication. + +!!! Warning + In case the conflict resolution of `update_pkey_exists` conflict results + in update, one of the rows is always deleted. + +#### UPDATE operations that violate multiple UNIQUE constraints + +Like [INSERT operations that violate multiple UNIQUE constraints](#insert-operations-that-violate-multiple-unique-constraints), where an incoming +`UPDATE` violates more than one `UNIQUE` index (or the `PRIMARY KEY`), BDR +raises a `multiple_unique_conflicts` conflict. + +BDR supports deferred unique constraints. If a transaction can commit on the +source, then it applies cleanly on target, unless it sees conflicts. +However, a deferred primary key can't be used as a REPLICA IDENTITY, so +the use cases are already limited by that and the warning about using +multiple unique constraints. + +#### UPDATE/DELETE conflicts + +It's possible for one node to update a row that another node simultaneously +deletes. In this case an `UPDATE`/`DELETE` conflict can occur on replay. + +If the deleted row is still detectable (the deleted row wasn't removed by `VACUUM`), +the `update_recently_deleted` conflict is generated. By default the +`UPDATE` is skipped, but you can configure the resolution for this. +See [Conflict resolution](#conflict-resolution) for details. + +The deleted row can be cleaned up from the database by the time the `UPDATE` +is received in case the local node is lagging behind in replication. In this +case, BDR can't differentiate between `UPDATE`/`DELETE` +conflicts and [INSERT/UPDATE conflicts](#insertupdate-conflicts) and generates the +`update_missing` conflict. + +Another type of conflicting `DELETE` and `UPDATE` is a `DELETE` +that comes after the row was updated locally. In this situation, the +outcome depends on the type of conflict detection used. When using the +default, [origin conflict detection](#origin-conflict-detection), no conflict is detected at all, +leading to the `DELETE` being applied and the row removed. If you enable +[row version conflict detection](#row-version-conflict-detection), a `delete_recently_updated` conflict is +generated. The default resolution for this conflict type is to apply the +`DELETE` and remove the row, but you can configure this or this can be handled by +a conflict trigger. + +#### INSERT/UPDATE conflicts + +When using the default asynchronous mode of operation, a node might receive an +`UPDATE` of a row before the original `INSERT` was received. This can +happen only with three or more nodes being active (see [Conflicts with three or more nodes](#conflicts-with-three-or-more-nodes)). + +When this happens, the `update_missing` conflict is generated. The default +conflict resolver is `insert_or_skip`, though you can use `insert_or_error` or `skip` +instead. Resolvers that do insert-or-action first +try to `INSERT` a new row based on data +from the `UPDATE` when possible (when the whole row was received). For the +reconstruction of the row to be possible, the table either needs to have +`REPLICA IDENTITY FULL` or the row must not contain any toasted data. + +See [TOAST support details](#toast-support-details) for more info about toasted data. + +#### INSERT/DELETE conflicts + +Similar to the `INSERT`/`UPDATE` conflict, the node might also receive a +`DELETE` operation on a row for which it didn't yet receive an `INSERT`. This +is again possible only with three or more nodes set up (see [Conflicts with three or more nodes](#conflicts-with-three-or-more-nodes)). + +BDR can't currently detect this conflict type. The `INSERT` operation +doesn't generate any conflict type and the `INSERT` is applied. + +The `DELETE` operation always generates a `delete_missing` conflict, which +is by default resolved by skipping the operation. + +#### DELETE/DELETE conflicts + +A `DELETE`/`DELETE` conflict arises when two different nodes concurrently +delete the same tuple. + +This always generates a `delete_missing` conflict, which is by default +resolved by skipping the operation. + +This conflict is harmless since both `DELETE` operations have the same effect. One +of them can be safely ignored. + +#### Conflicts with three or more nodes + +If one node inserts a row that is then replayed to a second node and updated +there, a third node can receive the `UPDATE` from the second node before it +receives the `INSERT` from the first node. This scenario is an `INSERT`/`UPDATE` conflict. + +These conflicts are handled by discarding the `UPDATE`. This can lead to +different data on different nodes. These are [divergent conflicts](#divergent conflicts). + +This conflict type can happen only with three or more masters, of which at +least two must be actively writing. + +Also, the replication lag from node 1 to node 3 must be high enough to +allow the following sequence of actions: + +1. node 2 receives INSERT from node 1 +2. node 2 performs UPDATE +3. node 3 receives UPDATE from node 2 +4. node 3 receives INSERT from node 1 + +Using `insert_or_error` (or in some cases the `insert_or_skip` conflict resolver +for the `update_missing` conflict type) is a viable mitigation strategy for +these conflicts. However, enabling this option opens the door for +`INSERT`/`DELETE` conflicts: + +1. node 1 performs UPDATE +2. node 2 performs DELETE +3. node 3 receives DELETE from node 2 +4. node 3 receives UPDATE from node 1, turning it into an INSERT + +If these are problems, we recommend tuning freezing settings for a table +or database so that they are correctly detected as `update_recently_deleted`. + +Another alternative is to use [Eager Replication](eager) to prevent these conflicts. + +`INSERT`/`DELETE` conflicts can also occur with three or more nodes. +Such a conflict is identical to `INSERT`/`UPDATE` except with the +`UPDATE` replaced by a `DELETE`. This can result in a `delete_missing` +conflict. + +BDR could choose to make each INSERT into a check-for-recently +deleted, as occurs with an `update_missing` conflict. However, the +cost of doing this penalizes the majority of users, so at this time +it simply logs `delete_missing`. + +Later releases will automatically resolve `INSERT`/`DELETE` anomalies +via rechecks using [LiveCompare](/latest/livecompare) when `delete_missing` conflicts occur. +These can be performed manually by applications by checking +the `bdr.conflict_history_summary` view. + +These conflicts can occur in two main problem use cases: + +- `INSERT` followed rapidly by a `DELETE`, as can be used in queuing applications +- Any case where the primary key identifier of a table is reused + +Neither of these cases is common. We recommend not replicating +the affected tables if these problem use cases occur. + +BDR has problems with the latter case because BDR relies on the +uniqueness of identifiers to make replication work correctly. + +Applications that insert, delete, and +then later reuse the same unique identifiers can cause difficulties. +This is known as the [ABA problem](https://en.wikipedia.org/wiki/ABA_problem). BDR has no way of knowing whether +the rows are the current row, the last row, or much older rows. + +Unique identifier reuse is also a business problem, since it is +prevents unique identification over time, which prevents auditing, +traceability, and sensible data quality. Applications don't need +to reuse unique identifiers. + +Any identifier reuse that occurs in the time interval it takes for +changes to pass across the system causes difficulties. Although that +time might be short in normal operation, down nodes can extend that +interval to hours or days. + +We recommend that applications don't reuse unique identifiers, but if they +do, take steps to avoid reuse within a period of less than a year. + +This problem doesn't occur in applications that use sequences or UUIDs. + +### Foreign key constraint conflicts + +Conflicts between a remote transaction being applied and existing local data +can also occur for `FOREIGN KEY` (FK) constraints. + +BDR applies changes with `session_replication_role = 'replica'`, so foreign +keys aren't rechecked when applying changes. +In an active/active environment, this can result in FK violations if deletes +occur to the referenced table at the same time as inserts into the referencing +table. This is similar to an `INSERT`/`DELETE` conflict. + +In single-master Postgres, any `INSERT`/`UPDATE` that refers to a value in the +referenced table must wait for `DELETE` operations to finish before they can gain +a row-level lock. If a `DELETE` removes a referenced value, then the `INSERT`/`UPDATE` fails the FK check. + +In multi-master BDR. there are no inter-node row-level locks. An `INSERT` on +the referencing table doesn't wait behind a `DELETE` on the referenced table, +so both actions can occur concurrently. Thus an `INSERT`/`UPDATE` on one node +on the referencing table can use a value at the same time as a `DELETE` +on the referenced table on another node. This then results in a value +in the referencing table that's no longer present in the referenced +table. + +In practice, this occurs if the `DELETE` operations occurs on referenced tables +in separate transactions from `DELETE` operations on referencing tables. This isn't +a common operation. + +In a parent-child relationship such as Orders -> OrderItems, it isn't typical to +do this. It's more likely to mark an OrderItem as canceled than to remove it +completely. For reference/lookup data, it's unusual to completely +remove entries at the same time as using those same values for new fact data. + +While there's a possibility of dangling FKs, the risk of this in general +is very low and so BDR doesn't impose a generic solution to cover this case. +Once you understand the situation in which this occurs, two solutions are +possible. + +The first solution is to restrict the use of FKs to closely +related entities that are generally modified from only one node at a time, are +infrequently modified, or where the modification's concurrency is +application-mediated. This avoids any FK violations at the application +level. + +The second solution is to add triggers to protect against this case using +the BDR-provided functions `bdr.ri_fkey_trigger()` and +`bdr.ri_fkey_on_del_trigger()`. When called as `BEFORE` triggers, these +functions use `FOREIGN KEY` information to avoid FK anomalies by +setting referencing columns to NULL, much as if you had a SET NULL constraint. +This rechecks all FKs in one trigger, so you need to add only one +trigger per table to prevent FK violation. + +As an example, suppose you have two tables: Fact and RefData. Fact has an FK that +references RefData. Fact is the referencing table and RefData is the referenced +table. You need to add one trigger to each table. + +Add a trigger that sets columns to NULL in Fact if the referenced row +in RefData was already deleted. + +```sql +CREATE TRIGGER bdr_replica_fk_iu_trg + BEFORE INSERT OR UPDATE ON fact + FOR EACH ROW + EXECUTE PROCEDURE bdr.ri_fkey_trigger(); + +ALTER TABLE fact + ENABLE REPLICA TRIGGER bdr_replica_fk_iu_trg; +``` + +Add a trigger that sets columns to NULL in Fact at the time a DELETE occurs +on the RefData table. + +```sql +CREATE TRIGGER bdr_replica_fk_d_trg + BEFORE DELETE ON refdata + FOR EACH ROW + EXECUTE PROCEDURE bdr.ri_fkey_on_del_trigger(); + +ALTER TABLE refdata + ENABLE REPLICA TRIGGER bdr_replica_fk_d_trg; +``` + +Adding both triggers avoids dangling foreign keys. + +### TRUNCATE conflicts + +`TRUNCATE` behaves similarly to a `DELETE` of all rows but performs this +action by physically removing the table data rather than row-by-row +deletion. As a result, row-level conflict handling isn't available, so +`TRUNCATE` commands don't generate conflicts with other DML actions, +even when there's a clear conflict. + +As a result, the ordering of replay can cause divergent changes if +another DML is executed concurrently on other nodes to the `TRUNCATE`. + +You can take one of the following actions: + +- Ensure `TRUNCATE` isn't executed alongside other concurrent DML. + Rely on [LiveCompare](/livecompare/latest) to highlight any such inconsistency. + +- Replace `TRUNCATE` with a `DELETE` statement with no `WHERE` clause. + This approach is likely to have very poor performance on + larger tables. + +- Set `bdr.truncate_locking = 'on'` to set the `TRUNCATE` command’s + locking behavior. This setting determines whether `TRUNCATE` obeys the `bdr.ddl_locking` + setting. This + isn't the default behavior for `TRUNCATE` since it requires all nodes + to be up. This configuration might not be possible or wanted in all cases. + +### Exclusion constraint conflicts + +BDR doesn't support exclusion constraints and prevents their creation. + +If an existing standalone database is converted to a BDR database, then +drop all exclusion constraints manually. + +In a distributed asynchronous system, you can't ensure that no +set of rows that violate the constraint exists, because all transactions +on different nodes are fully isolated. Exclusion constraints lead to +replay deadlocks where replay can't progress from any node to any +other node because of exclusion constraint violations. + +If you force BDR to create an exclusion constraint, or you don't drop +existing ones when converting a standalone database to BDR, +expect replication to break. To get it to progress again, remove or alter the +local tuples that an incoming remote tuple conflicts with so that the remote +transaction can be applied. + +### Data conflicts for roles and tablespace differences + +Conflicts can also arise where nodes have global (Postgres-system-wide) +data, like roles, that differ. This can result in operations—mainly +`DDL`—that can run successfully and commit on one node but then +fail to apply to other nodes. + +For example, node1 might have a user named fred, and that user wasn't +created on node2. If fred on node1 creates a table, the table is +replicated with its owner set to fred. When the DDL command is applied to +node2, the DDL fails because there's no user named fred. This failure +emits an error in the Postgres logs. + +Administrator intervention is required to resolve this conflict +by creating the user fred in the database where BDR is running. +You can set `bdr.role_replication = on` to resolve this in future. + +### Lock conflicts and deadlock aborts + +Because BDR writer processes operate much like normal user sessions, they're +subject to the usual rules around row and table locking. This can sometimes +lead to BDR writer processes waiting on locks held by user transactions or +even by each other. + +Relevant locking includes: + +- Explicit table-level locking (`LOCK TABLE ...`) by user sessions +- Explicit row-level locking (`SELECT ... FOR UPDATE/FOR SHARE`) by user sessions +- Implicit locking because of row `UPDATE`, `INSERT`, or `DELETE` operations, either + from local activity or from replication from other nodes + +A BDR writer process can deadlock with a user +transaction, where the user transaction is waiting on a lock held +by the writer process and vice versa. Two writer processes can also +deadlock with each other. Postgres's deadlock detector steps in and +terminates one of the problem transactions. If the BDR writer process is +terminated, it retries and generally succeeds. + +All these issues are transient and generally require no administrator +action. If a writer process is stuck for a long time behind a lock +on an idle user session, the administrator can terminate +the user session to get replication flowing again, but this is +no different from a user holding a long lock that impacts another +user session. + +Use of the [log_lock_waits](https://www.postgresql.org/docs/current/runtime-config-logging.html#GUC-LOG-LOCK-WAITS) +facility in Postgres can help identify locking related replay stalls. + +### Divergent conflicts + +Divergent conflicts arise when data that should be the same on different +nodes differs unexpectedly. Divergent conflicts should not occur, but not +all such conflicts can be reliably prevented at the time of writing. + +Changing the `PRIMARY KEY` of a row can lead to a divergent conflict if +another node changes the key of the same row before all nodes have replayed +the change. Avoid changing primary keys, or change them only on one designated +node. + +Divergent conflicts involving row data generally require administrator +action to manually adjust the data on one of the nodes to be consistent +with the other one. Such conflicts don't arise so long as you use BDR as documented +and avoid settings or functions marked as unsafe. + +The administrator must manually resolve such conflicts. You might need to use the +advanced options such as `bdr.ddl_replication` and `bdr.ddl_locking` depending on the +nature of the conflict. However, careless use of +these options can make things much worse and create a conflict that generic instructions can't address. + +### TOAST support details + +Postgres uses out-of-line storage for larger columns called +[TOAST](https://www.postgresql.org/docs/current/storage-toast.html). + +The TOAST values handling in logical decoding (which BDR is built on top of) +and logical replication is different from inline data stored as part of the +main row in the table. + +The TOAST value is logged into the transaction log (WAL) only if the value +has changed. This can cause problems, especially when handling UPDATE conflicts +because an `UPDATE` statement that didn't change a value of a toasted column +produces a row without that column. As mentioned in +[INSERT/UPDATE conflicts](#insertupdate-conflicts), BDR reports an error if an `update_missing` +conflict is resolved using `insert_or_error` and there are missing TOAST columns. + +However, there are more subtle issues than this one in case of concurrent +workloads with asynchronous replication (Eager transactions aren't affected). +Imagine, for example, the following workload on a EDB Postgres Distributed cluster with three nodes called +A, B, and C: + +1. On node A: txn A1 does an UPDATE SET col1 = 'toast data...' and commits first. +2. On node B: txn B1 does UPDATE SET other_column = 'anything else'; and commits after A1. +3. On node C: the connection to node A lags behind. +4. On node C: txn B1 is applied first, it misses the TOASTed column in col1, + but gets applied without conflict. +5. On node C: txn A1 conflicts (on update_origin_change) and is skipped. +6. Node C misses the toasted data from A1 forever. + +This scenario isn't usually a problem when using BDR. (It is when using +either built-in logical replication or plain pglogical for multi-master.) +BDR adds its own logging of TOAST columns when it detects a local UPDATE +to a row that recently replicated a TOAST column modification and the local +UPDATE isn't modifying the TOAST. Thus BDR prevents any inconsistency for +toasted data across different nodes. This situation causes increased WAL logging +when updates occur on multiple nodes (that is, when origin changes for a tuple). +Additional WAL overhead is zero if all updates are made from a single node, +as is normally the case with BDR AlwaysOn architecture. + +!!! Note + Running `VACUUM FULL` or `CLUSTER` on just the TOAST table without + also doing same on the main table removes metadata needed for the + extra logging to work. This means that, for a short period of time after + such a statement, the protection against these concurrency issues isn't + be present. + +!!! Warning + The additional WAL logging of TOAST is done using the `BEFORE UPDATE` + trigger on standard Postgres. This trigger must be sorted alphabetically + last (based on trigger name) among all `BEFORE UPDATE` triggers on the + table. It's prefixed with `zzzz_bdr_` to make this easier, but make sure + you don't create any trigger with a name that sorts after it. Otherwise + you won't have the protection against the concurrency issues. + +For the `insert_or_error` conflict resolution, the use of +`REPLICA IDENTITY FULL` is, however, still required. + +None of these problems associated with toasted columns affect tables with +`REPLICA IDENTITY FULL`. This setting always logs a toasted value as +part of the key since the whole row is considered to be part of the key. BDR +can reconstruct the new row, filling the +missing data from the key row. As a result, using +`REPLICA IDENTITY FULL` can increase WAL size significantly. + +## Avoiding or tolerating conflicts + +In most cases, you can design the application to avoid or tolerate conflicts. + +Conflicts can happen only if things are happening at the same time on +multiple nodes. The simplest way to avoid conflicts is to only ever write +to one node or to only ever write to a specific row in a specific way from +one specific node at a time. + +This happens naturally in many applications. For example, many +consumer applications allow data to be changed only by the owning user, such as +changing the default billing address on your account. Such data changes seldom +have update conflicts. + +You might make a change just before a node goes down, so the +change seems to be lost. You might then make the same change again, +leading to two updates on different nodes. When the down node comes back up, +it tries to send the older change to other nodes, but it's rejected +because the last update of the data is kept. + +For `INSERT`/`INSERT` conflicts, use [global sequences](../sequences) +to prevent this type of conflict. + +For applications that assign relationships between objects, such as a room +booking application, applying `update_if_newer` might not give an acceptable +business outcome. That is, it isn't useful to confirm to two people separately +that they have booked the same room. The simplest resolution is to use Eager +Replication to ensure that only one booking succeeds. More complex ways +might be possible depending on the application. For example, you can assign 100 seats +to each node and allow those to be booked by a writer on that node. But if +none are available locally, use a distributed locking scheme or Eager +Replication once most seats are reserved. + +Another technique for ensuring certain types of updates occur only from one +specific node is to route different types of transactions through +different nodes. For example: + +- Receiving parcels on one node but delivering parcels using another node +- A service application where orders are input on one node, work is + prepared on a second node, and then served back to customers on another + +Frequently, the best course is to allow conflicts to occur and +design the application to work with BDR's conflict resolution +mechanisms to cope with the conflict. + +## Conflict detection + +BDR provides these mechanisms for conflict detection: + +- [Origin conflict detection](#origin-conflict-detection) (default) +- [Row version conflict detection](#row-version-conflict-detection) +- [Column-level conflict detection](column-level-conflicts) + +### Origin conflict detection + +Origin conflict detection uses and relies on commit timestamps as +recorded on the node the transaction originates from. This +requires clocks to be in sync to work correctly or to be within a +tolerance of the fastest message between two nodes. If this +isn't the case, conflict resolution tends to favor the node that's +further ahead. You can manage clock skew between nodes using the +parameters `bdr.maximum_clock_skew` and `bdr.maximum_clock_skew_action`. + +Row origins are available only if `track_commit_timestamp = on`. + +Conflicts are initially detected based on whether the replication +origin changed, so conflict triggers are called in +situations that might turn out not to be conflicts. Hence, this +mechanism isn't precise, since it can generate false-positive conflicts. + +Origin info is available only up to the point where a row is frozen. +Updates arriving for a row after it was frozen don't raise +a conflict so are applied in all cases. This is the normal case +when adding a new node by `bdr_init_physical`, so raising conflicts +causes many false-positive results in that case. + +When a node that was offline reconnects and +begins sending data changes, this can cause divergent +errors if the newly arrived updates are older than the +frozen rows that they update. Inserts and deletes aren't affected by this situation. + +We suggest that you don't leave down nodes for extended outages, +as discussed in [Node restart and down node recovery](../nodes). + +On EDB Postgres Extended Server and EDB Postgres Advanced Server, BDR +holds back the freezing of rows while a node is down. This mechanism handles +this situation gracefully so you don't need to change parameter settings. + +On other variants of Postgres, you might need to manage this situation with +some care. + +Freezing normally occurs when a row being vacuumed is older than +`vacuum_freeze_min_age` xids from the current xid, which means that you +need to configure suitably high values for these parameters: + +- `vacuum_freeze_min_age` +- `vacuum_freeze_table_age` +- `autovacuum_freeze_max_age` + +Choose values based on the transaction rate, giving +a grace period of downtime before removing any conflict data +from the database node. For example, when `vacuum_freeze_min_age` is set to 500 million, a node performing +1000 TPS can be down for just over 5.5 days before conflict +data is removed. +The CommitTS data structure takes on-disk space of 5 GB with +that setting, so lower transaction rate systems can benefit from +lower settings. + +Initially recommended settings are: + +```sql +# 1 billion = 10GB +autovacuum_freeze_max_age = 1000000000 + +vacuum_freeze_min_age = 500000000 + +# 90% of autovacuum_freeze_max_age +vacuum_freeze_table_age = 900000000 +``` + +Note that: + +- You can set `autovacuum_freeze_max_age` only at node start. +- You can set `vacuum_freeze_min_age`, so using a + low value freezes rows early and can result in conflicts being + ignored. You can also set `autovacuum_freeze_min_age` and `toast.autovacuum_freeze_min_age` + for individual tables. +- Running the CLUSTER or VACUUM FREEZE commands also + freezes rows early and can result in conflicts being ignored. + +### Row version conflict detection + +Alternatively, BDR provides the option to use row versioning and make +conflict detection independent of the nodes' system clock. + +Row version conflict detection requires that you enable three things. If any of these +steps aren't performed correctly then [origin conflict detection](#origin-conflict-detection) is used. + +1. `check_full_tuple` must be enabled for the BDR node group. + +2. `REPLICA IDENTITY FULL` must be enabled on all tables that use + row version conflict detection. + +3. Row Version Tracking must be enabled on the table by using + `bdr.alter_table_conflict_detection`. This function adds a column + (with a name you specify) and an `UPDATE` trigger that manages the new + column value. The column is created as `INTEGER` type. + +Although the counter is incremented only on `UPDATE`, this technique allows +conflict detection for both `UPDATE` and `DELETE`. + +This approach resembles Lamport timestamps and fully prevents +the ABA problem for conflict detection. + +!!! Note + The row-level conflict resolution is still handled based on the + [conflict resolution](#conflict-resolution) configuration even with row versioning. The way + the row version is generated is useful only for detecting conflicts. + Don't rely on it as authoritative information about which + version of row is newer. + +To determine the current conflict resolution strategy used for a specific +table, refer to the column `conflict_detection` of the view `bdr.tables`. + +### bdr.alter_table_conflict_detection + +Allows the table owner to change how conflict detection works for a given table. + +#### Synopsis + +```sql +bdr.alter_table_conflict_detection(relation regclass, + method text, + column_name name DEFAULT NULL) +``` + +#### Parameters + +- `relation` — Name of the relation for which to set the new conflict detection method. +- `method` — The conflict detection method to use. +- `column_name` — The column to use for storing the column detection data. + This can be skipped, in which case the column name is chosen based + on the conflict detection method. The `row_origin` method doesn't require an + extra column for metadata storage. + +The recognized methods for conflict detection are: + +- `row_origin` — Origin of the previous change made on the tuple (see + [Origin conflict detection](#origin-conflict-detection)). This is the only method supported that + doesn't require an extra column in the table. +- `row_version` — Row version column (see [Row version conflict detection](#row-version-conflict-detection)). +- `column_commit_timestamp` — Per-column commit timestamps (described in + [CLCD](column-level-conflicts)). +- `column_modify_timestamp` — Per-column modification timestamp (described in + [CLCD](column-level-conflicts)). + +#### Notes + +For more information about the difference between `column_commit_timestamp` +and `column_modify_timestamp` conflict detection methods, see +[Current versus commit timestamp](column-level-conflicts#current-versus-commit-timestamp). + +This function uses the same replication mechanism as `DDL` statements. This +means the replication is affected by the [ddl filters](../repsets#ddl-replication-filtering) +configuration. + +The function takes a `DML` global lock on the relation for which +column-level conflict resolution is being enabled. + +This function is transactional. You can roll back the effects back with the +`ROLLBACK` of the transaction, and the changes are visible to the current +transaction. + +The `bdr.alter_table_conflict_detection` function can be executed only by +the owner of the `relation`, unless `bdr.backwards_compatibility` is +set to 30618 or below. + +!!! Warning + When changing the conflict detection method from one that + uses an extra column to store metadata, that column is dropped. + +!!! Warning + This function disables CAMO (together with a warning, as + long as these aren't disabled with `bdr.camo_enable_client_warnings`). + +### List of conflict types + +BDR recognizes the following conflict types, which can be used as the +`conflict_type` parameter: + +- `insert_exists` — An incoming insert conflicts with an existing row via a + primary key or a unique key/index. +- `update_differing` — An incoming update's key row differs from a local + row. This can happen only when using [row version conflict detection](#row-version-conflict-detection). +- `update_origin_change` — An incoming update is modifying a row that was + last changed by a different node. +- `update_missing` — An incoming update is trying to modify a row that doesn't + exist. +- `update_recently_deleted` — An incoming update is trying to modify a row + that was recently deleted. +- `update_pkey_exists` — An incoming update has modified the `PRIMARY KEY` to + a value that already exists on the node that's applying the change. +- `multiple_unique_conflicts` — The incoming row conflicts with multiple + UNIQUE constraints/indexes in the target table. +- `delete_recently_updated` — An incoming delete with an older commit timestamp + than the most recent update of the row on the current node, or when + using [Row version conflict detection]. +- `delete_missing` — An incoming delete is trying to remove a row that doesn't + exist. +- `target_column_missing` — The target table is missing one or more columns + present in the incoming row. +- `source_column_missing` — The incoming row is missing one or more columns + that are present in the target table. +- `target_table_missing` — The target table is missing. +- `apply_error_ddl` — An error was thrown by Postgres when applying a + replicated DDL command. + +## Conflict resolution + +Most conflicts can be resolved automatically. BDR defaults to a +last-update-wins mechanism or, more accurately, the `update_if_newer` +conflict resolver. This mechanism retains the most recently +inserted or changed row of the two conflicting ones based on the same +commit timestamps used for conflict detection. The behavior in certain corner-case +scenarios depends on the settings used for bdr.create_node_group and +alternatively for bdr.alter_node_group. + +BDR lets you override the default behavior of conflict resolution by using the +following function: + +### bdr.alter_node_set_conflict_resolver + +This function sets the behavior of conflict resolution on a given node. + +#### Synopsis + +```sql +bdr.alter_node_set_conflict_resolver(node_name text, + conflict_type text, + conflict_resolver text) +``` + +#### Parameters + +- `node_name` — Name of the node that's being changed. +- `conflict_type` — Conflict type for which to apply the setting + (see [List of conflict types](#list-of-conflict-types)). +- `conflict_resolver` — Resolver to use for the given conflict type + (see [List of conflict resolvers](#list-of-conflict-resolvers)). + +#### Notes + +Currently you can change only the local node. The function call isn't +replicated. If you want to change settings on multiple nodes, you must run the function +on each of them. + +The configuration change made by this function overrides any +default behavior of conflict resolutions specified by bdr.create_node_group +or `bdr.alter_node_group`. + +This function is transactional. You can roll back the changes, and +they are visible to the current transaction. + +### List of conflict resolvers + +Several conflict resolvers are available in BDR, with differing coverages +of the conflict types they can handle: + +- `error` — Throws error and stops replication. + Can be used for any conflict type. +- `skip` — Skips processing the remote change and continues replication + with the next change. + Can be used for `insert_exists`, `update_differing`, `update_origin_change`, + `update_missing`, `update_recently_deleted`, `update_pkey_exists`, + `delete_recently_updated`, `delete_missing`, `target_table_missing`, + `target_column_missing`, and `source_column_missing` conflict types. +- `skip_if_recently_dropped` — Skip the remote change if it's for a table that + doesn't exist downstream because it was recently (within + one day) dropped on the downstream; throw an error otherwise. Can be used for + the `target_table_missing` conflict type. `skip_if_recently_dropped` conflict + resolver can pose challenges if a table with the same name is re-created shortly + after it's dropped. In that case, one of the nodes might see the DMLs on the + re-created table before it sees the DDL to re-create the table. It then + incorrectly skips the remote data, assuming that the table is recently dropped, + and causes data loss. We hence recommend that you don't reuse the object namesq + immediately after they are dropped along with this conflict resolver. +- `skip_transaction` — Skips the whole transaction that generated the + conflict. Can be used for `apply_error_ddl` conflict. +- `update_if_newer` — Update if the remote row was committed later (as + determined by the wall clock of the originating node) than the conflicting + local row. If the timestamps are same, the node id is used as a tie-breaker + to ensure that same row is picked on all nodes (higher nodeid wins). + Can be used for `insert_exists`, `update_differing`, `update_origin_change`, + and `update_pkey_exists` conflict types. +- `update` — Always perform the replicated action. + Can be used for `insert_exists` (turns the `INSERT` into `UPDATE`), + `update_differing`, `update_origin_change`, `update_pkey_exists`, + and `delete_recently_updated` (performs the delete). +- `insert_or_skip` — Try to build a new row from available information sent by + the origin and INSERT it. If there isn't enough information available to build + a full row, skip the change. + Can be used for `update_missing` and `update_recently_deleted` conflict types. +- `insert_or_error` — Try to build new row from available information sent by + origin and insert it. If there isn't enough information available to build + full row, throw an error and stop the replication. + Can be used for `update_missing` and `update_recently_deleted` conflict types. +- `ignore` — Ignore any missing target column and continue processing. + Can be used for the `target_column_missing` conflict type. +- `ignore_if_null` — Ignore a missing target column if the extra column in the + remote row contains a NULL value. Otherwise, throw an error and stop replication. + Can be used for the `target_column_missing` conflict type. +- `use_default_value` — Fill the missing column value with the default (including + NULL if that's the column default) and continue processing. Any error while + processing the default or violation of constraints (i.e., NULL default on + NOT NULL column) stops replication. + Can be used for the `source_column_missing` conflict type. + +The `insert_exists`, `update_differing`, `update_origin_change`, +`update_missing`, `multiple_unique_conflicts`, `update_recently_deleted`, +`update_pkey_exists`, `delete_recently_updated, and `delete_missing` conflict +types can also be resolved by user-defined logic using +[Conflict triggers](../striggers). + +This matrix helps you individuate the conflict types the conflict +resolvers can handle. + +| | insert_exists | update_differing | update_origin_change | update_missing | update_recently_deleted | update_pkey_exists | delete_recently_updated | delete_missing | target_column_missing | source_column_missing | target_table_missing | multiple_unique_conflicts | +| :----------------------- | ------------- | ---------------- | -------------------- | -------------- | ----------------------- | ------------------ | ----------------------- | -------------- | --------------------- | --------------------- | -------------------- | ------------------------- | +| error | X | X | X | X | X | X | X | X | X | X | X | X | +| skip | X | X | X | X | X | X | X | X | X | X | X | X | +| skip_if_recently_dropped | | | | | | | | | | | X | | +| update_if_newer | X | X | X | | | X | | | | | | | +| update | X | X | X | | | X | X | | | | | X | +| insert_or_skip | | | | X | X | | | | | | | | +| insert_or_error | | | | X | X | | | | | | | | +| ignore | | | | | | | | | X | | | | +| ignore_if_null | | | | | | | | | X | | | | +| use_default_value | | | | | | | | | | X | | | +| conflict_trigger | X | X | X | X | X | X | X | X | | | | X | + +### Default conflict resolvers + +| Conflict type | Resolver | +| ------------------------- | ------------------------ | +| insert_exists | update_if_newer | +| update_differing | update_if_newer | +| update_origin_change | update_if_newer | +| update_missing | insert_or_skip | +| update_recently_deleted | skip | +| update_pkey_exists | update_if_newer | +| multiple_unique_conflicts | error | +| delete_recently_updated | skip | +| delete_missing | skip | +| target_column_missing | ignore_if_null | +| source_column_missing | use_default_value | +| target_table_missing | skip_if_recently_dropped | +| apply_error_ddl | error | + +### List of conflict resolutions + +The conflict resolution represents the kind of resolution chosen by the +conflict resolver and corresponds to the specific action that was +taken to resolve the conflict. + +The following conflict resolutions are currently supported for the +`conflict_resolution` parameter: + +- `apply_remote` — The remote (incoming) row was applied. +- `skip` — Processing of the row was skipped (no change was made + locally). +- `merge` — A new row was created, merging information from remote and local row. +- `user` — User code (a conflict trigger) produced the row that was written + to the target table. + +## Conflict logging + +To ease the diagnosis and handling of multi-master conflicts, BDR, by default, logs every conflict +into the `bdr.conflict_history` table. +You can change this behavior with more granularity with the following functions. + +### bdr.alter_node_set_log_config + +Set the conflict logging configuration for a node. + +#### Synopsis + +```sql +bdr.alter_node_set_log_config(node_name text, + log_to_file bool DEFAULT true, + log_to_table bool DEFAULT true, + conflict_type text[] DEFAULT NULL, + conflict_resolution text[] DEFAULT NULL) +``` + +#### Parameters + +- `node_name` — Name of the node that's being changed. +- `log_to_file` — Whether to log to the node log file. +- `log_to_table` — Whether to log to the `bdr.conflict_history` table. +- `conflict_type` — Conflict types to log. NULL (the default) means all. +- `conflict_resolution` — Conflict resolutions to log. NULL + (the default) means all. + +#### Notes + +Only the local node can be changed. The function call isn't +replicated. If you want to change settings on multiple nodes, you must run the function +on each of them. + +This function is transactional. You can roll back the changes, and +they are visible to the current transaction. + +#### Listing conflict logging configurations + +The view `bdr.node_log_config` shows all the logging configurations. +It lists the name of the logging configuration, where it logs, and the +conflict type and resolution it logs. + +#### Logging conflicts to a table + +Conflicts are logged to a table if `log_to_table` is set to true. +The target table for conflict logging is `bdr.conflict_history`. + +This table is range partitioned on the column `local_time`. The table is +managed by Autopartition. By default, a new partition is created for every day, and +conflicts of the last one month are maintained. After that, the old partitions +are dropped automatically. Autopartition creates between 7 and 14 +partitions in advance. bdr_superuser can change these defaults. + +Since conflicts generated for all tables managed by BDR are logged to this +table, it's important to ensure that only legitimate users can read the +conflicted data. BDR does this by defining ROW LEVEL SECURITY policies on the +`bdr.conflict_history` table. Only owners of the tables are allowed to read conflicts +on the respective tables. If the underlying tables have RLS policies +defined, enabled, and enforced, then even owners can't read the conflicts. RLS +policies created with the FORCE option also apply to owners of the table. In that +case, some or all rows in the underlying table might not be readable even to the +owner. So BDR also enforces a stricter policy on the conflict log table. + +The default role `bdr_read_all_conflicts` can be granted to users who +need to see all conflict details logged to the `bdr.conflict_history` table +without also granting them `bdr_superuser` role. + +The default role `bdr_read_all_stats` has access to a catalog view called +`bdr.conflict_history_summary`, which doesn't contain user data, allowing +monitoring of any conflicts logged. + +### Conflict reporting + +Conflicts logged to tables can be summarized in reports. Reports allow +application owners to identify, understand, and resolve conflicts +and introduce application changes to prevent them. + +```sql +SELECT nspname, relname +, date_trunc('day', local_time) :: date AS date +, count(*) +FROM bdr.conflict_history +WHERE local_time > date_trunc('day', current_timestamp) +GROUP BY 1,2,3 +ORDER BY 1,2; + + nspname | relname | date | count +---------+---------+------------+------- + my_app | test | 2019-04-05 | 1 +(1 row) +``` + +## Data verification with LiveCompare + +LiveCompare is a utility program designed +to compare any two databases to verify that they are identical. + +LiveCompare is included as part of the BDR stack and can be +aimed at any pair of BDR nodes. By default, it compares all replicated tables and reports differences. +LiveCompare also works with non-BDR data sources such as Postgres +and Oracle. + +You can also use LiveCompare to continuously monitor incoming rows. +You can stop and start it without losing context information, +so you can run it at convenient times. + +LiveCompare allows concurrent checking of multiple tables. You can +configure it to allow checking of a few tables or just +a section of rows within a table. +Checks are performed by first comparing whole +row hashes. If different, LiveCompare then compares whole rows. +LiveCompare avoids overheads by comparing rows in useful-sized batches. + +If differences are found, they can be rechecked over a period, +allowing for the delays of eventual consistency. + +Refer to the [LiveCompare](/livecompare/latest/) documentation for further details. diff --git a/product_docs/docs/pgd/5/consistency/crdt.mdx b/product_docs/docs/pgd/5/consistency/crdt.mdx new file mode 100644 index 00000000000..0d41f1bf369 --- /dev/null +++ b/product_docs/docs/pgd/5/consistency/crdt.mdx @@ -0,0 +1,678 @@ +--- +navTitle: CRDT data types +title: Conflict-free replicated data types +redirects: + - /pgd/latest/bdr/crdt/ +--- + +Conflict-free replicated data types (CRDT) support merging values +from concurrently modified rows instead of discarding one of the rows as traditional resolution does. + +Each CRDT type is implemented as a separate PostgreSQL data type with +an extra callback added to the `bdr.crdt_handlers` catalog. The merge +process happens inside the BDR writer on the apply side without any user +action needed. + +CRDTs require the table to have column-level conflict resolution enabled, +as documented in [CLCD](column-level-conflicts). + +The only action you need to take is to use a particular data type +in CREATE/ALTER TABLE rather than standard built-in data types such as +integer. For example, consider the following table with one regular integer +counter and a single row: + +``` +CREATE TABLE non_crdt_example ( + id integer PRIMARY KEY, + counter integer NOT NULL DEFAULT 0 +); + +INSERT INTO non_crdt_example (id) VALUES (1); +``` + +Suppose you issue the following SQL on two nodes at same time: + +``` +UPDATE non_crdt_example + SET counter = counter + 1 -- "reflexive" update + WHERE id = 1; +``` + +After both updates are applied, you can see the resulting values using this query: + +``` +SELECT * FROM non_crdt_example WHERE id = 1; + id | counter + -----+----------- + 1 | 1 +(1 row) +``` + +This code shows that you lost one of the increments due to the `update_if_newer` +conflict resolver. If you use the CRDT counter data type instead, +the result looks like this: + +``` +CREATE TABLE crdt_example ( + id integer PRIMARY KEY, + counter bdr.crdt_gcounter NOT NULL DEFAULT 0 +); + +ALTER TABLE crdt_example REPLICA IDENTITY FULL; + +SELECT bdr.alter_table_conflict_detection('crdt_example', + 'column_modify_timestamp', 'cts'); + +INSERT INTO crdt_example (id) VALUES (1); +``` + +Again issue the following SQL on two nodes at same time, +and then wait for the changes to be applied: + +``` +UPDATE crdt_example + SET counter = counter + 1 -- "reflexive" update + WHERE id = 1; + +SELECT id, counter FROM crdt_example WHERE id = 1; + id | counter + -----+----------- + 1 | 2 +(1 row) +``` + +This example shows that CRDTs correctly allow accumulator columns to work, even +in the face of asynchronous concurrent updates that otherwise conflict. + +The `crdt_gcounter` type is an example of state-based CRDT types that +work only with reflexive UPDATE SQL, such as `x = x + 1`, as the example shows. + +The `bdr.crdt_raw_value` configuration option determines whether queries +return the current value or the full internal state of the CRDT type. By +default, only the current numeric value is returned. When set to `true`, +queries return representation of the full state. You can use the special hash operator +(`#`) to request only the current numeric value without using the +special operator (the default behavior). If the full state is +dumped using `bdr.crdt_raw_value = on`, then the value can +reload only with `bdr.crdt_raw_value = on`. + +!!! Note + The `bdr.crdt_raw_value` applies formatting only of data returned + to clients, that is, simple column references in the select list. Any column + references in other parts of the query (such as `WHERE` clause or even + expressions in the select list) might still require use of the `#` operator. + +Another class of CRDT data types is referred to *delta CRDT* +types. These are a special subclass of operation-based CRDTs. + +With delta CRDTs, any update to a value is compared to the +previous value on the same node. Then a change is applied as a delta +on all other nodes. + +``` +CREATE TABLE crdt_delta_example ( + id integer PRIMARY KEY, + counter bdr.crdt_delta_counter NOT NULL DEFAULT 0 +); + +ALTER TABLE crdt_delta_example REPLICA IDENTITY FULL; + +SELECT bdr.alter_table_conflict_detection('crdt_delta_example', + 'column_modify_timestamp', 'cts'); + +INSERT INTO crdt_delta_example (id) VALUES (1); +``` + +Suppose you issue the following SQL on two nodes at same time: + +``` +UPDATE crdt_delta_example + SET counter = 2 -- notice NOT counter = counter + 2 + WHERE id = 1; +``` + +After both updates are applied, you can see the resulting values using this query: + +``` +SELECT id, counter FROM crdt_delta_example WHERE id = 1; + id | counter + -----+--------- + 1 | 4 +(1 row) +``` + +With a regular `integer` column, the result is `2`. But +when you update the row with a delta CRDT counter, you start with the OLD +row version, make a NEW row version, and send both to the remote node. +There, compare them with the version found there (e.g., +the LOCAL version). Standard CRDTs merge the NEW and the LOCAL version, +while delta CRDTs compare the OLD and NEW versions and apply the delta +to the LOCAL version. + +The CRDT types are installed as part of `bdr` into the `bdr` schema. +For convenience, the basic operators (`+`, `#` and `!`) and a number +of common aggregate functions (`min`, `max`, `sum`, and `avg`) are +created in `pg_catalog`. This makes them available without having to tweak +`search_path`. + +An important question is how query planning and optimization works with these +new data types. CRDT types are handled transparently. Both `ANALYZE` and +the optimizer work, so estimation and query planning works fine without +having to do anything else. + +## State-based and operation-based CRDTs + +Following the notation from [1], both operation-based +and state-based CRDTs are implemented. + +### Operation-based CRDT types (CmCRDT) + +The implementation of operation-based types is trivial because +the operation isn't transferred explicitly but computed from the old +and new row received from the remote node. + +Currently, these operation-based CRDTs are implemented: + +- `crdt_delta_counter` — `bigint` counter (increments/decrements) +- `crdt_delta_sum` — `numeric` sum (increments/decrements) + +These types leverage existing data types (for example, `crdt_delta_counter` is +a domain on a `bigint`) with a little bit of code to compute the delta. + +This approach is possible only for types for which the method for computing +the delta is known, but the result is simple and cheap (both in terms of +space and CPU) and has a couple of additional benefits. For example, it can +leverage operators/syntax for the underlying data type. + +The main disadvantage is that you can't reset this value reliably +in an asynchronous and concurrent environment. + +!!! Note + Implementing more complicated operation-based types by + creating custom data types is possible, storing the state and the last operation. + (Every change is decoded and transferred, so multiple + operations aren't needed). But at that point, the main benefits (simplicity, + reuse of existing data types) are lost without gaining any advantage compared to + state-based types (for example, still no capability to reset) except for the + space requirements. (A per-node state isn't needed.) + +### State-based CRDT types (CvCRDT) + +State-based types require a more complex internal state and so can't +use the regular data types directly the way operation-based types do. + +Currently, four state-based CRDTs are implemented: + +- `crdt_gcounter` — `bigint` counter (increment-only) +- `crdt_gsum` — `numeric` sum/counter (increment-only) +- `crdt_pncounter` — `bigint` counter (increments/decrements) +- `crdt_pnsum` — `numeric` sum/counter (increments/decrements) + +The internal state typically includes per-node information, increasing +the on-disk size but allowing added benefits. The need to implement +custom data types implies more code (in/out functions and operators). + +The advantage is the ability to reliably reset the values, a somewhat +self-healing nature in the presence of lost changes (which doesn't +happen in a cluster that operates properly), and the ability to receive changes +from other than source nodes. + +Consider, for example, that a value is modified on node A, and the change +gets replicated to B but not C due to network issue between A and C. +If B modifies the value and this change gets replicated to C, it +includes even the original change from A. With operation-based CRDTs, +node C doesn't receive the change until the A-C network connection +starts working again. + +The main disadvantages of CvCRDTs are higher costs in terms of +disk space and CPU usage. A bit of information for each node is needed, including nodes +that were already removed from the cluster. The complex nature of +the state (serialized into varlena types) means increased CPU use. + +## Disk-space requirements + +An important consideration is the overhead associated with CRDT types, +particularly the on-disk size. + +For operation-based types, this is trivial, because the types +are merely domains on top of other types and so have the same disk +space requirements no matter how many nodes are there. + +- `crdt_delta_counter` — Same as `bigint` (8 bytes) +- `crdt_delta_sum` — Same as `numeric` (variable, depending on precision + and scale) + +There's no dependency on the number of nodes because operation-based +CRDT types don't store any per-node information. + +For state-based types, the situation is more complicated. All the types +are variable-length (stored essentially as a `bytea` column) and consist +of a header and a certain amount of per-node information for each node +that modified the value. + +For the `bigint` variants, formulas computing approximate size are (`N` +denotes the number of nodes that modified this value): + +- `crdt_gcounter` — `32B (header) + N * 12B (per-node)` +- `crdt_pncounter` -—`48B (header) + N * 20B (per-node)` + +For the `numeric` variants, there's no exact formula because both the +header and per-node parts include `numeric` variable-length values. To +give you an idea of how many such values you need to keep: + +- `crdt_gsum` + - fixed: `20B (header) + N * 4B (per-node)` + - variable: `(2 + N)` `numeric` values +- `crdt_pnsum` + - fixed: `20B (header) + N * 4B (per-node)` + - variable: `(4 + 2 * N)` `numeric` values + +!!! Note + It doesn't matter how many nodes are in the cluster if the + values are never updated on multiple nodes. It also doesn't matter whether + the updates were concurrent (causing a conflict). + + In addition, it doesn't matter how many of those nodes were already + removed from the cluster. There's no way to compact the state yet. + +## CRDT types versus conflicts handling + +As tables can contain both CRDT and non-CRDT columns (most +columns are expected to be non-CRDT), you need to do both the regular +conflict resolution and CRDT merge. + +The conflict resolution happens first and is responsible for deciding +the tuple to keep (applytuple) and the one to discard. The merge +phase happens next, merging data for CRDT columns from the discarded +tuple into the applytuple. + +!!! Note + This handling makes CRDT types somewhat more expensive compared to plain + conflict resolution because the merge needs to happen every time. This is the case even + when the conflict resolution can use one of the fast-paths (such as those modified + in the current transaction). + +## CRDT types versus conflict reporting + +By default, detected conflicts are individually reported. Without +CRDT types, this makes sense because the conflict resolution +essentially throws away one half of the available information (local or +remote row, depending on configuration). This presents a data loss. + +CRDT types allow both parts of the information to be combined +without throwing anything away, eliminating the data loss issue. This makes +the conflict reporting unnecessary. + +For this reason, conflict reporting is skipped when the conflict can be +fully resolved by CRDT merge, that is, if each column meets at least one +of these two conditions: + +- The values in local and remote tuple are the same (NULL or equal). +- It uses a CRDT data type and so can be merged. + +!!! Note + This means that the conflict reporting is also skipped when there are no + CRDT columns but all values in local/remote tuples are equal. + +## Resetting CRDT values + +Resetting CRDT values is possible but requires special handling. +The asynchronous nature of the +cluster means that different nodes might see the reset operation at different +places in the change stream no matter how it's implemented. +Different nodes might also initiate a reset concurrently, that is, before +observing the reset from the other node. + +In other words, to make the reset operation behave correctly, it needs to +be commutative with respect to the regular operations. Many naive ways +to reset a value that might work well on a single-node fail +for this reason. + +For example, the simplest approach to resetting a value might be: + +``` +UPDATE crdt_table SET cnt = 0 WHERE id = 1; +``` + +With state-based CRDTs this doesn't work. It throws away the state for the +other nodes but only locally. It's added back by merge functions +on remote nodes, causing diverging values and eventually receiving it +back due to changes on the other nodes. + +With operation-based CRDTs, this might seem to work because the +update is interpreted as a subtraction of `-cnt`. But it works only in the +absence of concurrent resets. Once two nodes attempt to do a reset at +the same time, the delta is applied twice, getting a negative +value (which isn't expected from a reset). + +It might also seem that you can use `DELETE + INSERT` as a reset, but this approach +has a couple of weaknesses, too. If the row is reinserted with the same +key, it's not guaranteed that all nodes see it at the same position in +the stream of operations with respect to changes from other nodes. +BDR specifically discourages reusing the same primary key value since +it can lead to data anomalies in concurrent cases. + +State-based CRDT types can reliably handle resets +using a special `!` operator like this: + +``` +UPDATE tab SET counter = !counter WHERE ...; +``` + +"Reliably" means the values don't have the two issues of multiple concurrent resets and divergence. + +Operation-based CRDT types can be reset reliably only using +[Eager Replication](eager), since this avoids multiple concurrent resets. +You can also use Eager Replication to set either kind of CRDT to a specific +value. + +## Implemented CRDT data types + +Currently, there are six CRDT data types implemented: + +- Grow-only counter and sum +- Positive-negative counter and sum +- Delta counter and sum + +The counters and sums behave mostly the same, except that the counter types +are integer-based (`bigint`), while the sum types are decimal-based +(`numeric`). + +Additional CRDT types, described at [1], might be implemented later. + +You can list the currently implemented CRDT data types with the +following query: + +```sql +SELECT n.nspname, t.typname +FROM bdr.crdt_handlers c +JOIN (pg_type t JOIN pg_namespace n ON t.typnamespace = n.oid) + ON t.oid = c.crdt_type_id; +``` + +### grow-only counter (`crdt_gcounter`) + +- Supports only increments with nonnegative values (`value + int` + and `counter + bigint` operators). + +- You can obtain the current value of the counter either using `#` operator + or by casting it to `bigint`. + +- Isn't compatible with simple assignments like `counter = value` + (which is common pattern when the new value is computed somewhere in + the application). + +- Allows simple reset of the counter using the `!` operator + ( `counter = !counter` ). + +- You can inspect the internal state using `crdt_gcounter_to_text`. + +``` +CREATE TABLE crdt_test ( + id INT PRIMARY KEY, + cnt bdr.crdt_gcounter NOT NULL DEFAULT 0 +); + +INSERT INTO crdt_test VALUES (1, 0); -- initialized to 0 +INSERT INTO crdt_test VALUES (2, 129824); -- initialized to 129824 +INSERT INTO crdt_test VALUES (3, -4531); -- error: negative value + +-- enable CLCD on the table +ALTER TABLE crdt_test REPLICA IDENTITY FULL; +SELECT bdr.alter_table_conflict_detection('crdt_test', 'column_modify_timestamp', 'cts'); + +-- increment counters +UPDATE crdt_test SET cnt = cnt + 1 WHERE id = 1; +UPDATE crdt_test SET cnt = cnt + 120 WHERE id = 2; + +-- error: minus operator not defined +UPDATE crdt_test SET cnt = cnt - 1 WHERE id = 1; + +-- error: increment has to be non-negative +UPDATE crdt_test SET cnt = cnt + (-1) WHERE id = 1; + +-- reset counter +UPDATE crdt_test SET cnt = !cnt WHERE id = 1; + +-- get current counter value +SELECT id, cnt::bigint, cnt FROM crdt_test; + +-- show internal structure of counters +SELECT id, bdr.crdt_gcounter_to_text(cnt) FROM crdt_test; +``` + +### grow-only sum (`crdt_gsum`) + +- Supports only increments with nonnegative values (`sum + numeric`). + +- You can obtain the current value of the sum either by using the `#` operator + or by casting it to `numeric`. + +- Isn't compatible with simple assignments like `sum = value` + (which is the common pattern when the new value is computed somewhere in + the application). + +- Allows simple reset of the sum using the `!` operator (`sum = !sum`). + +- Can inspect internal state using `crdt_gsum_to_text`. + +``` +CREATE TABLE crdt_test ( + id INT PRIMARY KEY, + gsum bdr.crdt_gsum NOT NULL DEFAULT 0.0 +); + +INSERT INTO crdt_test VALUES (1, 0.0); -- initialized to 0 +INSERT INTO crdt_test VALUES (2, 1298.24); -- initialized to 1298.24 +INSERT INTO crdt_test VALUES (3, -45.31); -- error: negative value + +-- enable CLCD on the table +ALTER TABLE crdt_test REPLICA IDENTITY FULL; +SELECT bdr.alter_table_conflict_detection('crdt_test', 'column_modify_timestamp', 'cts'); + +-- increment sum +UPDATE crdt_test SET gsum = gsum + 11.5 WHERE id = 1; +UPDATE crdt_test SET gsum = gsum + 120.33 WHERE id = 2; + +-- error: minus operator not defined +UPDATE crdt_test SET gsum = gsum - 15.2 WHERE id = 1; + +-- error: increment has to be non-negative +UPDATE crdt_test SET gsum = gsum + (-1.56) WHERE id = 1; + +-- reset sum +UPDATE crdt_test SET gsum = !gsum WHERE id = 1; + +-- get current sum value +SELECT id, gsum::numeric, gsum FROM crdt_test; + +-- show internal structure of sums +SELECT id, bdr.crdt_gsum_to_text(gsum) FROM crdt_test; +``` + +### positive-negative counter (`crdt_pncounter`) + +- Supports increments with both positive and negative values (through + `counter + int` and `counter + bigint` operators). + +- You can obtain the current value of the counter either by using the `#` operator + or by casting to `bigint`. + +- Isn't compatible with simple assignments like `counter = value` + (which is the common pattern when the new value is computed somewhere in + the application). + +- Allows simple reset of the counter using the `!` operator + (`counter = !counter`). + +- You can inspect the internal state using `crdt_pncounter_to_text`. + +``` +CREATE TABLE crdt_test ( + id INT PRIMARY KEY, + cnt bdr.crdt_pncounter NOT NULL DEFAULT 0 +); + +INSERT INTO crdt_test VALUES (1, 0); -- initialized to 0 +INSERT INTO crdt_test VALUES (2, 129824); -- initialized to 129824 +INSERT INTO crdt_test VALUES (3, -4531); -- initialized to -4531 + +-- enable CLCD on the table +ALTER TABLE crdt_test REPLICA IDENTITY FULL; +SELECT bdr.alter_table_conflict_detection('crdt_test', 'column_modify_timestamp', 'cts'); + +-- increment counters +UPDATE crdt_test SET cnt = cnt + 1 WHERE id = 1; +UPDATE crdt_test SET cnt = cnt + 120 WHERE id = 2; +UPDATE crdt_test SET cnt = cnt + (-244) WHERE id = 3; + +-- decrement counters +UPDATE crdt_test SET cnt = cnt - 73 WHERE id = 1; +UPDATE crdt_test SET cnt = cnt - 19283 WHERE id = 2; +UPDATE crdt_test SET cnt = cnt - (-12) WHERE id = 3; + +-- get current counter value +SELECT id, cnt::bigint, cnt FROM crdt_test; + +-- show internal structure of counters +SELECT id, bdr.crdt_pncounter_to_text(cnt) FROM crdt_test; + +-- reset counter +UPDATE crdt_test SET cnt = !cnt WHERE id = 1; + +-- get current counter value after the reset +SELECT id, cnt::bigint, cnt FROM crdt_test; +``` + +### positive-negative sum (`crdt_pnsum`) + +- Supports increments with both positive and negative values (through + `sum + numeric`). + +- You can obtain the current value of the sum either by using then `#` operator + or by casting to `numeric`. + +- Isn't compatible with simple assignments like `sum = value` + (which is the common pattern when the new value is computed somewhere in + the application). + +- Allows simple reset of the sum using the `!` operator (`sum = !sum`). + +- You can inspect the internal state using `crdt_pnsum_to_text`. + +``` +CREATE TABLE crdt_test ( + id INT PRIMARY KEY, + pnsum bdr.crdt_pnsum NOT NULL DEFAULT 0 +); + +INSERT INTO crdt_test VALUES (1, 0); -- initialized to 0 +INSERT INTO crdt_test VALUES (2, 1298.24); -- initialized to 1298.24 +INSERT INTO crdt_test VALUES (3, -45.31); -- initialized to -45.31 + +-- enable CLCD on the table +ALTER TABLE crdt_test REPLICA IDENTITY FULL; +SELECT bdr.alter_table_conflict_detection('crdt_test', 'column_modify_timestamp', 'cts'); + +-- increment sums +UPDATE crdt_test SET pnsum = pnsum + 1.44 WHERE id = 1; +UPDATE crdt_test SET pnsum = pnsum + 12.20 WHERE id = 2; +UPDATE crdt_test SET pnsum = pnsum + (-24.34) WHERE id = 3; + +-- decrement sums +UPDATE crdt_test SET pnsum = pnsum - 7.3 WHERE id = 1; +UPDATE crdt_test SET pnsum = pnsum - 192.83 WHERE id = 2; +UPDATE crdt_test SET pnsum = pnsum - (-12.22) WHERE id = 3; + +-- get current sum value +SELECT id, pnsum::numeric, pnsum FROM crdt_test; + +-- show internal structure of sum +SELECT id, bdr.crdt_pnsum_to_text(pnsum) FROM crdt_test; + +-- reset sum +UPDATE crdt_test SET pnsum = !pnsum WHERE id = 1; + +-- get current sum value after the reset +SELECT id, pnsum::numeric, pnsum FROM crdt_test; +``` + +### delta counter (`crdt_delta_counter`) + +- Is defined a `bigint` domain, so works exactly like a `bigint` column. + +- Supports increments with both positive and negative values. + +- Is compatible with simple assignments like `counter = value` + (common when the new value is computed somewhere in the application). + +- There's no simple way to reset the value reliably. + +``` +CREATE TABLE crdt_test ( + id INT PRIMARY KEY, + cnt bdr.crdt_delta_counter NOT NULL DEFAULT 0 +); + +INSERT INTO crdt_test VALUES (1, 0); -- initialized to 0 +INSERT INTO crdt_test VALUES (2, 129824); -- initialized to 129824 +INSERT INTO crdt_test VALUES (3, -4531); -- initialized to -4531 + +-- enable CLCD on the table +ALTER TABLE crdt_test REPLICA IDENTITY FULL; +SELECT bdr.alter_table_conflict_detection('crdt_test', 'column_modify_timestamp', 'cts'); + +-- increment counters +UPDATE crdt_test SET cnt = cnt + 1 WHERE id = 1; +UPDATE crdt_test SET cnt = cnt + 120 WHERE id = 2; +UPDATE crdt_test SET cnt = cnt + (-244) WHERE id = 3; + +-- decrement counters +UPDATE crdt_test SET cnt = cnt - 73 WHERE id = 1; +UPDATE crdt_test SET cnt = cnt - 19283 WHERE id = 2; +UPDATE crdt_test SET cnt = cnt - (-12) WHERE id = 3; + +-- get current counter value +SELECT id, cnt FROM crdt_test; +``` + +### delta sum (`crdt_delta_sum`) + +- Is defined as a `numeric` domain so works exactly like a `numeric` column. + +- Supports increments with both positive and negative values. + +- Is compatible with simple assignments like `sum = value` + (common when the new value is computed somewhere in the application). + +- There's no simple way to reset the value reliably. + +``` +CREATE TABLE crdt_test ( + id INT PRIMARY KEY, + dsum bdr.crdt_delta_sum NOT NULL DEFAULT 0 +); + +INSERT INTO crdt_test VALUES (1, 0); -- initialized to 0 +INSERT INTO crdt_test VALUES (2, 129.824); -- initialized to 129824 +INSERT INTO crdt_test VALUES (3, -4.531); -- initialized to -4531 + +-- enable CLCD on the table +ALTER TABLE crdt_test REPLICA IDENTITY FULL; +SELECT bdr.alter_table_conflict_detection('crdt_test', 'column_modify_timestamp', 'cts'); + +-- increment counters +UPDATE crdt_test SET dsum = dsum + 1.32 WHERE id = 1; +UPDATE crdt_test SET dsum = dsum + 12.01 WHERE id = 2; +UPDATE crdt_test SET dsum = dsum + (-2.4) WHERE id = 3; + +-- decrement counters +UPDATE crdt_test SET dsum = dsum - 7.33 WHERE id = 1; +UPDATE crdt_test SET dsum = dsum - 19.83 WHERE id = 2; +UPDATE crdt_test SET dsum = dsum - (-1.2) WHERE id = 3; + +-- get current counter value +SELECT id, cnt FROM crdt_test; +``` + +[1] diff --git a/product_docs/docs/pgd/5/consistency/eager.mdx b/product_docs/docs/pgd/5/consistency/eager.mdx new file mode 100644 index 00000000000..dfd7169cded --- /dev/null +++ b/product_docs/docs/pgd/5/consistency/eager.mdx @@ -0,0 +1,92 @@ +--- +title: Eager conflict resolution +redirects: + - /pgd/latest/bdr/eager/ +--- + +Eager conflict resolution (also known as Eager Replication) prevents conflicts +by aborting transactions that conflict with each other with serializable error +during the COMMIT decision process. + +It is configured using [Commit Scopes](../durability/commit-scopes) +as one of the conflict resolution options for +[Group Commit](../durability/group-commit). + +## Usage + +To enable Eager conflict resolution, the client needs to switch to +a Commit Scope which uses it at session level or for individual transactions +as shown here: + +```sql +BEGIN; + +SET LOCAL bdr.commit_scope = 'eager_scope'; + +... other commands possible... +``` + +The client can continue to issue a `COMMIT` at the end of the +transaction and let BDR manage the two phases: + +```sql +COMMIT; +``` + +In this case the `eager_scope` commit scope would be defined something like this: + +```sql +SELECT bdr.add_commit_scope( + commit_scope_name := 'eager_scope', + origin_node_group := 'top_group', + rule := 'ALL (top_group) GROUP COMMIT (conflict_resolution = eager, commit_decision = raft) ABORT ON (timeout = 60s)', + wait_for_ready := true +); +``` + +!!! Upgrade note + The old `global` commit scope does not exist anymore and the above command + creates scope that is same as the old `global` scope with + `bdr.global_commit_timeout` set to `60s`. + +## Error handling + +Given that BDR manages the transaction, the client needs to check only the +result of the `COMMIT`. (This is advisable in any case, including single-node +Postgres.) + +In case of an origin node failure, the remaining nodes eventually +(after at least `ABORT ON timeout`) decide to roll back the +globally prepared transaction. Raft prevents inconsistent commit versus +rollback decisions. However, this requires a majority of connected +nodes. Disconnected nodes keep the transactions prepared +to eventually commit them (or roll back) as needed to reconcile with the +majority of nodes that might have decided and made further progress. + +## Effects of Eager Replication in general + +#### Increased commit latency + +Adding a synchronization step means additional communication between +the nodes, resulting in additional latency at commit time. Eager All-Node +Replication adds roughly two network roundtrips (to the furthest +peer node in the worst case). Logical standby nodes and nodes still +in the process of joining or catching up aren't included but +eventually receive changes. + +Before a peer node can confirm its local preparation of the +transaction, it also needs to apply it locally. This further adds to +the commit latency, depending on the size of the transaction. +This is independent of the `synchronous_commit` setting. + +#### Increased abort rate + +With single-node Postgres, or even with BDR in its default asynchronous +replication mode, errors at `COMMIT` time are rare. The additional +synchronization step adds a source of errors, so applications need to +be prepared to properly handle such errors (usually by applying a +retry loop). + +The rate of aborts depends solely on the workload. Large transactions +changing many rows are much more likely to conflict with other +concurrent transactions. diff --git a/product_docs/docs/pgd/5/consistency/index.mdx b/product_docs/docs/pgd/5/consistency/index.mdx new file mode 100644 index 00000000000..81d2b3b713f --- /dev/null +++ b/product_docs/docs/pgd/5/consistency/index.mdx @@ -0,0 +1,34 @@ +--- +title: Consistency + +navigation: + - conflicts + - column-level-conflicts + - crdt + - eager + +--- + +EDB Postgres Distributed is an active/active or multi-master DBMS. If used +asynchronously, writes to the same or related rows from multiple different +nodes can result in data [conflicts](conflicts) when using standard data types. + +Conflicts aren't errors. In most cases, they are events that PGD can detect +and resolve as they occur. Resolution depends on the +nature of the application and the meaning of the data, so it's important that +PGD provides the application a range of choices as to how to resolve +[conflicts](conflicts). + +By default, conflicts are resolved at the row level. When changes from two +nodes conflict, either the local or remote tuple is picked and the other +is discarded. For example, the commit timestamps might be compared for the two conflicting +changes and the newer one kept. This approach ensures that all nodes converge to the +same result and establishes commit-order-like semantics on the whole cluster. + +Column-level conflict detection and resolution is available with PGD, +described in [CLCD](column-level-conflicts). + +If you want to avoid conflicts, you can use +[Group Commit](/pgd/latest/durability/group-commit) with +[Eager Conflict Resolution](eager) or Conflict-free data types (CRDTs), +described in [CRDT](crdt) diff --git a/product_docs/docs/pgd/5/ddl.mdx b/product_docs/docs/pgd/5/ddl.mdx new file mode 100644 index 00000000000..c15eae6fba4 --- /dev/null +++ b/product_docs/docs/pgd/5/ddl.mdx @@ -0,0 +1,1129 @@ +--- +title: DDL replication +redirects: + - bdr/ddl + +--- + +DDL stands for data definition language, the subset of the SQL +language that creates, alters, and drops database objects. + +For operational convenience and correctness, BDR replicates most DDL +actions, with these exceptions: + +- Temporary or unlogged relations +- Certain DDL statements (mostly long running) +- Locking commands (`LOCK`) +- Table maintenance commands (`VACUUM`, `ANALYZE`, `CLUSTER`, `REINDEX`) +- Actions of autovacuum +- Operational commands (`CHECKPOINT`, `ALTER SYSTEM`) +- Actions related to databases or tablespaces + +Automatic DDL replication makes +certain DDL changes easier without having to manually distribute +the DDL change to all nodes and ensure that they are consistent. + +In the default replication set, DDL is replicated to all nodes by default. +To replicate DDL, you must add a DDL replication filter to the +replication set. See [DDL replication filtering](repsets/#ddl-replication-filtering). + +BDR is significantly different from standalone PostgreSQL when it +comes to DDL replication. Treating it the same is the most +common issue with BDR. + +The main difference from table replication is that DDL replication doesn't +replicate the result of the DDL but the statement itself. This works +very well in most cases, although it introduces the requirement that the DDL +must execute similarly on all nodes. A more subtle point is that the DDL +must be immutable with respect to all datatype-specific parameter settings, +including any datatypes introduced by extensions (not built-in). +For example, the DDL statement must execute correctly in the default +encoding used on each node. + +## DDL replication options + +The `bdr.ddl_replication` parameter specifies replication behavior. + +`bdr.ddl_replication = on` is the default and replicates DDL to the +default replication set, which by default means all nodes. Nondefault +replication sets don't replicate DDL unless they have a +[DDL filter](repsets) +defined for them. + +You can also replicate DDL to specific replication sets using the +function `bdr.replicate_ddl_command()`. This can be helpful if you +want to run DDL commands when a node is down or if you want to have +indexes or partitions that exist on a subset of nodes or rep sets, +for example, all nodes at site1. + +``` +SELECT bdr.replicate_ddl_command( + 'CREATE INDEX CONCURRENTLY ON foo (col7);', + ARRAY['site1'], -- the replication sets + 'on'); -- ddl_locking to apply +``` + +While we don't recommend it, you can skip automatic DDL replication and +execute it manually on each node using `bdr.ddl_replication` configuration +parameters. + +``` +SET bdr.ddl_replication = off; +``` + +When set, it makes BDR skip both the global locking and the replication +of executed DDL commands. You must then run the DDL manually on all nodes. + +!!! Warning + Executing DDL manually on each node without global locking can + cause the whole BDR group to stop replicating if conflicting DDL or DML + executes concurrently. + +The `bdr.ddl_replication` parameter can be set only by the bdr_superuser, +by superuser, or in the `config` file. + +## Executing DDL on BDR systems + +A BDR group isn't the same as a standalone PostgreSQL server. It's +based on asynchronous multi-master replication without central +locking and without a transaction coordinator. This has important implications +when executing DDL. + +DDL that executes in parallel continues to do so with BDR. DDL execution +respects the parameters that affect parallel operation on each node as it +executes, so you might notice differences in the settings between nodes. + +Prevent the execution of conflicting DDL, otherwise DDL replication +causes errors and the replication stops. + +BDR offers three levels of protection against those problems: + +`ddl_locking = 'dml'` is the best option for operations, usable when you execute +DDL from only one node at a time. This isn't the default, but we recommend +that you use this setting if you can control where DDL is executed from. Doing so +ensures that there are no inter-node conflicts. Intra-node conflicts are already +handled by PostgreSQL. + +`ddl_locking = on` is the strictest option and is best when DDL might execute +from any node concurrently and you want to ensure correctness. + +`ddl_locking = off` is the least strict option and is dangerous in general use. +This option skips locks altogether, avoiding any performance overhead, which makes +it a useful option when creating a new and empty database schema. + +These options can be set only by the bdr_superuser, by the superuser, or in the `config file`. + +When using the `bdr.replicate_ddl_command`, you can set this +parameter directly with the third argument, using the specified +`bdr.ddl_locking` setting only for the DDL commands passed to that +function. + +## DDL locking details + +Two kinds of locks enforce correctness of replicated DDL with BDR. + +The first kind is known as a global DDL lock and is used only when `ddl_locking = on`. +A global DDL lock prevents any other DDL from executing on the cluster while +each DDL statement runs. This ensures full correctness in the general case but +is too strict for many simple cases. BDR acquires a global lock on +DDL operations the first time in a transaction where schema changes are made. +This effectively serializes the DDL-executing transactions in the cluster. In +other words, while DDL is running, no other connection on any node can run +another DDL command, even if it affects different tables. + +To acquire a lock on DDL operations, the BDR node executing DDL contacts the +other nodes in a BDR group and asks them to grant it the exclusive right to +execute DDL. The lock request is sent by the regular replication stream, and the +nodes respond by the replication stream as well. So it's important that nodes (or +at least a majority of the nodes) run without much replication +delay. Otherwise it might take a long time for the node to acquire the DDL +lock. Once the majority of nodes agrees, the DDL execution is carried out. + +The ordering of DDL locking is decided using the Raft protocol. DDL statements +executed on one node are executed in the same sequence on all other nodes. + +To ensure that the node running a DDL has seen effects of all prior +DDLs run in the cluster, it waits until it has caught up with the node that +ran the previous DDL. If the node running the current DDL is lagging behind in +replication with respect to the node that ran the previous DDL, then it might +take a long time to acquire the lock. Hence it's preferable to run DDLs from a +single node or the nodes that have nearly caught up with replication changes +originating at other nodes. + +The second kind is known as a relation DML lock. This kind of lock is used when +either `ddl_locking = on` or `ddl_locking = dml`, and the DDL statement might cause +in-flight DML statements to fail. These failures can occur when you add or modify a constraint +such as a unique constraint, check constraint, or NOT NULL constraint. +Relation DML locks affect only one relation at a time. Relation DML +locks ensure that no DDL executes while there are changes in the queue that +might cause replication to halt with an error. + +To acquire the global DML lock on a table, the BDR node executing the DDL +contacts all other nodes in a BDR group, asking them to lock the table +against writes and waiting while all pending changes to that table are drained. +Once all nodes are fully caught up, the originator of the DML lock is free +to perform schema changes to the table and replicate them to the other nodes. + +The global DML lock holds an EXCLUSIVE LOCK on the table on each node, +so it blocks DML, other DDL, VACUUM, and index commands against that table while +it runs. This is true even if the global DML lock is held for a command that +normally doesn't take an EXCLUSIVE LOCK or higher. + +Waiting for pending DML operations to drain can take a long time and even longer +if replication is currently lagging. +This means that schema changes affecting row representation and constraints, +unlike with data changes, can be performed only while all configured nodes +can be reached and are keeping up reasonably well with the current write rate. +If such DDL commands must be performed while a node is down, first remove the +down node from the configuration. + +If a DDL statement isn't replicated, no global locks are acquired. + +Locking behavior is specified by the `bdr.ddl_locking` parameter, as +explained in [Executing DDL on BDR systems](#executing-ddl-on-bdr-systems): + +- `ddl_locking = on` takes global DDL lock and, if needed, takes relation DML lock. +- `ddl_locking = dml` skips global DDL lock and, if needed, takes relation DML lock. +- `ddl_locking = off` skips both global DDL lock and relation DML lock. + +Some BDR functions make DDL changes. For those functions, +DDL locking behavior applies. This is noted in the docs for each function. + +Thus, `ddl_locking = dml` is safe only when you can guarantee that +no conflicting DDL is executed from other nodes. With this setting, +the statements that require only the global DDL lock don't use the global +locking at all. + +`ddl_locking = off` is safe only when you can guarantee that there are no +conflicting DDL and no conflicting DML operations on the database objects +DDL executes on. If you turn locking off and then experience difficulties, +you might lose in-flight changes to data. The user application team needs to resolve any issues caused. + +In some cases, concurrently executing DDL can properly be serialized. +If these serialization failures occur, the DDL might reexecute. + +DDL replication isn't active on logical standby nodes until they are promoted. + +Some BDR management functions act like DDL, meaning that they +attempt to take global locks, and their actions are replicated if DDL +replication is active. The full list of replicated functions is listed in +[BDR functions that behave like DDL](#bdr-functions-that-behave-like-ddl). + +DDL executed on temporary tables never need global locks. + +ALTER or DROP of an object created in the current transaction doesn't required +global DML lock. + +Monitoring of global DDL locks and global DML locks is shown in +[Monitoring](monitoring). + +## Minimizing the impact of DDL + +Good operational advice for any database, these points become even more +important with BDR: + +- To minimize the impact of DDL, make transactions performing DDL short, + don't combine them with lots of row changes, and avoid long + running foreign key or other constraint rechecks. + +- For `ALTER TABLE`, use `ADD CONSTRAINT NOT VALID` followed by another + transaction with `VALIDATE CONSTRAINT` rather than using `ADD CONSTRAINT` alone. + `VALIDATE CONSTRAINT` waits until replayed on all nodes, which + gives a noticeable delay to receive confirmations. + +- When indexing, use the `CONCURRENTLY` option whenever possible. + +An alternate way of executing long-running DDL is to disable DDL replication +and then to execute the DDL statement separately on each node. You can +still do this using a single SQL statement, as shown in the following example. +Global locking rules still apply, so be careful not to lock +yourself out with this type of usage, which is more of a +workaround. + +```sql +SELECT bdr.run_on_all_nodes($ddl$ + CREATE INDEX CONCURRENTLY index_a ON table_a(i); +$ddl$); +``` + +We recommend using the `bdr.run_on_all_nodes()` technique with `CREATE +INDEX CONCURRENTLY`, noting that DDL replication must be disabled for the whole +session because `CREATE INDEX CONCURRENTLY` is a multi-transaction command. +Avoid `CREATE INDEX` on production systems +since it prevents writes while it executes. +`REINDEX` is replicated in versions up to 3.6 but not with BDR 3.7 or later. +Avoid using `REINDEX` because of the AccessExclusiveLocks it holds. + +Instead, use `REINDEX CONCURRENTLY` (or `reindexdb --concurrently`), +which is available in PG12+ or 2QPG11+. + +You can disable DDL replication when using command-line utilities like this: + +```sql +$ export PGOPTIONS="-c bdr.ddl_replication=off" +$ pg_restore --section=post-data +``` + +Multiple DDL statements might benefit from bunching into a single transaction +rather than fired as individual statements, so take the DDL lock +only once. This might not be desirable if the table-level locks interfere with +normal operations. + +If DDL is holding up the system for too long, you can safely +cancel the DDL on the originating node with `Control-C` in psql or with `pg_cancel_backend()`. +You can't cancel a DDL lock from any other node. + +You can control how long the global lock takes with optional +global locking timeout settings. +`bdr.global_lock_timeout` limits how long the wait +for acquiring the global lock can take before it's canceled. +`bdr.global_lock_statement_timeout` limits the runtime length of any statement +in transaction that holds global locks, and `bdr.global_lock_idle_timeout` +sets the maximum allowed idle time (time between statements) for a transaction +holding any global locks. You can disable all of these timeouts by setting +their values to zero. + +Once the DDL operation has committed on the originating node, you can't cancel or abort it. +The BDR group must wait for it to apply successfully on +other nodes that confirmed the global lock and for them to acknowledge replay. +For this reason, keep DDL transactions short and fast. + +## Handling DDL with down nodes + +If the node initiating the global DDL lock goes down after it acquired +the global lock (either DDL or DML), the lock stays active. +The global locks don't time out, even if timeouts were set. +In case the node comes back up, it releases all the global +locks that it holds. + +If it stays down for a long time (or indefinitely), +remove the node from the BDR group to release the global locks. This +is one reason for executing emergency DDL using the `SET` command as +the bdr_superuser to update the `bdr.ddl_locking` value. + +If one of the other nodes goes down after it confirmed the global lock +but before the command acquiring it executed, the execution of +that command requesting the lock continues as if the node were up. + +As mentioned earlier, the global DDL lock requires only a majority of +the nodes to respond, and so it works if part of the cluster is down, as long as a +majority is running and reachable. But the DML lock can't be acquired +unless the whole cluster is available. + +With global DDL or global DML lock, if another node goes down, the +command continues normally, and the lock is released. + +## Statement-specific DDL replication concerns + +Not all commands can be replicated automatically. Such commands +are generally disallowed, unless DDL replication is turned off +by turning `bdr.ddl_replication` off. + +BDR prevents some DDL statements from running when it's active on a +database. This protects the consistency of the system by disallowing +statements that can't be replicated correctly or for which replication +isn't yet supported. + +If a statement isn't permitted under BDR, you can often find +another way to do the same thing. For example, you can't do an `ALTER TABLE`, +which adds a column with a volatile default value. But generally you can +rephrase that as a series of independent `ALTER TABLE` and `UPDATE` statements +that work. + +Generally unsupported statements are prevented from being +executed, raising a `feature_not_supported` (SQLSTATE `0A000`) error. + +Any DDL that references or relies on a temporary object can't +be replicated by BDR and throws an error if executed with DDL replication +enabled. + +## BDR DDL command handling matrix + +The following table describes the utility or DDL commands that are allowed, the ones that +are replicated, and the type of global lock they take when they're replicated. + +For some more complex statements like `ALTER TABLE`, these can differ depending +on the subcommands executed. Every such command has detailed explanation +under the following table. + +| Command | Allowed | Replicated | Lock | +| -------------------------------- | --------------------------------------------- | ------------------------------------------ | ------------------------------------------------ | +| ALTER AGGREGATE | Y | Y | DDL | +| ALTER CAST | Y | Y | DDL | +| ALTER COLLATION | Y | Y | DDL | +| ALTER CONVERSION | Y | Y | DDL | +| ALTER DATABASE | Y | N | N | +| ALTER DATABASE LINK | Y | Y | DDL | +| ALTER DEFAULT PRIVILEGES | Y | Y | DDL | +| ALTER DIRECTORY | Y | Y | DDL | +| ALTER DOMAIN | Y | Y | DDL | +| ALTER EVENT TRIGGER | Y | Y | DDL | +| ALTER EXTENSION | Y | Y | DDL | +| ALTER FOREIGN DATA WRAPPER | Y | Y | DDL | +| ALTER FOREIGN TABLE | Y | Y | DDL | +| ALTER FUNCTION | Y | Y | DDL | +| ALTER INDEX | Y | Y | DDL | +| ALTER LANGUAGE | Y | Y | DDL | +| ALTER LARGE OBJECT | N | N | N | +| ALTER MATERIALIZED VIEW | Y | N | N | +| ALTER OPERATOR | Y | Y | DDL | +| ALTER OPERATOR CLASS | Y | Y | DDL | +| ALTER OPERATOR FAMILY | Y | Y | DDL | +| ALTER PACKAGE | Y | Y | DDL | +| ALTER POLICY | Y | Y | DDL | +| ALTER PROCEDURE | Y | Y | DDL | +| ALTER PROFILE | Y | Y | [Details](#bdr_ddl_allowed_CreateAlterProfile) | +| ALTER PUBLICATION | Y | Y | DDL | +| ALTER QUEUE | Y | Y | DDL | +| ALTER QUEUE TABLE | Y | Y | DDL | +| ALTER REDACTION POLICY | Y | Y | DDL | +| ALTER RESOURCE GROUP | Y | N | N | +| ALTER ROLE | Y | Y | DDL | +| ALTER ROUTINE | Y | Y | DDL | +| ALTER RULE | Y | Y | DDL | +| ALTER SCHEMA | Y | Y | DDL | +| ALTER SEQUENCE | [Details](#bdr_ddl_allowed_AlterSeqStmt) | Y | DML | +| ALTER SERVER | Y | Y | DDL | +| ALTER SESSION | Y | N | N | +| ALTER STATISTICS | Y | Y | DDL | +| ALTER SUBSCRIPTION | Y | Y | DDL | +| ALTER SYNONYM | Y | Y | DDL | +| ALTER SYSTEM | Y | N | N | +| ALTER TABLE | [Details](#bdr_ddl_allowed_AlterTableStmt) | Y | [Details](#bdr_ddl_lock_relation_AlterTableStmt) | +| ALTER TABLESPACE | Y | N | N | +| ALTER TEXT SEARCH CONFIGURATION | Y | Y | DDL | +| ALTER TEXT SEARCH DICTIONARY | Y | Y | DDL | +| ALTER TEXT SEARCH PARSER | Y | Y | DDL | +| ALTER TEXT SEARCH TEMPLATE | Y | Y | DDL | +| ALTER TRIGGER | Y | Y | DDL | +| ALTER TYPE | Y | Y | DDL | +| ALTER USER MAPPING | Y | Y | DDL | +| ALTER VIEW | Y | Y | DDL | +| ANALYZE | Y | N | N | +| BEGIN | Y | N | N | +| CHECKPOINT | Y | N | N | +| CLOSE | Y | N | N | +| CLOSE CURSOR | Y | N | N | +| CLOSE CURSOR ALL | Y | N | N | +| CLUSTER | Y | N | N | +| COMMENT | Y | [Details](#bdr_ddl_can_replicate_comment) | DDL | +| COMMIT | Y | N | N | +| COMMIT PREPARED | Y | N | N | +| COPY | Y | N | N | +| COPY FROM | Y | N | N | +| CREATE ACCESS METHOD | Y | Y | DDL | +| CREATE AGGREGATE | Y | Y | DDL | +| CREATE CAST | Y | Y | DDL | +| CREATE COLLATION | Y | Y | DDL | +| CREATE CONSTRAINT | Y | Y | DDL | +| CREATE CONVERSION | Y | Y | DDL | +| CREATE DATABASE | Y | N | N | +| CREATE DATABASE LINK | Y | Y | DDL | +| CREATE DIRECTORY | Y | Y | DDL | +| CREATE DOMAIN | Y | Y | DDL | +| CREATE EVENT TRIGGER | Y | Y | DDL | +| CREATE EXTENSION | Y | Y | DDL | +| CREATE FOREIGN DATA WRAPPER | Y | Y | DDL | +| CREATE FOREIGN TABLE | Y | Y | DDL | +| CREATE FUNCTION | Y | Y | DDL | +| CREATE INDEX | Y | Y | DML | +| CREATE LANGUAGE | Y | Y | DDL | +| CREATE MATERIALIZED VIEW | Y | N | N | +| CREATE OPERATOR | Y | Y | DDL | +| CREATE OPERATOR CLASS | Y | Y | DDL | +| CREATE OPERATOR FAMILY | Y | Y | DDL | +| CREATE PACKAGE | Y | Y | DDL | +| CREATE PACKAGE BODY | Y | Y | DDL | +| CREATE POLICY | Y | Y | DML | +| CREATE PROCEDURE | Y | Y | DDL | +| CREATE PROFILE | Y | Y | [Details](#bdr_ddl_allowed_CreateAlterProfile) | +| CREATE PUBLICATION | Y | Y | DDL | +| CREATE QUEUE | Y | Y | DDL | +| CREATE QUEUE TABLE | Y | Y | DDL | +| CREATE REDACTION POLICY | Y | Y | DDL | +| CREATE RESOURCE GROUP | Y | N | N | +| CREATE ROLE | Y | Y | DDL | +| CREATE ROUTINE | Y | Y | DDL | +| CREATE RULE | Y | Y | DDL | +| CREATE SCHEMA | Y | Y | DDL | +| CREATE SEQUENCE | [Details](#bdr_ddl_allowed_CreateSeqStmt) | Y | DDL | +| CREATE SERVER | Y | Y | DDL | +| CREATE STATISTICS | Y | Y | DDL | +| CREATE SUBSCRIPTION | Y | Y | DDL | +| CREATE SYNONYM | Y | Y | DDL | +| CREATE TABLE | [Details](#bdr_ddl_allowed_CreateStmt) | Y | DDL | +| CREATE TABLE AS | [Details](#bdr_ddl_allowed_CreateTableAsStmt) | Y | DDL | +| CREATE TABLESPACE | Y | N | N | +| CREATE TEXT SEARCH CONFIGURATION | Y | Y | DDL | +| CREATE TEXT SEARCH DICTIONARY | Y | Y | DDL | +| CREATE TEXT SEARCH PARSER | Y | Y | DDL | +| CREATE TEXT SEARCH TEMPLATE | Y | Y | DDL | +| CREATE TRANSFORM | Y | Y | DDL | +| CREATE TRIGGER | Y | Y | DDL | +| CREATE TYPE | Y | Y | DDL | +| CREATE TYPE BODY | Y | Y | DDL | +| CREATE USER MAPPING | Y | Y | DDL | +| CREATE VIEW | Y | Y | DDL | +| DEALLOCATE | Y | N | N | +| DEALLOCATE ALL | Y | N | N | +| DECLARE CURSOR | Y | N | N | +| DISCARD | Y | N | N | +| DISCARD ALL | Y | N | N | +| DISCARD PLANS | Y | N | N | +| DISCARD SEQUENCES | Y | N | N | +| DISCARD TEMP | Y | N | N | +| DO | Y | N | N | +| DROP ACCESS METHOD | Y | Y | DDL | +| DROP AGGREGATE | Y | Y | DDL | +| DROP CAST | Y | Y | DDL | +| DROP COLLATION | Y | Y | DDL | +| DROP CONSTRAINT | Y | Y | DDL | +| DROP CONVERSION | Y | Y | DDL | +| DROP DATABASE | Y | N | N | +| DROP DATABASE LINK | Y | Y | DDL | +| DROP DIRECTORY | Y | Y | DDL | +| DROP DOMAIN | Y | Y | DDL | +| DROP EVENT TRIGGER | Y | Y | DDL | +| DROP EXTENSION | Y | Y | DDL | +| DROP FOREIGN DATA WRAPPER | Y | Y | DDL | +| DROP FOREIGN TABLE | Y | Y | DDL | +| DROP FUNCTION | Y | Y | DDL | +| DROP INDEX | Y | Y | DDL | +| DROP LANGUAGE | Y | Y | DDL | +| DROP MATERIALIZED VIEW | Y | N | N | +| DROP OPERATOR | Y | Y | DDL | +| DROP OPERATOR CLASS | Y | Y | DDL | +| DROP OPERATOR FAMILY | Y | Y | DDL | +| DROP OWNED | Y | Y | DDL | +| DROP PACKAGE | Y | Y | DDL | +| DROP PACKAGE BODY | Y | Y | DDL | +| DROP POLICY | Y | Y | DDL | +| DROP PROCEDURE | Y | Y | DDL | +| DROP PROFILE | Y | Y | DDL | +| DROP PUBLICATION | Y | Y | DDL | +| DROP QUEUE | Y | Y | DDL | +| DROP QUEUE TABLE | Y | Y | DDL | +| DROP REDACTION POLICY | Y | Y | DDL | +| DROP RESOURCE GROUP | Y | N | N | +| DROP ROLE | Y | Y | DDL | +| DROP ROUTINE | Y | Y | DDL | +| DROP RULE | Y | Y | DDL | +| DROP SCHEMA | Y | Y | DDL | +| DROP SEQUENCE | Y | Y | DDL | +| DROP SERVER | Y | Y | DDL | +| DROP STATISTICS | Y | Y | DDL | +| DROP SUBSCRIPTION | Y | Y | DDL | +| DROP SYNONYM | Y | Y | DDL | +| DROP TABLE | Y | Y | DML | +| DROP TABLESPACE | Y | N | N | +| DROP TEXT SEARCH CONFIGURATION | Y | Y | DDL | +| DROP TEXT SEARCH DICTIONARY | Y | Y | DDL | +| DROP TEXT SEARCH PARSER | Y | Y | DDL | +| DROP TEXT SEARCH TEMPLATE | Y | Y | DDL | +| DROP TRANSFORM | Y | Y | DDL | +| DROP TRIGGER | Y | Y | DDL | +| DROP TYPE | Y | Y | DDL | +| DROP TYPE BODY | Y | Y | DDL | +| DROP USER MAPPING | Y | Y | DDL | +| DROP VIEW | Y | Y | DDL | +| EXECUTE | Y | N | N | +| EXPLAIN | Y | [Details](#bdr_ddl_can_replicate_explain) | [Details](#bdr_ddl_lock_explain_stmt) | +| FETCH | Y | N | N | +| GRANT | Y | [Details](#bdr_ddl_can_replicate_grant) | DDL | +| GRANT ROLE | Y | Y | DDL | +| IMPORT FOREIGN SCHEMA | Y | Y | DDL | +| LISTEN | Y | N | N | +| LOAD | Y | N | N | +| LOAD ROW DATA | Y | Y | DDL | +| LOCK TABLE | Y | N | [Details](#bdr_ddl_lock_lock_stmt) | +| MOVE | Y | N | N | +| NOTIFY | Y | N | N | +| PREPARE | Y | N | N | +| PREPARE TRANSACTION | Y | N | N | +| REASSIGN OWNED | Y | Y | DDL | +| REFRESH MATERIALIZED VIEW | Y | N | N | +| REINDEX | Y | N | N | +| RELEASE | Y | N | N | +| RESET | Y | N | N | +| REVOKE | Y | [Details](#bdr_ddl_can_replicate_grant) | DDL | +| REVOKE ROLE | Y | Y | DDL | +| ROLLBACK | Y | N | N | +| ROLLBACK PREPARED | Y | N | N | +| SAVEPOINT | Y | N | N | +| SECURITY LABEL | Y | [Details](#bdr_ddl_can_replicate_seclabel) | DDL | +| SELECT INTO | [Details](#bdr_ddl_allowed_CreateTableAsStmt) | Y | DDL | +| SET | Y | N | N | +| SET CONSTRAINTS | Y | N | N | +| SHOW | Y | N | N | +| START TRANSACTION | Y | N | N | +| TRUNCATE TABLE | Y | [Details](#bdr_ddl_can_replicate_truncate) | [Details](#bdr_ddl_lock_truncate_stmt) | +| UNLISTEN | Y | N | N | +| VACUUM | Y | N | N | + +
+ +### ALTER SEQUENCE + +Generally `ALTER SEQUENCE` is supported, but when using global +sequences, some options have no effect. + +`ALTER SEQUENCE ... RENAME` isn't supported on galloc sequences (only). +`ALTER SEQUENCE ... SET SCHEMA` isn't supported on galloc sequences (only). + +### ALTER TABLE + +Generally, `ALTER TABLE` commands are allowed. However, several +subcommands aren't supported. + +
+ +#### ALTER TABLE disallowed commands + +Some variants of `ALTER TABLE` currently aren't allowed on a BDR node: + +- `ADD COLUMN ... DEFAULT (non-immutable expression)` — This is not allowed because + it currently results in different data on different nodes. See + [Adding a column](#adding-a-column) for a suggested workaround. +- `ADD CONSTRAINT ... EXCLUDE` — Exclusion constraints aren't supported for now. + Exclusion constraints don't make much sense in an asynchronous system and + lead to changes that can't be replayed. +- `ALTER TABLE ... SET WITH[OUT] OIDS` — Isn't supported for the same reasons + as in `CREATE TABLE`. +- `ALTER COLUMN ... SET STORAGE external` — Is rejected if the column is + one of the columns of the replica identity for the table. +- `RENAME` — Can't rename an Autopartitioned table. +- `SET SCHEMA` — Can't set the schema of an Autopartitioned table. +- `ALTER COLUMN ... TYPE` — Changing a column's type isn't supported if the + command causes the whole table to be rewritten, which occurs when the change + isn't binary coercible. + Binary coercible changes might be allowed only one way. For example, + the change from `VARCHAR(128)` to `VARCHAR(256)` is binary coercible and therefore + allowed, whereas the change `VARCHAR(256)` to `VARCHAR(128)` isn't binary coercible + and therefore normally disallowed. Nonreplicated `ALTER COLUMN ... TYPE`, + can be allowed if the column is automatically castable to the new type + (it doesn't contain the `USING` clause). An example follows. + Table rewrites hold an + AccessExclusiveLock for extended periods on larger tables, so such commands + are likely to be infeasible on highly available databases in any case. + See [Changing a column's type](#changing-a-columns-type) for a suggested workaround. +- `ALTER TABLE ... ADD FOREIGN KEY` — Isn't supported if current user doesn't have + permission to read the referenced table or if the referenced table + has RLS restrictions enabled that the current user can't bypass. + +The following example fails because it tries to add a constant value of type `timestamp` +onto a column of type `timestamptz`. The cast between `timestamp` and `timestamptz` +relies on the time zone of the session and so isn't immutable. + +```sql +ALTER TABLE foo + ADD expiry_date timestamptz DEFAULT timestamp '2100-01-01 00:00:00' NOT NULL; +``` + +Starting in BDR 3.7.4, you can add certain types of constraints, such as `CHECK` and +`FOREIGN KEY` constraints, without taking a DML lock. But +this requires a two-step process of first creating a `NOT VALID` constraint +and then validating the constraint in a separate transaction with the `ALTER TABLE ... VALIDATE CONSTRAINT` +command. See [Adding a CONSTRAINT](#adding-a-constraint) +for more details. + +
+ +#### ALTER TABLE locking + +The following variants of `ALTER TABLE` take only DDL lock and not a +DML lock: + +- `ALTER TABLE ... ADD COLUMN ... (immutable) DEFAULT` +- `ALTER TABLE ... ALTER COLUMN ... SET DEFAULT expression` +- `ALTER TABLE ... ALTER COLUMN ... DROP DEFAULT` +- `ALTER TABLE ... ALTER COLUMN ... TYPE` if it doesn't require rewrite +- `ALTER TABLE ... ALTER COLUMN ... SET STATISTICS` +- `ALTER TABLE ... VALIDATE CONSTRAINT` +- `ALTER TABLE ... ATTACH PARTITION` +- `ALTER TABLE ... DETACH PARTITION` +- `ALTER TABLE ... ENABLE TRIGGER` (`ENABLE REPLICA TRIGGER` still takes a DML lock) +- `ALTER TABLE ... CLUSTER ON` +- `ALTER TABLE ... SET WITHOUT CLUSTER` +- `ALTER TABLE ... SET ( storage_parameter = value [, ... ] )` +- `ALTER TABLE ... RESET ( storage_parameter = [, ... ] )` +- `ALTER TABLE ... OWNER TO` + +All other variants of `ALTER TABLE` take a DML lock on the table being modified. +Some variants of `ALTER TABLE` have restrictions, noted below. + +#### ALTER TABLE examples + +This next example works because the type change is binary coercible and so doesn't +cause a table rewrite. It executes as a catalog-only change. + +```sql +CREATE TABLE foo (id BIGINT PRIMARY KEY, description VARCHAR(20)); +ALTER TABLE foo ALTER COLUMN description TYPE VARCHAR(128); +``` + +However, making this change to reverse the command isn't possible because +the change from `VARCHAR(128)` to `VARCHAR(20)` isn't binary coercible. + +```sql +ALTER TABLE foo ALTER COLUMN description TYPE VARCHAR(20); +``` + +For workarounds, see [Restricted DDL workarounds](#restricted-ddl-workarounds). + +It's useful to provide context for different types of `ALTER TABLE ... +ALTER COLUMN TYPE` (ATCT) operations that are possible in general and in +nonreplicated environments. + +Some ATCT operations update only the metadata of the underlying column +type and don't require a rewrite of the underlying table data. This is +typically the case when the existing column type and the target type are +binary coercible. For example: + +```sql +CREATE TABLE sample (col1 BIGINT PRIMARY KEY, col2 VARCHAR(128), col3 INT); +ALTER TABLE sample ALTER COLUMN col2 TYPE VARCHAR(256); +``` + +You can also change the column type to `VARCHAR` or `TEXT` +data types because of binary coercibility. Again, this is just a metadata +update of the underlying column type. + +```sql +ALTER TABLE sample ALTER COLUMN col2 TYPE VARCHAR; +ALTER TABLE sample ALTER COLUMN col2 TYPE TEXT; +``` + +However, if you want to reduce the size of col2, then that leads to +a rewrite of the underlying table data. Rewrite of a table is normally +restricted. + +```sql +ALTER TABLE sample ALTER COLUMN col2 TYPE VARCHAR(64); +ERROR: ALTER TABLE ... ALTER COLUMN TYPE that rewrites table data may not affect replicated tables on a BDR node +``` + +To give an example with nontext types, consider col3 above with type +INTEGER. An ATCT operation that tries to convert to SMALLINT or BIGINT +fails in a similar manner as above. + +```sql +ALTER TABLE sample ALTER COLUMN col3 TYPE bigint; +ERROR: ALTER TABLE ... ALTER COLUMN TYPE that rewrites table data may not affect replicated tables on a BDR node +``` + +In both of these failing cases, there's an automatic assignment +cast from the current types to the target types. However, there's no +binary coercibility, which ends up causing a rewrite of the underlying +table data. + +In such cases, in controlled DBA environments, you can change +the type of a column to an automatically castable one by adopting +a rolling upgrade for the type of this column in a nonreplicated +environment on all the nodes, one by one. Suppose the DDL isn't replicated +and the change of the column type is to an automatically castable one. +You can then allow the rewrite locally on the node +performing the alter, along with concurrent activity on other nodes on +this same table. You can then repeat this nonreplicated ATCT operation +on all the nodes one by one to bring about the desired change of the +column type across the entire EDB Postgres Distributed cluster. Because this +involves a rewrite, the activity still takes the DML lock for a +brief period and thus requires that the whole cluster is available. With +these specifics in place, you can carry out the rolling upgrade of the nonreplicated +alter activity like this: + +```sql +-- foreach node in EDB Postgres Distributed cluster do: +SET bdr.ddl_replication TO FALSE; +ALTER TABLE sample ALTER COLUMN col2 TYPE VARCHAR(64); +ALTER TABLE sample ALTER COLUMN col3 TYPE BIGINT; +RESET bdr.ddl_replication; +-- done +``` + +Due to automatic assignment casts being available for many data types, +this local nonreplicated ATCT operation supports a wide variety of +conversions. Also, ATCT operations that use a `USING` clause +are likely to fail because of the lack of automatic assignment casts. +This example shows a few common conversions with automatic assignment casts: + +```sql +-- foreach node in EDB Postgres Distributed cluster do: +SET bdr.ddl_replication TO FALSE; +ATCT operations to-from {INTEGER, SMALLINT, BIGINT} +ATCT operations to-from {CHAR(n), VARCHAR(n), VARCHAR, TEXT} +ATCT operations from numeric types to text types +RESET bdr.ddl_replication; +-- done +``` + +This example isn't an exhaustive list of possibly allowable ATCT +operations in a nonreplicated environment. Not all ATCT +operations work. The cases where no automatic assignment is +possible fail even if you disable DDL replication. So, while +conversion from numeric types to text types works in a nonreplicated +environment, conversion back from text type to numeric types fails. + +```sql +SET bdr.ddl_replication TO FALSE; +-- conversion from BIGINT to TEXT works +ALTER TABLE sample ALTER COLUMN col3 TYPE TEXT; +-- conversion from TEXT back to BIGINT fails +ALTER TABLE sample ALTER COLUMN col3 TYPE BIGINT; +ERROR: ALTER TABLE ... ALTER COLUMN TYPE which cannot be automatically cast to new type may not affect replicated tables on a BDR node +RESET bdr.ddl_replication; +``` + +While the ATCT operations in nonreplicated environments support a +variety of type conversions, the rewrite +can still fail if the underlying table data contains values that you can't +assign to the new data type. For example, suppose the current type for +a column is `VARCHAR(256)` and you try a nonreplicated ATCT +operation to convert it into `VARCHAR(128)`. If there's any existing data +in the table that's wider than 128 bytes, then the rewrite operation +fails locally. + +```sql +INSERT INTO sample VALUES (1, repeat('a', 200), 10); +SET bdr.ddl_replication TO FALSE; +ALTER TABLE sample ALTER COLUMN col2 TYPE VARCHAR(128); +INFO: in rewrite +ERROR: value too long for type character varying(128) +``` + +If underlying table data meets the characteristics of the new type, +then the rewrite succeeds. However, +replication might fail if other nodes that haven't yet performed the +nonreplicated rolling data type upgrade introduce new data that +is wider than 128 bytes concurrently to this local ATCT operation. This +brings replication to a halt in the cluster. So +be aware of the data type restrictions and characteristics at the +database and application levels while performing these +nonreplicated rolling data type upgrade operations. We strongly +recommend that you perform and test such ATCT operations in +controlled and fully aware DBA environments. These +ATCT operations are asymmetric, and backing out certain changes that fail +can lead to table rewrites that take a long time. + +Also, you can't perform the implicit castable ALTER activity in transaction blocks. + +### ALTER TYPE + +`ALTER TYPE` is replicated, but a global DML lock isn't +applied to all tables that use that data type, since PostgreSQL doesn't +record those dependencies. See [Restricted DDL workarounds](#restricted-ddl-workarounds). + +
+ +### COMMENT ON + +All variants of `COMMENT ON` are allowed, but +`COMMENT ON TABLESPACE/DATABASE/LARGE OBJECT` isn't replicated. + +
+ +### CREATE PROFILE or ALTER PROFILE + +The `PASSWORD_VERIFY_FUNCTION` associated with the profile should be `IMMUTABLE` if the function is `SECURITY DEFINER`. +Such a `CREATE PROFILE` or `ALTER PROFILE` command will be replicated but subsequent `CREATE USER` or `ALTER USER` commands using this profile will +break the replication due to the `writer` worker throwing the error: `cannot change current role within security-restricted operation`. + +
+ +### CREATE SEQUENCE + +Generally `CREATE SEQUENCE` is supported, but when using global +sequences, some options have no effect. + +
+ +### CREATE TABLE + +Generally `CREATE TABLE` is supported, but `CREATE TABLE WITH OIDS` isn't +allowed on a BDR node. + +
+ +### CREATE TABLE AS and SELECT INTO + +`CREATE TABLE AS` and `SELECT INTO` are allowed only if all subcommands are +also allowed. + +### EXPLAIN + +Generally `EXPLAIN` is allowed, but because `EXPLAIN ANALYZE` can have side +effects on the database, there are some restrictions on it. + +
+ +#### EXPLAIN ANALYZE Replication + +`EXPLAIN ANALYZE` follows replication rules of the analyzed statement. + +
+ +#### EXPLAIN ANALYZE Locking + +`EXPLAIN ANALYZE` follows locking rules of the analyzed statement. + +
+ +### GRANT and REVOKE + +Generally `GRANT` and `REVOKE` statements are supported, however +`GRANT/REVOKE ON TABLESPACE/LARGE OBJECT` aren't replicated. + +
+ +### LOCK TABLE + +LOCK TABLE isn't replicated, but it might acquire the global DML lock when +`bdr.lock_table_locking` is set `on`. + +You can also use The `bdr.global_lock_table()` function to explicitly request a global DML +lock. + +
+ +### SECURITY LABEL + +All variants of `SECURITY LABEL` are allowed, but +`SECURITY LABEL ON TABLESPACE/DATABASE/LARGE OBJECT` isn't replicated. + +
+ +### TRUNCATE Replication + +`TRUNCATE` command is replicated as DML, not as a DDL statement. Whether +the `TRUNCATE` on table is replicated depends on replication settings for +each affected table. + +
+ +### TRUNCATE Locking + +Even though `TRUNCATE` isn't replicated the same way as other DDL, it can acquire +the global DML lock when `bdr.truncate_locking` is set to `on`. + +### Role manipulation statements + +Users are global objects in a PostgreSQL instance, which means they span +multiple databases while BDR operates on an individual database level. This means +that role manipulation statement handling needs extra thought. + +BDR requires that any roles that are referenced by any replicated DDL must +exist on all nodes. The roles don't have to have the same grants, +password, and so on, but they must exist. + +BDR replicates role manipulation statements if `bdr.role_replication` is +enabled (default) and role manipulation statements are run in a BDR-enabled +database. + +The role manipulation statements include the following: + +- `CREATE ROLE` +- `ALTER ROLE` +- `DROP ROLE` +- `GRANT ROLE` +- `CREATE USER` +- `ALTER USER` +- `DROP USER` +- `CREATE GROUP` +- `ALTER GROUP` +- `DROP GROUP` + +In general, either: + +- Configure the system with `bdr.role_replication = off` and + deploy all role changes (user and group) by external orchestration + tools like Ansible, Puppet, and Chef or explicitly replicated by + `bdr.replicate_ddl_command(...)`. + +- Configure the system so that exactly one BDR-enabled database + on the PostgreSQL instance has `bdr.role_replication = on` and run all + role management DDL on that database. + +We recommended that you run all role management commands in one +database. + +If role replication is turned off, then the administrator must ensure that +any roles used by DDL on one node also exist on the other nodes. Otherwise BDR apply +stalls with an error until the role is created on the other nodes. + +!!! Note + BDR doesn't capture and replicate role management statements when they + run on a non-BDR-enabled database in a BDR-enabled PostgreSQL instance. + For example, if you have DBs 'bdrdb' (bdr group member) and 'postgres' (bare db), + and `bdr.role_replication = on`, then a `CREATE USER` run in `bdrdb` is + replicated, but a `CREATE USER` run in `postgres` isn't. + +### Restricted DDL workarounds + +Some of the limitations of BDR DDL operation handling can be worked around. +Often splitting up the operation into smaller changes can produce the desired +result that either isn't allowed as a single statement or requires excessive +locking. + +#### Adding a CONSTRAINT + +You can add `CHECK` and `FOREIGN KEY` constraints without requiring a DML lock. +This involves a two-step process. + +- `ALTER TABLE ... ADD CONSTRAINT ... NOT VALID` +- `ALTER TABLE ... VALIDATE CONSTRAINT` + +Execute these steps in two different transactions. Both these +steps take DDL lock only on the table and hence can be run even when one +or more nodes are down. But to validate a constraint, BDR must +ensure that: +- All nodes in the cluster see the `ADD CONSTRAINT` command. +- The node validating the constraint applied replication changes from all other nodes prior to + creating the NOT VALID constraint on those nodes. + +So even though the new mechanism doesn't need all nodes +to be up while validating the constraint, it still requires that all +nodes applied the `ALTER TABLE .. ADD CONSTRAINT ... NOT VALID` +command and made enough progress. BDR waits for a consistent +state to be reached before validating the constraint. + +The new facility requires the cluster to run with Raft protocol +version 24 and beyond. If the Raft protocol isn't yet upgraded, the old +mechanism is used, resulting in a DML lock request. + +#### Adding a column + +To add a column with a volatile default, run these commands in +separate transactions: + +```sql + ALTER TABLE mytable ADD COLUMN newcolumn coltype; -- Note the lack of DEFAULT or NOT NULL + + ALTER TABLE mytable ALTER COLUMN newcolumn DEFAULT volatile-expression; + + BEGIN; + SELECT bdr.global_lock_table('mytable'); + UPDATE mytable SET newcolumn = default-expression; + COMMIT; +``` + +This approach splits schema changes and row changes into separate transactions that +BDR can execute and results in consistent data across all nodes in a +BDR group. + +For best results, batch the update into chunks so that you don't update more than +a few tens or hundreds of thousands of rows at once. You can do this using +a `PROCEDURE` with embedded transactions. + +The last batch of changes must run in a transaction that +takes a global DML lock on the table. Otherwise you can miss rows +that are inserted concurrently into the table on other nodes. + +If required, you can run `ALTER TABLE mytable ALTER COLUMN newcolumn NOT NULL;` after the `UPDATE` has finished. + +#### Changing a column's type + +PostgreSQL causes a table rewrite in some cases where it could be avoided, +for example: + +```sql +CREATE TABLE foo (id BIGINT PRIMARY KEY, description VARCHAR(128)); +ALTER TABLE foo ALTER COLUMN description TYPE VARCHAR(20); +``` + +You can rewrite this statement to avoid a table rewrite by making the +restriction a table constraint rather than a datatype change. The constraint can +then be validated in a subsequent command to avoid long locks, if you want. + +```sql +CREATE TABLE foo (id BIGINT PRIMARY KEY, description VARCHAR(128)); +ALTER TABLE foo + ALTER COLUMN description TYPE varchar, + ADD CONSTRAINT description_length_limit CHECK (length(description) <= 20) NOT VALID; +ALTER TABLE foo VALIDATE CONSTRAINT description_length_limit; +``` + +If the validation fails, then you can `UPDATE` just the failing rows. +You can use this technique for `TEXT` and `VARCHAR` using `length()` or with +`NUMERIC` datatype using `scale()`. + +In the general case for changing column type, first add a column of the desired type: + +``` +ALTER TABLE mytable ADD COLUMN newcolumn newtype; +``` + +Create a trigger defined as `BEFORE INSERT OR UPDATE ON mytable FOR EACH ROW ..`, +which assigns `NEW.newcolumn` to `NEW.oldcolumn` so that new writes to the +table update the new column automatically. + +`UPDATE` the table in batches to copy the value of `oldcolumn` to +`newcolumn` using a `PROCEDURE` with embedded transactions. Batching the work +helps reduce replication lag if it's a big table. Updating by range of +IDs or whatever method you prefer is fine. Alternatively, you can update the whole table in one pass for +smaller tables. + +`CREATE INDEX ...` any required indexes on the new column. It's safe to +use `CREATE INDEX ... CONCURRENTLY` run individually without DDL replication +on each node to reduce lock durations. + +`ALTER` the column to add a `NOT NULL` and `CHECK` constraints, if required. + +1. `BEGIN` a transaction. +1. `DROP` the trigger you added. +1. `ALTER TABLE` to add any `DEFAULT` required on the column. +1. `DROP` the old column. +1. `ALTER TABLE mytable RENAME COLUMN newcolumn TO oldcolumn`. +1. `COMMIT`. + +!!! Note + Because you're dropping a column, you might have to re-create views, procedures, + and so on that depend on the table. Be careful if you `CASCADE` drop the column, + as you must be sure you re-create everything that referred to it. + +#### Changing other types + +The `ALTER TYPE` statement is replicated, but affected tables aren't locked. + +When this DDL is used, ensure that the statement has successfully +executed on all nodes before using the new type. You can achieve this using +the `bdr.wait_slot_confirm_lsn()` function. + +This example ensures that the DDL is written to all nodes before using the new value +in DML statements: + +``` +ALTER TYPE contact_method ADD VALUE 'email'; +SELECT bdr.wait_slot_confirm_lsn(NULL, NULL); +``` + +### BDR functions that behave like DDL + +The following BDR management functions act like DDL. This means that, if DDL +replication is active and DDL filter settings allow it, they +attempt to take global locks and their actions are replicate. For detailed +information, see the documentation for the individual functions. + +Replication set management + +- `bdr.create_replication_set` +- `bdr.alter_replication_set` +- `bdr.drop_replication_set` +- `bdr.replication_set_add_table` +- `bdr.replication_set_remove_table` +- `bdr.replication_set_add_ddl_filter` +- `bdr.replication_set_remove_ddl_filter` + +Conflict management + +- `bdr.alter_table_conflict_detection` +- `bdr.column_timestamps_enable` +- `bdr.column_timestamps_disable` + +Sequence management + +- `bdr.alter_sequence_set_kind` + +Stream triggers + +- `bdr.create_conflict_trigger` +- `bdr.create_transform_trigger` +- `bdr.drop_trigger` diff --git a/product_docs/docs/pgd/5/deployments.mdx b/product_docs/docs/pgd/5/deployments.mdx new file mode 100644 index 00000000000..b2c0f0c0cec --- /dev/null +++ b/product_docs/docs/pgd/5/deployments.mdx @@ -0,0 +1,16 @@ +--- +title: "Choosing your deployment method" +indexCards: simple + +--- + +You can deploy and install EDB Postgres Distributed products using the following methods: + +- [Trusted Postgres Architect](/tpa/latest) is an orchestration tool that uses Ansible to build Postgres clusters using a set of reference architectures that document how to set up and operate Postgres in various scenarios. Trusted Postgres Architect (TPA) represents the best practices followed by EDB, and its recommendations are as applicable to quick testbed setups as to production environments. See [Deploying with TPA](tpa) for more information. + +- BigAnimal is a fully managed database-as-a-service with built-in Oracle compatibility, running in your cloud account and operated by the Postgres experts. BigAnimal makes it easy to set up, manage, and scale your databases. The addition of extreme high availability support through EDB Postres Distributed allows single-region Always On Gold clusters: two BDR groups in different availability zones in a single cloud region, with a witness node in a third availability zone. See the [Extreme high availability](/biganimal/latest/overview/02_high_availability/#extreme-high-availability-beta) topic in the [BigAnimal documentation](/biganimal/latest) for more information. + +Coming soon: + +- EDB Postgres Distributed for Kubernetes will be a Kubernetes operator is designed, developed, and supported by EDB that covers the full lifecycle of a highly available Postgres database clusters with a multi-master architecture, using BDR replication. It is based on the open source CloudNativePG operator, and provides additional value such as compatibility with Oracle using EDB Postgres Advanced Server and additional supported platforms such as IBM Power and OpenShift. + diff --git a/product_docs/docs/pgd/5/durability/camo.mdx b/product_docs/docs/pgd/5/durability/camo.mdx new file mode 100644 index 00000000000..b0ae9247fe7 --- /dev/null +++ b/product_docs/docs/pgd/5/durability/camo.mdx @@ -0,0 +1,545 @@ +--- +navTitle: Commit At Most Once +title: Commit At Most Once +redirects: + - /pgd/latest/bdr/camo/ +--- + +The objective of the Commit At Most Once (CAMO) feature is to prevent +the application from committing more than once. + +Without CAMO, when a client loses connection after a COMMIT is +submitted, the application might not receive a reply from the server +and is therefore unsure whether the transaction committed. + +The application can't easily decide between the two options of: + +* Retrying the transaction with the same data, since this can in some cases + cause the data to be entered twice + +* Not retrying the transaction and risk that the data doesn't get + processed at all + +Either of those is a critical error with high-value data. + +One way to avoid this situation is to make sure that the transaction +includes at least one `INSERT` into a table with a unique index, but +that depends on the application design and requires application- +specific error-handling logic, so it isn't effective in all cases. + +The CAMO feature in PGD offers a more general solution and doesn't require +an `INSERT`. When activated by `bdr.commit_scope`, the application +receives a message containing the transaction identifier, if already +assigned. Otherwise, the first write statement in a transaction +sends that information to the client. + +If the application sends an explicit COMMIT, the protocol ensures that the +application receives the notification of the transaction identifier before the +COMMIT is sent. If the server doesn't reply to the COMMIT, the application can +handle this error by using the transaction identifier to request +the final status of the transaction from another PGD node. +If the prior transaction status is known, then the application can safely +decide whether to retry the transaction. + +CAMO works by creating a pair of partner nodes that are two PGD nodes +from the same PGD group. In this operation mode, each node in the pair knows +the outcome of any recent transaction executed on the other peer and especially +(for our need) knows the outcome of any transaction disconnected during COMMIT. +The node that receives the transactions from the application might be referred +to as "origin" and the node that confirms these transactions as "partner." +However, there's no difference in the CAMO configuration for the nodes in the +CAMO pair. The pair is symmetric. + +!!! Warning + CAMO requires changes to the user's application to take advantage of the + advanced error handling. Enabling a parameter isn't enough to gain + protection. Reference client implementations are provided to customers + on request. + +## Requirements + +To use CAMO, an application must issue an explicit COMMIT message +as a separate request (not as part of a multi-statement request). +CAMO can't provide status for transactions issued from procedures +or from single-statement transactions that use implicit commits. + +## Configuration + +Configuration of CAMO happens through [Commit Scopes](commit-scopes). + +## Failure scenarios + +Different failure scenarios occur in different +configurations. + +### Data persistence at receiver side + +By default, a PGL writer operates in +`bdr.synchronous_commit = off` mode when applying transactions +from remote nodes. This holds true for CAMO as well, meaning that +transactions are confirmed to the origin node possibly before reaching +the disk of the CAMO partner. In case of a crash or hardware failure, +it is possible for a confirmed transaction to be unrecoverable on the +CAMO partner by itself. This isn't an issue as long as the CAMO +origin node remains operational, as it redistributes the +transaction once the CAMO partner node recovers. + +This in turn means CAMO can protect against a single-node failure, +which is correct for local mode as well as or even in combination +with remote write. + +To cover an outage of both nodes of a CAMO pair, you can use +`bdr.synchronous_commit = local` to enforce a flush prior to the +pre-commit confirmation. This doesn't work with +either remote write or local mode and has a performance +impact due to I/O requirements on the CAMO partner in the +latency sensitive commit path. + +### Asynchronous mode + +When `DEGRADE ON ... TO ASYNC` clause is used in the commit scope +a node detects whether its CAMO partner is ready. If not, it temporarily +switches to asynchronous (local) mode. When in this mode, a node commits +transactions locally until switching back to CAMO mode. + +This doesn't allow COMMIT status to be retrieved, but it does +let you choose availability over consistency. This mode +can tolerate a single-node failure. In case both nodes of a CAMO pair +fail, they might choose incongruent commit decisions to maintain +availability, leading to data inconsistencies. + +For a CAMO partner to switch to ready, it needs to be connected, and +the estimated catchup interval needs to drop below +the `timeout` value of `TO ASYNC`. The current readiness status of a CAMO +partner can be checked with `bdr.is_camo_partner_ready`, while +`bdr.node_replication_rates` provides the current estimate of the catchup +time. + +The switch from CAMO protected to asynchronous mode is only ever triggered by +an actual CAMO transaction either because the commit exceeds the +`timeout` value of `TO ASYNC` or, in case the CAMO partner is already +known, disconnected at the time of commit. This switch is independent +of the estimated catchup interval. If the CAMO pair is configured to +require the current node to be the write lead of a group as configured +through `enable_proxy_routing` node group option. See +[Commit Scopes](commit-scopes) syntax. +This can prevent a split brain situation due to an isolated node from +switching to asynchronous mode. If `enable_proxy_routing` isn't set for the +CAMO group, the origin node switches to asynchronous mode immediately. + +The switch from asynchronous mode to CAMO mode depends on the CAMO partner +node, which initiates the connection. The CAMO partner tries to +reconnect at least every 30 seconds. After connectivity is +reestablished, it might therefore take up to 30 seconds until the CAMO +partner connects back to its origin node. Any lag that accumulated on +the CAMO partner further delays the switch back to CAMO protected +mode. + +Unlike during normal CAMO operation, in asynchronous mode there's no +additional commit overhead. This can be problematic, as it allows the +node to continuously process more transactions than the CAMO +pair can normally process. Even if the CAMO partner eventually +reconnects and applies transactions, its lag only ever increases +in such a situation, preventing reestablishing the CAMO protection. +To artificially throttle transactional throughput, PGD provides the +`bdr.camo_local_mode_delay` setting, which allows you to delay a COMMIT in +local mode by an arbitrary amount of time. We recommend measuring +commit times in normal CAMO mode during expected workloads and +configuring this delay accordingly. The default is 5 ms, which reflects +a asynchronous network and a relatively quick CAMO partner response. + +Consider the choice of whether to allow asynchronous mode in view of +the architecture and the availability requirements. The following examples +provide some detail. + +### Example + +This example considers a setup with two PGD nodes that are the +CAMO partner of each other. + +#### With asynchronous mode + +If asynchronous mode is allowed, there's no single point of failure. When one +node fails: + +* The other node can determine the status of all transactions that + were disconnected during COMMIT on the failed node. +* New write transactions are allowed: + * If the second node also fails, then the outcome of those + transactions that were being committed at that time is + unknown. + +#### Without asynchronous mode + +If asynchronous mode isn't allowed, then each node requires the other node +for committing transactions, that is, each node is a single point of +failure. When one node fails: + +* The other node can determine the status of all transactions that + were disconnected during COMMIT on the failed node. +* New write transactions are prevented until the node recovers. + +## Application use + +### Overview and requirements + +CAMO relies on a retry loop and specific error handling +on the client side. There are three aspects to it: + +* The result of a transaction's COMMIT needs to be checked and, in + case of a temporary error, the client must retry the transaction. +* Prior to COMMIT, the client must retrieve a global + identifier for the transaction, consisting of a node id and a + transaction id (both 32-bit integers). +* If the current server fails while attempting a COMMIT of a transaction, + the application must connect to its CAMO partner, retrieve the status + of that transaction, and retry depending on the response. + +The application must store the global transaction +identifier only for the purpose of verifying the transaction status in +case of disconnection during COMMIT. In particular, the application +doesn't need an additional persistence layer. If the application +fails, it needs only the information in the database to restart. + +### CAMO partner connection status + +The function `bdr.is_camo_partner_connected` allows checking the +connection status of a CAMO partner node configured in pair mode. +There currently is no equivalent for CAMO used with +Eager Replication. + +#### Synopsis + +```sql +bdr.is_camo_partner_connected() +``` + +#### Return value + +A Boolean value indicating whether the CAMO partner is currently +connected to a WAL sender process on the local node and therefore can +receive transactional data and send back confirmations. + +### CAMO partner readiness + +The function `bdr.is_camo_partner_ready` allows checking the readiness +status of a CAMO partner node configured in pair mode. Underneath, +this triggers the switch to and from local mode. + +#### Synopsis + +```sql +bdr.is_camo_partner_ready() +``` + +#### Return value + +A Boolean value indicating whether the CAMO partner can reasonably be +expected to confirm transactions originating from the local node in a +timely manner (before `timeout` for `TO ASYNC` expires). + +!!! Note + This function queries the past or current state. A + positive return value doesn't indicate whether the CAMO partner can + confirm future transactions. + +### Fetch the CAMO partner + +This function shows the local node's CAMO partner (configured by pair +mode). + +```sql +bdr.get_configured_camo_partner() +``` + +### Wait for consumption of the apply queue from the CAMO partner + +The function `bdr.wait_for_camo_partner_queue` is a wrapper of +`bdr.wait_for_apply_queue` defaulting to query the CAMO partner node. +It yields an error if the local node isn't part of a CAMO pair. + +#### Synopsis + +```sql +bdr.wait_for_camo_partner_queue() +``` + +### Transaction status between CAMO nodes + +This function enables a wait for CAMO transactions to be fully resolved. + +```sql +bdr.camo_transactions_resolved() +``` + +### Transaction status query function + +To check the status of a transaction that was being committed when the node +failed, the application must use this function: + +```sql +bdr.logical_transaction_status(node_id OID, xid OID, + require_camo_partner boolean) +``` + +With CAMO used in pair mode, use this function only on +a node that's part of a CAMO pair. Along with Eager +replication, you can use it on all nodes. + +In both cases, you must call the function within 15 minutes after +the commit was issued. The CAMO partner must regularly purge +such meta-information and therefore can't provide correct answers for +older transactions. + +Before querying the status of a transaction, this function waits for +the receive queue to be consumed and fully applied. This prevents +early negative answers for transactions that were +received but not yet applied. + +Despite its name, it's not always a read-only operation. +If the status is unknown, the CAMO partner decides whether to +commit or abort the transaction, storing that decision locally to +ensure consistency going forward. + +The client must not call this function before +attempting to commit on the origin. Otherwise the transaction might be +forced to roll back. + +#### Synopsis + +```sql +bdr.logical_transaction_status(node_id OID, xid OID, + require_camo_partner boolean DEFAULT true) +``` + +#### Parameters + +- `node_id` — The node id of the PGD node the transaction originates + from, usually retrieved by the client before COMMIT from the PQ + parameter `bdr.local_node_id`. +- `xid` — The transaction id on the origin node, usually retrieved by + the client before COMMIT from the PQ parameter `transaction_id` +- `require_camo_partner` — Defaults to true and enables configuration + checks. Set to false to disable these checks and query the + status of a transaction that was not a CAMO transaction. + +#### Return value + +The function returns one of these results: + +- `'committed'::TEXT` — The transaction was committed, is visible + on both nodes of the CAMO pair, and will eventually be replicated to + all other PGD nodes. No need for the client to retry it. + +- `'aborted'::TEXT` — The transaction was aborted and will not be + replicated to any other PGD node. The client needs to either + retry it or escalate the failure to commit the transaction. + +- `'in progress'::TEXT` — The transaction is still in progress on this + local node and wasn't committed or aborted yet. The transaction might be in the COMMIT phase, waiting for + the CAMO partner to confirm or deny the commit. The recommended + client reaction is to disconnect from the origin node and reconnect + to the CAMO partner to query that instead. With a load balancer or proxy + in between, where the client lacks control over which node gets + queried, the client can only poll repeatedly until the status + switches to either `'committed'` or `'aborted'`. + + For Eager All-Node Replication, peer nodes yield this result for + transactions that aren't yet committed or aborted. This means that + even transactions not yet replicated (or not even started on the + origin node) might yield an `in progress` result on a peer PGD node in + this case. However, the client must not query the transaction + status prior to attempting to commit on the origin. + +- `'unknown'::TEXT` — The transaction specified is unknown, either + because it's in the future, not replicated to that specific node + yet, or too far in the past. The status of such a transaction is + not yet or no longer known. This return value is a sign of improper + use by the client. + +The client must be prepared to retry the function call on error. + +### Connection pools and proxies + +The effect of connection pools and proxies needs to be considered when +designing a CAMO cluster. A proxy may freely distribute transactions +to all nodes in the commit group (i.e. to both nodes of a CAMO pair or +to all PGD nodes in case of Eager All Node Replication). + +Care needs to be taken to ensure that the application fetches +the proper node id: when using session pooling, the client remains +connected to the same node, so the node id remains constant for the +lifetime of the client session. However, with finer-grained transaction +pooling, the client needs to fetch the node id for every transaction (as +in the example given below). + +A client that is not directly connected to the PGD nodes might not even +notice a failover or switchover, but can always use the +`bdr.local_node_id` parameter to determine which node it is currently +connected to. In the crucial situation of a disconnect during COMMIT, +the proxy must properly forward that disconnect as an error to the +client applying the CAMO protocol. + +For CAMO in `received` mode, a proxy that potentially switches +between the CAMO pairs must use the `bdr.wait_for_camo_partner_queue` +function to prevent stale reads. + +### Example + +The following example demonstrates what a retry loop of a CAMO aware +client application should look like in C-like pseudo-code. It expects +two DSNs `origin_dsn` and `partner_dsn` providing connection information. +These usually are the same DSNs as used for the initial call to +`bdr.create_node`, and can be looked up in `bdr.node_summary`, column +`interface_connstr`. + +```shell +PGconn *conn = PQconnectdb(origin_dsn); + +loop { + // start a transaction + PQexec(conn, "BEGIN"); + + // apply transactional changes + PQexec(conn, "INSERT INTO ..."); + ... + + // store a globally unique transaction identifier + node_id = PQparameterStatus(conn, "bdr.local_node_id"); + xid = PQparameterStatus(conn, "transaction_id"); + + // attempt to commit + PQexec(conn, "COMMIT"); + if (PQresultStatus(res) == PGRES_COMMAND_OK) + return SUCCESS; + else if (PQstatus(res) == CONNECTION_BAD) + { + // Re-connect to the partner + conn = PQconnectdb(partner_dsn); + // Check if successfully reconnected + if (!connectionEstablished()) + panic(); + + // Check the attempted transaction's status + sql = "SELECT bdr.logical_transaction_status($node_id, $xid)"; + txn_status = PQexec(conn, sql); + if (txn_status == "committed") + return SUCCESS; + else + continue; // to retry the transaction on the partner + } + else + { + // The connection is intact, but the transaction failed for some + // other reason. Differentiate between permanent and temporary + // errors. + if (isPermanentError()) + return FAILURE; + else + { + // Determine an appropriate delay to back-off to account for + // temporary failures due to congestion, so as to decrease + // the overall load put on the servers. + sleep(increasing_retry_delay); + + continue; + } + } +} +``` + +This example needs to be extended with proper logic for connecting, including +retries and error handling. If using a load balancer +(e.g. PgBouncer), re-connecting can be implemented by simply using +`PQreset`. Ensure that the load balancer only +ever redirects a client to a CAMO partner and not any other PGD node. + +In practice, an upper limit of retries is recommended. Depending on the +actions performed in the transaction, other temporary errors may be +possible and need to be handled by retrying the transaction depending +on the error code, similarly to the best practices on deadlocks or on +serialization failures while in `SERIALIZABLE` isolation mode. + +## Interaction with DDL and global locks + +Transactions protected by CAMO can contain DDL operations. However, DDL uses global locks, which already provide some +synchronization among nodes. See +[DDL locking details](../ddl#ddl-locking-details) for more +information. + +Combining CAMO with DDL imposes a higher latency and also +increases the chance of global deadlocks. We therefore recommend using a +relatively low `bdr.global_lock_timeout`, which aborts the DDL and +therefore resolves a deadlock in a reasonable amount of time. + +### Nontransactional DDL + +The following DDL operations aren't allowed in a transaction +block and therefore can't benefit from CAMO protection. For +these, CAMO is automatically disabled internally: + +* all concurrent index operations (`CREATE`, `DROP`, and `REINDEX`) +* `REINDEX DATABASE`, `REINDEX SCHEMA`, and `REINDEX SYSTEM` +* `VACUUM` +* `CLUSTER` without any parameter +* `ALTER TABLE DETACH PARTITION CONCURRENTLY` +* `ALTER TYPE [enum] ADD VALUE` +* `ALTER SYSTEM` +* `CREATE` and `DROP DATABASE` +* `CREATE` and `DROP TABLESPACE` +* `ALTER DATABASE [db] TABLESPACE` + +## CAMO limitations + +- CAMO is designed to query the results of a recently failed COMMIT on +the origin node, so in case of disconnection, code the application +to immediately request the transaction status from the CAMO partner. +Have as little delay as possible after the failure before +requesting the status. Applications must not rely on CAMO decisions +being stored for longer than 15 minutes. + +- If the application forgets the global identifier assigned, for example +as a result of a restart, there's no easy way to recover +it. Therefore, we recommend that applications wait for outstanding +transactions to end before shutting down. + +- For the client to apply proper checks, a transaction protected by CAMO +can't be a single statement with implicit transaction control. You also can't +use CAMO with a transaction-controlling procedure or +in a `DO` block that tries to start or end transactions. + +- CAMO resolves commit status but doesn't yet resolve pending +notifications on commit. CAMO and Eager replication options don't +allow the `NOTIFY` SQL command or the `pg_notify()` function. +They also don't allow `LISTEN` or `UNLISTEN`. + +- When replaying changes, CAMO transactions may detect conflicts just +the same as other transactions. If timestamp conflict detection is used, +the CAMO transaction uses the timestamp of the prepare on the origin +node, which is before the transaction becomes visible on the origin +node itself. + +- CAMO is not currently compatible with transaction streaming. Please +ensure to disable transaction streaming when planning to use +CAMO. This can be configured globally or in the PGD node group, see +[Transaction Streaming Configuration](../transaction-streaming#configuration). + +## Performance implications + +CAMO extends the Postgres replication protocol by adding a +message roundtrip at commit. Applications have a higher +commit latency than with asynchronous replication, mostly determined +by the roundtrip time between involved nodes. Increasing the number +of concurrent sessions can help to increase parallelism to +obtain reasonable transaction throughput. + +The CAMO partner confirming transactions must store transaction +states. Compared to non-CAMO operation, this might require an +additional seek for each transaction applied from the origin. + +## Client application testing + +Proper use of CAMO on the client side isn't trivial. We strongly +recommend testing the application behavior with the PGD +cluster against failure scenarios such as node crashes or network +outages. diff --git a/product_docs/docs/pgd/5/durability/commit-scopes.mdx b/product_docs/docs/pgd/5/durability/commit-scopes.mdx new file mode 100644 index 00000000000..a65414a95c5 --- /dev/null +++ b/product_docs/docs/pgd/5/durability/commit-scopes.mdx @@ -0,0 +1,266 @@ +--- +title: Commit Scopes +--- + +Commit Scopes give applications granular control about durability and +consistency of EDB Postgres Distributed. + +A Commit Scope is a named rule that describes behavior of COMMIT replication. +The actual behavior depends on whether a Commit Scope uses +[Group Commit](group-commit), [Commit At Most Once](camo), +[Lag Control](lag-control) or combination of these. + +## Configuration + +To use Group Commit, first define a commit scope. This +determines the BDR nodes involved in the commit of a transaction. +Once a scope is established, you can configure a transaction to use +Group Commit as follows: + +```sql +BEGIN; +SET LOCAL bdr.commit_scope = 'example_scope'; +... +COMMIT; +``` + +The commit scope must be set before the transaction has written any data. + +For this example, you might previously have defined the commit scope as: + +```sql +SELECT bdr.add_commit_scope( + commit_scope_name := 'example_scope', + origin_node_group := 'example_bdr_group', + rule := 'ANY 2 (example_bdr_group)', + wait_for_ready := true +); +``` + +This assumes a *node group* named `example_bdr_group` exists and +includes at least two BDR nodes as members, either directly or in +subgroups. Any transaction committed in the `example_scope` +requires one extra confirmation from a BDR node in the group. +Together with the origin node, this accounts for "ANY 2" nodes out of +the group, on which the transaction is guaranteed to be visible and +durable after the commit. + +### Origin groups + +Rules for commit scopes can depend on the node the transaction is +committed on, that is, the node that acts as the origin for the transaction. +To make this transparent for the application, BDR allows a commit +scope to define different rules depending on where the transaction +originates from. + +For example, consider a EDB Postgres Distributed cluster with nodes +spread across two data centers: a left and a right one. +Assume the top-level BDR node group +is called `top_group`. You can use the following commands to set up +subgroups and create a commit scope requiring all nodes in the local +data center to confirm the transaction but only one node from the +remote one. + +```sql +-- create sub-groups +SELECT bdr.create_node_group( + node_group_name := 'left_dc', + parent_group_name := 'top_group', + join_node_group := false +); +SELECT bdr.create_node_group( + node_group_name := 'right_dc', + parent_group_name := 'top_group', + join_node_group := false +); + +-- create a commit scope with individual rules +-- for each sub-group +SELECT bdr.add_commit_scope( + commit_scope_name := 'example_scope', + origin_node_group := 'left_dc', + rule := 'ALL (left_dc) AND ANY 1 (right_dc)', + wait_for_ready := true +); +SELECT bdr.add_commit_scope( + commit_scope_name := 'example_scope', + origin_node_group := 'right_dc', + rule := 'ANY 1 (left_dc) AND ALL (right_dc)', + wait_for_ready := true +); +``` + +Now using the `example_scope` on any node that's part of `left_dc` will use the +first scope, while using same scope on node that's part of `right_dc` will +use the second scope. This is effective way of creating inverted scope without +having to juggle scope names in application. + +In addition to this, each group can also have default commit scope specified using +`bdr.alter_node_group_option` admin interface. + +So making the above scopes the default ones for all transactions originatin on +nodes withing those groups would look like this. + +```sql +SELECT bdr.alter_node_group_option( + node_group_name := 'left_dc', + config_key := 'default_commit_scope', + config_value := 'example_scope' +); +SELECT bdr.alter_node_group_option( + node_group_name := 'right_dc', + config_key := 'default_commit_scope', + config_value := 'example_scope' +); +``` + +### Confirmation levels + +BDR nodes can send confirmations for a transaction at different points +in time. In increasing levels of protection, from the perspective of the +confirming node, these are: + +* `received` — A remote BDR node confirms the transaction immediately + after receiving it, prior to starting the local application. +* `replicated` — Confirm after applying changes of the transaction + but before flushing them to disk. +* `durable` — Confirm the transaction after all of its changes are + flushed to disk. +* `visible` (default) — Confirm the transaction after all of its + changes are flushed to disk and it's visible to + concurrent transactions. + +In rules for commit scopes, you can append these confirmation levels +to the node group definition in parenthesis with `ON` as follows: + +* `ANY 2 (right_dc) ON replicated` +* `ALL (left_dc) ON visible` (default and may as well be omitted) +* `ALL (left_dc) ON received AND ANY 1 (right_dc) ON durable` + +## Reference + +### Commit scope grammar + +For reference, the grammar for commit scopes is composed as follows: + +``` +commit_scope: + confirmation [AND ...] + +commit_scope_operation: + commit_scope_group [ ON { received|replicated|durable|visible } ] commit_scope_kind + +commit_scope_group: +{ ANY num (node_group [, ...]) + | MAJORITY (node_group [, ...]) + | ALL (node_group [, ...]) } + +commit_scope_kind: +{ GROUP_COMMIT [ ( group_commit_parameter = value ) ] [ ABORT ON ( abort_on_parameter = value ) ] + | CAMO [ DEGRADE ON ( degrade_on_parameter = value ) TO ASYNC ] + | LAG_CONTROL [ ( lag_control_parameter = value ) ] } +``` + +#### Parameters + +* `node_group` - name of a node group +* `( group_commit_parameter = value )` - options for Group Commit + * `transaction_tracking` (boolean) - specifies whether status of + transaction should be tracked + * `conflict_resolution` (enum) - how to handle conflicts, possible values + are either `async` meaning conflicts should be resolved asynchronously + after during replication using the conflict resolution policy or `eager` + meaning that conflicts are resolved eagerly during COMMIT by aborting + one of the conflicting transactions + * `commit_decision` (enum) - how is COMMIT decision made, it can be either + `group` meaning the `commit_scope_group` specification also affects the + COMMIT decision, not just durability, it can also be `partner` which means + partner node decides whether transaction can be committed (this is only + allowed on 2 data node groups) or it can be `raft` which means COMMIT + decision is done using Raft consensus independently of `commit_scope_group` + consensus. +* `ABORT ON ( abort_on_parameter = value )` - allows automatic transaction + abort on timeout + * `timeout` (interval) - timeout in milliseconds (accepts other units) +* `DEGRADE ON ( degrade_on_parameter = value )` - allows degrading to asynchronous + operation on timeout + * `timeout` (interval) - timeout in milliseconds (accepts other units) after + which operation becomes asynchronous + * `require_write_lead` (boolean) - whether the node has to be a write lead to + to be able to switch to asynchronous mode +* `( lag_control_parameter = value )` - options for Lag Control + * `max_lag_size` (int) - maximum allowed lag based on WAL bytes + * `max_lag_time` (interval) - maximum allowed lag based on wall clock sampling + * `max_commit_delay` (interval) - maximum delay that can be injected to + commit in order to try to keep within the lag limits + +!!! Note + `CAMO` commit scope kind is mostly syntax sugar for + `GROUP_COMMIT (transaction_tracking = true, commit_decision = partner)` with + additional `DEGRADE ON` clause. It's expected that `GROUP_COMMIT` will + eventually gain `DEGRADE ON` clause as well, making `CAMO` syntax deprecated. + +!!! Note + While the grammar for `synchronous_standby_names` and Commit + Scopes can loo very similar, it is important to note that the former + does not account for the origin node, but the latter does. + Therefore, for example `synchronous_standby_names = 'ANY 1 (..)'` + is equivalent to a Commit Scope of `ANY 2 (...)`. This choice + makes reasoning about majority easier and reflects that the origin + node also contributes to the durability and visibility of the + transaction. + +### Adding a commit scope rule + +The function `bdr.add_commit_scope` creates a rule for the given +commit scope name and origin node group. If the rule is the same for +all nodes in the EDB Postgres Distributed cluster, invoking this function once for the +top-level node group is enough to fully define the commit scope. + +Alternatively, you can invoke it multiple times with the same +`commit_scope_name` but different origin node groups and rules for +commit scopes that vary depending on the origin of the transaction. + +#### Synopsis + +```sql +bdr.add_commit_scope( + commit_scope_name NAME, + origin_node_group NAME, + rule TEXT + wait_for_ready boolean DEFAULT true) +``` + +### Changing a commit scope rule + +To change a specific rule for a single origin node group in a +commit scope, you can use the function `bdr.alter_commit_scope`. + +#### Synopsis + +```sql +bdr.alter_commit_scope( + commit_scope_name NAME, + origin_node_group NAME, + rule TEXT) +``` + +### Removing a commit scope rule + +You can use `bdr.remove_commit_scope` to drop a single rule in +a commit scope. If you define multiple rules for the commit scope, you must invoke +this function once per rule to fully remove the entire +commit scope. + +#### Synopsis + +```sql +bdr.remove_commit_scope( + commit_scope_name NAME, + origin_node_group NAME) +``` + +!!! Note + Removing a commit scope that is still used as default by a node + group is not allowed + diff --git a/product_docs/docs/pgd/5/durability/group-commit.mdx b/product_docs/docs/pgd/5/durability/group-commit.mdx new file mode 100644 index 00000000000..feaaa72e989 --- /dev/null +++ b/product_docs/docs/pgd/5/durability/group-commit.mdx @@ -0,0 +1,108 @@ +--- +title: Group Commit +redirects: + - /pgd/latest/bdr/group-commit/ +--- + +The goal of Group Commit is to protect against data loss +in case of single node failures or temporary outages. You achieve this +by requiring more than one BDR node to successfully receive and +confirm a transaction at COMMIT time. + +## Requirements + +During normal operation, Group Commit is completely transparent to the +application. Transactions which were in progress during failover need the +reconciliation phase triggered or consolidated by either the application or a +proxy in between. This currently happens only when either the origin node +recovers or when it is parted from the cluster. This is the same as with +Postgres legacy builtin synchronous replication. + +Transactions using committed with Group Commit use two-phase commit underneath. +Therefore, configure `max_prepared_transactions` high enough to handle all such +transactions originating per node. + +## Configuration + +To use Group Commit, first define a [Commit Scope](commit-scopes). This +determines the BDR nodes involved in the commit of a transaction. + +## Behavior + +The behavior of Group Commit depends on the configuration applied by the Commit +Scope. + +### Commit decisions + +Group Commit can be configured to decide commits in 3 different ways, `group`, +`partner` and `raft`. + +Group commit decision is done through the consensus specified using the same +commit scope group settings used for the durability purposes. The difference +is that the commit decision is made based on PREPARE replication while the +durability checks COMMIT (PREPARED) replication. + +The partner decision is what [Commit At Most Once](camo) uses. This approach +only works when there are 2 data nodes in the node group. These two nodes are +partners of each other, and the replica rather than origin decides whether +to commit something or not. This approach requires application changes to use +the CAMO transaction protocol to work correctly as application is in some way +part of the consensus. More on this approach in the [CAMO](camo) chapter. + +Last option is raft, which uses the builtin Raft consensus to decide whether +commit can happen. Currently the global Raft is used so for this to work majority +of nodes across whole cluster must work. + +### Conflict resolution + +Conflict resolution can be either `async` or `eager`. + +Async means that PGD does optimistic conflict resolution during replication +(regardless of whether the origin transaction committed or or is still in progress) +using the row level resolution as configured for given node. See +[Conflicts](../consistency/conflicts) chapter for detailed description +of how the asynchronous conflict resolution works. + +Eager means that conflicts are resolved eagerly (as part of agreement on COMMIT) +and conflicting transactions get aborted with serialization error. This approach +provides greater isolation than the asynchronous resolution at the price of +performance. For the details of how Eager conflict resolution works, see +[Eager conflict resolution](../consistency/eager). + +### Aborts + +To prevent transaction which cannot get consensus on the COMMIT from hanging +forever, the `ABORT ON` clause allows specifying timeout after which the +transaction abort is requested. Note that in case the transaction is already +decided to be committed at the time the abort request is sent, the transaction +will eventually COMMIT even though client might receive abort message. + +## Limitations + +Group Commit transactions can't yet execute DDL, +nor do they support explicit two-phase commit. These might be allowed in +later releases. However, the `TRUNCATE` command is allowed. + +Only CAMO transactions can combined with `DEGRADE TO` clause for switching +to asynchronous operation on case of lowered availability yet. + +Neither Eager or CAMO transactions are not currently supported in combination +with the Decoding Worker feature nor with transaction streaming. +Installations using Eager must keep `enable_wal_decoder` and `streaming_mode` +disabled for the BDR node group. + +Synchronous replication uses a mechanism for transaction confirmation +different from Group Commit. The two aren't compatible, and you must not use +them together. Therefore, whenever Group Commit transactions, +make sure none of the BDR nodes are configured in +`synchronous_standby_names`. + +Currently, Raft commit decisions are extremely slow producing very low TPS and +are only recommended to be used along with the `eager` conflict resolution setting +to get the Eager All-Node Replication behavior of PGD 4 and older. + +Combining different commit decision options in the same transaction is not +supported. + +Combining different conflict resolution options in the same transaction is not +supported. diff --git a/product_docs/docs/pgd/5/durability/index.mdx b/product_docs/docs/pgd/5/durability/index.mdx new file mode 100644 index 00000000000..2a10d76a082 --- /dev/null +++ b/product_docs/docs/pgd/5/durability/index.mdx @@ -0,0 +1,300 @@ +--- +title: Durability and performance options + +navigation: + - commit-scopes + - group-commit + - camo + - lag-control + +redirects: + - /pgd/latest/bdr/durability/ + - /pgd/latest/choosing_durability/ +--- + +## Overview + +EDB Postgres Distributed allows you to choose from several replication +configurations based on your durability, consistency, availability, and +performance needs using *Commit Scopes*. + +In it's basic configuration, EDB Postgres Distributed will use asynchronous +replication, however commit scopes can change both the default and the per +transaction behavior. It's also possible to configure the legacy Postgres +synchronous replication using standard `synchronous_standby_names` in a same +way as the built-in physical or logical replication. Commit scopes however +provide much more flexibility and control over the replication behavior. + +The different synchronization settings affect three properties of interest +to applications that are related but can all be implemented individually: + +- *Durability*: Writing to multiple nodes increases crash resilience + and allows you to recover the data after a crash and restart. +- *Visibility*: With the commit confirmation to the client, the database + guarantees immediate visibility of the committed transaction on some + sets of nodes. +- *Conflict handling*: Conflicts can be handled either optimistically + post-commit, with conflicts being resolved when the transaction is replicated + based on commit timestamps. Or they can be handled pessimistically + pre-commit, where the client can rely on the transaction to eventually be + applied on all nodes without further conflicts or get an abort directly + informing the client of an error. + +Commit Scopes allow two ways of controlling durability of the transaction: + +- [Group Commit](group-commit). This option controls which and how many nodes + have to reach a consensus before we consider transaction to be committable + and at what stage of replication we can consider it committed. This also + allows controlling the visibility ordering of the transaction. +- [CAMO](camo). Variant of Group Commit where the client is part of the + consensus. +- [Lag Control](lag-control). This option controls how far behind can nodes + be in terms of replication before allowing commit to proceed. + +Postgres provides [Physical Streaming Replication](https://www.postgresql.org/docs/current/warm-standby.html#STREAMING-REPLICATION) +(PSR), which is unidirectional but offers a [synchronous variant](https://www.postgresql.org/docs/current/warm-standby.html#SYNCHRONOUS-REPLICATION). +For backward compatibility, BDR still supports configuring synchronous +replication with `synchronous_commit` and `synchronous_standby_names`. See +[Legacy synchronous replication](#legacy-synchronous-replication-using-pgd), +but consider using [Group Commit](group-commit) instead. + +## Terms and definitions + +BDR nodes take different roles during the replication of a transaction. +These are implicitly assigned per transaction and are unrelated even for +concurrent transactions. + +* The *origin* is the node that receives the transaction from the + client or application. It's the node processing the transaction + first, initiating replication to other BDR nodes, and responding back + to the client with a confirmation or an error. + +* A *partner* node is a BDR node expected to confirm transactions + either according to Group Commit requirements. + +* A *commit group* is the group of all BDR nodes involved in the + commit, that is, the origin and all of its partner nodes, which can be + just a few or all peer nodes. + +## Comparison + +Most options for synchronous replication available to +BDR allow for different levels of synchronization, offering different +tradeoffs between performance and protection against node or network +outages. + +The following table summarizes what a client can expect from a peer +node replicated to after having received a COMMIT confirmation from +the origin node the transaction was issued to. The Mode column takes +on different meaning depending on the variant. For PSR and legacy +synchronous replication with BDR, it refers to the +`synchronous_commit` setting. And for Commit Scopes, it refers to the +confirmation requirements of the +[commit scope configuration](commit-scopes#configuration). + +| Variant | Mode | Received | Visible | Durable | +|---------------------|-----------------------|----------|---------|---------| +| PSR Async | off (default) | no | no | no | +| PGD Async | off (default) | no | no | no | +| PGD Lag Control | 'ON received' nodes | no | no | no | +| PGD Lag Control | 'ON replicated' nodes | no | no | no | +| PGD Lag Control | 'ON durable' nodes | no | no | no | +| PGD Lag Control | 'ON visible' nodes | no | no | no | +| PSR Sync | remote_write (2) | yes | no | no (1) | +| PSR Sync | on (2) | yes | no | yes | +| PSR Sync | remote_apply (2) | yes | yes | yes | +| PGD Group Commit | 'ON received' nodes | yes | no | no | +| PGD Group Commit | 'ON replicated' nodes | yes | no | no | +| PGD Group Commit | 'ON durable' nodes | yes | no | yes | +| PGD Group Commit | 'ON visible' nodes | yes | yes | yes | +| PGD CAMO | 'ON received' nodes | yes | no | no | +| PGD CAMO | 'ON replicated' nodes | yes | no | no | +| PGD CAMO | 'ON durable' nodes | yes | no | yes | +| PGD CAMO | 'ON visible' nodes | yes | yes | yes | +| PGD Legacy Sync (3) | remote_write (2) | yes | no | no | +| PGD Legacy Sync (3) | on (2) | yes | yes | yes | +| PGD Legacy Sync (3) | remote_apply (2) | yes | yes | yes | + +*(1) Written to the OS, durable if the OS remains running and only +Postgres crashes.* + +*(2) Unless switched to local mode (if allowed) by setting +`synchronous_replication_availability` to `async'`, otherwise the +values for the asynchronous BDR default apply.* + +*(3) Consider using Group Commit instead.* + +Reception ensures the peer operating normally can +eventually apply the transaction without requiring any further +communication, even in the face of a full or partial network +outage. A crash of a peer node might still require retransmission of +the transaction, as this confirmation doesn't involve persistent +storage. All modes considered synchronous provide this protection. + +Visibility implies the transaction was applied remotely. All other +clients see the results of the transaction on all nodes, providing +this guarantee immediately after the commit is confirmed by the origin +node. Without visibility, other clients connected might not see the +results of the transaction and experience stale reads. + +Durability relates to the peer node's storage and provides protection +against loss of data after a crash and recovery of the peer node. +This can either relate to the reception of the data (as with physical +streaming replication) or to visibility (as with Group Commit). +The former eliminates the need for retransmissions after +a crash, while the latter ensures visibility is maintained across +restarts. + +## Internal timing of operations + +For a better understanding of how the different modes work, it's +helpful to realize PSR and BDR apply transactions +differently. + +With physical streaming replication, the order of operations is: + +- Origin flushes a commit record to WAL, making the transaction + visible locally. +- Peer node receives changes and issues a write. +- Peer flushes the received changes to disk. +- Peer applies changes, making the transaction visible locally. + +With PGD, the order of operations is different: + +- Origin flushes a commit record to WAL, making the transaction + visible locally. +- Peer node receives changes into its apply queue in memory. +- Peer applies changes, making the transaction visible locally. +- Peer persists the transaction by flushing to disk. + +For Group Commit, CAMO, and Eager, the origin node waits for +a certain number of confirmations prior to making the transaction +visible locally. The order of operations is: + +- Origin flushes a prepare or precommit record to WAL. +- Peer node receives changes into its apply queue in memory. +- Peer applies changes, making the transaction visible locally. +- Peer persists the transaction by flushing to disk. +- Origin commits and makes the transaction visible locally. + +The following table summarizes the differences. + +| Variant | Order of apply vs persist | Replication before or after commit | +|:-----------------|:-------------------------:|:----------------------------------:| +| PSR | persist first | after WAL flush of commit record | +| PGD Async | apply first | after WAL flush of commit record | +| PGD Lag Control | apply first | after WAL flush of commit record | +| PGD Group Commit | apply first | before COMMIT on origin | +| PGD CAMO | apply first | before COMMIT on origin | + +## Configuration + +Configuring Commit Scopes, is done through SQL function just like other administration +operations in PGD. + +For example you might define basic Commit Scope which does Group Commit on majority +of nodes in the example_group BDR group: + +```sql +SELECT bdr.add_commit_scope( + commit_scope_name := 'example_scope', + origin_node_group := 'example_group', + rule := 'ANY MAJORITY (example_group)', + wait_for_ready := true +); +``` + +You can then use the commit scope either by setting configuration variable (GUC) +`bdr.commit_scope` either per transaction or globally to that commit scope. + +```sql +BEGIN; +SET LOCAL bdr.commit_scope = 'example_scope'; +... +COMMIT; +``` + +You can also set the default commit scope for given BDR group. + +```sql +SELECT bdr.alter_node_group_option( + node_group_name := 'example_group', + config_key := 'default_commit_scope', + config_value := 'example_scope' +); +``` + +Note that the `default_commit_scope` is checked in the group tree the given origin +node belongs to from bottom to up. The `default_commit_scope` cannot be set to +the special value `local` which means no commit scope this way, for that use +the `bdr.commit_scope` configuration parameter. + +Full details of the Commit Scope language with all the options are described +in the [Commit Scopes](commit-scopes) chapter. + +### Postgres configuration parameters + +The following table provides an overview of the configuration +settings that are required to be set to a non-default value (req) or +optional (opt) but affecting a specific variant. + +| Setting (GUC) | Group Commit | Lag Control | PSR (1) | Legacy Sync | +|--------------------------------------|:------------:|:-----------:|:-------:|:-----------:| +| synchronous_standby_names | n/a | n/a | req | req | +| synchronous_commit | n/a | n/a | opt | opt | +| synchronous_replication_availability | n/a | n/a | opt | opt | +| bdr.commit_scope | opt | opt | n/a | n/a | + +## Planned shutdown and restarts + +When using Group Commit with receive confirmations, take care +with planned shutdown or restart. By default, the apply queue is consumed +prior to shutting down. However, in the `immediate` shutdown mode, the queue +is discarded at shutdown, leading to the stopped node "forgetting" +transactions in the queue. A concurrent failure of the origin node can +lead to loss of data, as if both nodes failed. + +To ensure the apply queue gets flushed to disk, use either +`smart` or `fast` shutdown for maintenance tasks. This approach maintains the +required synchronization level and prevents loss of data. + +## Legacy synchronous replication using BDR + +!!! Note + Consider using [Group Commit](group-commit) instead. + +### Usage + +To enable synchronous replication using BDR, you need to add the application +name of the relevant BDR peer nodes to +`synchronous_standby_names`. The use of `FIRST x` or `ANY x` offers a +some flexibility if this doesn't conflict with the requirements of +non-BDR standby nodes. + +Once you've added it, you can configure the level of synchronization per +transaction using `synchronous_commit`, which defaults to `on`. This setting means that +adding to `synchronous_standby_names` already enables synchronous +replication. Setting `synchronous_commit` to `local` or `off` turns +off synchronous replication. + +Due to BDR applying the transaction before persisting it, the +values `on` and `remote_apply` are equivalent (for logical +replication). + +### Migration to Commit Scopes + +The Group Commit feature of BDR is configured independent of +`synchronous_commit` and `synchronous_standby_names`. Instead, the +`bdr.commit_scope` GUC allows you to select the scope per transaction. And +instead of `synchronous_standby_names` configured on each node +individually, Group Commit uses globally synchronized Commit Scopes. + +!!! Note + While the grammar for `synchronous_standby_names` and Commit + Scopes looks similar, the former + doesn't account for the origin node, but the latter does. + Therefore, for example, `synchronous_standby_names = 'ANY 1 (..)'` + is equivalent to a Commit Scope of `ANY 2 (...)`. This choice + makes reasoning about majority easier and reflects that the origin + node also contributes to the durability and visibility of the + transaction. diff --git a/product_docs/docs/pgd/5/durability/lag-control.mdx b/product_docs/docs/pgd/5/durability/lag-control.mdx new file mode 100644 index 00000000000..a5e95400a8d --- /dev/null +++ b/product_docs/docs/pgd/5/durability/lag-control.mdx @@ -0,0 +1,193 @@ +--- +title: Lag control +redirects: + - /pgd/latest/bdr/lag-control/ +--- + +Data throughput of database applications on a BDR origin node can +exceed the rate at which committed data can be safely replicated to +downstream peer nodes. If this disparity persists beyond a period of +time or chronically in high availability applications, then +organizational objectives related to disaster recovery or business +continuity plans might not be satisfied. + +The replication lag control (RLC) feature is designed to regulate this +imbalance using a dynamic rate-limiting device so that data flow between +BDR group nodes complies with these organizational objectives. It does so +by controlling the extent of replication lag between BDR nodes. + +Some of these objectives include the following: + +- Recovery point objective (RPO) specifies the maximum tolerated + amount of data that can be lost due to unplanned events, usually + expressed as an amount of time. In non-replicated systems, RPO + is used to set backup intervals to limit the risk of lost committed + data due to failure. For replicated systems, RPO determines the + acceptable amount of committed data that hasn't been safely applied + to one or more peer nodes. + +- Resource constraint objective (RCO) acknowledges that there are finite + storage constraints. This storage includes database files, WAL, and + temporary or intermediate files needed for continued operation. + For replicated systems, as lag increases the demands on these storage + resources also increase. + +- Group elasticity objective (GEO) ensures that any node isn't + originating new data at a clip that can't be acceptably saved to + its peer nodes. When that is the case then the detection of that + condition can be used as one metric in the decision to expand the + number of database nodes. Similarly, when that condition abates then + it might influence the decision to contract the number of database nodes. + +Lag control manages replication lag by controlling the rate at which client +connections can commit READ WRITE transactions. Replication lag is +measured either as lag time or lag size, depending on the objectives +to meet. Transaction commit rate is regulated using a configured +BDR commit-delay time. + +## Requirements + +To get started using lag control: + +- Determine the maximum acceptable commit delay time `max_commit_delay` that can be tolerated for all database applications. + +- Decide on the lag measure to use. Choose either lag size `max_lag_size` or lag time `max_lag_time`. + +- Decide on the groups or subgroups involved and the minimum number of nodes in each collection required to satisfy confirmation. This will form the basis for the definition of a commit scope rule. + +## Configuration + +Lag control is specified within a commit scope, which allows consistent and coordinated parameter settings across the nodes spanned by the commmit scope rule. A Lag control specification can be included in the default commit scope of a top group or part of an Origin group commit scope. + +Using the sample node groups from the [Commit Scope](commit-scopes) chapter, this example shows lag control rules for two datacenters. + +```sql +-- create a Lag control commit scope with individual rules +-- for each sub-group +SELECT bdr.add_commit_scope( + commit_scope_name := 'example_scope', + origin_node_group := 'left_dc', + rule := 'ALL (left_dc) AND ANY 1 (right_dc) LAG CONTROL (max_commit_delay=500ms, max_lag_time=30s)', + wait_for_ready := true +); +SELECT bdr.add_commit_scope( + commit_scope_name := 'example_scope', + origin_node_group := 'right_dc', + rule := 'ANY 1 (left_dc) AND ALL (right_dc) LAG CONTROL (max_commit_delay=0.250ms, max_lag_size=100MB)', + wait_for_ready := true +); +``` + +Note the parameter values admit unit specification that is compatible with GUC parameter conventions. + +A Lag control commit scope rule can be added to existings commit scope rules that also include Group Commit and CAMO rule specifications. + +`max_commit_delay` parameter permits and encourages a specification of milliseconds with a fractional part, including a sub-millisecond setting if appropriate. + +## Overview + +Lag control is a dynamic TPS rate-limiting mechanism that operates at the client +connection level. It's designed to be as unobtrusive as possible while +satisfying configured lag-control constraints. This means that if enough +BDR nodes can replicate changes fast enough to remain below configured +lag measure thresholds, then the BDR runtime commit delay stays fixed +at 0 milliseconds. + +If this isn't the case, minimally +adjust the BDR runtime commit delay as high as needed, but no higher, until the number of +conforming nodes returns to the minimum threshold. + +Even after the minimum node threshold is reached, lag control continues to attempt +to drive the BDR runtime commit delay back to zero. The BDR commit delay +might rise and fall around an equilibrium level most of the time, but if +data throughput or lag-apply rates improve then the commit delay +decreases over time. + +The BDR commit delay is a post-commit delay. It occurs after the transaction +has committed and after all Postgres resources locked or acquired by the +transaction are released. Therefore, the delay doesn't prevent +concurrent active transactions from observing or modifying its values or +acquiring its resources. The same guarantee can't be made for external +resources managed by Postgres extensions. Regardless of extension +dependencies, the same guarantee can be made if the BDR extension is listed +before extension-based resource managers in `postgresql.conf`. + +Strictly speaking, the BDR commit delay is not a per-transaction delay. +It is the mean value of commit delays over a stream of transactions for a +particular client connection. This technique allows the commit delay and +fine-grained adjustments of the value to escape the coarse granularity of +OS schedulers, clock interrupts, and variation due to system load. It also +allows the BDR runtime commit delay to settle within microseconds of the +lowest duration possible to maintain a lag measure threshold. + +!!! Note + Don't conflate the BDR commit delay with the Postgres + commit delay. They are unrelated and perform different functions. Don't + substitute one for the other. + +## Transaction application + +The BDR commit delay is applied to all READ WRITE transactions that +modify data for user applications. This implies that any transaction +that doesn't modify data, including declared READ WRITE transactions, +is exempt from the commit delay. + +Asynchronous transaction commit also executes a BDR commit delay. This +might appear counterintuitive, but asynchronous commit, by virtue of its +performance, can be one of the greatest sources of replication lag. + +Postgres and BDR auxillary processes don't delay at transaction commit. +Most notably, BDR writers don't execute a commit delay when applying +remote transactions on the local node. This is by design as BDR writers +contribute nothing to outgoing replication lag and can reduce incoming +replication lag the most by not having their transaction commits throttled +by a delay. + +## Limitations + +The maximum commit delay is a ceiling value representing a hard limit. +This means that a commit delay never exceeds the configured value. +Conversely, the maximum lag measures both by size and time and +are soft limits that can be exceeded. When the maximum commit delay is reached, +there's no additional back pressure on the lag measures to prevent their +continued increase. + +There's no way to exempt origin transactions that don't modify BDR +replication sets from the commit delay. For these transactions, it +can be useful to SET LOCAL the maximum transaction delay to 0. + +## Caveats + +Application TPS is one of many factors that can affect replication lag. +Other factors include the average size of transactions for which BDR commit +delay can be less effective. In particular, bulk load operations can +cause replication lag to rise, which can trigger a concomitant rise in +the BDR runtime commit delay beyond the level reasonably expected by normal +applications, although still under the maximum allowed delay. + +Similarly, an application with a very high OLTP requirement and modest +data changes can be unduly restrained by the acceptable BDR commit delay +setting. + +In these cases, it can be useful to use the `SET [SESSION|LOCAL]` command to +custom configure lag control settings for those applications or modify +those applications. For example, bulk load operations are sometimes split +into multiple, smaller transactions to limit transaction snapshot duration +and WAL retention size or establish a restart point if the bulk load fails. +In deference to lag control, those transaction commits can also schedule very +long BDR commit delays to allow digestion of the lag contributed by the +prior partial bulk load. + +## Meeting organizational objectives + +In the example objectives list earlier: + +- RPO can be met by setting an appropriate maximum lag time. +- RCO can be met by setting an appropriate maximum lag size. +- GEO can be met by monitoring the BDR runtime commit delay + and the BDR runtime lag measures, + +As mentioned, when the maximum BDR runtime commit delay is +pegged at the BDR configured commit-delay limit and the lag +measures consistently exceed their BDR-configured maximum +levels, this scenario can be a marker for BDR group expansion. diff --git a/product_docs/docs/pgd/5/functions.mdx b/product_docs/docs/pgd/5/functions.mdx new file mode 100644 index 00000000000..5d246a4a302 --- /dev/null +++ b/product_docs/docs/pgd/5/functions.mdx @@ -0,0 +1,1452 @@ +--- +navTitle: System functions +title: System functions +redirects: + - bdr/functions +--- + +Perform BDR management primarily by using functions you call from SQL. +All functions in BDR are exposed in the `bdr` schema. Schema qualify any calls to these +functions instead of putting `bdr` in the +`search_path`. + +## Version information functions + +### bdr.bdr_version + +This function retrieves the textual representation of the BDR version +currently in use. + +### bdr.bdr_version_num + +This function retrieves the BDR version number that is +currently in use. Version numbers are monotonically increasing, allowing this +value to be used for less-than and greater-than comparisons. + +The following formula returns the version number consisting of +major version, minor version, and patch release into a single numerical +value: + +``` +MAJOR_VERSION * 10000 + MINOR_VERSION * 100 + PATCH_RELEASE +``` + +## System information functions + +### bdr.get_relation_stats + +Returns the relation information. + +### bdr.get_subscription_stats + +Returns the current subscription statistics. + +## System and progress information parameters + +BDR exposes some parameters that you can query using `SHOW` in `psql` +or using `PQparameterStatus` (or equivalent) from a client +application. + +### bdr.local_node_id + +When you initialize a session, this is set to the node id the client is +connected to. This allows an application to figure out the node it's +connected to, even behind a transparent proxy. + +It's also used with [Connection pools and proxies](durability/camo#connection-pools-and-proxies). + +### bdr.last_committed_lsn + +After every `COMMIT` of an asynchronous transaction, this parameter is updated to +point to the end of the commit record on the origin node. Combining it with `bdr.wait_for_apply_queue`, +allows applications +to perform causal reads across multiple nodes, that is, to wait until a transaction +becomes remotely visible. + +### transaction_id + +As soon as Postgres assigns a transaction id, if CAMO is enabled, this parameter is +updated to show the transaction id just assigned. + +### bdr.is_node_connected + +#### Synopsis + +```sql +bdr.is_node_connected(node_name name) +``` + +Returns boolean by checking if the walsender for a given peer is active +on this node. + +### bdr.is_node_ready + +#### Synopsis + +```sql +bdr.is_node_ready(node_name name, span interval DEFAULT NULL) +``` + +Returns boolean by checking if the lag is lower than the given span or +lower than the `timeout` for `TO ASYNC` otherwise. + +## Consensus function + +### bdr.consensus_disable + +Disables the consensus worker on the local node until server restart or until +it's reenabled using `bdr.consensus_enable` (whichever happens first). + +!!! Warning + Disabling consensus disables some features of BDR and + affects availability of the EDB Postgres Distributed cluster if left disabled for a + long time. Use this function only when working with + Technical Support. + +### bdr.consensus_enable + +Reenabled disabled consensus worker on local node. + +### bdr.consensus_proto_version + +Returns currently used consensus protocol version by the local node. + +Needed by the BDR group reconfiguration internal mechanisms. + +### bdr.consensus_snapshot_export + +#### Synopsis + +```sql +bdr.consensus_snapshot_export(version integer DEFAULT NULL) +``` + +Generate a new BDR consensus snapshot from the currently committed-and-applied +state of the local node and return it as bytea. + +By default, a snapshot for the highest supported Raft version is +exported. But you can override that by passing an explicit `version` +number. + +The exporting node doesn't have to be the current Raft leader, and it doesn't +need to be completely up to date with the latest state on the leader. However, `bdr.consensus_snapshot_import()` +might not accept such a snapshot. + +The new snapshot isn't automatically stored to the local node's +`bdr.local_consensus_snapshot` table. It's only returned to the caller. + +The generated snapshot might be passed to `bdr.consensus_snapshot_import()` on +any other nodes in the same BDR node group that's behind the exporting node's +Raft log position. + +The local BDR consensus worker must be disabled for this function to work. +Typical usage is: + +``` + SELECT bdr.bdr_consensus_disable(); + \copy (SELECT * FROM bdr.consensus_snapshot_export()) TO 'my_node_consensus_snapshot.data' + SELECT bdr.bdr_consensus_enable(); +``` + +While the BDR consensus worker is disabled: +- DDL locking attempts on the node fail or time out. +- galloc sequences don't get new values. +- Eager and CAMO transactions pause or error. +- Other functionality that needs the distributed consensus system is disrupted. + The required downtime is generally very brief. + +Depending on the use case, it might be practical to extract a snapshot that +already exists from the `snapshot` field of the `bdr.local_consensus_snapshot` +table and use that instead. Doing so doesn't require you to stop the consensus worker. + +### bdr.consensus_snapshot_import + +#### Synopsis + +```sql +bdr.consensus_snapshot_import(IN snapshot bytea) +``` + +Import a consensus snapshot that was exported by +`bdr.consensus_snapshot_export()`, usually from another node in the same BDR +node group. + +It's also possible to use a snapshot extracted directly from the `snapshot` +field of the `bdr.local_consensus_snapshot` table on another node. + +This function is useful for resetting a BDR node's catalog state to a known good +state in case of corruption or user error. + +You can import the snapshot if the importing node's `apply_index` is less than +or equal to the snapshot-exporting node's `commit_index` when the +snapshot was generated. (See `bdr.get_raft_status()`.) A node that can't accept +the snapshot because its log is already too far ahead raises an error +and makes no changes. The imported snapshot doesn't have to be completely +up to date, as once the snapshot is imported the node fetches the remaining +changes from the current leader. + +The BDR consensus worker must be disabled on the importing node for this +function to work. See notes on `bdr.consensus_snapshot_export()` for details. + +It's possible to use this function to force the local node to generate a new Raft +snapshot by running: + +``` +SELECT bdr.consensus_snapshot_import(bdr.consensus_snapshot_export()); +``` + +This approach might also truncate the Raft logs up to the current +applied log position. + +### bdr.consensus_snapshot_verify + +#### Synopsis + +```sql +bdr.consensus_snapshot_verify(IN snapshot bytea) +``` + +Verify the given consensus snapshot that was exported by +`bdr.consensus_snapshot_export()`. The snapshot header contains the +version with which it was generated and the node tries to verify it +against the same version. + +The snapshot might have been exported on the same node or any other node +in the cluster. If the node verifying the snapshot doesn't support the +version of the exported snapshot, then an error is raised. + +### bdr.get_consensus_status + +Returns status information about the current consensus (Raft) worker. + +### bdr.get_raft_status + +Returns status information about the current consensus (Raft) worker. +Alias for `bdr.get_consensus_status`. + +### bdr.raft_leadership_transfer + +#### Synopsis + +```sql +bdr.raft_leadership_transfer(IN node_name text, IN wait_for_completion boolean) +``` + +Request the node identified by `node_name` to be the Raft leader. The +request can be initiated from any of the BDR nodes and is +internally forwarded to the current leader to transfer the leadership to +the designated node. The designated node must be an ACTIVE BDR node +with full voting rights. + +If `wait_for_completion` is false, the request is served on +a best-effort basis. If the node can't become a leader in the +`bdr.raft_election_timeout` period, then some other capable node +becomes the leader again. Also, the leadership can change over the +period of time per Raft protocol. A `true` return result indicates +only that the request was submitted successfully. + +If `wait_for_completion` is `true`, then the function waits until +the given node becomes the new leader and possibly waits infinitely if +the requested node fails to become Raft leader (for example, due to network +issues). We therefore recommend that you always set a `statement_timeout` +with `wait_for_completion` to prevent an infinite loop. + +## Utility functions + +### bdr.wait_slot_confirm_lsn + +Allows you to wait until the last write on this session was replayed +to one or all nodes. + +Waits until a slot passes a certain LSN. If no position is supplied, the +current write position is used on the local node. + +If no slot name is passed, it waits until all BDR slots pass the LSN. + +The function polls every 1000 ms for changes from other nodes. + +If a slot is dropped concurrently, the wait ends for that slot. +If a node is currently down and isn't updating its slot, then the wait continues. +You might want to set `statement_timeout` to complete earlier in that case. + +#### Synopsis + +```sql +bdr.wait_slot_confirm_lsn(slot_name text DEFAULT NULL, target_lsn pg_lsn DEFAULT NULL) +``` + +#### Parameters + +- `slot_name` — Name of replication slot or, if NULL, all BDR slots (only). +- `target_lsn` — LSN to wait for or, if NULL, use the current write LSN on the + local node. + +### bdr.wait_for_apply_queue + +The function `bdr.wait_for_apply_queue` allows a BDR node to wait for +the local application of certain transactions originating from a given +BDR node. It returns only after all transactions from that peer +node are applied locally. An application or a proxy can use this +function to prevent stale reads. + +For convenience, BDR provides a variant of this function for +CAMO and the CAMO partner node. See +[bdr.wait_for_camo_partner_queue](durability/camo#wait-for-consumption-of-the-apply-queue-from-the-camo-partner). + +In case a specific LSN is given, that's the point in the recovery +stream from which the peer waits. You can use this +with `bdr.last_committed_lsn` retrieved from that peer node on a +previous or concurrent connection. + +If the given `target_lsn` is NULL, this function checks the local +receive buffer and uses the LSN of the last transaction received from +the given peer node, effectively waiting for all transactions already +received to be applied. This is especially useful in case the peer +node has failed and it's not known which transactions were sent. +In this case, transactions that are still in transit or +buffered on the sender side aren't waited for. + +#### Synopsis + +```sql +bdr.wait_for_apply_queue(peer_node_name TEXT, target_lsn pg_lsn) +``` + +#### Parameters + +- `peer_node_name` — The name of the peer node from which incoming + transactions are expected to be queued and to wait + for. If NULL, waits for all peer node's apply queue to be consumed. +- `target_lsn` — The LSN in the replication stream from the peer node + to wait for, usually learned by way of `bdr.last_committed_lsn` from the + peer node. + +### bdr.get_node_sub_receive_lsn + +You can use this function on a subscriber to get the last LSN that was +received from the given origin. It can be either unfiltered or filtered to take into +account only relevant LSN increments for transactions to be applied. + +The difference between the output of this function and the output of +`bdr.get_node_sub_apply_lsn()` measures the size of the corresponding +apply queue. + +#### Synopsis + +```sql +bdr.get_node_sub_receive_lsn(node_name name, committed bool default true) +``` + +#### Parameters + +- `node_name` — The name of the node that's the source of the + replication stream whose LSN is being retrieved. +- `committed` —; The default (true) makes this function take into + account only commits of transactions received rather than the last + LSN overall. This includes actions that have no effect on the subscriber + node. + +### bdr.get_node_sub_apply_lsn + +You can use this function on a subscriber to get the last LSN that was +received and applied from the given origin. + +#### Synopsis + +```sql +bdr.get_node_sub_apply_lsn(node_name name) +``` + +#### Parameters + +- `node_name` — the name of the node that's the source of the + replication stream whose LSN is being retrieved. + +### bdr.run_on_all_nodes + +Function to run a query on all nodes. + +!!! Warning + This function runs an arbitrary query on a remote node with the + privileges of the user used for the internode connections as specified in the + node's DSN. Use caution when granting privileges to this function. + +#### Synopsis + +```sql +bdr.run_on_all_nodes(query text) +``` + +#### Parameters + +- `query` — Arbitrary query to execute. + +#### Notes + +This function connects to other nodes and executes the query, returning +a result from each of them in JSON format. Multiple rows might be returned from +each node, encoded as a JSON array. Any errors, such as being unable to +connect because a node is down, are shown in the response field. +No explicit statement_timeout or other runtime parameters are set, so +defaults are used. + +This function doesn't go through normal replication. It uses direct client +connection to all known nodes. By default, the connection is created +with `bdr.ddl_replication = off`, since the commands are already being sent +to all of the nodes in the cluster. + +Be careful when using this function since you risk breaking replication +and causing inconsistencies between nodes. Use either transparent DDL +replication or `bdr.replicate_ddl_command()` to replicate DDL. +DDL might be blocked in a future release. + +#### Example + +It's useful to use this function in monitoring, for example, as in the following +query: + +```sql +SELECT bdr.run_on_all_nodes($$ + SELECT local_slot_name, origin_name, target_name, replay_lag_size + FROM bdr.node_slots + WHERE origin_name IS NOT NULL +$$); +``` + +This query returns something like this on a two-node cluster: + +``` +[ + { + "dsn": "host=node1 port=5432 dbname=bdrdb user=postgres ", + "node_id": "2232128708", + "response": { + "command_status": "SELECT 1", + "command_tuples": [ + { + "origin_name": "node1", + "target_name": "node2", + "local_slot_name": "bdr_bdrdb_bdrgroup_node2", + "replay_lag_size": "0 bytes" + } + ] + }, + "node_name": "node1" + }, + { + "dsn": "host=node2 port=5432 dbname=bdrdb user=postgres ", + "node_id": "2058684375", + "response": { + "command_status": "SELECT 1", + "command_tuples": [ + { + "origin_name": "node2", + "target_name": "node1", + "local_slot_name": "bdr_bdrdb_bdrgroup_node1", + "replay_lag_size": "0 bytes" + } + ] + }, + "node_name": "node2" + } +] +``` + +### bdr.run_on_nodes + +Function to run a query on a specified list of nodes. + +!!! Warning + This function runs an arbitrary query on remote nodes with the + privileges of the user used for the internode connections as specified in the + node's DSN. Use caution when granting privileges to this function. + +#### Synopsis + +```postgresql +bdr.run_on_nodes(node_names text[], query text) +``` + +#### Parameters + +- `node_names` — Text ARRAY of node names where query is executed. +- `query` — Arbitrary query to execute. + +#### Notes + +This function connects to other nodes and executes the query, returning +a result from each of them in JSON format. Multiple rows can be returned from +each node, encoded as a JSON array. Any errors, such as being unable to +connect because a node is down, are shown in the response field. +No explicit statement_timeout or other runtime parameters are set, so +defaults are used. + +This function doesn't go through normal replication. It uses direct client +connection to all known nodes. By default, the connection is created +with `bdr.ddl_replication = off`, since the commands are already being sent +to all of the nodes in the cluster. + +Be careful when using this function since you risk breaking replication +and causing inconsistencies between nodes. Use either transparent DDL +replication or `bdr.replicate_ddl_command()` to replicate DDL. +DDL might be blocked in a future release. + +### bdr.run_on_group + +Function to run a query on a group of nodes. + +!!! Warning + This function runs an arbitrary query on remote nodes with the + privileges of the user used for the internode connections as specified in the + node's DSN. Use caution when granting privileges to this function. + +#### Synopsis + +```postgresql +bdr.run_on_group(node_group_name text, query text) +``` + +#### Parameters + +- `node_group_name` — Name of node group where query is executed. +- `query` — Arbitrary query to execute. + +#### Notes + +This function connects to other nodes and executes the query, returning +a result from each of them in JSON format. Multiple rows can be returned from +each node, encoded as a JSON array. Any errors, such as being unable to +connect because a node is down, are shown in the response field. +No explicit statement_timeout or other runtime parameters are set, so +defaults are used. + +This function doesn't go through normal replication. It uses direct client +connection to all known nodes. By default, the connection is created +with `bdr.ddl_replication = off`, since the commands are already being sent +to all of the nodes in the cluster. + +Be careful when using this function since you risk breaking replication +and causing inconsistencies between nodes. Use either transparent DDL +replication or `bdr.replicate_ddl_command()` to replicate DDL. +DDL might be blocked in a future release. + +### bdr.global_lock_table + +This function acquires a global DML locks on a given table. +See [DDL locking details](ddl#ddl-locking-details) for information +about global DML lock. + +#### Synopsis + +```sql +bdr.global_lock_table(relation regclass) +``` + +#### Parameters + +- `relation` — Name or oid of the relation to lock. + +#### Notes + +This function acquires the global DML lock independently of the +`ddl_locking` setting. + +The `bdr.global_lock_table` function requires `UPDATE`, `DELETE`, or `TRUNCATE` +privilege on the locked `relation` unless `bdr.backwards_compatibility` is +set to 30618 or lower. + +### bdr.wait_for_xid_progress + +You can use this function to wait for the given transaction (identified +by its XID) originated at the given node (identified by its node id) +to make enough progress on the cluster. The progress is defined as the +transaction being applied on a node and this node having seen all +other replication changes done before the transaction is applied. + +#### Synopsis + +```sql +bdr.wait_for_xid_progress(origin_node_id oid, origin_topxid int4, allnodes boolean DEFAULT true) +``` + +#### Parameters + +- `origin_node_id` — Node id of the node where the transaction + originated. + +- `origin_topxid` — XID of the transaction. + +- `allnodes` — If `true` then wait for the transaction to progress on + all nodes. Otherwise wait only for the current node. + +#### Notes + +You can use the function only for those transactions that +replicated a DDL command because only those transactions are tracked +currently. If a wrong `origin_node_id` or `origin_topxid` is supplied, +the function might wait forever or until `statement_timeout` occurs. + +### bdr.local_group_slot_name + +Returns the name of the group slot on the local node. + +#### Example + +```sql +bdrdb=# SELECT bdr.local_group_slot_name(); + local_group_slot_name +----------------------- + bdr_bdrdb_bdrgroup +``` + +### bdr.node_group_type + +Returns the type of the given node group. Returned value is the same as what +was passed to `bdr.create_node_group()` when the node group was created, +except `normal` is returned if the `node_group_type` was passed as NULL +when the group was created. + +#### Example + +```sql +bdrdb=# SELECT bdr.node_group_type('bdrgroup'); + node_group_type +----------------- + normal +``` + +## Global advisory locks + +BDR supports global advisory locks. These locks are similar to +the advisory locks available in PostgreSQL except that the +advisory locks supported by BDR are global. They follow semantics +similar to DDL locks. So an advisory lock is obtained by majority consensus and +can be used even if one or more nodes are down or lagging behind, as long +as a majority of all nodes can work together. + +Currently only EXCLUSIVE locks are supported. So if another node or another +backend on the same node has already acquired the advisory lock on the object, +then other nodes or backends must wait for the lock to be released. + +Advisory lock is transactional in nature. So the lock is automatically released +when the transaction ends unless it's explicitly released before the end of +the transaction. In this case, it becomes available as soon as it's released. +Session-level advisory locks aren't currently supported. + +Global advisory locks are reentrant. So if the same resource is locked three +times, you must then unlock it three times for it to be released for use in other sessions. + +### bdr.global_advisory_lock + +This function acquires an EXCLUSIVE lock on the provided object. If the lock isn't +available, then it waits until the lock becomes available or the +`bdr.global_lock_timeout` is reached. + +#### Synopsis + +```sql +bdr.global_advisory_lock(key bigint) +``` + +#### parameters + +- `key` — The object on which an advisory lock is acquired. + +#### Synopsis + +```sql +bdr.global_advisory_lock(key1 integer, key2 integer) +``` + +#### parameters + +- `key1` — First part of the composite key. +- `key2` — second part of the composite key. + +### bdr.global_advisory_unlock + +This function releases a previously acquired lock on the application-defined +source. The lock must have been obtained in the same transaction by +the application. Otherwise, an error is raised. + +#### Synopsis + +```sql +bdr.global_advisory_unlock(key bigint) +``` + +#### Parameters + +- `key` — The object on which an advisory lock is acquired. + +#### Synopsis + +```sql +bdr.global_advisory_unlock(key1 integer, key2 integer) +``` + +#### Parameters + +- `key1` — First part of the composite key. +- `key2` — Second part of the composite key. + +## Monitoring functions + +### bdr.monitor_group_versions + +To provide a cluster-wide version check, this function uses +BDR version information returned from the view +`bdr.group_version_details`. + +#### Synopsis + +```sql +bdr.monitor_group_versions() +``` + +#### Notes + +This function returns a record with fields `status` and `message`, +as explained in [Monitoring](../monitoring/#monitoring-bdr-versions). + +This function calls `bdr.run_on_all_nodes()`. + +### bdr.monitor_group_raft + +To provide a cluster-wide Raft check, this function uses +BDR Raft information returned from the view +`bdr.group_raft_details`. + +#### Synopsis + +```sql +bdr.monitor_group_raft() +``` + +#### Parameters + +- `node_group_name` — the node group name that we want to check. + + +#### Notes + +This function returns a record with fields `status` and `message`, +as explained in [Monitoring](../monitoring/#monitoring-raft-consensus). + +This function calls `bdr.run_on_all_nodes()`. + +### bdr.monitor_local_replslots + +This function uses replication slot status information returned from the +view `pg_replication_slots` (slot active or inactive) to provide a +local check considering all replication slots except the BDR group +slots. + +#### Synopsis + +```sql +bdr.monitor_local_replslots() +``` + +#### Notes + +This function returns a record with fields `status` and `message`, +as explained in [Monitoring replication slots](monitoring/#monitoring-replication-slots). + +### bdr.wal_sender_stats + +If the [decoding worker](nodes#decoding-worker) is enabled, this +function shows information about the decoder slot and current LCR +(logical change record) segment file being read by each WAL sender. + +#### Synopsis + +```sql +bdr.wal_sender_stats() +``` + +#### Output columns + +- `pid` — PID of the WAL sender (corresponds to `pg_stat_replication`'s `pid` column). + +- `is_using_lcr` — Whether the WAL sender is sending LCR files. The next columns are `NULL` if `is_using_lcr` is `FALSE`. + +- `decoder_slot_name` — The name of the decoder replication slot. + +- `lcr_file_name` — The name of the current LCR file. + + +### bdr.get_decoding_worker_stat + +If the [decoding worker](nodes#decoding-worker) is enabled, this function +shows information about the state of the decoding worker associated with the +current database. This also provides more granular information about decoding +worker progress than is available via `pg_replication_slots`. + +#### Synopsis + +```sql +bdr.get_decoding_worker_stat() +``` + +#### Output columns + +- `pid` — The PID of the decoding worker (corresponds to the column `active_pid` in `pg_replication_slots`). + +- `decoded_upto_lsn` — LSN up to which the decoding worker read transactional logs. + +- `waiting` — Whether the decoding worker is waiting for new WAL. + +- `waiting_for_lsn` — The LSN of the next expected WAL. + +#### Notes + +For further details, see [Monitoring WAL senders using LCR](monitoring/#monitoring-wal-senders-using-lcr). + +### bdr.lag_control + +If [lag control](durability/lag-control#configuration) is enabled, this function +shows information about the commit delay and number of nodes conforming +to their configured lag measure for the local node and current database. + +#### Synopsis + +```sql +bdr.lag_control() +``` + +#### Output columns + +- `commit_delay` — Current runtime commit delay, in fractional milliseconds. + +- `commit_delay_maximum` — Configured maximum commit delay, in fractional milliseconds. + +- `commit_delay_adjustment` — Change to runtime commit delay possible during + a sample interval, in fractional milliseconds. + +- `conforming_nodes` — Current runtime number of nodes conforming to lag measures. + +- `conforming_nodes_minimum` — Configured minimum number of nodes required to + conform to lag measures, below which a commit delay adjustment is applied. + +- `lag_bytes_threshold` — Lag size at which a commit delay is applied, in kilobytes. + +- `lag_bytes_maximum` — Configured maximum lag size, in kilobytes. + +- `lag_time_threshold` — Lag time at which a commit delay is applied, in milliseconds. + +- `lag_time_maximum` — Configured maximum lag time, in milliseconds. + +- `sample_interval` — Configured minimum time between lag samples and possible + commit delay adjustments, in milliseconds. + + +## Internal functions + +### BDR message payload functions + +`bdr.decode_message_response_payload` and `bdr.decode_message_payload` + +These functions decode the consensus payloads to a more human-readable output. + +Used primarily by the `bdr.global_consensus_journal_details` debug view. + +### bdr.get_global_locks + +This function shows information about global locks held on the local node. + +Used to implement the `bdr.global_locks` view to provide a more detailed +overview of the locks. + +### bdr.get_slot_flush_timestamp + +Retrieves the timestamp of the last flush position confirmation for a given +replication slot. + +Used internally to implement the `bdr.node_slots` view. + +### BDR internal function replication functions + +`bdr.internal_alter_sequence_set_kind`, `internal_replication_set_add_table`, `internal_replication_set_remove_table` + +Functions used internally for replication of the various function calls. + +No longer used by the current version of BDR. Exists only for backward +compatibility during rolling upgrades. + +### bdr.internal_submit_join_request + +Submits a consensus request for joining a new node. + +Needed by the BDR group reconfiguration internal mechanisms. + +### bdr.isolation_test_session_is_blocked + +A helper function, extending (and actually invoking) the original +`pg_isolation_test_session_is_blocked` with an added check for blocks +on global locks. + +Used for isolation/concurrency tests. + +### bdr.local_node_info + +This function displays information for the local node, needed by the BDR group +reconfiguration internal mechanisms. + +The view `bdr.local_node_summary` provides similar information useful for +user consumption. + +### bdr.msgb_connect + +Function for connecting to the connection pooler of another node, +used by the consensus protocol. + +### bdr.msgb_deliver_message + +Function for sending messages to another node's connection pooler, +used by the consensus protocol. + +### bdr.peer_state_name + +This function transforms the node state (`node_state`) into a textual +representation and is used mainly to implement the `bdr.node_summary` view. + +### bdr.request_replay_progress_update + +Requests the immediate writing of a 'replay progress update' Raft message. +It's used mainly for test purposes but can be also used to test if the +consensus mechanism is working. + +### bdr.seq_nextval + +Internal implementation of sequence increments. + +Use this function instead of standard `nextval` in queries that +interact with [BDR global sequences](sequences/#bdr-global-sequences). + +#### Notes + +The following are also internal BDR sequence manipulation functions. +`bdr.seq_currval` and `bdr.sql_lastval` are used automatically. + +### bdr.show_subscription_status + +Retrieves information about the subscription status and is used mainly to +implement the `bdr.subscription_summary` view. + +### bdr.get_node_conflict_resolvers + +Displays a text string of all the conflict resolvers on the local node. + +### bdr.reset_subscription_stats + +Returns a Boolean result after resetting the statistics created by subscriptions, +as viewed by `bdr.stat_subscription`. + +### bdr.reset_relation_stats + +Returns a Boolean result after resetting the relation stats, +as viewed by `bdr.stat_relation`. + +### bdr.pg_xact_origin + +Returns origin id of a given transaction. + +#### Synopsis + +```sql +bdr.pg_xact_origin(xmin xid) +``` + +#### Parameters + +- `xid` — Transaction id whose origin is returned, + +### bdr.difference_fix_origin_create + +Creates a replication origin with a given name passed as an argument but adding a `bdr_` prefix. +It returns the internal id of the origin. This performs the same functionality +as `pg_replication_origin_create()`, except this requires `bdr_superuser` +rather than postgres superuser permissions. + +#### Synopsis + +### bdr.difference_fix_session_setup + +Marks the current session as replaying from the current origin. +The function uses the pre-created `bdr_local_only_origin` local +replication origin implicitly for the session. It allows replay +progress to be reported and returns void. This function performs the +same functionality as `pg_replication_origin_session_setup()` +except that this function requires bdr_superuser rather than postgres +superuser permissions. The earlier form of the function, +`bdr.difference_fix_session_setup(text)`, was deprecated and will be +removed in upcoming releases. + +#### Synopsis + +```sql +bdr.difference_fix_session_setup() +``` + +### bdr.difference_fix_session_reset + +Marks the current session as not replaying from any origin, essentially +resetting the effect of `bdr.difference_fix_session_setup()`. +It returns void. This function has the same functionality as +`pg_replication_origin_session_reset()` except this function requires +bdr_superuser rather than postgres superuser permissions. + +#### Synopsis + +```sql +bdr.difference_fix_session_reset() +``` + +### bdr.difference_fix_xact_set_avoid_conflict + +Marks the current transaction as replaying a transaction that +committed at LSN '0/0' and timestamp '2000-01-01'. This function has +the same functionality as +`pg_replication_origin_xact_setup('0/0', '2000-01-01')` +except this requires bdr_superuser rather than postgres superuser +permissions. + +#### Synopsis + +```sql +bdr.difference_fix_xact_set_avoid_conflict() +``` + +### bdr.resynchronize_table_from_node(node_name name, relation regclass) + +Resynchronizes the relation from a remote node. + +#### Synopsis + +```sql +bdr.resynchronize_table_from_node(node_name name, relation regclass) +``` + +#### Parameters + +- `node_name` — The node from which to copy or resync the relation data. +- `relation` — The relation to copy from the remote node. + +#### Notes + +This function acquires a global DML lock on the relation, truncates the relation +locally, and copies data into it from the remote node. + +The relation must exist on both nodes with the same name and definition. + +The following are supported: +- Resynchronizing partitioned tables with identical partition definitions +- Resynchronizing partitioned table to nonpartitioned table and vice versa +- Resynchronizing referenced tables by temporarily dropping and recreating +foreign key constraints + +After running the function on a referenced table, if the referenced column +data no longer matches the referencing column values, it throws an error. +After resynchronizing the referencing table data, rerun the function. + +Furthermore, it supports resynchronization of tables with generated columns by +computing the generated column values locally after copying the data from +remote node. + +Currently, row_filters are ignored by this function. + +The `bdr.resynchronize_table_from_node` function can be executed only by +the owner of the table, provided the owner has bdr_superuser privileges. + +### bdr.consensus_kv_store + +Stores value in the consistent KV Store. + +Returns timestamp of the value expiration time. This depends on `ttl`. If `ttl` +is `NULL`, then this returns `infinity`. If the value was deleted, it +returns `-infinity`. + +#### Synopsis + +```sql +bdr.consensus_kv_store(key text, value jsonb, + prev_value jsonb DEFAULT NULL, ttl int DEFAULT NULL) +``` + +#### Parameters + +- `key` — An arbitrary unique key to insert, update, or delete. +- `value` — JSON value to store. If NULL, any existing record is deleted. +- `prev_value` — If set, the write operation is done only if the current value + is equal to `prev_value`. +- `ttl` — Time to live of the new value, in milliseconds. + +#### Notes + +This is an internal function, mainly used by HARP. + +!!! Warning + Don't use this function in user applications. + +### bdr.consensus_kv_fetch + +Fetch value from the consistent KV Store in JSON format. + +#### Synopsis + +```sql +bdr.consensus_kv_fetch(IN key text) RETURNS jsonb +``` + +#### Parameters + +- `key` — An arbitrary key to fetch. + +#### Notes + +This is an internal function, mainly used by HARP. + +!!! Warning + Don't use this function in user applications. + + +### bdr.alter_subscription_skip_changes_upto + +Because logical replication can replicate across versions, doesn't replicate +global changes like roles, and can replicate selectively, sometimes the logical +replication apply process can encounter an error and stop applying changes. + +Wherever possible, fix such problems by making changes to the +target side. `CREATE` any missing table that's blocking replication, +`CREATE` a needed role, `GRANT` a necessary permission, and so on. But occasionally a +problem can't be fixed that way and it might be necessary to skip entirely over a +transaction. +Changes are skipped as entire transactions—all or nothing. To decide where to +skip to, use log output to find the commit LSN, per the example that follows, or peek +the change stream with the logical decoding functions. + +Unless a transaction made only one change, you often need to manually +apply the transaction's effects on the target side, so it's important to +save the problem transaction whenever possible, as shown in the examples that follow. + +It's possible to skip over changes without +`bdr.alter_subscription_skip_changes_upto` by using +`pg_catalog.pg_logical_slot_get_binary_changes` to skip to the LSN of interest, +so this is a convenience function. It does do a faster skip, although it +might bypass some kinds of errors in logical decoding. + +This function works only on disabled subscriptions. + +The usual sequence of steps is: + +1. Identify the problem subscription and LSN of the problem commit. +1. Disable the subscription. +1. Save a copy of the transaction using `pg_catalog.pg_logical_slot_peek_changes` on the source node, if possible. +1. `bdr.alter_subscription_skip_changes_upto` on the target node. +1. Apply repaired or equivalent changes on the target manually, if necessary. +1. Reenable the subscription. + +!!! Warning + It's easy to make problems worse when using this function. Don't + do anything unless you're certain it's the only option. + +#### Synopsis + +```sql + bdr.alter_subscription_skip_changes_upto( + subname text, + skip_upto_and_including pg_lsn + ); +``` + +#### Example + +Apply of a transaction is failing with an error, and you've determined that +lower-impact fixes such as changes on the target side can't resolve this +issue. You determine that you must skip the transaction. + +In the error logs, find the commit record LSN to skip to, as in this +example: + +``` +ERROR: XX000: CONFLICT: target_table_missing; resolver skip_if_recently_dropped returned an error: table does not exist +CONTEXT: during apply of INSERT from remote relation public.break_me in xact with commit-end lsn 0/300AC18 xid 131315 +committs 2021-02-02 15:11:03.913792+01 (action #2) (effective sess origin id=2 lsn=0/300AC18) +while consuming 'I' message from receiver for subscription bdr_regression_bdrgroup_node1_node2 (id=2667578509) +on node node2 (id=3367056606) from upstream node node1 (id=1148549230, reporiginid=2) +``` + +In this portion of log, you have the information you need: +the_target_lsn: **0/300AC18** +the_subscription: **bdr_regression_bdrgroup_node1_node2** + +Next, disable the subscription so the apply worker doesn't try to connect to the replication slot: + +```sql + SELECT bdr.alter_subscription_disable('the_subscription'); +``` + +You can't skip only parts of the transaction: it's all or nothing. So +we strongly recommend that you save a record of it by copying it out on the +provider side first, using the subscription's slot name. + +```sql + \\copy (SELECT * FROM pg_catalog.pg_logical_slot_peek_changes('the_slot_name', + 'the_target_lsn', NULL, 'min_proto_version', '1', 'max_proto_version', '1', + 'startup_params_format', '1', 'proto_format', 'json')) + TO 'transaction_to_drop.csv' WITH (FORMAT csv); +``` + +This example is broken into multiple lines for readability, +but issue it in a single line. `\copy` doesn't +support multi-line commands. + +You can skip the change by changing `peek` to `get`, but +`bdr....skip_changes_upto` does a faster skip that avoids decoding +and outputting all the data: + +```sql + SELECT bdr.alter_subscription_skip_changes_upto('subscription_name', + 'the_target_lsn'); +``` + +You can apply the same changes (or repaired versions of them) +manually to the target node, using the dumped transaction contents as a guide. + +Finally, reenable the subscription: + +```sql + SELECT bdr.alter_subscription_enable('the_subscription'); +``` + +## Task Manager Functions + +#### Synopsis + +```sql +bdr.taskmgr_set_leader(node name, wait_for_completion boolean DEFAULT true); +``` + +Request the given `node` to be the task manager leader node. The leader +node is responsible for creating new tasks (currently only autopartition +makes use of this facility). A witness node, a logical standby or a +subscriber-only node can't become a leader. Such requests will fail with +an ERROR. + +#### Synopsis + +```sql +bdr.taskmgr_get_last_completed_workitem(); +``` + +Return the `id` of the last workitem successfully completed on all nodes in the +cluster. + +### Check Taskmgr Status + +Using the `bdr.taskmgr_work_queue_check_status` function, you can +see the status of the background workers that are doing their job to +generate and finish the tasks. + +The status can be seen through these views: +`taskmgr_work_queue_local_status` +`taskmgr_work_queue_global_status` + +#### Synopsis + +```sql +bdr.taskmgr_work_queue_check_status(workid bigint + local boolean DEFAULT false); +``` + +#### Parameters + +- `workid` — The key of the task. +- `local` — Check the local status only. + +#### Notes + +Taskmgr workers are always running in the background, even before the +`bdr.autopartition` function is called for the first time. If an invalid +`workid` is used, the function returns `unknown`. `In-progress` is the typical status. + + + +### Alter node kind + +BDR5 introduced a concept of Task Manager Leader node. The node is selected +automatically by PGD, but for upgraded clusters, its important to set the +`node_kind` properly for all nodes in the cluster. The user is expected to +do this manually after upgrading to the latest PGD version by calling +`bdr.alter_node_kind()` SQL function for each node. + +#### Synopsis + +```sql +bdr.alter_node_kind(node_name text, + node_kind text); +``` + +#### Parameters + +- `node_name` — Name of the node to change kind. +- `node_kind` — Kind of the node. + + +### Convert catchup state code in name + +#### Synopsis + +```sql +bdr.node_catchup_state_name(catchup_state oid); +``` + +#### Parameters + +- `catchup_state` — Oid code of the catchup state. + + +### Modify the BDR node group routing configuration + +#### Synopsis + +```sql +bdr.alter_node_group_option(node_group_name text, + config_key text, + config_value text); +``` + +#### Parameters + +- `node_group_name` — Name of the group to be changed. +- `config_key` — Key of the option in the node group to be changed. +- `config_value` — New value to be set for the given key. + + +### Modify the BDR node routing configuration + +#### Synopsis + +```sql +bdr.alter_node_option(node_name text, + config_key text, + config_value text); +``` + +#### Parameters + +- `node_name` — Name of the node to be changed. +- `config_key` — Key of the option in the node to be changed. +- `config_value` — New value to be set for the given key. + +### Create a proxy + +#### Synopsis + +```sql +bdr.create_proxy(proxy_name text, node_group text); +``` + +#### Parameters + +- `proxy_name` — Name of the new proxy. +- `node_group` — Name of the group to be used by the proxy. + +### Change a proxy + +#### Synopsis + +```sql +bdr.alter_proxy_option(proxy_name text, config_key text, config_value text); +``` + +#### Parameters + +- `proxy_name` — Name of the proxy to be changed. +- `config_key` — Key of the option in the proxy to be changed. +- `config_value` — New value to be set for the given key. + +### Drop a proxy + +#### Synopsis + +```sql +bdr.drop_proxy(proxy_name text); +``` + +#### Parameters + +- `proxy_name` — Name of the proxy to be dropped. + +### Change routing leader + +Transfer the leadership of the node group to another node + +#### Synopsis + +```sql +bdr.routing_leadership_transfer(node_group_name text, + leader_name text, + transfer_method text DEFAULT 'strict', + transfer_timeout interval DEFAULT '10s'); +``` + +#### Parameters + +- `node_group_name` — Name of group where the leadership transfer is requested. +- `leader_name` — Name of node that will become write leader. +- `transfer_method` — Type of the transfer, it can be "fast" or the default "strict" that checks the maximum lag. +- `transfer_timeout` — Timeout of the leadership transfer, default is 10 seconds. + +### `bdr.bdr_track_commit_decision` + +Save the transaction commit status in the shared memory hash table. +This is used by the upgrade scripts to transfer commit decisions +saved in bdr.node_pre_commit catalog to the shared memory hash table. +This will also be logged to the WAL and hence can be reloaded from +WAL. + +#### Synopsis + +```sql +bdr.bdr_track_commit_decision(OID, xid, xid, "char", timestamptz, boolean); +``` + +### `bdr.bdr_get_commit_decisions` + +Convenience routine to inspect shared memory state + +#### Synopsis + +```sql +bdr.bdr_get_commit_decisions(dbid OID, + origin_node_id OID, + origin_xid xid, + local_xid xid, + decision "char", + decision_ts timestamptz, + is_camo boolean) +``` + +### `bdr.show_workers` + +Information related to the bdr workers. + +#### Synopsis + +```sql +bdr.show_workers( + worker_pid int, + worker_role int, + worker_role_name text, + worker_subid oid +``` + +### `bdr.show_writers` + +Function used in the `bdr.writers` view. + +### `bdr.node_kind_name` + +Return human friendly name of the node kind (standalone|data|standby|witness|subscriber-only) diff --git a/product_docs/docs/pgd/5/images/always-on-1x3.png b/product_docs/docs/pgd/5/images/always-on-1x3.png new file mode 100644 index 00000000000..53aa3168ab0 --- /dev/null +++ b/product_docs/docs/pgd/5/images/always-on-1x3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f923264a4bec5f5904f61dd9a2cdbcc789ff06828613599898ab72f2d4caa25 +size 47626 diff --git a/product_docs/docs/pgd/5/images/always-on-2x3-aa.png b/product_docs/docs/pgd/5/images/always-on-2x3-aa.png new file mode 100644 index 00000000000..7b9e874d7ca --- /dev/null +++ b/product_docs/docs/pgd/5/images/always-on-2x3-aa.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea946cd66df287102317888226aa591458d8047c9eca19084f9fb7ac0f9b0ae2 +size 64746 diff --git a/product_docs/docs/pgd/5/index.mdx b/product_docs/docs/pgd/5/index.mdx new file mode 100644 index 00000000000..607a2aca3ed --- /dev/null +++ b/product_docs/docs/pgd/5/index.mdx @@ -0,0 +1,64 @@ +--- +title: "EDB Postgres Distributed" +indexCards: none +redirects: + - /pgd/5/compatibility_matrix + - /pgd/latest/bdr +navigation: + - rel_notes + - known_issues + - "#Concepts" + - terminology + - overview + - "#Planning" + - architectures + - choosing_server + - deployments + - other_considerations + - "#Installing" + - tpa + - upgrades + - "#Using" + - appusage + - configuration + - nodes + - ddl + - security + - sequences + - durability + - consistency + - repsets + - routing + - backup + - monitoring + - cli + - transaction-streaming + - striggers + - scaling + - twophase + - tssnapshots + - catalogs + - functions + +--- + + +EDB Postgres Distributed provides multi-master replication and data distribution with advanced conflict management, data-loss protection, and throughput up to 5X faster than native logical replication, and enables distributed PostgreSQL clusters with high availability up to five 9s. + +By default EDB Postgres Distributed uses asynchronous replication, applying changes on +the peer nodes only after the local commit. Additional levels of synchronicity can +be configured between different nodes, groups of nodes or all nodes by configuring +[Group Commit](durability/group-commit), [CAMO](durability/camo), or +[Eager](consistency/eager) replication. + +## Compatibility matrix + +| EDB Postgres
Distributed | Commmunity
PostgreSQL | EDB Postgres
Extended Server | EDB Postgres
Advanced Server | PGD
CLI | PGD-Proxy | BDR
Extension | pgLogical3
Extension | HARP | +| ------------------------------ | --------------------------- | ---------------------------------- | ---------------------------------- | ------------- | --------- | ------------------- | -------------------------- | ---- | +| 5 | 12, 13, 14, 15 | 12, 13, 14, 15 | 12, 13, 14, 15 | 5 | 5 | 5 | n/a | n/a | +| 4 | 12, 13, 14 | 12, 13, 14 | 12, 13, 14 | 1 | n/a | 4 | n/a | 2 | +| 3.7 | 11, 12, 13 | 11r2, 12, 13 | 11, 12, 13 | n/a | n/a | 3.7 | 3.7 | 2 | + +- PGD CLI 1 is supported with BDR 4.1 and later. +- BDR DCS in HARP 2 is supported with BDR 3.7.15 and later and 4.0.1 and later. + diff --git a/product_docs/docs/pgd/5/known_issues.mdx b/product_docs/docs/pgd/5/known_issues.mdx new file mode 100644 index 00000000000..bf801110808 --- /dev/null +++ b/product_docs/docs/pgd/5/known_issues.mdx @@ -0,0 +1,101 @@ +--- +title: 'Known issues' +--- + +This section discusses currently known issues in EDB Postgres Distributed 4. + +## Data Consistency + +Read about [Conflicts](consistency/conflicts/) to understand +the implications of the asynchronous operation mode in terms of data +consistency. + +## List of issues + +These known issues are tracked in BDR's +ticketing system and are expected to be resolved in a future +release. + +- If the resolver for the `update_origin_change` conflict + is set to `skip`, `synchronous_commit=remote_apply` is used, and + concurrent updates of the same row are repeatedly applied on two + different nodes, then one of the update statements might hang due + to a deadlock with the BDR writer. As mentioned in the + [Conflicts](consistency/conflicts/) chapter, `skip` is not the default + resolver for the `update_origin_change` conflict, and this + combination isn't intended to be used in production. It discards + one of the two conflicting updates based on the order of arrival + on that node, which is likely to cause a divergent cluster. + In the rare situation that you do choose to use the `skip` + conflict resolver, note the issue with the use of the + `remote_apply` mode. + +- The Decoding Worker feature doesn't work with CAMO/EAGER/Group Commit. + Installations using CAMO/Eager/Group Commit must keep `enable_wal_decoder` + disabled. + +- Lag control doesn't adjust commit delay in any way on a fully + isolated node, that is, in case all other nodes are unreachable or not + operational. As soon as at least one node is connected, replication + lag control picks up its work and adjusts the BDR commit delay + again. + +- For time-based lag control, BDR currently uses the lag time (measured + by commit timestamps) rather than the estimated catchup time that's + based on historic apply rate. + +- Changing the CAMO partners in a CAMO pair isn't currently possible. + It's possible only to add or remove a pair. + Adding or removing a pair doesn't need a restart of Postgres or even a + reload of the configuration. + +- Group Commit cannot be combined with [CAMO](durability/camo/) or [Eager All Node + replication](consistency/eager/). Eager Replication currently only works by using the + "global" BDR commit scope. + +- Transactions using Eager Replication can't yet execute DDL, + nor do they support explicit two-phase commit. + The TRUNCATE command is allowed. + +- Not all DDL can be run when either CAMO or Group Commit is used. + +- Parallel apply is not currently supported in combination with Group + Commit, please make sure to disable it when using Group Commit by + either setting `num_writers` to 1 for the node group (using + [`bdr.alter_node_group_config`](nodes#bdralter_node_group_config)) or + via the GUC `bdr.writers_per_subscription` (see + [Configuration of Generic Replication](configuration#generic-replication)). + +- There currently is no protection against altering or removing a commit + scope. Running transactions in a commit scope that is concurrently + being altered or removed can lead to the transaction blocking or + replication stalling completely due to an error on the downstream node + attempting to apply the transaction. Ensure that any transactions + using a specific commit scope have finished before altering or removing it. + +## List of limitations + +This is a (non-comprehensive) list of limitations that are +expected and are by design. They are not expected to be resolved in the +future. + +- Replacing a node with its physical standby doesn't work for nodes that + use CAMO/Eager/Group Commit. Combining physical standbys and BDR in + general isn't recommended, even if otherwise possible. + +- A `galloc` sequence might skip some chunks if the + sequence is created in a rolled back transaction and then created + again with the same name. This can also occur if it is created and dropped when DDL + replication isn't active and then it is created again when DDL + replication is active. + The impact of the problem is mild, because the sequence + guarantees aren't violated. The sequence skips only some + initial chunks. Also, as a workaround you can specify the + starting value for the sequence as an argument to the + `bdr.alter_sequence_set_kind()` function. + +- Legacy BDR synchronous replication uses a mechanism for transaction + confirmation different from the one used by CAMO, Eager, and Group Commit. + The two are not compatible and must not be used together. Using synchronous + replication to other non-BDR nodes, including both logical and physical + standby is possible. diff --git a/product_docs/docs/pgd/5/monitoring/index.mdx b/product_docs/docs/pgd/5/monitoring/index.mdx new file mode 100644 index 00000000000..b9dbe5cab66 --- /dev/null +++ b/product_docs/docs/pgd/5/monitoring/index.mdx @@ -0,0 +1,22 @@ +--- +title: Monitoring +originalFilePath: monitoring.md + +--- + +Monitoring replication setups is important to ensure that your system performs optimally +and does not run out of disk space or encounter other faults that may halt operations. + +It is important to have automated monitoring in place to ensure that if, for example, +replication slots start falling badly behind, the administrator is alerted and can +take proactive action. + +EDB provides Postgres Enterprise Manager (PEM), which supports BDR from version 8.1. See [Monitoring EDB Postgres Distributed](/pem/latest/monitoring_BDR_nodes/) for more information. + +Alternatively, tools or users can make their own calls into information views +and functions provided by the BDR extension. See [Monitoring through SQL](sql) for +detailed description. + +EDB Postgres Distributed also integrates with OpenTelemetry, allowing you to +use existing reporting setup to follow the state of the EDB Postgres Distributed +cluster. See the [OpenTelemetry integration](otel) chapter for details. diff --git a/product_docs/docs/pgd/5/monitoring/otel.mdx b/product_docs/docs/pgd/5/monitoring/otel.mdx new file mode 100644 index 00000000000..d234c2be093 --- /dev/null +++ b/product_docs/docs/pgd/5/monitoring/otel.mdx @@ -0,0 +1,99 @@ +--- +title: OpenTelemetry Integration +--- + +EDB Postgres Distributed can be configured to report monitoring information +as well as traces to the OpenTelemetry collector. + +Several resource attributes are filled by EDB Postgres Distributed OTEL collector. +These are attached to all metrics and traces: + + - The `service.name` is configurable via `bdr.otel_service_name` configuration setting. + - The `service.namespace` is always set to `edb_postgres_distributed`. + - The `service.instance.id` is always set to system identifier of the Postgres instance. + - The `service.version` is set to current version of the BDR extension loaded in the Postgresql instance. + +## Metrics collection + +The metric collection is enable automatically when configuration option +`bdr.metrics_otel_http_url` is set to non-empty URL. + +Different kinds of metrics are being collected as seen bellow. + +### Generic metrics + +| Metric name | Type | Labels | Description +| ----------- | ---- | ------ | ----------- +| pg_backends_by_state | gauge | conn_state - idle, active, idle in transaction, fastpath functioncall, idle in transaction (aborted), disabled, undefined | Number of backends in a given state +| pg_oldest_xact_start | gauge | | Oldest transaction start time +| pg_oldest_activity_start | gauge | | Oldest query start time +| pg_waiting_backends | gauge | wait_type - LWLock, Lock, BufferPin, Activity, Client, Extension, IPC, Timeout, IO, ??? (for unknown) | Number of currently waiting backends by wait type +| pg_start_time | gauge | | Timestamp at which the server has started +| pg_reload_time | gauge | | Timestamp at which the server has last reloaded configuration + + +### Replication metrics + +| Metric name | Type | Labels | Description +| ----------- | ---- | ------ | ----------- +| bdr_slot_sent_lag | gauge | slot_name - name of a slot | Current sent lag in bytes for each replication slot +| bdr_slot_write_lag | gauge | slot_name - name of a slot | Current write lag in bytes for each replication slot +| bdr_slot_flush_lag | gauge | slot_name - name of a slot | Current flush lag in bytes for each replication slot +| bdr_slot_apply_lag | gauge | slot_name - name of a slot | Current apply lag in bytes for each replication slot +| bdr_subscription_receive_lsn | gauge | sub_name - name of subscription | Current received LSN for each subscription +| bdr_subscription_flush_lsn | gauge | sub_name - name of subscription | Current flushed LSN for each subscription +| bdr_subscription_apply_lsn | gauge | sub_name - name of subscription | Current applied LSN for each subscription +| bdr_subscription_receiver | gauge | sub_name - name of subscription | Whether subscription receiver is currently running (1) or not (0) + +### Consensus metric + +| Metric name | Type | Labels | Description +| ----------- | ---- | ------ | ----------- +| bdr_raft_state | gauge | state_str - RAFT_FOLLOWER, RAFT_CANDIDATE, RAFT_LEADER, RAFT_STOPPED | Raft state of the consensus on this node +| bdr_raft_protocol_version | gauge | | Consensus protocol version used by this node +| bdr_raft_leader_node | gauge | | Id of a node that this node considers to be current leader +| bdr_raft_nodes | gauge | | Total number of nodes that participate in consensus (includes learner/non-voting nodes) +| bdr_raft_voting_nodes | gauge | | Number of actual voting nodes in consensus +| bdr_raft_term | gauge | | Current raft term this node is on +| bdr_raft_commit_index | gauge | | Raft commit index committed by this node +| bdr_raft_apply_index | gauge | | Raft commit index applied by this node + +## Tracing + +Tracing collection to OpenTelemetry requires `bdr.trace_otel_http_url` to be +configured and tracing itself to be enabled using `bdr.trace_enable`. + +The tracing is limited to only some subsystems at the moment, primarily to the +cluster management functionality. The following spans can be seen in traces: + +| Span name | Description | +| --------- | ----------- | +| create_node_group | Group creation +| alter_node_group_config | Change of group config option(s) +| alter_node_config | Change of node config option +| join_node_group | Node joining a group +| join_send_remote_request | Join source sending the join request on behalf of the joining node +| add_camo_pair | Add CAMO pair +| alter_camo_pair | Change CAMO pair +| remove_camo_pair | Delete CAMO pair +| alter_commit_scope | Change commit scope definition (either create new or update existing) +| alter_proxy_config | Change config for PGD-Proxy instance (either create new or update existing) +| walmsg_global_lock_send | Send global locking WAL message +| walmsg_global_lock_recv | Received global locking WAL message +| ddl_epoch_apply | Global locking epoch apply (ensure cluster is synchronized enough for new epoch to start) +| walmsg_catchup | Catchup during node removal WAL message +| raft_send_appendentries | Internal Raft book keeping message +| raft_recv_appendentries | Internal Raft book keeping message +| raft_request | Raft request execution +| raft_query | Raft query execution +| msgb_send | Consensus messaging layer message +| msgb_recv_receive | Consensus messaging layer message +| msgb_recv_deliver | Consensus messaging layer message delivery +| preprocess_ddl | DDL command preprocessing + +## TLS support + +The metrics and tracing endpoints can be either HTTP or HTTPS. It's possible +to configure paths to the CA bundle, client key, and client certificate using +`bdr.otel_https_ca_path`, `bdr.otel_https_key_path`, and `bdr.otel_https_cert_path` +configuration options. diff --git a/product_docs/docs/pgd/5/monitoring/sql.mdx b/product_docs/docs/pgd/5/monitoring/sql.mdx new file mode 100644 index 00000000000..f305b1145bc --- /dev/null +++ b/product_docs/docs/pgd/5/monitoring/sql.mdx @@ -0,0 +1,701 @@ +--- +title: Monitoring through SQL +--- + +EDB Postgres Distributed provides several monitoring and statistics views that +are specific to its distributed nature. The standard Postgres monitoring +is also useful for monitoring EDB Postgres Distributed. + +## Monitoring Overview + +A BDR Group consists of multiple servers, often referred to as nodes. All of the +nodes need to be monitored to ensure the health of the whole group. + +The bdr_monitor role may execute the `bdr.monitor` functions to provide an +assessment of BDR health using one of three levels: + +- `OK` - often shown as Green +- `WARNING` - often shown as Yellow +- `CRITICAL` - often shown as Red +- as well as `UNKNOWN` - for unrecognized situations, often shown as Red + +BDR also provides dynamic catalog views that show the instantaneous state of various +internal metrics and also BDR metadata catalogs that store the configuration +defaults and/or configuration changes requested by the user. Some of those views +and tables are accessible by bdr_monitor or bdr_read_all_stats, but some contain +user or internal information that has higher security requirements. + +BDR allows you to monitor each of the nodes individually, or to monitor the +whole group by access to a single node. If you wish to monitor each node individually, +simply connect to each node and issue monitoring requests. If you wish to monitor +the group from a single node then use the views starting with `bdr.group` since these +requests make calls to other nodes to assemble a group-level information set. + +If you have been granted access to the `bdr.run_on_all_nodes()` function by +bdr_superuser then you may make your own calls to all nodes. + +## Monitoring Node Join and Removal + +By default, the node management functions wait for the join or part +operation to complete. This can be turned off using the respective +`wait_for_completion` function argument. If waiting is turned off, +then to see when a join or part operation finishes, +check the node state indirectly via `bdr.node_summary` and +`bdr.event_summary`. + +When called, the helper function `bdr.wait_for_join_completion()` will cause +a PostgreSQL session to pause until all outstanding node join operations +complete. + +Here is an example output of a `SELECT` query from `bdr.node_summary` that +indicates that two nodes are active and another one is joining: + +``` +# SELECT node_name, interface_connstr, peer_state_name, +# node_seq_id, node_local_dbname +# FROM bdr.node_summary; +-[ RECORD 1 ]-----+----------------------------------------- +node_name | node1 +interface_connstr | host=localhost dbname=postgres port=7432 +peer_state_name | ACTIVE +node_seq_id | 1 +node_local_dbname | postgres +-[ RECORD 2 ]-----+----------------------------------------- +node_name | node2 +interface_connstr | host=localhost dbname=postgres port=7433 +peer_state_name | ACTIVE +node_seq_id | 2 +node_local_dbname | postgres +-[ RECORD 3 ]-----+----------------------------------------- +node_name | node3 +interface_connstr | host=localhost dbname=postgres port=7434 +peer_state_name | JOINING +node_seq_id | 3 +node_local_dbname | postgres +``` + +Also, the table [`bdr.node_catchup_info`](../catalogs/#bdrnode_catchup_info) will give information +on the catch-up state, which can be relevant to joining nodes or parting nodes. + +When a node is parted, it could be that some nodes in the cluster did not receive +all the data from that parting node. So it will create a temporary slot from +a node that already received that data and can forward it. + +The `catchup_state` can be one of the following: + +``` +10 = setup +20 = start +30 = catchup +40 = done +``` + +## Monitoring Replication Peers + +There are two main views used for monitoring of replication activity: + +- [`bdr.node_slots`](../catalogs/#bdrnode_slots) for monitoring outgoing replication +- [`bdr.subscription_summary`](../catalogs/#bdrsubscription_summary) for monitoring incoming replication + +Most of the information provided by `bdr.node_slots` can be also obtained by querying +the standard PostgreSQL replication monitoring views +[`pg_catalog.pg_stat_replication`](https://www.postgresql.org/docs/current/static/monitoring-stats.html#PG-STAT-REPLICATION-VIEW) +and +[`pg_catalog.pg_replication_slots`](https://www.postgresql.org/docs/current/view-pg-replication-slots.html). + +Each node has one BDR group slot which should never have a connection to it +and will very rarely be marked as active. This is normal, and does not imply +something is down or disconnected. See [`Replication Slots created by BDR`](../nodes/#replication-slots-created-by-bdr). + +### Monitoring Outgoing Replication + +There is an additional view used for monitoring of outgoing replication activity: + +- [`bdr.node_replication_rates`](../catalogs/#bdrnode_replication_rates) for monitoring outgoing replication + +The `bdr.node_replication_rates` view gives an overall picture of the outgoing +replication activity along with the catchup estimates for peer nodes, +specifically. + +``` +# SELECT * FROM bdr.node_replication_rates; +-[ RECORD 1 ]----+----------- +peer_node_id | 112898766 +target_name | node1 +sent_lsn | 0/28AF99C8 +replay_lsn | 0/28AF99C8 +replay_lag | 00:00:00 +replay_lag_bytes | 0 +replay_lag_size | 0 bytes +apply_rate | 822 +catchup_interval | 00:00:00 +-[ RECORD 2 ]----+----------- +peer_node_id | 312494765 +target_name | node3 +sent_lsn | 0/28AF99C8 +replay_lsn | 0/28AF99C8 +replay_lag | 00:00:00 +replay_lag_bytes | 0 +replay_lag_size | 0 bytes +apply_rate | 853 +catchup_interval | 00:00:00 +``` + +The `apply_rate` above refers to the rate in bytes per second. It is the rate +at which the peer is consuming data from the local node. The `replay_lag` when +a node reconnects to the cluster is immediately set to zero. We are working on +fixing this information; as a workaround, we suggest you use the `catchup_interval` +column that refers to the time required for the peer node to catch up to the +local node data. The other fields are also available via the `bdr.node_slots` +view, as explained below. + +!!! Note + This catalog is only present when bdr-enteprise extension is installed. + +Administrators may query `bdr.node_slots` for outgoing replication from the +local node. It shows information about replication status of all other nodes +in the group that are known to the current node, as well as any additional +replication slots created by BDR on the current node. + +``` +# SELECT node_group_name, target_dbname, target_name, slot_name, active_pid, +# catalog_xmin, client_addr, sent_lsn, replay_lsn, replay_lag, +# replay_lag_bytes, replay_lag_size +# FROM bdr.node_slots; +-[ RECORD 1 ]---+---------------------------- +node_group_name | bdrgroup +target_dbname | postgres +target_name | node3 +slot_name | bdr_postgres_bdrgroup_node3 +active_pid | 15089 +catalog_xmin | 691 +client_addr | 127.0.0.1 +sent_lsn | 0/23F7B70 +replay_lsn | 0/23F7B70 +replay_lag | [NULL] +replay_lag_bytes| 120 +replay_lag_size | 120 bytes +-[ RECORD 2 ]---+---------------------------- +node_group_name | bdrgroup +target_dbname | postgres +target_name | node2 +slot_name | bdr_postgres_bdrgroup_node2 +active_pid | 15031 +catalog_xmin | 691 +client_addr | 127.0.0.1 +sent_lsn | 0/23F7B70 +replay_lsn | 0/23F7B70 +replay_lag | [NULL] +replay_lag_bytes| 84211 +replay_lag_size | 82 kB +``` + +Note that because BDR is a mesh network, to get full view of lag in the +cluster, this query has to be executed on all nodes participating. + +`replay_lag_bytes` reports the difference in WAL positions between the local +server's current WAL write position and `replay_lsn`, the last position +confirmed replayed by the peer node. `replay_lag_size` is just a human-readable +form of the same. It is important to understand that WAL usually contains a lot +of writes that are not replicated but still count in `replay_lag_bytes`, +including `VACUUM` activity, index changes, writes associated with other +databases on the same node, writes for tables that are not part of a +replication set, etc. So the lag in bytes reported here is not the amount of +data that must be replicated on the wire to bring the peer node up to date, +only the amount of server-side WAL that must be processed. + +Similarly, `replay_lag` is not a measure of how long the peer node will take to +catch up, or how long it will take to replay from its current position to the +write position at the time `bdr.node_slots` was queried. It measures the delay +between when the peer confirmed the most recent commit and the current +wall-clock time. We suggest that you monitor `replay_lag_bytes` and `replay_lag_size` +or `catchup_interval` in `bdr.node_replication_rates`, as this column is set to +zero immediately after the node reconnects. + +The lag in both bytes and time does not advance while logical replication is +streaming a transaction. It only changes when a commit is replicated. So the lag +will tend to "sawtooth", rising as a transaction is streamed, then falling again +as the peer node commits it, flushes it, and sends confirmation. The reported +LSN positions will "stair-step" instead of advancing smoothly, for similar +reasons. + +When replication is disconnected (`active` = `'f'`), the `active_pid` column +will be `NULL`, as will `client_addr` and other fields that only make sense +with an active connection. The `state` field will be `'disconnected'`. The +`_lsn` fields will be the same as the `confirmed_flush_lsn`, since that is the +last position that the client is known for certain to have replayed to and saved. +The `_lag` fields will show the elapsed time between the most recent confirmed +flush on the client and the current time, and the `_lag_size` and `_lag_bytes` +fields will report the distance between `confirmed_flush_lsn` and the local +server's current WAL insert position. + +Note: It is normal for `restart_lsn` to be behind the other `lsn` columns; +this does not indicate a problem with replication or a peer node lagging. The +`restart_lsn` is the position that PostgreSQL's internal logical decoding must +be reading WAL at if interrupted, and generally reflects the position of the +oldest transaction that is not yet replicated and flushed. A very old +`restart_lsn` can make replication slow to restart after disconnection and +force retention of more WAL than is desirable, but will otherwise be harmless. +If you are concerned, look for very long running transactions and forgotten +prepared transactions. + +### Monitoring Incoming Replication + +Incoming replication (also called subscription) can be monitored by querying +the `bdr.subscription_summary` view. This shows the list of known subscriptions +to other nodes in the EDB Postgres Distributed cluster and the state of the replication worker, e.g.: + +``` +# SELECT node_group_name, origin_name, sub_enabled, sub_slot_name, +# subscription_status +# FROM bdr.subscription_summary; +-[ RECORD 1 ]-------+---------------------------- +node_group_name | bdrgroup +origin_name | node2 +sub_enabled | t +sub_slot_name | bdr_postgres_bdrgroup_node1 +subscription_status | replicating +-[ RECORD 2 ]-------+---------------------------- +node_group_name | bdrgroup +origin_name | node3 +sub_enabled | t +sub_slot_name | bdr_postgres_bdrgroup_node1 +subscription_status | replicating +``` + +### Monitoring WAL senders using LCR + +If the [Decoding Worker](../nodes#decoding-worker) is enabled, information about the +current LCR (`Logical Change Record`) file for each WAL sender can be monitored +via the function [bdr.wal_sender_stats](../functions#bdrwal_sender_stats), +e.g.: + +``` +postgres=# SELECT * FROM bdr.wal_sender_stats(); + pid | is_using_lcr | decoder_slot_name | lcr_file_name +---------+--------------+-------------------------------+------------------------------------------ + 2059904 | f | | + 2059909 | t | bdr_postgres_bdrgroup_decoder | 0000000000000000000000140000000000000000 + 2059916 | t | bdr_postgres_bdrgroup_decoder | 0000000000000000000000140000000000000000 +(3 rows) +``` + +If `is_using_lcr` is `FALSE`, `decoder_slot_name`/`lcr_file_name` will be `NULL`. +This will be the case if the Decoding Worker is not enabled, or the WAL sender is +serving a [logical standby](../nodes#logical-standby-nodes). + +Additionally, information about the Decoding Worker can be monitored via the function +[bdr.get_decoding_worker_stat](../functions#bdrget_decoding_worker_stat), e.g.: + +``` +postgres=# SELECT * FROM bdr.get_decoding_worker_stat(); + pid | decoded_upto_lsn | waiting | waiting_for_lsn +---------+------------------+---------+----------------- + 1153091 | 0/1E5EEE8 | t | 0/1E5EF00 +(1 row) +``` + +## Monitoring BDR Replication Workers + +All BDR workers show up in the system view `bdr.stat_activity`, +which has the same columns and information content as +[pg_stat_activity](https://www.postgresql.org/docs/current/monitoring-stats.html#PG-STAT-ACTIVITY-VIEW). +So this view offers these insights into the state of a BDR system: + +- The wait_event column has enhanced information, if + the reason for waiting is related to BDR. +- The `query` column will be blank in BDR workers, except + when a writer process is executing DDL + +The `bdr.workers` view shows BDR worker specific details, that are not +available from `bdr.stat_activity`. + +The view `bdr.event_summary` shows last error (if any) reported by any worker +which has a problem continuing the work. This is persistent information, so +it's important to note the time of the error not just the existence of one, +because most errors are transient in their nature and BDR workers will retry +the failed operation. + +## Monitoring BDR Writers + +There is another system view `bdr.writers` to monitor writer activities. +This views shows the current status of only writer workers. It includes: + +- `sub_name` to identify the subscription which the writer belongs to +- `pid` of the writer process +- `streaming_allowed` to know if the writer supports application of + in-progress streaming transactions +- `is_streaming` to know if the writer is currently applying a + streaming transaction +- `commit_queue_position` to check the position of the writer in the + commit queue. + +BDR honours commit ordering by following the same commit order as +happened on the origin. In case of parallel writers, multiple writers +could be applying different transactions at the same time. The +`commit_queue_position` shows in which order they will commit. Value `0` +means that the writer is the first one to commit. Value `-1` means that +the commit position is not yet known. This can happen for a streaming +transaction or when the writer is not applying any transaction at the +moment. + +## Monitoring Global Locks + +The global lock, which is currently only used for DDL replication, is a heavyweight +lock that exists across the whole BDR group. + +There are currently two types of global locks: + +- DDL lock, used for serializing all DDL operations on permanent + (not temporary) objects (i.e. tables) in the database +- DML relation lock, used for locking out writes to relations during DDL + operations that change the relation definition + +Either or both entry types may be created for the same transaction, depending on +the type of DDL operation and the value of the `bdr.ddl_locking` setting. + +Global locks held on the local node are visible in [the `bdr.global_locks` +view](../catalogs#bdrglobal_locks). This view shows the type of the lock; for +relation locks it shows which relation is being locked, the PID holding the +lock (if local), and whether the lock has been globally granted or not. In case +of global advisory locks, `lock_type` column shows `GLOBAL_LOCK_ADVISORY` and +`relation` column shows the advisory key(s) on which the lock is acquired. + +The following is an example output of `bdr.global_locks` while running an +`ALTER TABLE` statement with `bdr.ddl_locking = on`: + +``` +# SELECT lock_type, relation, pid FROM bdr.global_locks; +-[ RECORD 1 ]-------------- +lock_type | GLOBAL_LOCK_DDL +relation | [NULL] +pid | 15534 +-[ RECORD 2 ]-------------- +lock_type | GLOBAL_LOCK_DML +relation | someschema.sometable +pid | 15534 +``` + +See the catalog documentation for details on all fields including lock +timing information. + +## Monitoring Conflicts + +Replication [conflicts](../consistency/conflicts) can arise when multiple nodes make +changes that affect the same rows in ways that can interact with each other. +The BDR system should be monitored to ensure that conflicts are identified +and, where possible, application changes are made to eliminate them or make +them less frequent. + +By default, all conflicts are logged to `bdr.conflict_history`. Since this +contains full details of conflicting data, the rows are protected by +row-level security to ensure they are visible only by +owners of replicated tables. Owners should expect conflicts and analyze them +to see which, if any, might be considered as problems to be resolved. + +For monitoring purposes use `bdr.conflict_history_summary`, which does +not contain user data. An example query to count the number of conflicts +seen within the current day using an efficient query plan is: + +```sql +SELECT count(*) +FROM bdr.conflict_history_summary +WHERE local_time > date_trunc('day', current_timestamp) + AND local_time < date_trunc('day', current_timestamp + '1 day'); +``` + +## Apply Statistics + +BDR collects statistics about replication apply, both for each subscription +and for each table. + +Two monitoring views exist: `bdr.stat_subscription` for subscription statistics +and `bdr.stat_relation` for relation statistics. These views both provide: + +- Number of INSERTs/UPDATEs/DELETEs/TRUNCATEs replicated +- Block accesses and cache hit ratio +- Total I/O time for read/write +- Number of in-progress transactions streamed to file +- Number of in-progress transactions streamed to writers +- Number of in-progress streamed transactions committed/aborted + +and for relations only, these statistics: + +- Total time spent processing replication for the relation +- Total lock wait time to acquire lock (if any) for the relation (only) + +and for subscriptions only, these statistics: + +- Number of COMMITs/DDL replicated for the subscription +- Number of times this subscription has connected upstream + +Tracking of these statistics is controlled by the BDR GUCs +`bdr.track_subscription_apply` and `bdr.track_relation_apply` +respectively. + +The example output from these would look like this: + +```sql +# SELECT sub_name, nconnect, ninsert, ncommit, nupdate, ndelete, ntruncate, nddl +FROM bdr.stat_subscription; +-[ RECORD 1 ]---------------------------------- +sub_name | bdr_regression_bdrgroup_node1_node2 +nconnect | 3 +ninsert | 10 +ncommit | 5 +nupdate | 0 +ndelete | 0 +ntruncate | 0 +nddl | 2 +``` + +In this case the subscription connected 3 times to the upstream, inserted +10 rows and did 2 DDL commands inside 5 transactions. + +Stats counters for these views can be reset to zero using the functions +`bdr.reset_subscription_stats` and `bdr.reset_relation_stats`. + +## Standard PostgreSQL Statistics Views + +Statistics on table and index usage are updated normally by the downstream +master. This is essential for the correct function of +[autovacuum](https://www.postgresql.org/docs/current/static/routine-vacuuming.html). +If there are no local writes on the downstream master and statistics have not been +reset, these two views should show corresponding results between +upstream and downstream: + +- `pg_stat_user_tables` +- `pg_statio_user_tables` + +!!! Note + We don't necessarily expect the upstream table statistics to + be *similar* to the downstream ones; we only expect them to *change* + by the same amounts. Consider the example of a table whose statistics + show 1M inserts and 1M updates; when a new node joins the BDR group, + the statistics for the same table in the new node will show 1M inserts + and zero updates. However, from that moment, the upstream and + downstream table statistics will change by the same amounts, because + all changes on one side will be replicated to the other side. + +Since indexes are used to apply changes, the identifying indexes on the +downstream side may appear more heavily used with workloads that perform +`UPDATE`s and `DELETE`s than non-identifying indexes are. + +The built-in index monitoring views are: + +- `pg_stat_user_indexes` +- `pg_statio_user_indexes` + +All these views are discussed in detail in the +[PostgreSQL documentation on the statistics views](http://www.postgresql.org/docs/current/static/monitoring-stats.html#MONITORING-STATS-VIEWS-TABLE). + +## Monitoring BDR Versions + +BDR allows running different Postgres versions as well as different +BDR versions across the nodes in the same cluster. This is useful for +upgrading. + +The view `bdr.group_versions_details` uses the function +`bdr.run_on_all_nodes()` to retrieve Postgres and BDR versions from all nodes +at the same time. For example: + +```sql +bdrdb=# SELECT node_name, postgres_version, bdr_version + FROM bdr.group_versions_details; + node_name | postgres_version | bdr_version +-----------+------------------+------------- + node1 | 14.1 | 4.0.0 + node2 | 14.1 | 4.0.0 +``` + +The recommended setup is to try to have all nodes running the same +latest versions as soon as possible. It is recommended +that the cluster does not run different BDR versions for too long. + +For monitoring purposes, we recommend the following alert levels: + +- status=UNKNOWN, message=This node is not part of any BDR group +- status=OK, message=All nodes are running same BDR versions +- status=WARNING, message=There is at least 1 node that is not accessible +- status=WARNING, message=There are node(s) running different BDR versions + when compared to other nodes + +The described behavior is implemented in the function +`bdr.monitor_group_versions()`, which uses BDR version +information returned from the view `bdr.group_version_details` +to provide a cluster-wide version check. For example: + +```sql +bdrdb=# SELECT * FROM bdr.monitor_group_versions(); + status | message +--------+----------------------------------------- + OK | All nodes are running same BDR versions +``` + +## Monitoring Raft Consensus + +Raft Consensus should be working cluster-wide at all times. The impact +of running a EDB Postgres Distributed cluster without Raft Consensus working might be as +follows: + +- BDR data changes replication may still be working correctly +- Global DDL/DML locks will not work +- Galloc sequences will eventually run out of chunks +- Eager Replication will not work +- Cluster maintenance operations (join node, part node, promote standby) + are still allowed but they might not finish (simply hang) +- Node statuses might not be correctly synced among the BDR nodes +- BDR group replication slot does not advance LSN, thus keeps WAL files on + disk + +The view `bdr.group_raft_details` uses the functions +`bdr.run_on_all_nodes()` and `bdr.get_raft_status()` to retrieve Raft +Consensus status from all nodes at the same time. For example: + +```sql +bdrdb=# SELECT node_id, node_name, state, leader_id +FROM bdr.group_raft_details; + node_id | node_name | node_group_name | state | leader_id +------------+-----------+-----------------+---------------+------------ + 1148549230 | node1 | top_group | RAFT_LEADER | 1148549230 + 3367056606 | node2 | top_group | RAFT_FOLLOWER | 1148549230 +``` + +We can say that Raft Consensus is working correctly if all below +conditions are met: + +- A valid state (`RAFT_LEADER` or `RAFT_FOLLOWER`) is defined on all + nodes +- Only one of the nodes is the `RAFT_LEADER` +- The `leader_id` is the same on all rows and must match the `node_id` + of the row where `state = RAFT_LEADER` + +From time to time, Raft Consensus will start a new election to define a +new `RAFT_LEADER`. During an election, there might be an intermediary +situation where there is no `RAFT_LEADER` and some of the nodes consider +themselves as `RAFT_CANDIDATE`. The whole election should not take longer +than `bdr.raft_election_timeout` (by default it is set to 6 seconds). If +the query above returns an in-election situation, then simply wait for +`bdr.raft_election_timeout` and run the query again. If after +`bdr.raft_election_timeout` has passed and some the conditions above are +still not met, then Raft Consensus is not working. + +Raft Consensus might not be working correctly on a single node only; +for example one of the nodes does not recognize the current leader and +considers itself as a `RAFT_CANDIDATE`. In this case, it is important to +make sure that: + +- All BDR nodes are accessible to each other through both regular and + replication connections (check file `pg_hba.conf`) +- BDR versions are the same on all nodes +- `bdr.raft_election_timeout` is the same on all nodes + +In some cases, especially if nodes are geographically distant from each +other and/or network latency is high, the default value of +`bdr.raft_election_timeout` (6 seconds) might not be enough. If Raft +Consensus is still not working even after making sure everything is +correct, consider increasing `bdr.raft_election_timeout` to, say, 30 +seconds on all nodes. From BDR 3.6.11 onwards, setting +`bdr.raft_election_timeout` requires only a server reload. + +Given how Raft Consensus affects cluster operational tasks, and also as +Raft Consensus is directly responsible for advancing the group slot, +we can define monitoring alert levels as follows: + +- status=UNKNOWN, message=This node is not part of any BDR group +- status=OK, message=Raft Consensus is working correctly +- status=WARNING, message=There is at least 1 node that is not accessible +- status=WARNING, message=There are node(s) as RAFT_CANDIDATE, an + election might be in progress +- status=WARNING, message=There is no RAFT_LEADER, an election might be + in progress +- status=CRITICAL, message=There is a single node in Raft Consensus +- status=CRITICAL, message=There are node(s) as RAFT_CANDIDATE while a + RAFT_LEADER is defined +- status=CRITICAL, message=There are node(s) following a leader different + than the node set as RAFT_LEADER + +The described behavior is implemented in the function +`bdr.monitor_group_raft()`, which uses Raft Consensus status +information returned from the view `bdr.group_raft_details` +to provide a cluster-wide Raft check. For example: + +```sql +bdrdb=# SELECT * FROM bdr.monitor_group_raft(); +node_group_name | status | message +----------------|--------+------------------------------------- +myroup | OK | Raft Consensus is working correctly +``` + +## Monitoring Replication Slots + +Each BDR node keeps: + +- One replication slot per active BDR peer +- One group replication slot + +For example: + +```sql +bdrdb=# SELECT slot_name, database, active, confirmed_flush_lsn +FROM pg_replication_slots ORDER BY slot_name; + slot_name | database | active | confirmed_flush_lsn +--------------------------+----------+--------+--------------------- + bdr_bdrdb_bdrgroup | bdrdb | f | 0/3110A08 + bdr_bdrdb_bdrgroup_node2 | bdrdb | t | 0/31F4670 + bdr_bdrdb_bdrgroup_node3 | bdrdb | t | 0/31F4670 + bdr_bdrdb_bdrgroup_node4 | bdrdb | t | 0/31F4670 +``` + +Peer slot names follow the convention `bdr___`, +while the BDR group slot name follows the convention +`bdr__`, which can be accessed using the function +`bdr.local_group_slot_name()`. + +Peer replication slots should be active on all nodes at all times. +If a peer replication slot is not active, then it might mean: + +- The corresponding peer is shutdown or not accessible; or +- BDR replication is broken. + +Grep the log file for `ERROR` or `FATAL` and also check `bdr.event_summary` on +all nodes. The root cause might be, for example, an incompatible DDL was +executed with DDL replication disabled on one of the nodes. + +The BDR group replication slot is however inactive most of the time. BDR +maintains this slot and advances its LSN when all other peers have already +consumed the corresponding transactions. Consequently it is not necessary to +monitor the status of the group slot. + +The function `bdr.monitor_local_replslots()` provides a summary of whether all +BDR node replication slots are working as expected, e.g.: + +```sql +bdrdb=# SELECT * FROM bdr.monitor_local_replslots(); + status | message +--------+------------------------------------------------- + OK | All BDR replication slots are working correctly +``` + +One of the following status summaries will be returned: + +- `UNKNOWN`: `This node is not part of any BDR group` +- `OK`: `All BDR replication slots are working correctly` +- `OK`: `This node is part of a subscriber-only group` +- `CRITICAL`: `There is at least 1 BDR replication slot which is inactive` +- `CRITICAL`: `There is at least 1 BDR replication slot which is missing` + +## Monitoring Transaction COMMITs + +By default, BDR transactions commit only on the local node. In that case, +transaction `COMMIT` will be processed quickly. + +BDR can be used with standard PostgreSQL synchronous replication, while +BDR also provides two new transaction commit modes: CAMO and Eager +replication. Each of these modes provides additional robustness +features, though at the expense of additional latency at `COMMIT`. +The additional time at `COMMIT` can be monitored dynamically using the +`bdr.stat_activity` catalog, where processes report different `wait_event` +states. A transaction in `COMMIT` waiting for confirmations from one or +more synchronous standbys reports a `SyncRep` wait event, whereas the +two new modes report `EagerRep`. diff --git a/product_docs/docs/pgd/5/nodes.mdx b/product_docs/docs/pgd/5/nodes.mdx new file mode 100644 index 00000000000..02a1b30e825 --- /dev/null +++ b/product_docs/docs/pgd/5/nodes.mdx @@ -0,0 +1,1445 @@ +--- +title: Node management +redirects: + - bdr/nodes + +--- + +Each database that's member of a BDR group must be represented by its own +node. A node is a unique identifier of a database in a BDR group. + +At present, each node can be a member of just one node group. (This might be +extended in later releases.) Each node can subscribe to one or more +replication sets to give fine-grained control over replication. + +A BDR group might also contain zero or more subgroups, allowing you to create a variety +of different architectures. + +## Creating and joining a BDR group + +For BDR, every node must connect to every other node. To make +configuration easy, when a new node joins, it configures all +existing nodes to connect to it. For this reason, every node, including +the first BDR node created, must know the [PostgreSQL connection string](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING), +sometimes referred to as a data source name (DSN), that other nodes +can use to connect to it. Both formats of connection string are supported. +So you can use either key-value format, like `host=myhost port=5432 dbname=mydb`, +or URI format, like `postgresql://myhost:5432/mydb`. + +The SQL function `bdr.create_node_group()` creates the BDR group +from the local node. Doing so activates BDR on that node and allows other +nodes to join the BDR group, which consists of only one node at that point. +At the time of creation, you must specify the connection string for other +nodes to use to connect to this node. + +Once the node group is created, every further node can join the BDR +group using the `bdr.join_node_group()` function. + +Alternatively, use the command line utility `bdr_init_physical` to +create a new node, using `pg_basebackup` (or a physical standby) of an existing +node. If using `pg_basebackup`, the `bdr_init_physical` utility can optionally +specify the base backup of only the target database. The earlier +behavior was to back up the entire database cluster. With this utility, the activity +completes faster and also uses less space because it excludes +unwanted databases. If you specify only the target database, then the excluded +databases get cleaned up and removed on the new node. + +When a new BDR node is joined to an existing BDR group or a node subscribes +to an upstream peer, before replication can begin the system must copy the +existing data from the peer nodes to the local node. This copy must be +carefully coordinated so that the local and remote data starts out +identical. It's not enough to use `pg_dump` yourself. The BDR +extension provides built-in facilities for making this initial copy. + +During the join process, the BDR extension synchronizes existing data +using the provided source node as the basis and creates all metadata +information needed for establishing itself in the mesh topology in the BDR +group. If the connection between the source and the new node disconnects during +this initial copy, restart the join process from the +beginning. + +The node that is joining the cluster must not contain any schema or data +that already exists on databases in the BDR group. We recommend that the +newly joining database be empty except for the BDR extension. However, +it's important that all required database users and roles are created. + +Optionally, you can skip the schema synchronization using the `synchronize_structure` +parameter of the `bdr.join_node_group()` function. In this case, the schema must +already exist on the newly joining node. + +We recommend that you select the source node that has the best connection (the +closest) as the source node for joining. Doing so lowers the time +needed for the join to finish. + +Coordinate the join procedure using the Raft consensus algorithm, which +requires most existing nodes to be online and reachable. + +The logical join procedure (which uses the `bdr.join_node_group()` function) +performs data sync doing `COPY` operations and uses multiple writers +(parallel apply) if those are enabled. + +Node join can execute concurrently with other node joins for the majority of +the time taken to join. However, only one regular node at a time can be in +either of the states PROMOTE or PROMOTING, which are typically fairly short if +all other nodes are up and running. Otherwise the join is serialized at +this stage. The subscriber-only nodes are an exception to this rule, and they +can be concurrently in PROMOTE and PROMOTING states as well, so their join +process is fully concurrent. + +The join process uses only one node as the source, so it can be +executed when nodes are down if a majority of nodes are available. +This can cause a complexity when running logical join. +During logical join, the commit timestamp of rows copied from the source +node is set to the latest commit timestamp on the source node. +Committed changes on nodes that have a commit timestamp earlier than this +(because nodes are down or have significant lag) can conflict with changes +from other nodes. In this case, the newly joined node can be resolved +differently to other nodes, causing a divergence. As a result, we recommend +not running a node join when significant replication lag exists between nodes. +If this is necessary, run LiveCompare on the newly joined node to +correct any data divergence once all nodes are available and caught up. + +`pg_dump` can fail when there is concurrent DDL activity on the source node +because of cache-lookup failures. Since `bdr.join_node_group()` uses `pg_dump` +internally, it might fail if there's concurrent DDL activity on the source node. +Retrying the join works in that case. + +### Joining a heterogeneous cluster + +BDR 4.0 node can join a EDB Postgres Distributed cluster running 3.7.x at a specific +minimum maintenance release (such as 3.7.6) or a mix of 3.7 and 4.0 nodes. +This procedure is useful when you want to upgrade not just the BDR +major version but also the underlying PostgreSQL major +version. You can achieve this by joining a 3.7 node running on +PostgreSQL 12 or 13 to a EDB Postgres Distributed cluster running 3.6.x on +PostgreSQL 11. The new node can also +run on the same PostgreSQL major release as all of the nodes in the +existing cluster. + +BDR ensures that the replication works correctly in all directions +even when some nodes are running 3.6 on one PostgreSQL major release and +other nodes are running 3.7 on another PostgreSQL major release. But +we recommend that you quickly bring the cluster into a +homogenous state by parting the older nodes once enough new nodes +join the cluster. Don't run any DDLs that might +not be available on the older versions and vice versa. + +A node joining with a different major PostgreSQL release can't use +physical backup taken with `bdr_init_physical`, and the node must join +using the logical join method. This is necessary because the major +PostgreSQL releases aren't on-disk compatible with each other. + +When a 3.7 node joins the cluster using a 3.6 node as a +source, certain configurations, such as conflict resolution, +aren't copied from the source node. The node must be configured +after it joins the cluster. + +### Connection DSNs and SSL (TLS) + +The DSN of a node is simply a `libpq` connection string, since nodes connect +using `libpq`. As such, it can contain any permitted `libpq` connection +parameter, including those for SSL. The DSN must work as the +connection string from the client connecting to the node in which it's +specified. An example of such a set of parameters using a client certificate is: + +```ini +sslmode=verify-full sslcert=bdr_client.crt sslkey=bdr_client.key +sslrootcert=root.crt +``` + +With this setup, the files `bdr_client.crt`, `bdr_client.key`, and +`root.crt` must be present in the data directory on each node, with the +appropriate permissions. +For `verify-full` mode, the server's SSL certificate is checked to +ensure that it's directly or indirectly signed with the `root.crt` certificate +authority and that the host name or address used in the connection matches the +contents of the certificate. In the case of a name, this can match a Subject +Alternative Name or, if there are no such names in the certificate, the +Subject's Common Name (CN) field. +Postgres doesn't currently support subject alternative names for IP +addresses, so if the connection is made by address rather than name, it must +match the CN field. + +The CN of the client certificate must be the name of the user making the +BDR connection. +This is usually the user postgres. Each node requires matching +lines permitting the connection in the `pg_hba.conf` file. For example: + +```ini +hostssl all postgres 10.1.2.3/24 cert +hostssl replication postgres 10.1.2.3/24 cert +``` + +Another setup might be to use `SCRAM-SHA-256` passwords instead of client +certificates and not verify the server identity as long as +the certificate is properly signed. Here the DSN parameters might be: + +```ini +sslmode=verify-ca sslrootcert=root.crt +``` + +The corresponding `pg_hba.conf` lines are: + +```ini +hostssl all postgres 10.1.2.3/24 scram-sha-256 +hostssl replication postgres 10.1.2.3/24 scram-sha-256 +``` + +In such a scenario, the postgres user needs a `.pgpass` file +containing the correct password. + +## Witness nodes + +If the cluster has an even number of nodes, it might be useful to create +an extra node to help break ties in the event of a network split (or +network partition, as it is sometimes called). + +Rather than create an additional full-size node, you can create a micro node, +sometimes called a witness node. This is a normal BDR node that +is deliberately set up not to replicate any tables or data to it. + +## Logical standby nodes + +BDR allows you to create a *logical standby node*, also known as an offload +node, a read-only node, receive-only node, or logical-read replicas. +A master node can have zero, one, or more logical standby nodes. + +!!! Note + Logical standby nodes can be used in environments where network traffic + between data centers is a concern; otherwise having more data nodes per + location is always preferred. + +With a physical standby node, the node never comes up fully, forcing it to +stay in continual recovery mode. +BDR allows something similar. `bdr.join_node_group` has the `pause_in_standby` +option to make the node stay in half-way-joined as a logical standby node. +Logical standby nodes receive changes but don't send changes made locally +to other nodes. + +Later, if you want, use `bdr.promote_node()` to move the logical standby into a +full, normal send/receive node. + +A logical standby is sent data by one source node, defined by the DSN in +`bdr.join_node_group`. Changes from all other nodes are received from this one +source node, minimizing bandwidth between multiple sites. + +There are multiple options for high availability: + +- If the source node dies, one physical standby can be promoted to a master. + In this case, the new master can continue to feed any or all logical standby nodes. + +- If the source node + dies, one logical standby can be promoted to a full node and replace the source + in a failover operation similar to single-master operation. If there + are multiple logical standby nodes, the other nodes can't follow the new master, + so the effectiveness of this technique is limited to one logical + standby. + +In case a new standby is created from an existing BDR node, +the needed replication slots for operation aren't synced to the +new standby until at least 16 MB of LSN has elapsed since the group +slot was last advanced. In extreme cases, this might require a full +16 MB before slots are synced or created on the streaming replica. If +a failover or switchover occurs during this interval, the +streaming standby can't be promoted to replace its BDR node, as the +group slot and other dependent slots don't exist yet. + +The slot sync-up process on the standby solves this by invoking a function +on the upstream. This function moves the group slot in the +entire EDB Postgres Distributed cluster by performing WAL switches and requesting all BDR +peer nodes to replay their progress updates. This causes the +group slot to move ahead in a short time span. This reduces the time +required by the standby for the initial slot's sync-up, allowing for +faster failover to it, if required. + +On PostgreSQL, it's important to ensure that the slot's sync-up completes on +the standby before promoting it. You can run the following query on the +standby in the target database to monitor and ensure that the slots +synced up with the upstream. The promotion can go ahead when this query +returns `true`. + +```sql +SELECT true FROM pg_catalog.pg_replication_slots WHERE + slot_type = 'logical' AND confirmed_flush_lsn IS NOT NULL; +``` + +You can also nudge the slot sync-up process in the entire BDR +cluster by manually performing WAL switches and by requesting all BDR +peer nodes to replay their progress updates. This activity causes +the group slot to move ahead in a short time and also hastens the +slot sync-up activity on the standby. You can run the following queries +on any BDR peer node in the target database for this: + +```sql +SELECT bdr.run_on_all_nodes('SELECT pg_catalog.pg_switch_wal()'); +SELECT bdr.run_on_all_nodes('SELECT bdr.request_replay_progress_update()'); +``` + +Use the monitoring query on the standby to check that these +queries do help in faster slot sync-up on that standby. + +Logical standby nodes can be protected using physical standby nodes, +if desired, so Master->LogicalStandby->PhysicalStandby. You can't +cascade from LogicalStandby to LogicalStandby. + +A logical standby does allow write transactions, so the restrictions +of a physical standby don't apply. You can use this to great benefit, since +it allows the logical standby to have additional indexes, longer retention +periods for data, intermediate work tables, LISTEN/NOTIFY, temp tables, +materialized views, and other differences. + +Any changes made locally to logical standbys that commit before the promotion +aren't sent to other nodes. All transactions that commit after promotion +are sent onwards. If you perform writes to a logical standby, +take care to quiesce the database before promotion. + +You might make DDL changes to logical standby nodes but they aren't +replicated and they don't attempt to take global DDL locks. BDR functions +that act similarly to DDL also aren't replicated. See [DDL replication](ddl). +If you made incompatible DDL changes to a logical standby, +then the database is a *divergent node*. Promotion of a divergent +node currently results in replication failing. +As a result, plan to either ensure that a logical standby node +is kept free of divergent changes if you intend to use it as a standby, or +ensure that divergent nodes are never promoted. + +## Physical standby nodes + +BDR also enables you to create traditional physical standby failover +nodes. These are commonly intended to directly replace a BDR +node in the cluster after a short promotion procedure. As with +any standard Postgres cluster, a node can have any number of these +physical replicas. + +There are, however, some minimal prerequisites for this to work properly +due to the use of replication slots and other functional requirements in +BDR: + +- The connection between BDR primary and standby uses streaming + replication through a physical replication slot. +- The standby has: + - `recovery.conf` (for PostgreSQL <12, for PostgreSQL 12+ these settings are in `postgres.conf`): + - `primary_conninfo` pointing to the primary + - `primary_slot_name` naming a physical replication slot on the primary to be used only by this standby + - `postgresql.conf`: + - `shared_preload_libraries = 'bdr'`, there can be other plugins in the list as well, but don't include pglogical + - `hot_standby = on` + - `hot_standby_feedback = on` +- The primary has: + - `postgresql.conf`: + - `bdr.standby_slot_names` specifies the physical + replication slot used for the standby's `primary_slot_name`. + +While this is enough to produce a working physical standby of a BDR +node, you need to address some additional concerns. + +Once established, the standby requires enough time and WAL traffic +to trigger an initial copy of the primary's other BDR-related +replication slots, including the BDR group slot. At minimum, slots on a +standby are live and can survive a failover only if they report +a nonzero `confirmed_flush_lsn` as reported by `pg_replication_slots`. + +As a consequence, check physical standby nodes in newly initialized BDR +clusters with low amounts of write activity before +assuming a failover will work normally. Failing to take this precaution +can result in the standby having an incomplete subset of required +replication slots needed to function as a BDR node, and thus an +aborted failover. + +The protection mechanism that ensures physical standby nodes are up to date +and can be promoted (as configured `bdr.standby_slot_names`) affects the +overall replication latency of the BDR group. This is because the group replication +happens only when the physical standby nodes are up to date. + +For these reasons, we generally recommend to use either logical standby nodes +or a subscribe-only group instead of physical standby nodes. They both +have better operational characteristics in comparison. + +You can can manually ensure the group slot is advanced on all nodes +(as much as possible), which helps hasten the creation of BDR-related +replication slots on a physical standby using the following SQL syntax: + +```sql +SELECT bdr.move_group_slot_all_nodes(); +``` + +Upon failover, the standby must perform one of two actions to replace +the primary: + +- Assume control of the same IP address or hostname as the primary. +- Inform the EDB Postgres Distributed cluster of the change in address by executing the + [bdr.alter_node_interface](#bdralter_node_interface) function on all other BDR nodes. + +Once this is done, the other BDR nodes reestablish communication +with the newly promoted standby -> primary node. Since replication +slots are synchronized only periodically, this new primary might reflect +a lower LSN than expected by the existing BDR nodes. If this is the +case, BDR fast forwards each lagging slot to the last location +used by each BDR node. + +Take special note of the `bdr.standby_slot_names` parameter as +well. It's important to set it in a EDB Postgres Distributed cluster where there is a +primary -> physical standby relationship or when using subscriber-only groups. + +BDR maintains a group slot that always reflects the state of the +cluster node showing the most lag for any outbound replication. +With the addition of a physical +replica, BDR must be informed that there is a nonparticipating node +member that, regardless, affects the state of the group slot. + +Since the standby doesn't directly communicate with the other BDR +nodes, the `standby_slot_names` parameter informs BDR to consider named +slots as needed constraints on the group slot as well. When set, the +group slot is held if the standby shows lag, even if the group +slot is normally advanced. + +As with any physical replica, this type of standby can also be +configured as a synchronous replica. As a reminder, this requires: + +- On the standby: + - Specifying a unique `application_name` in `primary_conninfo` +- On the primary: + - Enabling `synchronous_commit` + - Including the standby `application_name` in `synchronous_standby_names` + +It's possible to mix physical standby and other BDR nodes in +`synchronous_standby_names`. CAMO and Eager All-Node Replication use +different synchronization mechanisms and don't work with synchronous +replication. Make sure `synchronous_standby_names` doesn't +include any BDR node if either CAMO or Eager All-Node Replication is used. +Instead use only non-BDR nodes, for example, a physical standby. + +## Subgroups + +A group can also contain zero or more subgroups. Each subgroup can be +allocated to a specific purpose in the top-level parent group. The +node_group_type specifies the type when the subgroup is created. + +### Subscriber-only groups + +As the name suggests, this type of node +subscribes only to replication changes from other nodes in the cluster. However, +no other nodes receive replication changes from `subscriber-only` +nodes. This is somewhat similar to logical standby nodes. But in contrast +to logical standby, the `subscriber-only` nodes are fully joined to +the cluster. They can receive replication changes from all other nodes +in the cluster and hence aren't affected by unavailability or +parting of any one node in the cluster. + +A `subscriber-only` node is a fully joined +BDR node and hence it receives all replicated DDLs and acts on those. It +also uses Raft to consistently report its status to all nodes in the +cluster. The `subscriber-only` node doesn't have Raft voting rights and +hence can't become a Raft leader or participate in the leader +election. Also, while it receives replicated DDLs, it doesn't +participate in DDL or DML lock acquisition. In other words, a currently +down `subscriber-only` node doesn't stop a DML lock from being acquired. + +The `subscriber-only` node forms the building block for BDR Tree +topology. In this topology, a small number of fully active +nodes are replicating changes in all directions. A large +number of `subscriber-only` nodes receive only changes but never +send any changes to any other node in the cluster. This topology avoids +connection explosion due to a large number of nodes, yet provides +an extremely large number of `leaf` nodes that you can use to consume the +data. + +To make use of `subscriber-only` nodes, first +create a BDR group of type `subscriber-only`. Make it a subgroup of +the group from which the member nodes receive the replication +changes. Once you create the subgroup, all nodes that intend to become +`subscriber-only` nodes must join the subgroup. You can create more than one +subgroup of `subscriber-only` type, and they can have +different parent groups. + +Once a node successfully joins the `subscriber-only` subgroup, it +becomes a `subscriber-only` node and starts receiving replication changes +for the parent group. Any changes made directly on the `subscriber-only` +node aren't replicated. + +See `bdr.create_node_group()` to know how to create a subgroup of a +specific type and belonging to a specific parent group. + +#### Notes + +Since a `subscriber-only` node doesn't replicate changes to any node in +the cluster, it can't act as a source for syncing replication changes +when a node is parted from the cluster. But if the `subscriber-only` +node already received and applied replication changes from the +parted node that no other node in the cluster currently has, then that +causes inconsistency between the nodes. + +For now, you can solve this by setting `bdr.standby_slot_names` +and `bdr.standby_slots_min_confirmed` so that there +is always a fully active BDR node that is ahead of the `subscriber-only` +nodes. + +This might be improved in a future release. We might either allow +`subscriber-only` nodes to be ahead in the replication and then use them +as replication source for sync or simply provide ways to optionally +remove the inconsistent `subscriber-only` nodes from the cluster when +another fully joined node is parted. + +## Decoding worker + +BDR4 provides an option to enable a decoding worker process that performs +decoding once, no matter how many nodes are sent data. This introduces a +new process, the WAL decoder, on each BDR node. One WAL sender process still +exists for each connection, but these processes now just perform the task of +sending and receiving data. Taken together, these changes reduce the CPU +overhead of larger BDR groups and also allow higher replication throughput +since the WAL sender process now spends more time on communication. + +`enable_wal_decoder` is an option for each BDR group, which is currently +disabled by default. You can use `bdr.alter_node_group_config()` to enable or +disable the decoding worker for a BDR group. + +When the decoding worker is enabled, BDR stores logical change record (LCR) +files to allow buffering of changes between decoding and when all +subscribing nodes received data. LCR files are stored under the +`pg_logical` directory in each local node's data directory. The number and +size of the LCR files varies as replication lag increases, so this also +needs monitoring. The LCRs that aren't required by any of the BDR nodes are cleaned +periodically. The interval between two consecutive cleanups is controlled by +`bdr.lcr_cleanup_interval`, which defaults to 3 minutes. The cleanup is +disabled when `bdr.lcr_cleanup_interval` is zero. + +When disabled, logical decoding is performed by the WAL sender process for each +node subscribing to each node. In this case, no LCR files are written. + +Even though the decoding worker is enabled for a BDR group, following +GUCs control the production and use of LCR per node. By default +these are `false`. For production and use of LCRs, enable the +decoding worker for the BDR group and set these GUCs to to `true` on each of the nodes in the BDR group. + +- `bdr.enable_wal_decoder` — When turned `false`, all WAL + senders using LCRs restart to use WAL directly. When `true` + along with the BDR group config, a decoding worker process is + started to produce LCR and WAL Senders use LCR. +- `bdr.receive_lcr` — When `true` on the subscribing node, it requests WAL + sender on the publisher node to use LCRs if available. + +### Notes + +As of now, a decoding worker decodes changes corresponding to the node where it's +running. A logical standby is sent changes from all the nodes in the BDR group +through a single source. Hence a WAL sender serving a logical standby can't +use LCRs right now. + +A subscriber-only node receives changes from respective nodes directly. Hence +a WAL sender serving a subscriber-only node can use LCRs. + +Even though LCRs are produced, the corresponding WALs are still retained similar +to the case when a decoding worker isn't enabled. In the future, it might be possible +to remove WAL corresponding the LCRs, if they aren't otherwise required. + +For reference, the first 24 characters of an LCR file name are similar to those +in a WAL file name. The first 8 characters of the name are all '0' right now. +In the future, they are expected to represent the TimeLineId similar to the first 8 +characters of a WAL segment file name. The following sequence of 16 characters +of the name is similar to the WAL segment number, which is used to track LCR +changes against the WAL stream. + +However, logical changes are +reordered according to the commit order of the transactions they belong to. +Hence their placement in the LCR segments doesn't match the placement of +corresponding WAL in the WAL segments. + +The set of last 16 characters represents the +subsegment number in an LCR segment. Each LCR file corresponds to a +subsegment. LCR files are binary and variable sized. The maximum size of an +LCR file can be controlled by `bdr.max_lcr_segment_file_size`, which +defaults to 1 GB. + +## Node restart and down node recovery + +BDR is designed to recover from node restart or node disconnection. +The disconnected node rejoins the group by reconnecting +to each peer node and then replicating any missing data from that node. + +When a node starts up, each connection begins showing +`bdr.node_slots.state = catchup` and begins replicating missing data. +Catching up continues for a period of time that depends on the +amount of missing data from each peer node and will likely increase +over time, depending on the server workload. + +If the amount of write activity on each node isn't uniform, the catchup period +from nodes with more data can take significantly longer than other nodes. +Eventually, the slot state changes to `bdr.node_slots.state = streaming`. + +Nodes that are offline for longer periods, such as hours or days, +can begin to cause resource issues for various reasons. Don't plan +on extended outages without understanding the following issues. + +Each node retains change information (using one +[replication slot](https://www.postgresql.org/docs/current/logicaldecoding-explanation.html) +for each peer node) so it can later replay changes to a temporarily unreachable node. +If a peer node remains offline indefinitely, this accumulated change information +eventually causes the node to run out of storage space for PostgreSQL +transaction logs (*WAL* in `pg_wal`), and likely causes the database server +to shut down with an error similar to this: + +``` +PANIC: could not write to file "pg_wal/xlogtemp.559": No space left on device +``` + +Or, it might report other out-of-disk related symptoms. + +In addition, slots for offline nodes also hold back the catalog xmin, preventing +vacuuming of catalog tables. + +On EDB Postgres Extended Server and EDB Postgres Advanced Server, offline nodes +also hold back freezing of data to prevent losing conflict-resolution data +(see [Origin conflict detection](consistency/conflicts)). + +Administrators must monitor for node outages (see [monitoring](monitoring/)) +and make sure nodes have enough free disk space. If the workload is +predictable, you might be able to calculate how much space is used over time, +allowing a prediction of the maximum time a node can be down before critical +issues arise. + +Don't manually remove replication slots created by BDR. If you do, the cluster +becomes damaged and the node that was using the +slot must be parted from the cluster, as described in [Replication slots created by BDR](#replication-slots-created-by-bdr). + +While a node is offline, the other nodes might not yet have received +the same set of data from the offline node, so this might appear as a slight +divergence across nodes. The parting process corrects this imbalance across nodes. +(Later versions might do this earlier.) + +### Replication slots created by BDR + +On a BDR master node, the following replication slots are +created by BDR: + +- One *group slot*, named `bdr__` +- N-1 *node slots*, named `bdr___`, where N is the total number of BDR nodes in the cluster, + including direct logical standbys, if any + +!!! Warning + Don't drop those slots. BDR creates and manages them and drops them when or if necessary. + +On the other hand, you can create or drop replication slots required by software like Barman +or logical replication using the appropriate commands +for the software without any effect on BDR. +Don't start slot names used by other software with the +prefix `bdr_`. + +For example, in a cluster composed of the three nodes `alpha`, `beta`, and +`gamma`, where BDR is used to replicate the `mydb` database and the +BDR group is called `mygroup`: + +- Node `alpha` has three slots: + - One group slot named `bdr_mydb_mygroup` + - Two node slots named `bdr_mydb_mygroup_beta` and + `bdr_mydb_mygroup_gamma` +- Node `beta` has three slots: + - One group slot named `bdr_mydb_mygroup` + - Two node slots named `bdr_mydb_mygroup_alpha` and + `bdr_mydb_mygroup_gamma` +- Node `gamma` has three slots: + - One group slot named `bdr_mydb_mygroup` + - Two node slots named `bdr_mydb_mygroup_alpha` and + `bdr_mydb_mygroup_beta` + +#### Group replication slot + +The group slot is an internal slot used by BDR primarily to track the +oldest safe position that any node in the BDR group (including all logical +standbys) has caught up to, for any outbound replication from this node. + +The group slot name is given by the function `bdr.local_group_slot_name()`. + +The group slot can: + +- Join new nodes to the BDR group without having all existing nodes + up and running (although the majority of nodes should be up), without + incurring data loss in case the node that was down during join starts + replicating again. +- Part nodes from the cluster consistently, even if some nodes haven't + caught up fully with the parted node. +- Hold back the freeze point to avoid missing some conflicts. +- Keep the historical snapshot for timestamp-based snapshots. + +The group slot is usually inactive and is fast forwarded only periodically +in response to Raft progress messages from other nodes. + +!!! Warning + Don't drop the group slot. Although usually inactive, it's + still vital to the proper operation of the EDB Postgres Distributed cluster. If you drop it, + then some or all of the features can stop working or have + incorrect outcomes. + +### Hashing long identifiers + +The name of a replication slot—like any other PostgreSQL +identifier—can't be longer than 63 bytes. BDR handles this by +shortening the database name, the BDR group name, and the name of the +node in case the resulting slot name is too long for that limit. +Shortening an identifier is carried out by replacing the final section +of the string with a hash of the string itself. + +For example, consider a cluster that replicates a database +named `db20xxxxxxxxxxxxxxxx` (20 bytes long) using a BDR group named +`group20xxxxxxxxxxxxx` (20 bytes long). The logical replication slot +associated to node `a30xxxxxxxxxxxxxxxxxxxxxxxxxxx` (30 bytes long) +is called since `3597186`, `be9cbd0`, and `7f304a2` are respectively the hashes +of `db20xxxxxxxxxxxxxxxx`, `group20xxxxxxxxxxxxx`, and +`a30xxxxxxxxxxxxxxxxxxxxxxxxxx`. + +``` +bdr_db20xxxx3597186_group20xbe9cbd0_a30xxxxxxxxxxxxx7f304a2 +``` + +## Removing a node from a BDR group + +Since BDR is designed to recover from extended node outages, you +must explicitly tell the system if you're removing a node +permanently. If you permanently shut down a node and don't tell +the other nodes, then performance suffers and eventually +the whole system stops working. + +Node removal, also called *parting*, is done using the `bdr.part_node()` +function. You must specify the node name (as passed during node creation) +to remove a node. You can call the `bdr.part_node()` function from any active +node in the BDR group, including the node that you're removing. + +Just like the join procedure, parting is done using Raft consensus and requires a +majority of nodes to be online to work. + +The parting process affects all nodes. The Raft leader manages a vote +between nodes to see which node has the most recent data from the parting node. +Then all remaining nodes make a secondary, temporary connection to the +most-recent node to allow them to catch up any missing data. + +A parted node still is known to BDR but won't consume resources. A +node might be added again under the same name as a parted node. +In rare cases, you might want to clear all metadata of a parted +node by using the function `bdr.drop_node()`. + +### Uninstalling BDR + +Dropping the BDR extension removes all the BDR objects in a node, +including metadata tables. You can do this with the following +command: + +```sql +DROP EXTENSION bdr; +``` + +If the database depends on some BDR-specific objects, then you can't drop the BDR +extension. Examples include: + +- Tables using BDR-specific sequences such as `SnowflakeId` or `galloc` +- Column using CRDT data types +- Views that depend on some BDR catalog tables + +Remove those dependencies before dropping the BDR extension. +For example, drop the dependent objects, alter the column +type to a non-BDR equivalent, or change the sequence type back to +`local`. + +!!! Warning + You can drop the BDR extension only if the node was + successfully parted from its BDR node group or if it's the last + node in the group. Dropping BDR metadata breaks replication to and from the other nodes. + +!!! Warning + When dropping a local BDR node or the BDR extension in the local + database, any preexisting session might still try to execute a BDR-specific workflow + and therefore fail. You can solve the problem + by disconnecting the session and then reconnecting the client or + by restarting the instance. + +There's also a `bdr.drop_node()` function. Use this function only in +emergencies, such as if there's a problem with parting. + +## Listing BDR topology + +### Listing BDR groups + +The following simple query lists all the BDR node groups of which +the current node is a member. It currently returns only one row. + +```sql +SELECT node_group_name +FROM bdr.local_node_summary; +``` + +You can display the configuration of each node group using a more +complex query: + +```sql +SELECT g.node_group_name +, ns.pub_repsets +, ns.sub_repsets +, g.node_group_default_repset AS default_repset +, node_group_check_constraints AS check_constraints +FROM bdr.local_node_summary ns +JOIN bdr.node_group g USING (node_group_name); +``` + +### Listing nodes in a BDR group + +You can extract the list of all nodes in a given node group (such as `mygroup`) +from the `bdr.node_summary` view as shown in the following +example: + +```sql +SELECT node_name AS name +, node_seq_id AS ord +, peer_state_name AS current_state +, peer_target_state_name AS target_state +, interface_connstr AS dsn +FROM bdr.node_summary +WHERE node_group_name = 'mygroup'; +``` + +The read-only state of a node, as shown in the +`current_state` or in the `target_state` query columns, is indicated +as `STANDBY`. + +### List of node states + +- `NONE`: Node state is unset when the worker starts, expected to be set quickly + to the current known state. +- `CREATED`: `bdr.create_node()` was executed, but the node isn't a + member of any EDB Postgres Distributed cluster yet. +- `JOIN_START`: `bdr.join_node_group()` begins to join the local node to an + existing EDB Postgres Distributed cluster. +- `JOINING`: The node join has started and is currently at the initial sync phase, + creating the schema and data on the node. +- `CATCHUP`: Initial sync phase is completed. Now the join is at the last step + of retrieving and applying transactions that were performed on the upstream + peer node since the join started. +- `STANDBY`: Node join finished, but hasn't yet started to broadcast changes. + All joins spend some time in this state, but if defined as a logical standby, + the node continues in this state. +- `PROMOTE`: Node was a logical standby and we just called `bdr.promote_node` to + move the node state to `ACTIVE`. These two `PROMOTE`states have to be coherent + to the fact that only one node can be with a state higher than `STANDBY` but + lower than `ACTIVE`. +- `PROMOTING`: Promotion from logical standby to full BDR node is in progress. +- `ACTIVE`: The node is a full BDR node and is currently `ACTIVE`. This is the + most common node status. +- `PART_START`: Node was `ACTIVE` or `STANDBY` and we just called `bdr.part_node` + to remove the node from the EDB Postgres Distributed cluster. +- `PARTING`: Node disconnects from other nodes and plays no further part in + consensus or replication. +- `PART_CATCHUP`: Nonparting nodes synchronize any missing data from the + recently parted node. +- `PARTED`: Node parting operation is now complete on all nodes. + +Only one node at a time can be in either of the states PROMOTE or PROMOTING. + +## Node management interfaces + +You can add and remove nodes dynamically using the SQL interfaces. + +### bdr.create_node + +This function creates a node. + +#### Synopsis + +```sql +bdr.create_node(node_name text, local_dsn text) +``` + +#### Parameters + +- `node_name` — Name of the new node. Only one node is allowed per + database. Valid node names consist of lowercase letters, numbers, + hyphens, and underscores. +- `local_dsn` — Connection string to the node. + +#### Notes + +This function creates a record for the local node with the associated +public connection string. There can be only one local record, so once it's +created, the function reports an error if run again. + +This function is a transactional function. You can roll it back and the +changes made by it are visible to the current transaction. + +The function holds lock on the newly created bdr node until the end of +the transaction. + +### bdr.drop_node + +Drops a node. + +!!! Warning + This function isn't intended for regular use. Execute it only + if instructed by Technical Support. + +This function removes the metadata for a given node from the local +database. The node can be either: + +- The local node, in which case all the node metadata is removed, + including information about remote nodes. +- A remote node, in which case only metadata for that specific + node is removed. + +#### Synopsis + +```sql +bdr.drop_node(node_name text, cascade boolean DEFAULT false, force boolean DEFAULT false) +``` + +#### Parameters + +- `node_name` — Name of an existing node. +- `cascade` — Deprecated, will be removed in the future. +- `force` — Circumvents all sanity checks and forces the removal of + all metadata for the given BDR node despite a possible danger of + causing inconsistencies. Only Technical Support uses a forced node drop + in case of emergencies related to + parting. + +#### Notes + +Before you run this, part the node using `bdr.part_node()`. + +This function removes metadata for a given node from the local database. The node +can be the local node, in which case all the node metadata are removed, +including information about remote nodes. Or it can be the remote node, in +which case only metadata for that specific node is removed. + +!!! Note + BDR4 can have a maximum of 1024 node records (both ACTIVE and PARTED) + at one time because each node has a unique sequence number + assigned to it, for use by snowflakeid and timeshard sequences. + PARTED nodes aren't automatically cleaned up. If this + becomes a problem, you can use this function to remove those records. + +### bdr.create_node_group + +This function creates a BDR group with the local node as the only member of the group. + +#### Synopsis + +```sql +bdr.create_node_group(node_group_name text, + parent_group_name text DEFAULT NULL, + join_node_group boolean DEFAULT true, + node_group_type text DEFAULT NULL) +``` + +#### Parameters + +- `node_group_name` — Name of the new BDR group. As with the node + name, valid group names must consist of only lowercase letters, numbers, + and underscores. +- `parent_group_name` — The name of the parent group for the subgroup. +- `join_node_group` — This parameter helps a node to decide whether to join + the group being created by it. The default value is `true`. This is used + when a node is creating a shard group that it doesn't want to join. + This can be `false` only if you specify `parent_group_name`. +- `node_group_type` — The valid values are `NULL`, `subscriber-only`, `datanode`, + `read coordinator`, and `write coordinator`. `subscriber-only` type is + used to create a group of nodes that receive changes only from the + fully joined nodes in the cluster, but they never send replication + changes to other nodes. See [Subscriber-only nodes](#subscriber-only-nodes) for more details. + `Datanode` implies that the group represents a shard, whereas the other + values imply that the group represents respective coordinators. + Except `subscriber-only`, the other values are reserved for future use. + `NULL` implies a normal general-purpose node group is created. + +#### Notes + +This function passes a request to the local consensus worker that's running for +the local node. + +The function isn't transactional. The creation of the group is a background +process, so once the function finishes, you can't roll back the changes. +Also, the changes might not be immediately visible to the current transaction. +You can call `bdr.wait_for_join_completion` to wait until they are. + +The group creation doesn't hold any locks. + +### bdr.alter_node_group_config + +This function changes the configuration parameters of an existing BDR group. +Options with NULL value (default for all of them) aren't modified. + +#### Synopsis + +```sql +bdr.alter_node_group_config(node_group_name text, + insert_to_update boolean DEFAULT NULL, + update_to_insert boolean DEFAULT NULL, + ignore_redundant_updates boolean DEFAULT NULL, + check_full_tuple boolean DEFAULT NULL, + apply_delay interval DEFAULT NULL, + check_constraints boolean DEFAULT NULL, + num_writers int DEFAULT NULL, + enable_wal_decoder boolean DEFAULT NULL, + streaming_mode text DEFAULT NULL, + default_commit_scope text DEFAULT NULL) +``` + +#### Parameters + +- `node_group_name` — Name of an existing BDR group. The local node must be part + of the group. +- `insert_to_update` — Reserved for backward compatibility. +- `update_to_insert` — Reserved for backward compatibility. + + versions of BDR. Use `bdr.alter_node_set_conflict_resolver` instead. +- `ignore_redundant_updates` — Reserved for backward compatibility. +- `check_full_tuple` — Reserved for backward compatibility. +- `apply_delay` — Reserved for backward compatibility. +- `check_constraints` — Whether the apply process checks the constraints + when writing replicated data. + This option is deprecated and will be disabled or removed in future + versions of BDR. +- `num_writers` — Number of parallel writers for subscription backing + this node group. -1 means the default (as specified by the + GUC `bdr.writers_per_subscription`) is used. Valid values + are either -1 or a positive integer. +- `enable_wal_decoder` — Enables/disables the decoding worker process. + You can't enable the decoding worker process if `streaming_mode` is + already enabled. +- `streaming_mode` — Enables/disables streaming of large transactions. + When set to `off`, streaming is disabled. When set to any other value, + large transactions are decoded while they're still in progress, and the + changes are sent to the downstream. If the value is set to `file`, + then the incoming changes of streaming transactions are stored in a file + and applied only after the transaction is committed on upstream. If the + value is set to `writer`, then the incoming changes are directly sent to + one of the writers, if available. If parallel apply is disabled or no + writer is free to handle streaming transaction, then the changes are + written to a file and applied after the transaction is committed. If the + value is set to `auto`, BDR tries to intelligently pick between + `file` and `writer`, depending on the transaction property and available + resources. You can't enable `streaming_mode` if the WAL + decoder is already enabled. + + For more details, see [Transaction streaming](transaction-streaming). + +- `default_commit_scope` — The commit scope to use by default, + initially the `local` commit scope. This applies only to the + top-level node group. You can use individual rules for different + origin groups of the same commit scope. See + [Origin groups](durability/group-commit/#origin-groups) for more details. + +#### Notes + +This function passes a request to the group consensus mechanism to change +the defaults. The changes made are replicated globally using the consensus +mechanism. + +The function isn't transactional. The request is processed in the background +so you can't roll back the function call. Also, the changes might not be +immediately visible to the current transaction. + +This function doesn't hold any locks. + +!!! Warning + When you use this function to change the `apply_delay` value, the + change doesn't apply to nodes that are already members of the + group. + This restriction has little consequence on production + use because this value normally isn't used outside of testing. + +### bdr.join_node_group + +This function joins the local node to an already existing BDR group. + +#### Synopsis + +```sql +bdr.join_node_group ( + join_target_dsn text, + node_group_name text DEFAULT NULL, + pause_in_standby boolean DEFAULT false, + wait_for_completion boolean DEFAULT true, + synchronize_structure text DEFAULT 'all' +) +``` + +#### Parameters + +- `join_target_dsn` — Specifies the connection string to an existing (source) node + in the BDR group you want to add the local node to. +- `node_group_name` — Optional name of the BDR group. Defaults to NULL, which + tries to detect the group name from information present on the source + node. +- `pause_in_standby` — Optionally tells the join process to join only as a + logical standby node, which can be later promoted to a full member. +- `wait_for_completion` — Wait for the join process to complete before + returning. Defaults to `true`. +- `synchronize_structure` — Set the kind of structure (schema) synchronization + to do during the join. Valid options are `all`, which synchronizes + the complete database structure, and `none`, which doesn't synchronize any + structure. However, it still synchronizes data. + +If `wait_for_completion` is specified as `false`, +this is an asynchronous call that returns as soon as the joining procedure starts. +You can see progress of the join in logs and the +`bdr.event_summary` information view or by calling the +`bdr.wait_for_join_completion()` function after `bdr.join_node_group()` returns. + +#### Notes + +This function passes a request to the group consensus mechanism by way of the node +that the `join_target_dsn` connection string points to. +The changes made are replicated globally by the consensus mechanism. + +The function isn't transactional. The joining process happens in the +background and you can't roll it back. The changes are visible only +to the local transaction if `wait_for_completion` was set to `true` or by calling +`bdr.wait_for_join_completion` later. + +Node can be part of only a single group, so you can call this function only once +on each node. + +Node join doesn't hold any locks in the BDR group. + +### bdr.switch_node_group + +This function switches the local node from its current subgroup to another subgroup within the same existing BDR node group. + +#### Synopsis + +```sql +bdr.switch_node_group ( + node_group_name text, + wait_for_completion boolean DEFAULT true +) +``` + +#### Parameters + +- `node_group_name` — Name of the BDR group or subgroup. +- `wait_for_completion` — Wait for the switch process to complete before + returning. Defaults to `true`. + +If `wait_for_completion` is specified as `false`, +this is an asynchronous call that returns as soon as the switching procedure starts. +You can see progress of the switch in logs and the +`bdr.event_summary` information view or by calling the +`bdr.wait_for_join_completion()` function after `bdr.switch_node_group()` returns. + +#### Notes + +This function passes a request to the group consensus mechanism. +The changes made are replicated globally by the consensus mechanism. + +The function isn't transactional. The switching process happens in the +background and you can't roll it back. The changes are visible only +to the local transaction if `wait_for_completion` was set to `true` or by calling +`bdr.wait_for_join_completion` later. + +The local node changes membership from its current subgroup to another subgroup within the same BDR node group without needing to part the cluster. The node's kind must match that of existing nodes within the target subgroup. + +Node switching doesn't hold any locks in the BDR group. + +Restrictions: Currently, the function only allows switching between a subgroup and its BDR node group. To effect a move between subgroups it is necessary to make two separate calls: 1) switch from subgroup to node group and, 2) switch from node group to other subgroup. + + +### bdr.promote_node + +This function promotes a local logical standby node to a full member of the BDR group. + +#### Synopsis + +```sql +bdr.promote_node(wait_for_completion boolean DEFAULT true) +``` + +#### Notes + +This function passes a request to the group consensus mechanism to change +the defaults. The changes made are replicated globally by the consensus +mechanism. + +The function isn't transactional. The promotion process happens in the +background, and you can't roll it back. The changes are visible only +to the local transaction if `wait_for_completion` was set to `true` or by calling +`bdr.wait_for_join_completion` later. + +The promotion process holds lock against other promotions. This lock doesn't +block other `bdr.promote_node` calls but prevents the background process of +promotion from moving forward on more than one node at a time. + +### bdr.wait_for_join_completion + +This function waits for the join procedure of a local node to finish. + +#### Synopsis + +```sql +bdr.wait_for_join_completion(verbose_progress boolean DEFAULT false) +``` + +#### Parameters + +- `verbose_progress` — Optionally prints information about individual steps + taken during the join procedure. + +#### Notes + +This function waits until the checks state of the local node reaches the target +state, which was set by `bdr.create_node_group`, `bdr.join_node_group`, or +`bdr.promote_node`. + +### bdr.part_node + +Removes (parts) the node from the BDR group but doesn't remove data +from the node. + +You can call the function from any active node in the BDR group, including +the node that you're removing. However, once the +node is parted, it can't part other nodes in the cluster. + +!!! Note + If you're parting the local node, you must set `wait_for_completion` + to `false`. Otherwise, it reports an error. + +!!! Warning + This action is permanent. If you want to temporarily halt replication + to a node, see `bdr.alter_subscription_disable()`. + +#### Synopsis + +```sql +bdr.part_node ( + node_name text, + wait_for_completion boolean DEFAULT true, + force boolean DEFAULT false +) +``` + +#### Parameters + +- `node_name` — Name of an existing node to part. +- `wait_for_completion` — If `true`, the function doesn't return until the + node is fully parted from the cluster. Otherwise the function + starts the parting procedure and returns immediately without waiting. + Always set to `false` when executing on the local node or when using `force`. +- `force` — Forces removal of the node on the local node. This sets the + node state locally if consensus can't be reached or if the node parting + process is stuck. + +!!! Warning + Using `force = true` can leave the BDR group in an inconsistent + state. Use it only to recover from failures in which you can't + remove the node any other way. + +#### Notes + +This function passes a request to the group consensus mechanism to part +the given node. The changes made are replicated globally by the consensus +mechanism. The parting process happens in the background, and you can't +roll it back. The changes made by the parting process are visible only to +the local transaction if `wait_for_completion` was set to `true`. + +With `force` set to `true`, on consensus failure, this function sets the +state of the given node only on the local node. In such a case, the function is +transactional (because the function changes the node state) and you can +roll it back. If the function is called on a node that is already in process of +parting with `force` set to `true`, it also marks the given node as +parted locally and exits. This is useful only when the consensus can't be +reached on the cluster (that is, the majority of the nodes are down) or if the +parting process is stuck. But it's important to take into +account that when the parting node that was receiving writes, the parting process +can take a long time without being stuck. The other nodes need to resynchronize +any missing data from the given node. The force parting completely skips this +resynchronization and can leave the other nodes in an inconsistent state. + +The parting process doesn't hold any locks. + +### bdr.alter_node_interface + +This function changes the connection string (`DSN`) of a specified node. + +#### Synopsis + +```sql +bdr.alter_node_interface(node_name text, interface_dsn text) +``` + +#### Parameters + +- `node_name` — Name of an existing node to alter. +- `interface_dsn` — New connection string for a node. + +#### Notes + +Run this function and make the changes only on the local node. This means that you normally execute it on every node in the +BDR group, including the node that is being changed. + +This function is transactional. You can roll it back, and the changes are +visible to the current transaction. + +The function holds lock on the local node. + +### bdr.alter_subscription_enable + +This function enables either the specified subscription or all the subscriptions of the +local BDR node. This is also known as resume subscription. +No error is thrown if the subscription is already enabled. +Returns the number of subscriptions affected by this operation. + +#### Synopsis + +```sql +bdr.alter_subscription_enable( + subscription_name name DEFAULT NULL, + immediate boolean DEFAULT false +) +``` + +#### Parameters + +- `subscription_name` — Name of the subscription to enable. If NULL + (the default), all subscriptions on the local node are enabled. +- `immediate` — This currently has no effect. + +#### Notes + +This function isn't replicated and affects only local node subscriptions +(either a specific node or all nodes). + +This function is transactional. You can roll it back, and the current transaction can see any catalog changes. The subscription workers are started +by a background process after the transaction has committed. + +### bdr.alter_subscription_disable + +This function disables either the specified subscription or all the +subscriptions of the local BDR node. Optionally, it can also immediately stop +all the workers associated with the disabled subscriptions. This is also known as pause +subscription. No error is thrown if the subscription is already disabled. +Returns the number of subscriptions affected by this operation. + +#### Synopsis + +```sql +bdr.alter_subscription_disable( + subscription_name name DEFAULT NULL, + immediate boolean DEFAULT false, + fast boolean DEFAULT true +) +``` + +#### Parameters + +- `subscription_name` — Name of the subscription to disable. If NULL + (the default), all subscriptions on the local node are disabled. +- `immediate` — Used to force the action immediately, stopping + all the workers associated with the disabled subscription. When this option is + `true`, you can't run this function inside of the transaction block. +- `fast` — This argument influences the behavior of `immediate`. + If set to `true` (the default) it stops all the workers associated with the + disabled subscription without waiting for them to finish current work. + +#### Notes + +This function isn't replicated and affects only local node subscriptions +(either a specific subscription or all subscriptions). + +This function is transactional. You can roll it back, and the current transaction can see any catalog changes. +However, the timing of the subscription +worker stopping depends on the value of `immediate`. If set to `true`, the +workers receive the stop without waiting for the `COMMIT`. If the `fast` +argument is set to `true`, the interruption of the workers doesn't wait for +current work to finish. + +## Node-management commands + +BDR also provides a command-line utility for adding nodes to the BDR group using +physical copy (`pg_basebackup`) of an existing node and for converting a +physical standby of an existing node to a new node in the BDR group. + +### bdr_init_physical + +This is a regular command that's added to PostgreSQL's bin directory. + +You must specify a data directory. If this data directory is empty, +use the `pg_basebackup -X stream` to fill the directory +using a fast block-level copy operation. + +If the specified data directory isn't empty, this is used as the +base for the new node. If the data directory is already active as a +physical standby node, you need to stop the standby before running +`bdr_init_physical`, which manages Postgres. Initially it +waits for catchup and then promotes to a master node before joining the BDR +group. The `--standby` option, if used, turns the existing +physical standby into a logical standby node. It refers to the end state +of the new BDR node, not the starting state of the specified data directory. + +This command drops all PostgreSQL-native logical replication +subscriptions from the database (or disables them when the `-S` option is +used) as well as any replication origins and slots. + +#### Synopsis + +```shell +bdr_init_physical [OPTION] ... +``` + +#### Options + +##### General options + +- `-D, --pgdata=DIRECTORY` — The data directory to use for the new node. It + can be either an empty or nonexistent directory or a directory populated using the + `pg_basebackup -X stream` command (required). +- `-l, --log-file=FILE` — Use FILE for logging. The default is + `bdr_init_physical_postgres.log`. +- `-n, --node-name=NAME` — The name of the newly created node (required). +- `--replication-sets=SETS` — The name of a comma-separated list of replication + set names to use. All replication sets are used if not specified. +- `--standby` — Create a logical standby (receive-only node) rather than full + send/receive node. +- `--node-group-name` — Group to join. Defaults to the same group as source node. +- `-s, --stop` — Stop the server once the initialization is done. +- `-v` — Increase logging verbosity. +- `-L` — Perform selective pg_basebackup when used with an + empty/nonexistent data directory (-D option). This is a feature of + EDB Postgres Extended Server only. +- `-S` — Instead of dropping logical replication subscriptions, disable + them. + +##### Connection options + +- `-d, --remote-dsn=CONNSTR` — Connection string for remote node (required). +- `--local-dsn=CONNSTR` — Connection string for local node (required). + +##### Configuration files override + +- `--hba-conf` — Path to the new `pg_hba.conf`. +- `--postgresql-conf` — Path to the new `postgresql.conf`. +- `--postgresql-auto-conf` — Path to the new `postgresql.auto.conf`. + +#### Notes + +The replication set names specified in the command don't affect the data that +exists in the data directory before the node joins the BDR group. This is true +whether `bdr_init_physical` makes its own base backup or an existing base backup +is being promoted to a new BDR node. Thus the `--replication-sets` option +affects only the data published and subscribed to after the node joins the BDR node +group. This behavior is different from the way replication sets are used in a +logical join, as when using `bdr.join_node_group()`. + +The operator can truncate unwanted tables after the join completes. +Refer to the `bdr.tables` catalog to determine replication set membership and +identify tables that aren't members of any subscribed-to replication set. We +strongly recommend that you truncate the tables rather than drop them, because: + +- DDL replication sets aren't necessarily the same as row (DML) replication + sets, so you might inadvertently drop the table on other nodes. +- If you later want to add the table to a replication set and you dropped + it on some subset of nodes, you need to re-create it only + on those nodes without creating DDL conflicts before you can add it to + any replication sets. + +It's simpler and safer to truncate your nonreplicated tables, leaving them +present but empty. + +A future version of BDR might automatically omit or remove tables that aren't +part of the selected replication sets for a physical join, so your application +should not rely on details of the behavior documented here. diff --git a/product_docs/docs/pgd/5/other_considerations.mdx b/product_docs/docs/pgd/5/other_considerations.mdx new file mode 100644 index 00000000000..e3ac2c5c7a3 --- /dev/null +++ b/product_docs/docs/pgd/5/other_considerations.mdx @@ -0,0 +1,46 @@ +--- +title: "Other considerations" +--- + +Review these other considerations when planning your deployment. + +## Deployment and sizing considerations + +For production deployments, EDB recommends a minimum of 4 cores for each +Postgres data node. Witness nodes don't participate in the data replication +operation and don't have to meet this requirement. Always size logical standbys +exactly like the data nodes to avoid performance degradations in case of a node +promotion. In production deployments, PGD proxy nodes require minimum of 1 core, +and should increase incrementally in correlation with an increase in the number +of database cores in approximately a 1:10 ratio. EDB recommends detailed +benchmarking of your specific performance requirements to determine appropriate +sizing based on your workload. The EDB Professional Services team is available +to assist if needed. + +For development purposes, don't assign Postgres data nodes fewer than two cores. +The sizing of Barman nodes depends on the database size and the data change +rate. + +You can deploy Postgres data nodes, Barman nodes, and PGD proxy nodes on virtual +machines or in a bare metal deployment mode. However, don't deploy multiple data +nodes on VMs that are on the same physical hardware, as that reduces resiliency. +Also don't deploy multiple PGD proxy nodes on VMs on the same physical hardware, +as that, too, reduces resiliency. + +Single PGD Proxy nodes can be co-located with single PGD data nodes. + +## Clocks and timezones + +EDB Postgres Distributed has been designed to operate with nodes in multiple +timezones, allowing a truly worldwide database cluster. Individual servers do +not need to be configured with matching timezones, though we do recommend using +log_timezone = UTC to ensure the human readable server log is more accessible +and comparable. + +Server clocks should be synchronized using NTP or other solutions. + +Clock synchronization is not critical to performance, as is the case with some +other solutions. Clock skew can impact Origin Conflict Detection, though EDB +Postgres Distributed provides controls to report and manage any skew that +exists. EDB Postgres Distributed also provides Row Version Conflict Detection, +as described in [Conflict Detection](consistency/conflicts). diff --git a/product_docs/docs/pgd/5/overview/img/bdr.png b/product_docs/docs/pgd/5/overview/img/bdr.png new file mode 100644 index 00000000000..29635ad1030 --- /dev/null +++ b/product_docs/docs/pgd/5/overview/img/bdr.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:023200b99a4fbf8ba1a9ac375c98daf277eb876d399dff24bc97e173e49eb708 +size 57274 diff --git a/product_docs/docs/pgd/5/overview/img/frontpage.svg b/product_docs/docs/pgd/5/overview/img/frontpage.svg new file mode 100644 index 00000000000..1beb742e72f --- /dev/null +++ b/product_docs/docs/pgd/5/overview/img/frontpage.svg @@ -0,0 +1 @@ +geo-distributed \ No newline at end of file diff --git a/product_docs/docs/pgd/5/overview/img/nodes.png b/product_docs/docs/pgd/5/overview/img/nodes.png new file mode 100644 index 00000000000..7f969ed1e71 --- /dev/null +++ b/product_docs/docs/pgd/5/overview/img/nodes.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:264eccb0911c492ba60dccf3f9df14aa93119336b8845b1c772859bd7a031939 +size 45015 diff --git a/product_docs/docs/pgd/5/overview/img/nodes.svg b/product_docs/docs/pgd/5/overview/img/nodes.svg new file mode 100644 index 00000000000..b5ae1678cb1 --- /dev/null +++ b/product_docs/docs/pgd/5/overview/img/nodes.svg @@ -0,0 +1,13 @@ + + + + + + image/svg+xml + + + + + + + diff --git a/product_docs/docs/pgd/5/overview/index.mdx b/product_docs/docs/pgd/5/overview/index.mdx new file mode 100644 index 00000000000..a949553eb82 --- /dev/null +++ b/product_docs/docs/pgd/5/overview/index.mdx @@ -0,0 +1,252 @@ +--- +title: "Overview" +redirect: bdr +--- + +EDB Postgres Distributed (PGD) provides multi-master replication and data +distribution with advanced conflict management, data-loss protection, and +throughput up to 5X faster than native logical replication, and enables +distributed Postgres clusters with high availability up to five 9s. + +PGD provides loosely coupled, multi-master logical replication +using a mesh topology. This means that you can write to any server and the +changes are sent directly, row-by-row, to all the +other servers that are part of the same PGD group. + +By default, PGD uses asynchronous replication, applying changes on +the peer nodes only after the local commit. Multiple synchronous replication +options are also available. + +## Basic architecture + +### Multiple groups + +A PGD node is a member of at least one *node group*, and in the most +basic architecture there is a single node group for the whole PGD +cluster. + +### Multiple masters + +Each node (database) participating in a PGD group both receives +changes from other members and can be written to directly by the user. + +This is distinct from hot or warm standby, where only one master +server accepts writes, and all the other nodes are standbys that +replicate either from the master or from another standby. + +You don't have to write to all the masters all of the time. +A frequent configuration directs writes mostly to just one master. + +### Asynchronous, by default + +Changes made on one PGD node aren't replicated to other nodes until +they're committed locally. As a result, the data isn't exactly the +same on all nodes at any given time. Some nodes have data that +hasn't yet arrived at other nodes. PostgreSQL's block-based replication +solutions default to asynchronous replication as well. In PGD, +because there are multiple masters and, as a result, multiple data streams, +data on different nodes might differ even when +`synchronous_commit` and `synchronous_standby_names` are used. + +### Mesh topology + +PGD is structured around a mesh network where every node connects to every +other node and all nodes exchange data directly with each other. There's no +forwarding of data in PGD except in special circumstances such as adding and removing nodes. +Data can arrive from outside the EDB Postgres Distributed cluster or +be sent onwards using native PostgreSQL logical replication. + +### Logical replication + +Logical replication is a method of replicating data rows and their changes +based on their replication identity (usually a primary key). +We use the term *logical* in contrast to *physical* replication, which uses +exact block addresses and byte-by-byte replication. Index changes aren't +replicated, thereby avoiding write amplification and reducing bandwidth. + +Logical replication starts by copying a snapshot of the data from the +source node. Once that is done, later commits are sent to other nodes as +they occur in real time. Changes are replicated without re-executing SQL, +so the exact data written is replicated quickly and accurately. + +Nodes apply data in the order in which commits were made on the source node, +ensuring transactional consistency is guaranteed for the changes from +any single node. Changes from different nodes are applied independently of +other nodes to ensure the rapid replication of changes. + +Replicated data is sent in binary form, when it's safe to do so. + + +### Connection management + +[Connection management](../routing) leverages consensus-driven quorum to determine +the correct connection end-point in a semi-exclusive manner to prevent unintended +multi-node writes from an application. This reduces the potential for data conflicts. + +[PGD Proxy](../routing/proxy) is the tool for application connection management +provided as part of EDB Postgres Distributed. + +### High availability + +Each master node can be protected by one or more standby nodes, so any node +that goes down can be quickly replaced and continue. Each standby node can +be either a logical or a physical standby node. + +Replication continues between currently connected nodes even if one or more +nodes are currently unavailable. When the node recovers, replication +can restart from where it left off without missing any changes. + +Nodes can run different release levels, negotiating the required protocols +to communicate. As a result, EDB Postgres Distributed clusters can use rolling upgrades, even +for major versions of database software. + +DDL is replicated across nodes by default. DDL execution can +be user controlled to allow rolling application upgrades, if desired. + +## Architectural options and performance + +### Always On architectures + +A number of different architectures can be configured, each of which has +different performance and scalability characteristics. + +The group is the basic building block consisting of 2+ nodes +(servers). In a group, each node is in a different availability zone, with dedicated router +and backup, giving immediate switchover and high availability. Each group has a +dedicated replication set defined on it. If the group loses a node, you can easily +repair or replace it by copying an existing node from the group. + +The Always On architectures are built from either one group in a single location +or two groups in two separate locations. Each group provides high availability. When two +groups are leveraged in remote locations, they together also provide disaster recovery (DR). + +Tables are created across both groups, so any change goes to all nodes, not just to +nodes in the local group. + +One node in each group is the target for the main application. All other nodes are described as +shadow nodes (or "read-write replica"), waiting to take over when needed. If a node +loses contact, we switch immediately to a shadow node to continue processing. If a +group fails, we can switch to the other group. Scalability isn't the goal of this +architecture. + +Since we write mainly to only one node, the possibility of contention between is +reduced to almost zero. As a result, performance impact is much reduced. + +Secondary applications might execute against the shadow nodes, although these are +reduced or interrupted if the main application begins using that node. + +In the future, one node will be elected as the main replicator to other groups, limiting CPU +overhead of replication as the cluster grows and minimizing the bandwidth to other groups. + +### Supported Postgres database servers + +PGD is compatible with [PostgreSQL](https://www.postgresql.org/), [EDB Postgres Extended Server](https://techsupport.enterprisedb.com/customer_portal/sw/2ndqpostgres/) and [EDB Postgres Advanced Server](/epas/latest) +and is deployed as a standard Postgres extension named BDR. See the [Compatibility matrix](../#compatibility-matrix) +for details of supported version combinations. + +Some key PGD features depend on certain core +capabilities being available in the targeted Postgres database server. +Therefore, PGD users must also adopt the Postgres +database server distribution that's best suited to their business needs. For +example, if having the PGD feature Commit At Most Once (CAMO) is mission +critical to your use case, don't adopt the community +PostgreSQL distribution because it doesn't have the core capability required to handle +CAMO. See the full feature matrix compatibility in +[Choosing a Postgres distribution](../choosing_server/). + +PGD offers close to native Postgres compatibility. However, some access +patterns don't necessarily work as well in multi-node setup as they do on a +single instance. There are also some limitations in what can be safely +replicated in multi-node setting. [Application usage](../appusage) +goes into detail on how PGD behaves from an application development perspective. + +### Characteristics affecting performance + +By default, PGD keeps one copy of each table on each node in the group, and any +changes propagate to all nodes in the group. + +Since copies of data are everywhere, SELECTs need only ever access the local node. +On a read-only cluster, performance on any one node isn't affected by the +number of nodes and is immune to replication conflicts on other nodes caused by +long-running SELECT queries. Thus, adding nodes increases linearly the total possible SELECT +throughput. + +If an INSERT, UPDATE, and DELETE (DML) is performed locally, then the changes +propagate to all nodes in the group. The overhead of DML apply is less than the +original execution, so if you run a pure write workload on multiple nodes +concurrently, a multi-node cluster can handle more TPS than a single node. + +Conflict handling has a cost that acts to reduce the throughput. The throughput +then depends on how much contention the application displays in practice. +Applications with very low contention perform better than a single node. +Applications with high contention can perform worse than a single node. +These results are consistent with any multi-master technology. They aren't particular to PGD. + +Synchronous replilcation options can send changes concurrently to multiple nodes +so that the replication lag is minimized. Adding more nodes means using more CPU for +replication, so peak TPS reduces slightly as each node is added. + +If the workload tries to use all CPU resources, then this resource constrains +replication, which can then affect the replication lag. + +In summary, adding more master nodes to a PGD group doesn't result in significant write +throughput increase when most tables are replicated because all the writes will +be replayed on all nodes. Because PGD writes are in general more effective +than writes coming from Postgres clients by way of SQL, some performance increase +can be achieved. Read throughput generally scales linearly with the number of +nodes. + +## Deployment + +PGD is intended to be deployed in one of a small number of known-good configurations, +using either [Trusted Postgres Architect](/tpa/latest) or a configuration management approach +and deployment architecture approved by Technical Support. + +Manual deployment isn't recommended and might not be supported. + +Log messages and documentation are currently available only in English. + +## Clocks and timezones + +PGD is designed to operate with nodes in multiple timezones, allowing a +truly worldwide database cluster. Individual servers don't need to be configured +with matching timezones, although we do recommend using `log_timezone = UTC` to +ensure the human-readable server log is more accessible and comparable. + +Synchronize server clocks using NTP or other solutions. + +Clock synchronization isn't critical to performance, as it is with some +other solutions. Clock skew can impact origin conflict detection, although +PGD provides controls to report and manage any skew that exists. PGD also +provides row-version conflict detection, as described in [Conflict detection](../consistency/conflicts). + + +## Limits + +PGD can run hundreds of nodes on good-enough hardware and network. However, +for mesh-based deployments, we generally don't recommend running more than +32 nodes in one cluster. +Each master node can be protected by multiple physical or logical standby nodes. +There's no specific limit on the number of standby nodes, +but typical usage is to have 2–3 standbys per master. Standby nodes don't +add connections to the mesh network, so they aren't included in the +32-node recommendation. + +PGD currently has a hard limit of no more than 1000 active nodes, as this is the +current maximum Raft connections allowed. + +Support for using EDB Postgres Distributed for multiple databases on the same +Postgres instance is deprecated beginning with EDB Postgres Distributed 5 and +will no longer be supported with EDB Postgres Distributed 6. As we extend the +capabilities of the product, the additional complexity introduced operationally +and functionally is no longer viable in a multi-database design. + +The minimum recommended number of nodes in a group is three to provide fault +tolerance for PGD's consensus mechanism. With just two nodes, consensus would +fail if one of the nodes was unresponsive. Consensus is required for some PGD +operations such as distributed sequence generation. For more information about +the consensus mechanism used by EDB Postgres Distributed, see +[Architectural details](../architectures/#architecture-details). + + + diff --git a/product_docs/docs/pgd/5/rel_notes/index.mdx b/product_docs/docs/pgd/5/rel_notes/index.mdx new file mode 100644 index 00000000000..405ffec39c3 --- /dev/null +++ b/product_docs/docs/pgd/5/rel_notes/index.mdx @@ -0,0 +1,16 @@ +--- +title: "EDB Postgres Distributed Release notes" +navTitle: "Release notes" +navigation: +- pgd_5.0.0_rel_notes +--- + +The EDB Postgres Distributed documentation describes the latest version of EDB +Postgres Distributed 5, including minor releases and patches. The release notes +provide information on what was new in each release. For new functionality +introduced in a minor or patch release, the content also indicates the release +that introduced the feature. + +| Release Date | EDB Postgres Distributed | BDR | +| ------------ | ---------------------------- | ----- | +| 2023 Feb 21 | [5.0.0](pgd_5.0.0_rel_notes) | 5.0.0 | diff --git a/product_docs/docs/pgd/5/rel_notes/pgd_5.0.0_rel_notes.mdx b/product_docs/docs/pgd/5/rel_notes/pgd_5.0.0_rel_notes.mdx new file mode 100644 index 00000000000..af88f90342c --- /dev/null +++ b/product_docs/docs/pgd/5/rel_notes/pgd_5.0.0_rel_notes.mdx @@ -0,0 +1,41 @@ +--- +title: "Release notes for EDB Postgres Distributed version 5.0.0" +navTitle: "Version 5.0.0" +--- + +EDB Postgres Distributed version 5.0.0 is a is a new major version of EDB Postgres Distributed. +This version brings major new features and compatibility changes. + +The highlights of this release include: + + * Flexible deployment architectures + * Enhanced routing capabilities + * Unified replication durability configuration + * Support for EDB Advanced Storage Pack + * Support for TDE with EDB Postgres Advanced 15 and EDB Postgres Extended 15 + * Integration with OpenTelemetry + * Improved transaction tracking performance (Group Commit, CAMO) + * Postgres 12 to 15 compatiblity + + +| Component | Version | Type | Description | +|-----------|---------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| PGD | 5.0.0 | Feature | Flexible Deployment Architectures

Redefined Always-ON to support wider variety of deployments.

| +| BDR | 5.0.0 | Feature | Enhanced routing capabilities

BDR cluster elects a write leader for every group (and associated location) using per group Raft when routing is enabled for the group. It takes care of write leader failover and provides SQL commands to change a write leader.

| +| BDR | 5.0.0 | Feature | Support for EDB Advanced Storage Pack

EDB Advanced Storage Pack provides advanced storage options for PostgreSQL databases in the form of table access method (TAM) extensions. These storage options can enhance the performance and reliability of databases without requiring application changes.

| +| BDR | 5.0.0 | Feature | Unified replication durability configuration

The durability options such as Group Commit, CAMO, Eager Replication or Lag Control are now all configured through Commit Scope configuration.

| +| BDR | 5.0.0 | Feature | EDB Postgres Advanced and EDB Postgres Extended TDE support

EDB Postgres Distributed 5 fully supports the Transparent Data Encryption feature in EDB Postgres Advanced and EDB Postgres Extended

. | +| BDR | 5.0.0 | Feature | Integration with OpenTelemetry

BDR extension can now send monitoring metrics as well as traces to the OpenTelemetry collector for better integration with existing monitoring solutions.

| +| BDR | 5.0.0 | Feature | Postgres 15 compatibility

EDB Postgres Distributed 5 is compatible with Postgres 12 to 15.

| +| BDR | 5.0.0 | Feature | Improved Cluster Event Management

The `bdr.worker_errors` and `bdr.state_journal_details` view were replaced by unified `bdr.event_summary` which also include changes in Raft role for the local node. In the future additional events may be added to it.

| +| BDR | 5.0.0 | Change | Improved transaction tracking performance

Transaction tracking now uses shared memory instead of `bdr.internal_node_pre_commit` catalog which considerably improves performance as it does not incur additional I/O.

| +| BDR | 5.0.0 | Feature | Support non-default replication sets with Decoding Worker

Allows Decoding Worker feature to be used in clusters using non-default replication sets like assymetric replication setup.

| +| BDR | 5.0.0 | Feature | Add support for HASH partitioning in Autopartition

Extend autopartition/autoscale to support HASH partitioning. Many of things that are required for RANGE partitioning are not needed for HASH partitioning. For example, we expect to create all HASH partitions in one go (at least for the current work; later we may change this). We don't expect HASH partitions to be moved to a different tablespace or dropped. So data retention policies don't apply for HASH partitioning.

| +| BDR | 5.0.0 | Feature | Add a new benchmarking utility `pgd_bench`

The utility supports benchmarking CAMO transactions and in future releases will be used for benchmarking PGD specific workloads.

| +| BDR | 5.0.0 | Change | Separate Task Management from Autopartition

In this release, the autopartition work queue mechanism has been moved to a separate module called Task Manager (taskmgr). The task manager is responsible for creating new tasks and executing the ones created by the local node or the task manager leader node. The autopartition worker is thus renamed as taskmgr worker process in this release.

In the older PGD releases, the Raft leader was responsible for creating new work items. But that creates a problem because a witness node can become a Raft leader while it does not have the full view of the cluster objects. In this release, we have introduced a concept of Task Manager Leader node. The node is selected automatically by PGD, but for upgraded clusters, its important to set the `node_kind` properly for all nodes in the cluster. The user is expected to this manually after upgrading to the latest PGD version by calling bdr.alter_node_kind() SQL function for each node.

| +| BDR | 5.0.0 | Change | Nodes now have a node kind

This better differentiates different kinds of nodes such as data, witness, subscriber-only and standby.

| +| Proxy | 5.0.0 | Feature | PGD built-in proxy

A TCP layer 4, pass through proxy for PGD cluster using routing capabilities of BDR.

| +| CLI | 5.0.0 | Feature | PGD cluster verification

CLI supports two new commands `verify-settings` and `verify-cluster`. `verify-settings` verifies the PostgreSQL configuration of each node in a PGD cluster against the recommendations. `verify-cluster` verifies the PGD cluster architectures against the flexible architecture deployment recommendations.

| +| CLI | 5.0.0 | Feature | Proxy management and configuration

`pgd` supports `create-proxy`, `delete proxy`, `set-group-options`, `set-node-options`, `set-proxy-options`, `show-proxies`, `show-groups` and `switchover` to configure and manage Proxy per group.

| +| CLI | 5.0.0 | Change | Remove `show-camo` command and remove CAMO check from `check-health` command. Support for `commit scopes` in CLI will be added in a future release. | +| CLI | 5.0.0 | Change | Modify output of `show-nodes` and `show-raft` commands to accomodate routing capabilities | diff --git a/product_docs/docs/pgd/5/repsets.mdx b/product_docs/docs/pgd/5/repsets.mdx new file mode 100644 index 00000000000..12a269ee5b3 --- /dev/null +++ b/product_docs/docs/pgd/5/repsets.mdx @@ -0,0 +1,668 @@ +--- +title: Replication sets +redirects: + - bdr/repsets + +--- + +A replication set is a group of tables that a BDR node can subscribe to. +You can use replication sets to create more complex replication topologies +than regular symmetric multi-master where each node is an exact copy of the other +nodes. + +Every BDR group creates a replication set with the same name as +the group. This replication set is the default replication set, which is +used for all user tables and DDL replication. All nodes are subscribed to it. +In other words, by default all user tables are replicated between all nodes. + +## Using replication sets + +You can create replication sets using `create_replication_set()`, +specifying whether to include insert, update, delete, or truncate actions. +One option lets you add existing tables to the set, and +a second option defines whether to add tables when they are +created. + +You can also manually define the tables to add or remove from a +replication set. + +Tables included in the replication set are maintained when the node +joins the cluster and afterwards. + +Once the node is joined, you can still remove tables from the replication +set, but you must add new tables using a resync operation. + +By default, a newly defined replication set doesn't replicate DDL or BDR +administration function calls. Use `replication_set_add_ddl_filter` +to define the commands to replicate. + +BDR creates replication set definitions on all nodes. Each node can then be +defined to publish or subscribe to each replication set using +`alter_node_replication_sets`. + +You can use functions to alter these definitions later or to drop the replication +set. + +!!! Note + Don't use the default replication set for selective replication. + Don't drop or modify the default replication set on any of + the BDR nodes in the cluster as it is also used by default for DDL + replication and administration function calls. + +## Behavior of partitioned tables + +BDR supports partitioned tables transparently, meaning that you can add a partitioned +table to a replication set. +Changes that involve any of the partitions are replicated downstream. + +!!! Note + When partitions are replicated through a partitioned table, the + statements executed directly on a partition are replicated as they + were executed on the parent table. The exception is the `TRUNCATE` command, + which always replicates with the list of affected tables or partitions. + +You can add individual partitions to the replication set, in +which case they are replicated like regular tables (to the table of the +same name as the partition on the downstream). This has some performance +advantages if the partitioning definition is the same on both provider and +subscriber, as the partitioning logic doesn't have to be executed. + +!!! Note + If a root partitioned table is part of any replication set, memberships + of individual partitions are ignored. only the membership of that root + table is taken into account. + +## Behavior with foreign keys + +A foreign key constraint ensures that each row in the referencing +table matches a row in the referenced table. Therefore, if the +referencing table is a member of a replication set, the referenced +table must also be a member of the same replication set. + +The current version of BDR doesn't automatically check for or enforce +this condition. When adding a table to a replication set, the database administrator must +make sure +that all the tables referenced by foreign keys are also added. + +You can use the following query to list all the foreign keys and +replication sets that don't satisfy this requirement. +The referencing table is a member of the replication set, while the +referenced table isn't: + +```sql +SELECT t1.relname, + t1.nspname, + fk.conname, + t1.set_name + FROM bdr.tables AS t1 + JOIN pg_catalog.pg_constraint AS fk + ON fk.conrelid = t1.relid + AND fk.contype = 'f' + WHERE NOT EXISTS ( + SELECT * + FROM bdr.tables AS t2 + WHERE t2.relid = fk.confrelid + AND t2.set_name = t1.set_name +); +``` + +The output of this query looks like the following: + +```sql + relname | nspname | conname | set_name +---------+---------+-----------+---------- + t2 | public | t2_x_fkey | s2 +(1 row) +``` + +This means that table `t2` is a member of replication set `s2`, but the +table referenced by the foreign key `t2_x_fkey` isn't. + +The `TRUNCATE CASCADE` command takes into account the +replication set membership before replicating the command. For example: + +```sql +TRUNCATE table1 CASCADE; +``` + +This becomes a `TRUNCATE` without cascade on all the tables that are +part of the replication set only: + +```sql +TRUNCATE table1, referencing_table1, referencing_table2 ... +``` + +## Replication set management + +Management of replication sets. + +With the exception of `bdr.alter_node_replication_sets`, the following +functions are considered to be `DDL`. DDL replication and global locking +apply to them, if that's currently active. See [DDL replication](ddl). + +### bdr.create_replication_set + +This function creates a replication set. + +Replication of this command is affected by DDL replication configuration +including DDL filtering settings. + +#### Synopsis + +```sql +bdr.create_replication_set(set_name name, + replicate_insert boolean DEFAULT true, + replicate_update boolean DEFAULT true, + replicate_delete boolean DEFAULT true, + replicate_truncate boolean DEFAULT true, + autoadd_tables boolean DEFAULT false, + autoadd_existing boolean DEFAULT true) +``` + +#### Parameters + +- `set_name` — Name of the new replication set. Must be unique across the BDR + group. +- `replicate_insert` — Indicates whether to replicate inserts into tables in this + replication set. +- `replicate_update` — Indicates whether to replicate updates of tables in this + replication set. +- `replicate_delete` — Indicates whether to replicate deletes from tables in this + replication set. +- `replicate_truncate` — Indicates whether to replicate truncates of tables in this + replication set. +- `autoadd_tables` — Indicates whether to replicate newly created (future) tables + to this replication set +- `autoadd_existing` — Indicates whether to add all existing user tables + to this replication set. This parameter has an effect only if `autoadd_tables` is + set to `true`. + +#### Notes + +By default, new replication sets don't replicate DDL or BDR administration +function calls. See [ddl filters](repsets#ddl-replication-filtering) for how to set +up DDL replication for replication sets. A preexisting DDL filter +is set up for the default group replication set that replicates all DDL and admin +function calls. It's created when the group is created but can be dropped +in case you don't want the BDR group default replication set to replicate +DDL or the BDR administration function calls. + +This function uses the same replication mechanism as `DDL` statements. This means +that the replication is affected by the [ddl filters](repsets#ddl-replication-filtering) +configuration. + +The function takes a `DDL` global lock. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. + +### bdr.alter_replication_set + +This function modifies the options of an existing replication set. + +Replication of this command is affected by DDL replication configuration, +including DDL filtering settings. + +#### Synopsis + +```sql +bdr.alter_replication_set(set_name name, + replicate_insert boolean DEFAULT NULL, + replicate_update boolean DEFAULT NULL, + replicate_delete boolean DEFAULT NULL, + replicate_truncate boolean DEFAULT NULL, + autoadd_tables boolean DEFAULT NULL) +``` + +#### Parameters + +- `set_name` — Name of an existing replication set. +- `replicate_insert` — Indicates whether to replicate inserts into tables in this + replication set. +- `replicate_update` — Indicates whether to replicate updates of tables in this + replication set. +- `replicate_delete` — Indicates whether to replicate deletes from tables in this + replication set. +- `replicate_truncate` — Indicates whether to replicate truncates of tables in this + replication set. +- `autoadd_tables` — Indicates whether to add newly created (future) tables to this replication set. + +Any of the options that are set to NULL (the default) remain the same as +before. + +#### Notes + +This function uses the same replication mechanism as `DDL` statements. This means +the replication is affected by the [ddl filters](repsets#ddl-replication-filtering) +configuration. + +The function takes a `DDL` global lock. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. + +### bdr.drop_replication_set + +This function removes an existing replication set. + +Replication of this command is affected by DDL replication configuration, +including DDL filtering settings. + +#### Synopsis + +```sql +bdr.drop_replication_set(set_name name) +``` + +#### Parameters + +- `set_name` — Name of an existing replication set. + +#### Notes + +This function uses the same replication mechanism as `DDL` statements. This means +the replication is affected by the [ddl filters](repsets#ddl-replication-filtering) +configuration. + +The function takes a `DDL` global lock. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. + +!!! Warning + Don't drop a replication set that's being used by at least + another node, because doing so stops replication on that + node. If that happens, unsubscribe the affected node + from that replication set. + For the same reason, don't drop a replication set with + a join operation in progress when the node being joined + is a member of that replication set. Replication set membership is + checked only at the beginning of the join. + This happens because the information on replication set usage is + local to each node, so that you can configure it on a node before + it joins the group. + +You can manage replication set subscription for a node using `alter_node_replication_sets`. + +### bdr.alter_node_replication_sets + +This function changes the replication sets a node publishes and is subscribed to. + +#### Synopsis + +```sql +bdr.alter_node_replication_sets(node_name name, + set_names text[]) +``` + +#### Parameters + +- `node_name` — The node to modify. Currently has to be local node. +- `set_names` — Array of replication sets to replicate to the specified + node. An empty array results in the use of the group default replication set. + +#### Notes + +This function is executed only on the local node and isn't replicated in any manner. + +The replication sets listed aren't checked for existence, +since this function is designed to execute before the node joins. Be careful +to specify replication set names correctly to avoid errors. + +This allows for calling the function not only on the node that's part of the +BDR group but also on a node that hasn't joined any group yet. This approach limits +the data synchronized during the join. However, the schema is +always fully synchronized without regard to the replication sets setting. +All tables are copied across, not just the ones specified +in the replication set. You can drop unwanted tables by referring to +the `bdr.tables` catalog table. These might be removed automatically in later +versions of BDR. This is currently true even if the [ddl filters](repsets#ddl-replication-filtering) +configuration otherwise prevent replication of DDL. + +The replication sets that the node subscribes to after this call are published +by the other nodes for actually replicating the changes from those nodes to +the node where this function is executed. + +## Replication set membership + +You can add tables to or remove them from one or more replication sets. This +affects replication only of changes (DML) in those tables. Schema changes (DDL) are +handled by DDL replication set filters (see [DDL replication filtering](#ddl-replication-filtering)). + +The replication uses the table membership in replication sets +with the node replication sets configuration to determine the actions to +replicate to which node. The decision is done using the union of all the +memberships and replication set options. Suppose that a table is a member +of replication set A that replicates only INSERT actions and replication set B that +replicates only UPDATE actions. Both INSERT and UPDATE act8ions are replicated if the +target node is also subscribed to both replication set A and B. + +### bdr.replication_set_add_table + +This function adds a table to a replication set. + +This adds a table to a replication set and starts replicating changes +from this moment (or rather transaction commit). Any existing data the table +might have on a node isn't synchronized. + +Replication of this command is affected by DDL replication configuration, +including DDL filtering settings. + +#### Synopsis + +```sql +bdr.replication_set_add_table(relation regclass, + set_name name DEFAULT NULL, + columns text[] DEFAULT NULL, + row_filter text DEFAULT NULL) +``` + +#### Parameters + +- `relation` — Name or Oid of a table. +- `set_name` — Name of the replication set. If NULL (the default), then the BDR + group default replication set is used. +- `columns` — Reserved for future use (currently does nothing and must be NULL). +- `row_filter` — SQL expression to be used for filtering the replicated rows. + If this expression isn't defined (that is, set to NULL, the default) then all rows are sent. + +The `row_filter` specifies an expression producing a Boolean result, with NULLs. +Expressions evaluating to True or Unknown replicate the row. A False value +doesn't replicate the row. Expressions can't contain subqueries or refer to +variables other than columns of the current row being replicated. You can't reference system +columns. + +`row_filter` executes on the origin node, not on the target node. This puts an +additional CPU overhead on replication for this specific table but +completely avoids sending data for filtered rows. Hence network +bandwidth is reduced and overhead on the target node is applied. + +`row_filter` never removes `TRUNCATE` commands for a specific table. +You can filter away `TRUNCATE` commands at the replication set level. + +You can replicate just some columns of a table. See +[Replicating between nodes with differences](appusage). + +#### Notes + +This function uses the same replication mechanism as `DDL` statements. This means +that the replication is affected by the [ddl filters](repsets#ddl-replication-filtering) +configuration. + +The function takes a `DML` global lock on the relation that's being +added to the replication set if the `row_filter` isn't NULL. Otherwise +it takes just a `DDL` global lock. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. + +### bdr.replication_set_remove_table + +This function removes a table from the replication set. + +Replication of this command is affected by DDL replication configuration, +including DDL filtering settings. + +#### Synopsis + +```sql +bdr.replication_set_remove_table(relation regclass, + set_name name DEFAULT NULL) +``` + +#### Parameters + +- `relation` — Name or Oid of a table. +- `set_name` — Name of the replication set. If NULL (the default), then the BDR + group default replication set is used. + +#### Notes + +This function uses the same replication mechanism as `DDL` statements. This means +the replication is affected by the [ddl filters](repsets#ddl-replication-filtering) +configuration. + +The function takes a `DDL` global lock. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. + +## Listing replication sets + +You can list existing replication sets with the following query: + +```sql +SELECT set_name +FROM bdr.replication_sets; +``` + +You can use this query to list all the tables in a given replication +set: + +```sql +SELECT nspname, relname +FROM bdr.tables +WHERE set_name = 'myrepset'; +``` + +In [Behavior with foreign keys](#behavior-with-foreign-keys), we show a +query that lists all the foreign keys whose referenced table isn't +included in the same replication set as the referencing table. + +Use the following SQL to show those replication sets that the +current node publishes and subscribes from: + +```sql + SELECT node_id, + node_name, + pub_repsets, + sub_repsets + FROM bdr.local_node_summary; +``` + +This code produces output like this: + +```sql + node_id | node_name | pub_repsets | sub_repsets +------------+-----------+---------------------------------------- + 1834550102 | s01db01 | {bdrglobal,bdrs01} | {bdrglobal,bdrs01} +(1 row) +``` + +To execute the same query against all nodes in the cluster, you can use the following query. This approach gets +the replication sets associated with all nodes at the same time. + +```sql +WITH node_repsets AS ( + SELECT jsonb_array_elements( + bdr.run_on_all_nodes($$ + SELECT + node_id, + node_name, + pub_repsets, + sub_repsets + FROM bdr.local_node_summary; + $$)::jsonb + ) AS j +) +SELECT j->'response'->'command_tuples'->0->>'node_id' AS node_id, + j->'response'->'command_tuples'->0->>'node_name' AS node_name, + j->'response'->'command_tuples'->0->>'pub_repsets' AS pub_repsets, + j->'response'->'command_tuples'->0->>'sub_repsets' AS sub_repsets +FROM node_repsets; +``` + +This shows, for example: + +```sql + node_id | node_name | pub_repsets | sub_repsets +------------+-----------+---------------------------------------- + 933864801 | s02db01 | {bdrglobal,bdrs02} | {bdrglobal,bdrs02} + 1834550102 | s01db01 | {bdrglobal,bdrs01} | {bdrglobal,bdrs01} + 3898940082 | s01db02 | {bdrglobal,bdrs01} | {bdrglobal,bdrs01} + 1102086297 | s02db02 | {bdrglobal,bdrs02} | {bdrglobal,bdrs02} +(4 rows) +``` + +## DDL replication filtering + +By default, the replication of all supported DDL happens by way of the default BDR +group replication set. This is achieved with a DDL filter with +the same name as the BDR group. This filter is added to the default BDR +group replication set when the BDR group is created. + +You can adjust this by changing the DDL replication filters for +all existing replication sets. These filters are independent of table +membership in the replication sets. Just like data changes, each DDL statement +is replicated only once, even if it's matched by multiple filters on +multiple replication sets. + +You can list existing DDL filters with the following query, which +shows for each filter the regular expression applied to the command +tag and to the role name: + +```sql +SELECT * FROM bdr.ddl_replication; +``` + +You can use the following functions to manipulate DDL filters. +They are considered to be `DDL` and are therefore subject to DDL +replication and global locking. + +### bdr.replication_set_add_ddl_filter + +This function adds a DDL filter to a replication set. + +Any DDL that matches the given filter is replicated to any node that's +subscribed to that set. This also affects replication of BDR admin functions. + +This doesn't prevent execution of DDL on any node. It only +alters whether DDL is replicated to other nodes. Suppose two nodes have +a replication filter between them that excludes all index commands. Index commands can still +be executed freely by directly connecting to +each node and executing the desired DDL on that node. + +The DDL filter can specify a `command_tag` and `role_name` to allow +replication of only some DDL statements. The `command_tag` is the same as those +used by [EVENT TRIGGERs](https://www.postgresql.org/docs/current/static/event-trigger-matrix.html) +for regular PostgreSQL commands. A typical example might be to create a +filter that prevents additional index commands on a logical standby from +being replicated to all other nodes. + +You can filter the BDR admin functions used by using a tagname matching the +qualified function name. For example, `bdr.replication_set_add_table` is the +command tag for the function of the same name. In this case, this tag allows all BDR +functions to be filtered using `bdr.*`. + +The `role_name` is used for matching against the current role that is executing +the command. Both `command_tag` and `role_name` are evaluated as regular +expressions, which are case sensitive. + +#### Synopsis + +```sql +bdr.replication_set_add_ddl_filter(set_name name, + ddl_filter_name text, + command_tag text, + role_name text DEFAULT NULL, + base_relation_name text DEFAULT NULL, + query_match text DEFAULT NULL, + exclusive boolean DEFAULT FALSE) +``` + +#### Parameters + +- `set_name` — name of the replication set; if NULL then the BDR + group default replication set is used +- `ddl_filter_name` — name of the DDL filter; this must be unique across the + whole BDR group +- `command_tag` — regular expression for matching command tags; NULL means + match everything +- `role_name` — regular expression for matching role name; NULL means + match all roles +- `base_relation_name` — reserved for future use, must be NULL +- `query_match` — regular expression for matching the query; NULL means + match all queries +- `exclusive` — if true, other matched filters are not taken into + consideration (that is, only the exclusive filter is applied), when multiple + exclusive filters match, we throw an error. This is useful for routing + specific commands to specific replication set, while keeping the default + replication through the main replication set. + +#### Notes + +This function uses the same replication mechanism as `DDL` statements. This means +that the replication is affected by the [ddl filters](repsets#ddl-replication-filtering) +configuration. This also means that replication of changes to ddl +filter configuration is affected by the existing ddl filter configuration. + +The function takes a `DDL` global lock. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. + +To view the defined replication filters, use the view `bdr.ddl_replication`. + +#### Examples + +To include only BDR admin functions, define a filter like this: + +```sql +SELECT bdr.replication_set_add_ddl_filter('mygroup', 'mygroup_admin', $$bdr\..*$$); +``` + +To exclude everything apart from index DDL: + +```sql +SELECT bdr.replication_set_add_ddl_filter('mygroup', 'index_filter', + '^(?!(CREATE INDEX|DROP INDEX|ALTER INDEX)).*'); +``` + +To include all operations on tables and indexes but exclude all others, add +two filters: one for tables, one for indexes. This shows that +multiple filters provide the union of all allowed DDL commands: + +```sql +SELECT bdr.replication_set_add_ddl_filter('bdrgroup','index_filter', '^((?!INDEX).)*$'); +SELECT bdr.replication_set_add_ddl_filter('bdrgroup','table_filter', '^((?!TABLE).)*$'); +``` + +### bdr.replication_set_remove_ddl_filter + +This function removes the DDL filter from a replication set. + +Replication of this command is affected by DDL replication configuration, +including DDL filtering settings themselves. + +#### Synopsis + +```sql +bdr.replication_set_remove_ddl_filter(set_name name, + ddl_filter_name text) +``` + +#### Parameters + +- `set_name` — Name of the replication set. If NULL then the BDR + group default replication set is used. +- `ddl_filter_name` — Name of the DDL filter to remove. + +#### Notes + +This function uses the same replication mechanism as `DDL` statements. This +means that the replication is affected by the +[ddl filters](repsets#ddl-replication-filtering) configuration. +This also means that replication of changes to the DDL filter configuration is +affected by the existing DDL filter configuration. + +The function takes a `DDL` global lock. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. diff --git a/product_docs/docs/pgd/5/routing/index.mdx b/product_docs/docs/pgd/5/routing/index.mdx new file mode 100644 index 00000000000..c7087093974 --- /dev/null +++ b/product_docs/docs/pgd/5/routing/index.mdx @@ -0,0 +1,83 @@ +--- +title: "Application connection management" +navTitle: "Connection management" +indexCards: none + +navigation: + - proxy + - installing_proxy + +--- + +Managing application connections is important part of high availability. + +Especially with asynchronous replication, having consistent write lead node is +important in order to avoid conflicts and guarantee availability for the +application. + +EDB Postgres Distributed provides a proxy layer called PDG-Proxy which is +normally installed in highly available configuration (at least 2 instances per +region). + +The PGD-Proxy connects to one of the EDB Postgres Distributed nodes and monitors +routing configuration changes as decided by the EDB Postgres Distributed cluster +and ensures that the connections are routed to correct node(s) consistently. + +## Configuration + +The configuration of the routing is done through either SQL interfaces or through +PGD-CLI. + +The SQL interfaces are described in this section. + +You can enable routing decisions by calling `bdr.alter_node_group_option()` function. +For example `SELECT bdr.alter_node_group_option('region1-group', 'enable_proxy_routing', 'true')`. +It can be disabled again by setting the same option to `false`. + +There are additional group-level options that affect the routing decisions: + +- route_writer_max_lag - Maximum lag in bytes of the new write candidate to be + selected as write leader, if no candidate passes this, there will be no writer + selected automatically. +- route_reader_max_lag - Maximum lag in bytes for node to be considered viable + read-only node (currently reserved for future use). + +Per node configuration of routing is set using `bdr.alter_node_option()`. The +available options that affect routing are following: + +- route_priority - Relative routing priority of the node against other nodes in + the same node group. +- route_fence - Whether the node is fenced from routing (can't receive connections + from PGD-Proxy) +- route_writes - Whether writes can be routed to this node, i.e. whether the node + can become write leader. +- route_reads - Whether read only connections can be routed to this node (currnetly + reserved for future use). + +The proxies are also configured using SQL interfaces. You can add proxy configuration +using `bdr.create_proxy`. For example `SELECT bdr.create_proxy('region1-proxy1', 'region1-group');` +will add default configuration for proxy named "region1-proxy1" that is member +of BDR group "region1-group". The name of the proxy given here must be same +as the name given in the proxy configuration file. Proxy configuration can be +removed using `SELECT bdr.drop_proxy('region1-proxy1')`, such proxy will be +deactivated as a result. + +Options for each proxy can be configured using `bdr.alter_proxy_option()` function. +The available option are: + +- listen_address - Address the proxy should listen on. +- listen_port - Port the proxy should listen on. +- max_client_conn - Maximum number of connections the proxy will accept. +- max_server_conn - Maximum number of connections the proxy will make to the + Postgres node. +- server_conn_timeout - Connection timeout for server connections. +- server_conn_keepalive - Keepalive interval for server connections. + +The current configuration of every group is visible in the +`bdr.node_group_routing_config_summary` view. Similarly the +`bdr.node_routing_config_summary` view shows current per node routing +configuration and `bdr.proxy_config_summary` shows per proxy configuration. + +It's also possible to do a switch-over operation (changing which node is +the write leader explicitly) using `bdr.routing_leadership_transfer()` function. + diff --git a/product_docs/docs/pgd/5/routing/installing_proxy.mdx b/product_docs/docs/pgd/5/routing/installing_proxy.mdx new file mode 100644 index 00000000000..6b89b3bef30 --- /dev/null +++ b/product_docs/docs/pgd/5/routing/installing_proxy.mdx @@ -0,0 +1,100 @@ +--- +title: "Installing PGD Proxy" +navTitle: "Installing Proxy" +--- + +## Installing PGD Proxy + +There are two ways to install and configure PGD Proxy to manage Postgres Distributed cluster. The easiest way to install and configure PGD Proxy is to use the EDB Trusted Postgres Architect (TPA) utility +for cluster deployment and management. + + +### Installing through TPA + +If PGD cluster is being deployed through TPA then it installs and configures PGD Proxy automatically as per recommended architecture. If you wish to install PGD Proxy on any other node in PGD cluster then, you simply need to attach the `pgd-proxy` role to that instance in TPA configuration file and set the `bdr_child_group` parameter as shown below before deploying. See [Trusted Postgres Architect](/product_docs/docs/pgd/latest/tpa) for more information. +```yaml +- Name: proxy-a1 + location: a + node: 4 + role: + - pgd-proxy + vars: + bdr_child_group: group_a + volumes: + - device_name: /dev/sdf + volume_type: none +``` + + +### Installing manually + +You can manually install the PGD Proxy on any Linux machine using `.deb` and `.rpm` packages available from the PGD repository. The package name is `edb-pgd5-proxy`. For example: + +```sh +# for Debian +sudo apt-get install edb-pgd5-proxy +``` + + +## Configuration + +Proxy connects to BDR database for its internal operations, like getting proxy configs, getting write leader details, etc. Therefore, it needs a list of endpoints/dsn to connect to BDR nodes. Proxy expects these configurations in a local config file `pgd-proxy-config.yml`. Following is a functional example of `pgd-proxy-config.yml` file: + +```yaml +log-level: debug +cluster: + name: my_pgd_cluster + endpoints: + - "host=bdr-a1 port=5432 dbname=bdrdb user=pgdproxy " + - "host=bdr-a3 port=5432 dbname=bdrdb user=pgdproxy " + - "host=bdr-a2 port=5432 dbname=bdrdb user=pgdproxy " + proxy: + name: "proxy-a1" +``` + +The `cluster.endpoints` and `cluster.proxy.name` are mandatory fields in the config file. Proxy always tries to connect to the first endpoint in the list, if it fails it tries the next endpoint and so on. The log level for PGD Proxy service can be set using the top level config parameter `log-level` as shown in the sample config. The valid values for `log-level` are `debug`, `info`, `warn` and `error`. + +PGD Proxy searches for `pgd-proxy-config.yml` in the following locations (precedence order - high to low): + + 1. "-f config-file" (e.g. 'pgd-proxy -f /opt/my-config.yml') + 2. "/etc/edb/pgd-proxy" (default) + 3. "$HOME/.edb/pgd-proxy" + +The `pgd-proxy-config.yml`, is located in the `/etc/edb/pgd-proxy` directory, by default. If you rename the file or move it to another location, specify the new name and location using the `-f` or `--config-file` flag in the `pgd-proxy.service` file. + +### Proxy user + +If the PGD Cluster is created by TPA, a postgres user `pgdproxy` for PGD Proxy and `route_dsn` at node level is set. If you wish to setup a different user then you need to create the user and set in the `endpoints` in config file manually. + + +## PGD Proxy service + +PGD Proxy is preferably run as a systemd service. The `pgd-proxy` service unit file is located at `/etc/systemd/system/pgd-proxy.service` by default. Following is the sample service file for EDB Postgres Extended Server and Postgres Community. + +**Note:** For EDB Postgres Advanced Server please change User and Group from `postgres` to `enterprisedb`. + +``` +[Unit] +Description=PGD Proxy + +[Service] +Type=simple +User=postgres +Group=postgres +Restart=on-failure +RestartSec=1s +ExecStart=/usr/bin/pgd-proxy -f /etc/edb/pgd-proxy/pgd-proxy-config.yml +StandardOutput=syslog +StandardError=syslog +SyslogIdentifier=pgd-proxy + +[Install] +WantedBy=multi-user.target +``` + +Use the below commands to manage `pgd-proxy` service +``` +systemctl status pgd-proxy +systemctl stop pgd-proxy +systemctl restart pgd-proxy +``` diff --git a/product_docs/docs/pgd/5/routing/proxy.mdx b/product_docs/docs/pgd/5/routing/proxy.mdx new file mode 100644 index 00000000000..fd08acac147 --- /dev/null +++ b/product_docs/docs/pgd/5/routing/proxy.mdx @@ -0,0 +1,58 @@ +--- +title: "EDB Postgres Distributed Proxy" +navTitle: "PGD Proxy" +indexCards: none +navigation: +- installing_proxy + +directoryDefaults: + description: "The PGD Proxy is a service that acts as proxy layer between the client application and Postgres for your EDB Postgres Distributed cluster" +--- + +EDB Postgres Distributed Proxy is a daemon that acts as an abstraction layer between the client application and Postgres. It interfaces with the BDR consensus mechanism to obtain the identity of the current write leader node and redirects traffic to that node. + +There is always at least one global group and one data group in the PGD cluster. BDR elects the write leader for each data group which has the `enable_proxy_routing` and `enable_raft` options set to true. Proxy can be attached to a global group or data group. There could be multiple proxies attached to each group. + +PGD Proxy is a TCP layer 4 proxy. + + +## How it works ? + +Upon starting, PGD Proxy connects to one of the endpoints given in the local config file and fetches DB connection information for all nodes, proxy options like listen address, listen port, routing details like current write leader, etc. Endpoints given in config file are only used at the time of startup, after that actual endpoints are taken from bdr catalog `bdr.node_routing_config_summary#route_dsn`. + +The write leader election is managed by BDR itself. Proxy internally interacts with BDR to get write leader change events notifications on Postgres notify/listen channels and routes client traffic to current write leader. Proxy disconnects all client connections on write leader change. Write leader election is a Raft backed activity and is subject to Raft leader availability. + +PGD Proxy responds to write leader change events that can be categorized into two modes of operation viz., `Failover` and `Switchover`. + +Automatic transfer of write leadership from current write leader node to new node in the event of but not limited to Postgres or Operating system crash is termed as `Failover`. BDR automatically elects a new write leader when the current write leader goes down or becomes unresponsive. Once the new write leader is elected by BDR, proxy will close existing client connections (which were to the old write leader) and redirect new client connections to newly elected write leader. + +User controlled, manual transfer of write leadership from current write leader to new target leader is termed as `Switchover`. Switchover is triggered through [PGD CLI switchover](/product_docs/docs/pgd/5/cli/command_ref/pgd_switchover.mdx) command. The command is submitted to BDR and BDR attempts to elect the given target node as new write leader. Similar to failover, proxy will close existing client connections and redirect new client connections to newly elected write leader. This is useful at the time of server maintenance, e.g, if the current write leader node needs to be stopped for maintenance like server update, OS patch update etc. + +Please note switchover may or may not happen to the user provided target leader. It depends on but not limited to following factors - +- switchover command arguments like switchover method, timeout, +- different group routing option values like route_writer_max_lag, and +- data lags across nodes when switchover command was executed + + +## Managing PGD Proxy + +PGD CLI provides few commands viz., `create-proxy`, `show-proxy`, `set-proxy-options` and `delete-proxy` to manage Proxies in PGD cluster. See [PGD CLI](/product_docs/docs/pgd/5/cli/command_ref) for more information. + +See [Connection management](/product_docs/docs/pgd/5/routing) for more information on BDR side of configuration and management of PGD Proxy. + + +## Proxy log location +### syslog +- Debian based - `/var/log/syslog` +- Red Hat based - `/var/log/messages` + +Use `journalctl` command to filter and view logs for troubleshooting Proxy. Below are few sample commands for quick reference. +``` +journalctl -u pgd-proxy -n100 -f +journalctl -u pgd-proxy --since today +journalctl -u pgd-proxy --since "10 min ago" +journalctl -u pgd-proxy --since "2022-10-20 16:21:50" --until "2022-10-20 16:21:55" +``` + + + diff --git a/product_docs/docs/pgd/5/scaling.mdx b/product_docs/docs/pgd/5/scaling.mdx new file mode 100644 index 00000000000..bd8cdfeeb4a --- /dev/null +++ b/product_docs/docs/pgd/5/scaling.mdx @@ -0,0 +1,306 @@ +--- +title: AutoPartition +redirects: + - bdr/scaling +--- + +AutoPartition allows tables to grow easily to large sizes by automatic +partitioning management. This capability uses features of BDR +such as low-conflict locking of creating and dropping partitions. + +You can create new partitions regularly and then drop them when the +data retention period expires. + +BDR management is primarily accomplished by functions that can be called by SQL. +All functions in BDR are exposed in the `bdr` schema. Unless you put it into +your search_path, you need to schema-qualify the name of each function. + +## Auto creation of partitions + +`bdr.autopartition()` creates or alters the definition of automatic +range partitioning for a table. If no definition exists, it's created. +Otherwise, later executions will alter the definition. + +`bdr.autopartition()` doesn't lock the actual table. It changes the +definition of when and how new partition maintenance actions take place. + +`bdr.autopartition()` leverages the features that allow a partition to be +attached or detached/dropped without locking the rest of the table +(when the underlying Postgres version supports it). + +An ERROR is raised if the table isn't RANGE partitioned or a multi-column +partition key is used. + +A new partition is added for every `partition_increment` range of values, with +lower and upper bound `partition_increment` apart. For tables with a partition +key of type `timestamp` or `date`, the `partition_increment` must be a valid +constant of type `interval`. For example, specifying `1 Day` causes a new +partition to be added each day, with partition bounds that are one day apart. + +If the partition column is connected to a `snowflakeid`, `timeshard`, or `ksuuid` sequence, +you must specify the `partition_increment` as type `interval`. Otherwise, +if the partition key is integer or numeric, then the `partition_increment` +must be a valid constant of the same datatype. For example, specifying +`1000000` causes new partitions to be added every 1 million values. + +If the table has no existing partition, then the specified +`partition_initial_lowerbound` is used as the lower bound for the first +partition. If you don't specify `partition_initial_lowerbound`, then the system +tries to derive its value from the partition column type and the specified +`partition_increment`. For example, if `partition_increment` is specified as `1 Day`, +then `partition_initial_lowerbound` is set to CURRENT +DATE. If `partition_increment` is specified as `1 Hour`, then +`partition_initial_lowerbound` is set to the current hour of the current +date. The bounds for the subsequent partitions are set using the +`partition_increment` value. + +The system always tries to have a certain minimum number of advance partitions. +To decide whether to create new partitions, it uses the +specified `partition_autocreate_expression`. This can be an expression that can be evaluated by SQL, +which is evaluated every time a check is performed. For example, +for a partitioned table on column type `date`, if +`partition_autocreate_expression` is specified as `DATE_TRUNC('day',CURRENT_DATE)`, +`partition_increment` is specified as `1 Day` and +`minimum_advance_partitions` is specified as 2, then new partitions are +created until the upper bound of the last partition is less than +`DATE_TRUNC('day', CURRENT_DATE) + '2 Days'::interval`. + +The expression is evaluated each time the system checks for new partitions. + +For a partitioned table on column type `integer`, you can specify the +`partition_autocreate_expression` as `SELECT max(partcol) FROM +schema.partitioned_table`. The system then regularly checks if the maximum value of +the partitioned column is within the distance of `minimum_advance_partitions * partition_increment` +of the last partition's upper bound. Create an index on the `partcol` so that the query runs efficiently. +If the `partition_autocreate_expression` isn't specified for a partition table +on column type `integer`, `smallint`, or `bigint`, then the system +sets it to `max(partcol)`. + +If the `data_retention_period` is set, partitions are +dropped after this period. Partitions are dropped at the same time as new +partitions are added, to minimize locking. If this value isn't set, you must drop the partitions manually. + +The `data_retention_period` parameter is supported only for timestamp (and +related) based partitions. The period is calculated by considering the upper +bound of the partition. The partition is either migrated to the secondary +tablespace or dropped if either of the given period expires, relative to the +upper bound. + +By default, AutoPartition manages partitions globally. In other words, when a +partition is created on one node, the same partition is also created on all +other nodes in the cluster. So all partitions are consistent and guaranteed to +be available. For this, AutoPartition makes use of Raft. You can change this behavior +by passing `managed_locally` as `true`. In that case, all partitions +are managed locally on each node. This is useful for the case when the +partitioned table isn't a replicated table and hence it might not be necessary +or even desirable to have all partitions on all nodes. For example, the +built-in `bdr.conflict_history` table isn't a replicated table and is +managed by AutoPartition locally. Each node creates partitions for this table +locally and drops them once they are old enough. + +You can't later change tables marked as `managed_locally` to be managed +globally and vice versa. + +Activities are performed only when the entry is marked `enabled = on`. + +You aren't expected to manually create or drop partitions for tables +managed by AutoPartition. Doing so can make the AutoPartition metadata +inconsistent and might cause it to fail. + +### Configure AutoPartition + +The `bdr.autopartition` function configures automatic partitioning of a table. + +#### Synopsis + +```sql +bdr.autopartition(relation regclass, + partition_increment text, + partition_initial_lowerbound text DEFAULT NULL, + partition_autocreate_expression text DEFAULT NULL, + minimum_advance_partitions integer DEFAULT 2, + maximum_advance_partitions integer DEFAULT 5, + data_retention_period interval DEFAULT NULL, + managed_locally boolean DEFAULT false, + enabled boolean DEFAULT on); +``` + +#### Parameters + +- `relation` — Name or Oid of a table. +- `partition_increment` — Interval or increment to next partition creation. +- `partition_initial_lowerbound` — If the table has no partition, then the + first partition with this lower bound and `partition_increment` apart upper + bound is created. +- `partition_autocreate_expression` — Used to detect if it's time to create new partitions. +- `minimum_advance_partitions` — The system attempts to always have at + least `minimum_advance_partitions` partitions. +- `maximum_advance_partitions` — Number of partitions to be created in a single + go once the number of advance partitions falls below `minimum_advance_partitions`. +- `data_retention_period` — Interval until older partitions are dropped, if + defined. This value must be greater than `migrate_after_period`. +- `managed_locally` — If true, then the partitions are managed locally. +- `enabled` — Allows activity to be disabled or paused and later resumed or reenabled. + +#### Examples + +Daily partitions, keep data for one month: + +```sql +CREATE TABLE measurement ( +logdate date not null, +peaktemp int, +unitsales int +) PARTITION BY RANGE (logdate); + +bdr.autopartition('measurement', '1 day', data_retention_period := '30 days'); +``` + +Create five advance partitions when there are only two more partitions remaining (each partition can hold 1 billion orders): + +```sql +bdr.autopartition('Orders', '1000000000', + partition_initial_lowerbound := '0', + minimum_advance_partitions := 2, + maximum_advance_partitions := 5 + ); +``` + +### Create one AutoPartition + +Use `bdr.autopartition_create_partition()` to create a standalone AutoPartition +on the parent table. + +#### Synopsis + +```sql +bdr.autopartition_create_partition(relname regclass, + partname name, + lowerb text, + upperb text, + nodes oid[]); +``` + +#### Parameters + +- `relname` — Name or Oid of the parent table to attach to. +- `partname` — Name of the new AutoPartition. +- `lowerb` — The lower bound of the partition. +- `upperb` — The upper bound of the partition. +- `nodes` — List of nodes that the new partition resides on. + +### Stopping automatic creation of partitions + +Use `bdr.drop_autopartition()` to drop the auto-partitioning rule for the +given relation. All pending work items for the relation are deleted and no new +work items are created. + +```sql +bdr.drop_autopartition(relation regclass); +``` + +#### Parameters + +- `relation` — Name or Oid of a table. + +### Drop one AutoPartition + +Use `bdr.autopartition_drop_partition` once a BDR AutoPartition table has been +made, as this function can specify single partitions to drop. If the partitioned +table was successfully dropped, the function returns `true`. + +#### Synopsis + +```sql +bdr.autopartition_drop_partition(relname regclass) +``` + +#### Parameters + +- `relname` — The name of the partitioned table to drop. + +### Notes + +This places a DDL lock on the parent table, before using DROP TABLE on the +chosen partition table. + +### Wait for partition creation + +Use `bdr.autopartition_wait_for_partitions()` to wait for the creation of +partitions on the local node. The function takes the partitioned table name and +a partition key column value and waits until the partition that holds that +value is created. + +The function only waits for the partitions to be created locally. It doesn't guarantee +that the partitions also exists on the remote nodes. + +To wait for the partition to be created on all BDR nodes, use the +`bdr.autopartition_wait_for_partitions_on_all_nodes()` function. This function +internally checks local as well as all remote nodes and waits until the +partition is created everywhere. + +#### Synopsis + +```sql +bdr.autopartition_wait_for_partitions(relation regclass, upperbound text); +``` + +#### Parameters + +- `relation` — Name or Oid of a table. +- `upperbound` — Partition key column value. + +#### Synopsis + +```sql +bdr.autopartition_wait_for_partitions_on_all_nodes(relation regclass, upperbound text); +``` + +#### Parameters + +- `relation` — Name or Oid of a table. +- `upperbound` — Partition key column value. + +### Find partition + +Use the `bdr.autopartition_find_partition()` function to find the partition for the +given partition key value. If partition to hold that value doesn't exist, then +the function returns NULL. Otherwise Oid of the partition is returned. + +#### Synopsis + +```sql +bdr.autopartition_find_partition(relname regclass, searchkey text); +``` + +#### Parameters + +- `relname` — Name of the partitioned table. +- `searchkey` — Partition key value to search. + +### Enable or disable AutoPartitioning + +Use `bdr.autopartition_enable()` to enable AutoPartitioning on the given table. +If AutoPartitioning is already enabled, then no action occurs. Similarly, use +`bdr.autopartition_disable()` to disable AutoPartitioning on the given table. + +#### Synopsis + +```sql +bdr.autopartition_enable(relname regclass); +``` + +#### Parameters + +- `relname` — Name of the relation to enable AutoPartitioning. + +#### Synopsis + +```sql +bdr.autopartition_disable(relname regclass); +``` + +#### Parameters + +- `relname` — Name of the relation to disable AutoPartitioning. + diff --git a/product_docs/docs/pgd/5/security.mdx b/product_docs/docs/pgd/5/security.mdx new file mode 100644 index 00000000000..5109f164cbb --- /dev/null +++ b/product_docs/docs/pgd/5/security.mdx @@ -0,0 +1,383 @@ +--- +title: Security and roles +redirects: + - bdr/security + +--- + +Only superusers can create the BDR extension. However, if you want, you can set up the `pgextwlist` extension and configure it to allow a non-superuser to create a BDR extension. + +Configuring and managing BDR doesn't require superuser access, nor is that recommended. +The privileges required by BDR are split across the following default/predefined roles, named +similarly to the PostgreSQL default/predefined roles: + +- bdr_superuser — The highest-privileged role, having access to all BDR tables and functions. +- bdr_read_all_stats — The role having read-only access to the tables, views, and functions, sufficient to understand the state of BDR. +- bdr_monitor — At the moment, the same as `bdr_read_all_stats`. To be extended later. +- bdr_application — The minimal privileges required by applications running BDR. +- bdr_read_all_conflicts — Can view all conflicts in `bdr.conflict_history`. + +These BDR roles are created when the BDR extension is +installed. See [BDR default roles](#bdr-defaultpredefined-roles) for more details. + +Managing BDR doesn't require that administrators have access to user data. + +Arrangements for securing conflicts are discussed in +[Logging conflicts to a table](consistency/conflicts). + +You can monitor conflicts using the `BDR.conflict_history_summary` view. + +## Catalog tables + +System catalog and Information Schema tables are always excluded from replication by BDR. + +In addition, tables owned by extensions are excluded from replication. + +## BDR functions and operators + +All BDR functions are exposed in the `bdr` schema. Any calls to these +functions must be schema qualified, rather than putting `bdr` in the +search_path. + +All BDR operators are available by way of the `pg_catalog` schema to allow users +to exclude the `public` schema from the search_path without problems. + +## Granting privileges on catalog objects + +Administrators must not grant explicit privileges on catalog +objects such as tables, views, and functions. Manage access to those objects +by granting one of the roles described in [BDR default roles](#bdr-defaultpredefined-roles). + +This requirement is a consequence of the flexibility that allows +joining a node group even if the nodes on either side of the join don't +have the exact same version of BDR (and therefore of the BDR +catalog). + +More precisely, if privileges on individual catalog objects were +explicitly granted, then the `bdr.join_node_group()` procedure might +fail because the corresponding GRANT statements extracted from the +node being joined might not apply to the node that is joining. + +## Role management + +Users are global objects in a PostgreSQL instance. +`CREATE USER` and `CREATE ROLE` commands are replicated automatically if they +are executed in the database where BDR is running and the +`bdr.role_replication` is turned on. However, if these commands are executed +in other databases in the same PostgreSQL instance, then they aren't replicated, +even if those users have rights on the BDR database. + +When a new BDR node joins the BDR group, existing users aren't automatically +copied unless the node is added using `bdr_init_physical`. This is intentional +and is an important security feature. PostgreSQL allows users to access multiple +databases, with the default being to access any database. BDR doesn't know +which users access which database and so can't safely decide +which users to copy across to the new node. + +PostgreSQL allows you to dump all users with the command: + +```shell +pg_dumpall --roles-only > roles.sql +``` + +The file `roles.sql` can then be edited to remove unwanted users before +reexecuting that on the newly created node. +Other mechanisms are possible, depending on your identity and access +management solution (IAM) but aren't automated at this time. + +## Roles and replication + +DDL changes executed by a user are applied as that same user on each node. + +DML changes to tables are replicated as the table-owning user on the target node. +We recommend but do not enforce that a table be owned by the same user on each node. + +Suppose table A is owned by user X on node1 and owned by user Y on node2. If user Y +has higher privileges than user X, this might be viewed as a privilege escalation. +Since some nodes have different use cases, we allow this but warn against it +to allow the security administrator to plan and audit this situation. + +On tables with row-level security policies enabled, changes +are replicated without reenforcing policies on apply. +This is equivalent to the changes being applied as +`NO FORCE ROW LEVEL SECURITY`, even if +`FORCE ROW LEVEL SECURITY` is specified. +If this isn't what you want, specify a row_filter that avoids +replicating all rows. We recommend but don't enforce +that the row security policies on all nodes be identical or +at least compatible. + +The user bdr_superuser controls replication for BDR and can +add or remove any table from any replication set. bdr_superuser +doesn't need any privileges +over individual tables, nor is this recommended. If you need to restrict access +to replication set functions, restricted versions of these +functions can be implemented as `SECURITY DEFINER` functions +and granted to the appropriate users. + +## Connection role + +When allocating a new BDR node, the user supplied in the DSN for the +`local_dsn` argument of `bdr.create_node` and the `join_target_dsn` of +`bdr.join_node_group` are used frequently to refer to, create, and +manage database objects. + +BDR is carefully written to prevent privilege escalation attacks even +when using a role with `SUPERUSER` rights in these DSNs. + +To further reduce the attack surface, you can specify a more restricted user +in the above DSNs. At a minimum, such a user must be +granted permissions on all nodes, such that following stipulations are +satisfied: + +- The user has the `REPLICATION` attribute. +- It is granted the `CREATE` permission on the database. +- It inherits the `bdr_superuser` role. +- It owns all database objects to replicate, either directly or from + permissions from the owner roles. + +Once all nodes are joined, the permissions can be further reduced to +just the following to still allow DML and DDL replication: + +- The user has the `REPLICATION` attribute. +- It inherits the `bdr_superuser` role. + +## Privilege restrictions + +BDR enforces additional restrictions, effectively preventing the +use of DDL that relies solely on TRIGGER or REFERENCES privileges. + +`GRANT ALL` still grants both TRIGGER and REFERENCES privileges, +so we recommend that you state privileges explicitly. For example, use +`GRANT SELECT, INSERT, UPDATE, DELETE, TRUNCATE` instead of `ALL`. + +### Foreign key privileges + +`ALTER TABLE ... ADD FOREIGN KEY` is supported only if the user has +SELECT privilege on the referenced table or if the referenced table +has RLS restrictions enabled that the current user can't bypass. + +Thus, the REFERENCES privilege isn't sufficient to allow creating +a foreign key with BDR. Relying solely on the REFERENCES privilege +isn't typically useful since it makes the validation check execute +using triggers rather than a table scan. It is typically too expensive +to use successfully. + +### Triggers + +In PostgreSQL, both the owner of a table and anyone who +was granted the TRIGGER privilege can create triggers. Triggers granted by the non-table owner +execute as the table owner in BDR, which might cause a security issue. +The TRIGGER privilege is seldom used and PostgreSQL Core Team has said +"The separate TRIGGER permission is something we consider obsolescent." + +BDR mitigates this problem by using stricter rules on who can create a trigger +on a table: + +- superuser +- bdr_superuser +- Owner of the table can create triggers according to same rules as in PostgreSQL + (must have EXECUTE privilege on the function used by the trigger). +- Users who have TRIGGER privilege on the table can create a trigger only if + they create the trigger using a function that is owned by the same owner as the + table and they satisfy standard PostgreSQL rules (again must have EXECUTE + privilege on the function). So if both table and function have the same owner and the + owner decided to give a user both TRIGGER privilege on the table and EXECUTE + privilege on the function, it is assumed that it is okay for that user to create + a trigger on that table using this function. +- Users who have TRIGGER privilege on the table can create triggers using + functions that are defined with the SECURITY DEFINER clause if they have EXECUTE + privilege on them. This clause makes the function always execute in the context + of the owner of the function both in standard PostgreSQL and BDR. + +This logic is built on the fact that, in PostgreSQL, the owner of the trigger +isn't the user who created it but the owner of the function used by that trigger. + +The same rules apply to existing tables, and if the existing table has triggers that +aren't owned by the owner of the table and don't use SECURITY DEFINER functions, +you can't add it to a replication set. + +These checks were added with BDR 3.6.19. An application that +relies on the behavior of previous versions can set +`bdr.backwards_compatibility` to 30618 (or lower) to behave like +earlier versions. + +BDR replication apply uses the system-level default search_path only. +Replica triggers, stream triggers, +and index expression functions might assume other search_path settings which then fail when they +execute on apply. To ensure this doesn't occur, resolve object references clearly using either the default +search_path only (always use fully qualified references to objects, e.g., schema.objectname), or set the search +path for a function using `ALTER FUNCTION ... SET search_path = ...` for the functions affected. + +## BDR default/predefined roles + +BDR predefined roles are created when the BDR extension is installed. +After BDR extension is dropped from a database, the roles continue to exist +and need to be dropped manually if required. This allows BDR to be used in multiple +databases on the same PostgreSQL instance without problem. + +The `GRANT ROLE` DDL statement doesn't participate in BDR replication. +Thus, execute this on each node of a cluster. + +### bdr_superuser + +- ALL PRIVILEGES ON ALL TABLES IN SCHEMA BDR +- ALL PRIVILEGES ON ALL ROUTINES IN SCHEMA BDR + +### bdr_read_all_stats + +SELECT privilege on + +- `bdr.conflict_history_summary` +- `bdr.ddl_epoch` +- `bdr.ddl_replication` +- `bdr.global_consensus_journal_details` +- `bdr.global_lock` +- `bdr.global_locks` +- `bdr.local_consensus_state` +- `bdr.local_node_summary` +- `bdr.node` +- `bdr.node_catchup_info` +- `bdr.node_conflict_resolvers` +- `bdr.node_group` +- `bdr.node_local_info` +- `bdr.node_peer_progress` +- `bdr.node_slots` +- `bdr.node_summary` +- `bdr.replication_sets` +- `bdr.sequences` +- `bdr.stat_relation` +- `bdr.stat_subscription` +- `bdr.subscription` +- `bdr.subscription_summary` +- `bdr.tables` + +EXECUTE privilege on + +- `bdr.bdr_version` +- `bdr.bdr_version_num` +- `bdr.decode_message_payload` +- `bdr.get_global_locks` +- `bdr.get_raft_status` +- `bdr.get_relation_stats` +- `bdr.get_slot_flush_timestamp` +- `bdr.get_sub_progress_timestamp` +- `bdr.get_subscription_stats` +- `bdr.peer_state_name` +- `bdr.show_subscription_status` + +### bdr_monitor + +All privileges from `bdr_read_all_stats`, plus + +EXECUTE privilege on + +- `bdr.monitor_group_versions` +- `bdr.monitor_group_raft` +- `bdr.monitor_local_replslots` + +### bdr_application + +EXECUTE privilege on + +- All functions for column_timestamps datatypes +- All functions for CRDT datatypes +- `bdr.alter_sequence_set_kind` +- `bdr.create_conflict_trigger` +- `bdr.create_transform_trigger` +- `bdr.drop_trigger` +- `bdr.get_configured_camo_partner` +- `bdr.global_lock_table` +- `bdr.is_camo_partner_connected` +- `bdr.is_camo_partner_ready` +- `bdr.logical_transaction_status` +- `bdr.ri_fkey_trigger` +- `bdr.seq_nextval` +- `bdr.seq_currval` +- `bdr.seq_lastval` +- `bdr.trigger_get_committs` +- `bdr.trigger_get_conflict_type` +- `bdr.trigger_get_origin_node_id` +- `bdr.trigger_get_row` +- `bdr.trigger_get_type` +- `bdr.trigger_get_xid` +- `bdr.wait_for_camo_partner_queue` +- `bdr.wait_slot_confirm_lsn` + +Many of these functions have additional privileges +required before you can use them. For example, you must be +the table owner to successfully execute `bdr.alter_sequence_set_kind`. +These additional rules are described with each specific function. + +### bdr_read_all_conflicts + +BDR logs conflicts into the `bdr.conflict_history` table. Conflicts are +visible to table owners only, so no extra privileges are required +to read the conflict history. If it's useful to have a user that can +see conflicts for all tables, you can optionally grant the role +bdr_read_all_conflicts to that user. + +## Verification + +BDR was verified using the following tools and approaches. + +### Coverity + +Coverity Scan was used to verify the BDR stack providing coverage +against vulnerabilities using the following rules and coding standards: + +- MISRA C +- ISO 26262 +- ISO/IEC TS 17961 +- OWASP Top 10 +- CERT C +- CWE Top 25 +- AUTOSAR + +### CIS Benchmark + +CIS PostgreSQL Benchmark v1, 19 Dec 2019 was used to verify the BDR stack. +Using the `cis_policy.yml` configuration available as an option with +Trusted Postgres Architect gives the following results for the Scored tests: + +| | Result | Description | +| ------ | ---------- | ----------------------------------------------------------------- | +| 1.4 | PASS | Ensure systemd Service Files Are Enabled | +| 1.5 | PASS | Ensure Data Cluster Initialized Successfully | +| 2.1 | PASS | Ensure the file permissions mask is correct | +| 2.2 | PASS | Ensure the PostgreSQL pg_wheel group membership is correct | +| 3.1.2 | PASS | Ensure the log destinations are set correctly | +| 3.1.3 | PASS | Ensure the logging collector is enabled | +| 3.1.4 | PASS | Ensure the log file destination directory is set correctly | +| 3.1.5 | PASS | Ensure the filename pattern for log files is set correctly | +| 3.1.6 | PASS | Ensure the log file permissions are set correctly | +| 3.1.7 | PASS | Ensure 'log_truncate_on_rotation' is enabled | +| 3.1.8 | PASS | Ensure the maximum log file lifetime is set correctly | +| 3.1.9 | PASS | Ensure the maximum log file size is set correctly | +| 3.1.10 | PASS | Ensure the correct syslog facility is selected | +| 3.1.11 | PASS | Ensure the program name for PostgreSQL syslog messages is correct | +| 3.1.14 | PASS | Ensure 'debug_print_parse' is disabled | +| 3.1.15 | PASS | Ensure 'debug_print_rewritten' is disabled | +| 3.1.16 | PASS | Ensure 'debug_print_plan' is disabled | +| 3.1.17 | PASS | Ensure 'debug_pretty_print' is enabled | +| 3.1.18 | PASS | Ensure 'log_connections' is enabled | +| 3.1.19 | PASS | Ensure 'log_disconnections' is enabled | +| 3.1.21 | PASS | Ensure 'log_hostname' is set correctly | +| 3.1.23 | PASS | Ensure 'log_statement' is set correctly | +| 3.1.24 | PASS | Ensure 'log_timezone' is set correctly | +| 3.2 | PASS | Ensure the PostgreSQL Audit Extension (pgAudit) is enabled | +| 4.1 | PASS | Ensure sudo is configured correctly | +| 4.2 | PASS | Ensure excessive administrative privileges are revoked | +| 4.3 | PASS | Ensure excessive function privileges are revoked | +| 4.4 | PASS | Tested Ensure excessive DML privileges are revoked | +| 5.2 | Not Tested | Ensure login via 'host' TCP/IP Socket is configured correctly | +| 6.2 | PASS | Ensure 'backend' runtime parameters are configured correctly | +| 6.7 | Not Tested | Ensure FIPS 140-2 OpenSSL Cryptography Is Used | +| 6.8 | PASS | Ensure SSL is enabled and configured correctly | +| 7.3 | PASS | Ensure WAL archiving is configured and functional | + +Test 5.2 can PASS if audited manually, but it doesn't have an +automated test. + +Test 6.7 succeeds on default deployments using CentOS, but it +requires extra packages on Debian variants. diff --git a/product_docs/docs/pgd/5/sequences.mdx b/product_docs/docs/pgd/5/sequences.mdx new file mode 100644 index 00000000000..1862c197ab5 --- /dev/null +++ b/product_docs/docs/pgd/5/sequences.mdx @@ -0,0 +1,830 @@ +--- +title: Sequences +redirects: + - bdr/sequences + +--- + +Many applications require that unique surrogate ids be assigned to database entries. +Often the database `SEQUENCE` object is used to produce these. In +PostgreSQL, these can be either: +- A manually created sequence using the +`CREATE SEQUENCE` command and retrieved by calling the `nextval()` function +- `serial` and `bigserial` columns or, alternatively, +`GENERATED BY DEFAULT AS IDENTITY` columns + +However, standard sequences in PostgreSQL aren't multi-node aware and +produce values that are unique only on the local node. This is important because +unique ids generated by such sequences cause conflict and data loss (by +means of discarded `INSERT` actions) in multi-master replication. + +## BDR global sequences + +For this reason, BDR provides an application-transparent way to generate unique +ids using sequences on bigint or bigserial datatypes across the whole BDR group, +called *global sequences*. + +BDR global sequences provide an easy way for applications to use the +database to generate unique synthetic keys in an asynchronous distributed +system that works for most—but not necessarily all—cases. + +Using BDR global sequences allows you to avoid the problems with insert +conflicts. If you define a `PRIMARY KEY` or `UNIQUE` constraint on a column +that's using a global sequence, no node can ever get +the same value as any other node. When BDR synchronizes inserts between the +nodes, they can never conflict. + +BDR global sequences extend PostgreSQL sequences, so they are crash-safe. To use +them, you must be granted the `bdr_application` role. + +There are various possible algorithms for global sequences: + +- SnowflakeId sequences +- Globally allocated range sequences + +SnowflakeId sequences generate values using an algorithm that doesn't require +inter-node communication at any point. It's faster and more robust and has the +useful property of recording the timestamp at which the values were +created. + +SnowflakeId sequences have the restriction that they work only for 64-bit BIGINT +datatypes and produce values 19 digits long, which might be too long for +use in some host language datatypes such as Javascript Integer types. +Globally allocated sequences allocate a local range of values that can +be replenished as needed by inter-node consensus, making them suitable for +either BIGINT or INTEGER sequences. + +You can create a global sequence using the `bdr.alter_sequence_set_kind()` +function. This function takes a standard PostgreSQL sequence and marks it as +a BDR global sequence. It can also convert the sequence back to the standard +PostgreSQL sequence. + +BDR also provides the configuration variable `bdr.default_sequence_kind`, which +determines the kind of sequence to create when the `CREATE SEQUENCE` +command is executed or when a `serial`, `bigserial`, or +`GENERATED BY DEFAULT AS IDENTITY` column is created. Valid settings are: + +- `local`, meaning that newly created + sequences are the standard PostgreSQL (local) sequences. +- `galloc`, which always creates globally allocated range sequences. +- `snowflakeid`, which creates global sequences for BIGINT sequences that + consist of time, nodeid, and counter components. You can't use it with + INTEGER sequences (so you can use it for `bigserial` but not for `serial`). +- `timeshard`, which is the older version of SnowflakeId sequence and is provided for + backward compatibility only. The SnowflakeId is preferred. +- `distributed` (the default), which is a special value that you can use only for + `bdr.default_sequence_kind`. It selects `snowflakeid` for `int8` + sequences (i.e., `bigserial`) and `galloc` sequence for `int4` + (i.e., `serial`) and `int2` sequences. + +The `bdr.sequences` view shows information about individual sequence kinds. + +`currval()` and `lastval()` work correctly for all types of global sequence. + +### SnowflakeId sequences + +The ids generated by SnowflakeId sequences are loosely time ordered so you can +use them to get the approximate order of data insertion, like standard PostgreSQL +sequences. Values generated within the same millisecond might be out of order, +even on one node. The property of loose time ordering means they are suitable +for use as range partition keys. + +SnowflakeId sequences work on one or more nodes and don't require any inter-node +communication after the node join process completes. So you can continue to +use them even if there's the risk of extended network partitions. They aren't +affected by replication lag or inter-node latency. + +SnowflakeId sequences generate unique ids in a different +way from standard sequences. The algorithm uses three components for a +sequence number. The first component of the sequence is a timestamp +at the time of sequence number generation. The second component of +the sequence number is the unique id assigned to each BDR node, +which ensures that the ids from different nodes are always different. +The third component is the number generated by +the local sequence. + +While adding a unique node id to the sequence number is enough +to ensure there are no conflicts, we also want to keep another useful +property of sequences. The ordering of the sequence +numbers roughly corresponds to the order in which data was inserted +into the table. Putting the timestamp first ensures this. + +A few limitations and caveats apply to SnowflakeId sequences. + +SnowflakeId sequences are 64 bits wide and need a `bigint` or `bigserial`. +Values generated are at least 19 digits long. +There's no practical 32-bit `integer` version, so you can't use it with `serial` +sequences. Use globally allocated range sequences instead. + +For SnowflakeId there's a limit of 4096 sequence values generated per +millisecond on any given node (about 4 million sequence values per +second). In case the sequence value generation wraps around within a given +millisecond, the SnowflakeId sequence waits until the next millisecond and gets a +fresh value for that millisecond. + +Since SnowflakeId sequences encode timestamps into the sequence value, you can generate new sequence +values only within the given time frame (depending on the system clock). +The oldest timestamp that you can use is 2016-10-07, which is the epoch time for +the SnowflakeId. The values wrap to negative values in the year 2086 and +completely run out of numbers by 2156. + +Since timestamp is an important part of a SnowflakeId sequence, there's additional +protection from generating sequences with a timestamp older than the latest one +used in the lifetime of a postgres process (but not between postgres restarts). + +The `INCREMENT` option on a sequence used as input for SnowflakeId sequences is +effectively ignored. This might be relevant for applications that do sequence +ID caching, like many object-relational mapper (ORM) tools, notably Hibernate. +Because the sequence is time based, this has little practical effect since the +sequence advances to a new noncolliding value by the time the +application can do anything with the cached values. + +Similarly, you might change the `START`, `MINVALUE`, `MAXVALUE`, and `CACHE` settings +on the underlying sequence, but there's no benefit to doing +so. The sequence's low 14 bits are used and the rest is discarded, so +the value range limits don't affect the function's result. For the same +reason, `setval()` isn't useful for SnowflakeId sequences. + +#### Timeshard sequences + +Timeshard sequences are provided for backward compatibility with existing +installations but aren't recommended for new application use. We recommend +using the SnowflakeId sequence instead. + +Timeshard is very similar to SnowflakeId but has different limits and fewer +protections and slower performance. + +The differences between timeshard and SnowflakeId are as following: + + - Timeshard can generate up to 16384 per millisecond (about 16 million per + second), which is more than SnowflakeId. However, there's no protection + against wraparound within a given millisecond. Schemas using the timeshard + sequence must protect the use of the `UNIQUE` constraint when using timeshard values + for given column. + - The timestamp component of timeshard sequence runs out of values in + the year 2050 and, if used in combination with bigint, the values wrap + to negative numbers in the year 2033. This means that sequences generated + after 2033 have negative values. This is a considerably shorter time + span than SnowflakeId and is the main reason why SnowflakeId is preferred. + - Timeshard sequences require occasional disk writes (similar to standard local + sequences). SnowflakeIds are calculated in memory so the SnowflakeId + sequences are in general a little faster than timeshard sequences. + +### Globally allocated range sequences + +The globally allocated range (or `galloc`) sequences allocate ranges (chunks) +of values to each node. When the local range is used up, a new range is +allocated globally by consensus amongst the other nodes. This uses the key +space efficiently but requires that the local node be connected to a majority +of the nodes in the cluster for the sequence generator to progress when the +currently assigned local range is used up. + +Unlike SnowflakeId sequences, `galloc` sequences support all sequence data types +provided by PostgreSQL: `smallint`, `integer`, and `bigint`. This means that +you can use `galloc` sequences in environments where 64-bit sequences are +problematic. Examples include using integers in javascript, since that supports only +53-bit values, or when the sequence is displayed on output with limited space. + +The range assigned by each voting is currently predetermined based on the +datatype the sequence is using: + +- smallint — 1 000 numbers +- integer — 1 000 000 numbers +- bigint — 1 000 000 000 numbers + +Each node allocates two chunks of seq_chunk_size, one for the current use +plus a reserved chunk for future usage, so the values generated from any one +node increase monotonically. However, viewed globally, the values +generated aren't ordered at all. This might cause a loss of performance +due to the effects on b-tree indexes and typically means that generated +values aren't useful as range partition keys. + +The main downside of the `galloc` sequences is that once the assigned range is +used up, the sequence generator has to ask for consensus about the next range +for the local node that requires inter-node communication. This could +lead to delays or operational issues if the majority of the BDR group isn't +accessible. This might be avoided in later releases. + +The `CACHE`, `START`, `MINVALUE`, and `MAXVALUE` options work correctly +with `galloc` sequences. However, you need to set them before transforming +the sequence to the `galloc` kind. The `INCREMENT BY` option also works +correctly. However, you can't assign an increment value that's equal +to or more than the above ranges assigned for each sequence datatype. +`setval()` doesn't reset the global state for `galloc` sequences; don't use it. + +A few limitations apply to `galloc` sequences. BDR tracks `galloc` sequences in a +special BDR catalog [bdr.sequence_alloc](catalogs#bdrsequence_alloc). This +catalog is required to track the currently allocated chunks for the `galloc` +sequences. The sequence name and namespace is stored in this catalog. Since the +sequence chunk allocation is managed by Raft, whereas any changes to the +sequence name/namespace is managed by the replication stream, BDR currently doesn't +support renaming `galloc` sequences or moving them to another namespace or +renaming the namespace that contains a `galloc` sequence. Be +mindful of this limitation while designing application schema. + +#### Converting a local sequence to a galloc sequence + +Before transforming a local sequence to galloc, you need to take care of several +prerequisites. + +##### 1. Verify that sequence and column data type match + +Check that the sequence's data type matches the data type of the column with +which it will be used. For example, you can create a `bigint` sequence +and assign an `integer` column's default to the `nextval()` returned by that +sequence. With galloc sequences, which for `bigint` are allocated in blocks of +1 000 000 000, this quickly results in the values returned by `nextval()` +exceeding the `int4` range if more than two nodes are in use. + +The following example shows what can happen: + +```sql +CREATE SEQUENCE int8_seq; + +SELECT sequencename, data_type FROM pg_sequences; + sequencename | data_type +--------------+----------- + int8_seq | bigint +(1 row) + +CREATE TABLE seqtest (id INT NOT NULL PRIMARY KEY); + +ALTER SEQUENCE int8_seq OWNED BY seqtest.id; + +SELECT bdr.alter_sequence_set_kind('public.int8_seq'::regclass, 'galloc', 1); + alter_sequence_set_kind +------------------------- + +(1 row) + +ALTER TABLE seqtest ALTER COLUMN id SET DEFAULT nextval('int8_seq'::regclass); +``` + +After executing `INSERT INTO seqtest VALUES(DEFAULT)` on two nodes, the table +contains the following values: + +```sql +SELECT * FROM seqtest; + id +------------ + 2 + 2000000002 +(2 rows) +``` + +However, attempting the same operation on a third node fails with an +`integer out of range` error, as the sequence generated the value +`4000000002`. + +!!! Tip + You can retrieve the current data type of a sequence from the PostgreSQL + [pg_sequences](https://www.postgresql.org/docs/current/view-pg-sequences.html) + view. You can modify the data type of a sequence with `ALTER SEQUENCE ... AS ...`, + for example, `ALTER SEQUENCE public.sequence AS integer`, as long as its current + value doesn't exceed the maximum value of the new data type. + +##### 2. Set a new start value for the sequence + +When the sequence kind is altered to `galloc`, it's rewritten and restarts from +the defined start value of the local sequence. If this happens on an existing +sequence in a production database, you need to query the current value and +then set the start value appropriately. To assist with this use case, BDR +allows users to pass a starting value with the function `bdr.alter_sequence_set_kind()`. +If you're already using offset and you have writes from multiple nodes, you +need to check what is the greatest used value and restart the sequence at least +to the next value. + +```sql +-- determine highest sequence value across all nodes +SELECT max((x->'response'->'command_tuples'->0->>'nextval')::bigint) + FROM json_array_elements( + bdr.run_on_all_nodes( + E'SELECT nextval(\'public.sequence\');' + )::jsonb AS x; + +-- turn into a galloc sequence +SELECT bdr.alter_sequence_set_kind('public.sequence'::regclass, 'galloc', $MAX + $MARGIN); +``` + +Since users can't lock a sequence, you must leave a `$MARGIN` value to allow +operations to continue while the `max()` value is queried. + +The `bdr.sequence_alloc` table gives information on the chunk size and the +ranges allocated around the whole cluster. +In this example, we started our sequence from `333`, and we have two nodes in the +cluster. We can see that we have a number of allocation 4, which is 2 per node, +and the chunk size is 1000000 that's related to an integer sequence. + +```sql +SELECT * FROM bdr.sequence_alloc + WHERE seqid = 'public.categories_category_seq'::regclass; + seqid | seq_chunk_size | seq_allocated_up_to | seq_nallocs | seq_last_alloc +-------------------------+----------------+---------------------+-------------+----------------------------- + categories_category_seq | 1000000 | 4000333 | 4 | 2020-05-21 20:02:15.957835+00 +(1 row) +``` + +To see the ranges currently assigned to a given sequence on each node, use +these queries: + +* Node `Node1` is using range from `333` to `2000333`. + +```sql +SELECT last_value AS range_start, log_cnt AS range_end + FROM categories_category_seq WHERE ctid = '(0,2)'; -- first range + range_start | range_end +-------------+----------- + 334 | 1000333 +(1 row) + +SELECT last_value AS range_start, log_cnt AS range_end + FROM categories_category_seq WHERE ctid = '(0,3)'; -- second range + range_start | range_end +-------------+----------- + 1000334 | 2000333 +(1 row) +``` + +* Node `Node2` is using range from `2000004` to `4000003`. + +```sql +SELECT last_value AS range_start, log_cnt AS range_end + FROM categories_category_seq WHERE ctid = '(0,2)'; -- first range + range_start | range_end +-------------+----------- + 2000334 | 3000333 +(1 row) + +SELECT last_value AS range_start, log_cnt AS range_end + FROM categories_category_seq WHERE ctid = '(0,3)'; -- second range + range_start | range_end +-------------+----------- + 3000334 | 4000333 +``` + +!!! NOTE + You can't combine it to a single query (like `WHERE ctid IN ('(0,2)', '(0,3)')`) + as that still shows only the first range. + +When a node finishes a chunk, it asks a consensus for a new one and gets the +first available. In the example, it's from 4000334 to 5000333. This is +the new reserved chunk and starts to consume the old reserved chunk. + +## UUIDs, KSUUIDs, and other approaches + +There are other ways to generate globally unique ids without using the global +sequences that can be used with BDR. For example: + +- UUIDs and their BDR variant, KSUUIDs +- Local sequences with a different offset per node (i.e., manual) +- An externally coordinated natural key + +BDR applications can't use other methods safely: +counter-table-based approaches relying on `SELECT ... FOR UPDATE`, `UPDATE ... RETURNING ...` +or similar for sequence generation doesn't work correctly in BDR because BDR +doesn't take row locks between nodes. The same values are generated on +more than one node. For the same reason, the usual strategies for "gapless" +sequence generation don't work with BDR. In most cases, the application +coordinates generation of sequences that must be gapless from some external +source using two-phase commit. Or it generates them only on one node in +the BDR group. + +### UUIDs and KSUUIDs + +`UUID` keys instead avoid sequences entirely and +use 128-bit universal unique identifiers. These are random +or pseudorandom values that are so large that it's nearly +impossible for the same value to be generated twice. There's +no need for nodes to have continuous communication when using `UUID` keys. + +In the unlikely event of a collision, conflict detection +chooses the newer of the two inserted records to retain. Conflict logging, +if enabled, records such an event. However, it's +exceptionally unlikely to ever occur, since collisions +become practically likely only after about `2^64` keys are generated. + +The main downside of `UUID` keys is that they're somewhat inefficient in terms of space and +the network. They consume more space not only as a primary key but +also where referenced in foreign keys and when transmitted on the wire. +Also, not all applications cope well with `UUID` keys. + +BDR provides functions for working with a K-Sortable variant of `UUID` data, +known as KSUUID, which generates values that can be stored using the PostgreSQL +standard `UUID` data type. A `KSUUID` value is similar to `UUIDv1` in that +it stores both timestamp and random data, following the `UUID` standard. +The difference is that `KSUUID` is K-Sortable, meaning that it's weakly +sortable by timestamp. This makes it more useful as a database key as it +produces more compact `btree` indexes, which improves +the effectiveness of search, and allows natural time-sorting of result data. +Unlike `UUIDv1`, +`KSUUID` values don't include the MAC of the computer on which they were +generated, so there are no security concerns from using them. + +`KSUUID` v2 is now recommended in all cases. You can directly sort values generated +with regular comparison operators. + +There are two versions of `KSUUID` in BDR: v1 and v2. +The legacy `KSUUID` v1 is +deprecated but is kept in order to support existing installations. Don't +use it for new installations. +The internal contents of v1 and v2 aren't compatible. As such, the +functions to manipulate them also aren't compatible. The v2 of `KSUUID` also +no longer stores the `UUID` version number. + +### Step and offset sequences + +In offset-step sequences, a normal PostgreSQL sequence is used on each node. +Each sequence increments by the same amount and starts at differing offsets. +For example, with step 1000, node1's sequence generates 1001, 2001, 3001, and +so on. node2's sequence generates 1002, 2002, 3002, and so on. This scheme works well +even if the nodes can't communicate for extended periods. However, the designer +must specify a maximum number of nodes when establishing the +schema, and it requires per-node configuration. Mistakes can easily lead to +overlapping sequences. + +It's relatively simple to configure this approach with BDR by creating the +desired sequence on one node, like this: + +``` +CREATE TABLE some_table ( + generated_value bigint primary key +); + +CREATE SEQUENCE some_seq INCREMENT 1000 OWNED BY some_table.generated_value; + +ALTER TABLE some_table ALTER COLUMN generated_value SET DEFAULT nextval('some_seq'); +``` + +Then, on each node calling `setval()`, give each node a different offset +starting value, for example: + +``` +-- On node 1 +SELECT setval('some_seq', 1); + +-- On node 2 +SELECT setval('some_seq', 2); + + -- ... etc +``` + +Be sure to allow a large enough `INCREMENT` to leave room for all +the nodes you might ever want to add, since changing it in future is difficult +and disruptive. + +If you use `bigint` values, there's no practical concern about key exhaustion, +even if you use offsets of 10000 or more. It would take hundreds of years, +with hundreds of machines, doing millions of inserts per second, to have any +chance of approaching exhaustion. + +BDR doesn't currently offer any automation for configuration of the +per-node offsets on such step/offset sequences. + +#### Composite keys + +A variant on step/offset sequences is to use a composite key composed of +`PRIMARY KEY (node_number, generated_value)`, where the +node number is usually obtained from a function that returns a different +number on each node. You can create such a function by temporarily +disabling DDL replication and creating a constant SQL function. Alternatively, you can use +a one-row table that isn't part of a replication set to store a different +value in each node. + +## Global sequence management interfaces + +BDR provides an interface for converting between a standard PostgreSQL sequence +and the BDR global sequence. + +The following functions are considered to be `DDL`, so DDL replication +and global locking applies to them. + +### bdr.alter_sequence_set_kind + +Allows the owner of a sequence to set the kind of a sequence. +Once set, `seqkind` is visible only by way of the `bdr.sequences` view. +In all other ways, the sequence appears as a normal sequence. + +BDR treats this function as `DDL`, so DDL replication and global locking applies, +if it's currently active. See [DDL Replication](ddl). + +#### Synopsis + +```sql +bdr.alter_sequence_set_kind(seqoid regclass, seqkind text) +``` + +#### Parameters + +- `seqoid` — Name or Oid of the sequence to alter. +- `seqkind` — `local` for a standard PostgreSQL sequence, `snowflakeid` or + `galloc` for globally unique BDR sequences, or `timeshard` for legacy + globally unique sequence. + +#### Notes + +When changing the sequence kind to `galloc`, the first allocated range for that +sequence uses the sequence start value as the starting point. When there are +existing values that were used by the sequence before it was changed to `galloc`, +we recommend moving the starting point so that the newly generated +values don't conflict with the existing ones using the following command: + +```sql +ALTER SEQUENCE seq_name START starting_value RESTART +``` + +This function uses the same replication mechanism as `DDL` statements. This means +that the replication is affected by the [ddl filters](repsets#ddl-replication-filtering) +configuration. + +The function takes a global `DDL` lock. It also locks the sequence locally. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. + +Only the owner of the sequence can execute the `bdr.alter_sequence_set_kind` function +unless `bdr.backwards_compatibility` is +set is set to 30618 or lower. + +### bdr.extract_timestamp_from_snowflakeid + +This function extracts the timestamp component of the `snowflakeid` sequence. +The return value is of type timestamptz. + +#### Synopsis +```sql +bdr.extract_timestamp_from_snowflakeid(snowflakeid bigint) +``` + +#### Parameters + - `snowflakeid` — Value of a snowflakeid sequence. + +#### Notes + +This function executes only on the local node. + +### bdr.extract_nodeid_from_snowflakeid + +This function extracts the nodeid component of the `snowflakeid` sequence. + +#### Synopsis +```sql +bdr.extract_nodeid_from_snowflakeid(snowflakeid bigint) +``` + +#### Parameters + - `snowflakeid` — Value of a snowflakeid sequence. + +#### Notes + +This function executes only on the local node. + +### bdr.extract_localseqid_from_snowflakeid + +This function extracts the local sequence value component of the `snowflakeid` sequence. + +#### Synopsis +```sql +bdr.extract_localseqid_from_snowflakeid(snowflakeid bigint) +``` + +#### Parameters + - `snowflakeid` — Value of a snowflakeid sequence. + +#### Notes + +This function executes only on the local node. + +### bdr.timestamp_to_snowflakeid + +This function converts a timestamp value to a dummy snowflakeid sequence value. + +This is useful for doing indexed searches or comparisons of values in the +snowflakeid column and for a specific timestamp. + +For example, given a table `foo` with a column `id` that's using a `snowflakeid` +sequence, we can get the number of changes since yesterday midnight like this: + +``` +SELECT count(1) FROM foo WHERE id > bdr.timestamp_to_snowflakeid('yesterday') +``` + +A query formulated this way uses an index scan on the column `id`. + +#### Synopsis +```sql +bdr.timestamp_to_snowflakeid(ts timestamptz) +``` + +#### Parameters + - `ts` — Timestamp to use for the snowflakeid sequence generation. + +#### Notes + +This function executes only on the local node. + +### bdr.extract_timestamp_from_timeshard + +This function extracts the timestamp component of the `timeshard` sequence. +The return value is of type timestamptz. + +#### Synopsis + +```sql +bdr.extract_timestamp_from_timeshard(timeshard_seq bigint) +``` + +#### Parameters + +- `timeshard_seq` — Value of a timeshard sequence. + +#### Notes + +This function executes only on the local node. + +### bdr.extract_nodeid_from_timeshard + +This function extracts the nodeid component of the `timeshard` sequence. + +#### Synopsis + +```sql +bdr.extract_nodeid_from_timeshard(timeshard_seq bigint) +``` + +#### Parameters + +- `timeshard_seq` — Value of a timeshard sequence. + +#### Notes + +This function executes only on the local node. + +### bdr.extract_localseqid_from_timeshard + +This function extracts the local sequence value component of the `timeshard` sequence. + +#### Synopsis + +```sql +bdr.extract_localseqid_from_timeshard(timeshard_seq bigint) +``` + +#### Parameters + +- `timeshard_seq` — Value of a timeshard sequence. + +#### Notes + +This function executes only on the local node. + +### bdr.timestamp_to_timeshard + +This function converts a timestamp value to a dummy timeshard sequence value. + +This is useful for doing indexed searches or comparisons of values in the +timeshard column and for a specific timestamp. + +For example, given a table `foo` with a column `id` that's using a `timeshard` +sequence, we can get the number of changes since yesterday midnight like this: + +``` +SELECT count(1) FROM foo WHERE id > bdr.timestamp_to_timeshard('yesterday') +``` + +A query formulated this way uses an index scan on the column `id`. + +#### Synopsis + +```sql +bdr.timestamp_to_timeshard(ts timestamptz) +``` + +#### Parameters + +- `ts` — Timestamp to use for the timeshard sequence generation. + +#### Notes + +This function executes only on the local node. + +## KSUUID v2 Functions + +Functions for working with `KSUUID` v2 data, K-Sortable UUID data. + +### bdr.gen_ksuuid_v2 + +This function generates a new `KSUUID` v2 value using the value of timestamp passed as an +argument or current system time if NULL is passed. +If you want to generate KSUUID automatically using the system time, pass a NULL argument. + +The return value is of type UUID. + +#### Synopsis + +```sql +bdr.gen_ksuuid_v2(timestamptz) +``` + +#### Notes + +This function executes only on the local node. + +### bdr.ksuuid_v2_cmp + +This function compares the `KSUUID` v2 values. + +It returns 1 if the first value is newer, -1 if the second value is lower, or zero if they +are equal. + +#### Synopsis + +```sql +bdr.ksuuid_v2_cmp(uuid, uuid) +``` + +#### Parameters + +- `UUID` — `KSUUID` v2 to compare. + +#### Notes + +This function executes only on the local node. + +### bdr.extract_timestamp_from_ksuuid_v2 + +This function extracts the timestamp component of `KSUUID` v2. +The return value is of type timestamptz. + +#### Synopsis + +```sql +bdr.extract_timestamp_from_ksuuid_v2(uuid) +``` + +#### Parameters + +- `UUID` — `KSUUID` v2 value to extract timestamp from. + +#### Notes + +This function executes only on the local node. + +## KSUUID v1 functions + +Functions for working with `KSUUID` v1 data, K-Sortable UUID data(v1). + +### bdr.gen_ksuuid + +This function generates a new `KSUUID` v1 value, using the current system time. +The return value is of type UUID. + +#### Synopsis + +```sql +bdr.gen_ksuuid() +``` + +#### Notes + +This function executes only on the local node. + +### bdr.uuid_v1_cmp + +This function compares the `KSUUID` v1 values. + +It returns 1 if the first value is newer, -1 if the second value is lower, or zero if they +are equal. + +#### Synopsis + +```sql +bdr.uuid_v1_cmp(uuid, uuid) +``` + +#### Notes + +This function executes only on the local node. + +#### Parameters + +- `UUID` — `KSUUID` v1 to compare. + +### bdr.extract_timestamp_from_ksuuid + +This function extracts the timestamp component of `KSUUID` v1 or `UUIDv1` values. +The return value is of type timestamptz. + +#### Synopsis + +```sql +bdr.extract_timestamp_from_ksuuid(uuid) +``` + +#### Parameters + +- `UUID` — `KSUUID` v1 value to extract timestamp from. + +#### Notes + +This function executes on the local node. diff --git a/product_docs/docs/pgd/5/striggers.mdx b/product_docs/docs/pgd/5/striggers.mdx new file mode 100644 index 00000000000..1b9971c985b --- /dev/null +++ b/product_docs/docs/pgd/5/striggers.mdx @@ -0,0 +1,692 @@ +--- +title: Stream triggers +redirects: + - bdr/striggers + +--- + +BDR introduces new types of triggers that you can use for additional +data processing on the downstream/target node. + +- Conflict triggers +- Transform triggers + +Together, these types of triggers are known as *stream triggers*. + +Stream triggers are designed to be trigger-like in syntax. They leverage the +PostgreSQL BEFORE trigger architecture and are likely to have similar +performance characteristics as PostgreSQL BEFORE Triggers. + +Multiple trigger definitions can use one trigger function, just as with +normal PostgreSQL triggers. +A trigger function is a program defined in this form: +`CREATE FUNCTION ... RETURNS TRIGGER`. Creating the trigger doesn't +require use of the `CREATE TRIGGER` command. Instead, create stream triggers +using the special BDR functions +`bdr.create_conflict_trigger()` and `bdr.create_transform_trigger()`. + +Once created, the trigger is visible in the catalog table `pg_trigger`. +The stream triggers are marked as `tgisinternal = true` and +`tgenabled = 'D'` and have the name suffix '\_bdrc' or '\_bdrt'. The view +`bdr.triggers` provides information on the triggers in relation to the table, +the name of the procedure that is being executed, the event that triggers it, +and the trigger type. + +Stream triggers aren't enabled for normal SQL processing. +Because of this, the `ALTER TABLE ... ENABLE TRIGGER` is blocked for stream +triggers in both its specific name variant and the ALL variant. This mechanism prevents +the trigger from executing as a normal SQL trigger. + +These triggers execute on the downstream or target node. There's no +option for them to execute on the origin node. However, you might want to consider +the use of `row_filter` expressions on the origin. + +Also, any DML that is applied while executing a stream +trigger isn't replicated to other BDR nodes and doesn't +trigger the execution of standard local triggers. This is intentional. You can use it, for example, +to log changes or conflicts captured by a +stream trigger into a table that is crash-safe and specific of that +node. See [Stream triggers examples](#stream-triggers-examples) for a working example. + +## Trigger execution during Apply + +Transform triggers execute first—once for each incoming change in the +triggering table. These triggers fire before we attempt to locate a +matching target row, allowing a very wide range of transforms to be applied +efficiently and consistently. + +Next, for UPDATE and DELETE changes, we locate the target row. If there's no +target row, then no further processing occurs for those change types. + +We then execute any normal triggers that previously were explicitly enabled +as replica triggers at table-level: + +```sql +ALTER TABLE tablename +ENABLE REPLICA TRIGGER trigger_name; +``` + +We then decide whether a potential conflict exists. If so, we then call any +conflict trigger that exists for that table. + +### Missing column conflict resolution + +Before transform triggers are executed, PostgreSQL tries to match the +incoming tuple against the row-type of the target table. + +Any column that exists on the input row but not on the target table +triggers a conflict of type `target_column_missing`. Conversely, a +column existing on the target table but not in the incoming row +triggers a `source_column_missing` conflict. The default resolutions +for those two conflict types are respectively `ignore_if_null` and +`use_default_value`. + +This is relevant in the context of rolling schema upgrades, for +example, if the new version of the schema introduces a new +column. When replicating from an old version of the schema to a new +one, the source column is missing, and the `use_default_value` +strategy is appropriate, as it populates the newly introduced column +with the default value. + +However, when replicating from a node having the new schema version to +a node having the old one, the column is missing from the target +table. The `ignore_if_null` resolver isn't appropriate for a +rolling upgrade because it breaks replication as soon as the user +inserts a tuple with a non-NULL value +in the new column in any of the upgraded nodes. + +In view of this example, the appropriate setting for rolling schema +upgrades is to configure each node to apply the `ignore` resolver in +case of a `target_column_missing` conflict. + +You can do this with the following query, which you must execute +separately on each node. Replace `node1` with the actual +node name. + +```sql +SELECT bdr.alter_node_set_conflict_resolver('node1', + 'target_column_missing', 'ignore'); +``` + +#### Data loss and divergence risk + +Setting the conflict resolver to `ignore` +can lead to data loss and cluster divergence. + +Consider the following example: table `t` exists on nodes 1 and 2, but +its column `col` exists only on node 1. + +If the conflict resolver is set to `ignore`, then there can be rows on +node 1 where `c` isn't null, for example, `(pk=1, col=100)`. That row is +replicated to node 2, and the value in column `c` is discarded, +for example, `(pk=1)`. + +If column `c` is then added to the table on node 2, it is at first +set to NULL on all existing rows, and the row considered above +becomes `(pk=1, col=NULL)`. The row having `pk=1` is no longer +identical on all nodes, and the cluster is therefore divergent. + +The default `ignore_if_null` resolver isn't affected by +this risk because any row replicated to node 2 has +`col=NULL`. + +Based on this example, we recommend running LiveCompare against the +whole cluster at the end of a rolling schema upgrade where the +`ignore` resolver was used. This practice helps to ensure that you detect and fix any divergence. + +## Terminology of row-types + +We use these row-types: + +- `SOURCE_OLD` is the row before update, that is, the key. +- `SOURCE_NEW` is the new row coming from another node. +- `TARGET` is the row that exists on the node already, that is, the conflicting row. + +## Conflict triggers + +Conflict triggers execute when a conflict is detected by BDR. +They decide what happens when the conflict has occurred. + +- If the trigger function returns a row, the action is applied to the target. +- If the trigger function returns a NULL row, the action is skipped. + +For example, if the trigger is called for a `DELETE`, the trigger +returns NULL if it wants to skip the `DELETE`. If you want the `DELETE` to proceed, +then return a row value: either `SOURCE_OLD` or `TARGET` works. +When the conflicting operation is either `INSERT` or `UPDATE`, and the +chosen resolution is the deletion of the conflicting row, the trigger +must explicitly perform the deletion and return NULL. +The trigger function can perform other SQL actions as it chooses, but +those actions are only applied locally, not replicated. + +When a real data conflict occurs between two or more nodes, +two or more concurrent changes are occurring. When we apply those changes, the +conflict resolution occurs independently on each node. This means the conflict +resolution occurs once on each node and can occur with a +significant time difference between them. As a result, communication between the multiple executions of the conflict +trigger isn't possible. It is the responsibility of the author of the conflict trigger to +ensure that the trigger gives exactly the same result for all related events. +Otherwise, data divergence occurs. Technical Support recommends that you formally test all conflict +triggers using the isolationtester tool supplied with +BDR. + +!!! Warning + - You can specify multiple conflict triggers on a single table, but + they must match a distinct event. That is, each conflict must + match only a single conflict trigger. + - We don't recommend multiple triggers matching the same event on the same table. + They might result in inconsistent behavior and + will not be allowed in a future release. + +If the same conflict trigger matches more than one event, you can use the `TG_OP` +variable in the trigger to identify the operation that +produced the conflict. + +By default, BDR detects conflicts by observing a change of replication origin +for a row. Hence, you can call a conflict trigger even when +only one change is occurring. Since, in this case, there's no +real conflict, this conflict detection mechanism can generate +false-positive conflicts. The conflict trigger must handle all of those +identically. + +In some cases, timestamp conflict detection doesn't detect a +conflict at all. For example, in a concurrent `UPDATE`/`DELETE` where the +`DELETE` occurs just after the `UPDATE`, any nodes that see first the `UPDATE` +and then the `DELETE` don't see any conflict. If no conflict is seen, +the conflict trigger are never called. In the same situation but using +row version conflict detection, a conflict is seen, which a conflict trigger can then +handle. + +The trigger function has access to additional state information as well as +the data row involved in the conflict, depending on the operation type: + +- On `INSERT`, conflict triggers can access the `SOURCE_NEW` row from + the source and `TARGET` row. +- On `UPDATE`, conflict triggers can access the `SOURCE_OLD` and + `SOURCE_NEW` row from the source and `TARGET` row. +- On `DELETE`, conflict triggers can access the `SOURCE_OLD` row from + the source and `TARGET` row. + +You can use the function `bdr.trigger_get_row()` to retrieve `SOURCE_OLD`, `SOURCE_NEW`, +or `TARGET` rows, if a value exists for that operation. + +Changes to conflict triggers happen transactionally and are protected by +global DML locks during replication of the configuration change, similarly +to how some variants of `ALTER TABLE` are handled. + +If primary keys are updated inside a conflict trigger, it can +sometimes lead to unique constraint violations errors due to a difference +in timing of execution. +Hence, avoid updating primary keys in conflict triggers. + +## Transform triggers + +These triggers are similar to conflict triggers, except they are executed +for every row on the data stream against the specific table. The behavior of +return values and the exposed variables is similar, but transform triggers +execute before a target row is identified, so there is no `TARGET` row. + +You can specify multiple transform triggers on each table in BDR. +Transform triggers execute in alphabetical order. + +A transform trigger can filter away rows, and it can do additional operations +as needed. It can alter the values of any column or set them to `NULL`. The +return value decides the further action taken: + +- If the trigger function returns a row, it's applied to the target. +- If the trigger function returns a `NULL` row, there's no further action to + perform. Unexecuted triggers never execute. +- The trigger function can perform other actions as it chooses. + +The trigger function has access to additional state information as well as +rows involved in the conflict: + +- On `INSERT`, transform triggers can access the `SOURCE_NEW` row from the source. +- On `UPDATE`, transform triggers can access the `SOURCE_OLD` and `SOURCE_NEW` row from the source. +- On `DELETE`, transform triggers can access the `SOURCE_OLD` row from the source. + +You can use the function `bdr.trigger_get_row()` to retrieve `SOURCE_OLD` or `SOURCE_NEW` +rows. `TARGET` row isn't available, since this type of trigger executes before such +a target row is identified, if any. + +Transform triggers look very similar to normal BEFORE row triggers but have these +important differences: + +- A transform trigger gets called for every incoming change. + BEFORE triggers aren't called at all for `UPDATE` and `DELETE` changes + if a matching row in a table isn't found. + +- Transform triggers are called before partition table routing occurs. + +- Transform triggers have access to the lookup key via `SOURCE_OLD`, + which isn't available to normal SQL triggers. + +## Stream triggers variables + +Both conflict triggers and transform triggers have access to information about +rows and metadata by way of the predefined variables provided by the trigger API and +additional information functions provided by BDR. + +In PL/pgSQL, you can use the predefined variables that follow. + +### TG_NAME + +Data type name. This variable contains the name of the trigger actually fired. +The actual trigger name has a '\_bdrt' or '\_bdrc' suffix +(depending on trigger type) compared to the name provided during trigger creation. + +### TG_WHEN + +Data type text. This variable says `BEFORE` for both conflict and transform triggers. +You can get the stream trigger type by calling the `bdr.trigger_get_type()` +information function. See [bdr.trigger_get_type](#bdrtrigger_get_type). + +### TG_LEVEL + +Data type text: a string of `ROW`. + +### TG_OP + +Data type text: a string of `INSERT`, `UPDATE`, or `DELETE` identifying the operation for which the trigger was fired. + +### TG_RELID + +Data type oid: the object ID of the table that caused the trigger invocation. + +### TG_TABLE_NAME + +Data type name: the name of the table that caused the trigger invocation. + +### TG_TABLE_SCHEMA + +Data type name: the name of the schema of the table that caused the trigger +invocation. For partitioned tables, this is the name of the root table. + +### TG_NARGS + +Data type integer: the number of arguments given to the trigger function in +the `bdr.create_conflict_trigger()` or `bdr.create_transform_trigger()` +statement. + +### TG_ARGV\[] + +Data type array of text: the arguments from the `bdr.create_conflict_trigger()` +or `bdr.create_transform_trigger()` statement. The index counts from 0. +Invalid indexes (less than 0 or greater than or equal to `TG_NARGS`) result in +a `NULL` value. + +## Information functions + +### bdr.trigger_get_row + +This function returns the contents of a trigger row specified by an identifier +as a `RECORD`. This function returns `NULL` if called inappropriately, that is, +called with `SOURCE_NEW` when the operation type (TG_OP) is `DELETE`. + +#### Synopsis + +```sql +bdr.trigger_get_row(row_id text) +``` + +#### Parameters + +- `row_id` — identifier of the row. Can be any of `SOURCE_NEW`, `SOURCE_OLD`, and + `TARGET`, depending on the trigger type and operation (see description of + individual trigger types). + +### bdr.trigger_get_committs + +This function returns the commit timestamp of a trigger row specified by an +identifier. If not available because a row is frozen or isn't available, +returns `NULL`. Always returns `NULL` for row identifier `SOURCE_OLD`. + +#### Synopsis + +```sql +bdr.trigger_get_committs(row_id text) +``` + +#### Parameters + +- `row_id` — Identifier of the row. Can be any of `SOURCE_NEW`, `SOURCE_OLD`, and + `TARGET`, depending on trigger type and operation (see description of + individual trigger types). + +### bdr.trigger_get_xid + +This function returns the local transaction id of a TARGET row specified by an +identifier. If not available because a row is frozen or isn't available, +returns `NULL`. Always returns `NULL` for `SOURCE_OLD` and `SOURCE_NEW` row +identifiers. + +Available only for conflict triggers. + +#### Synopsis + +```sql +bdr.trigger_get_xid(row_id text) +``` + +#### Parameters + +- `row_id` — Identifier of the row. Can be any of `SOURCE_NEW`, `SOURCE_OLD`, and + `TARGET`, depending on trigger type and operation (see description of + individual trigger types). + +### bdr.trigger_get_type + +This function returns the current trigger type, which can be `CONFLICT` +or `TRANSFORM`. Returns null if called outside a stream trigger. + +#### Synopsis + +```sql +bdr.trigger_get_type() +``` + +### bdr.trigger_get_conflict_type + +This function returns the current conflict type if called inside a conflict +trigger. Otherwise, returns `NULL`. + +See [Conflict types](consistency/conflicts#list-of-conflict-types) +for possible return values of this function. + +#### Synopsis + +```sql +bdr.trigger_get_conflict_type() +``` + +### bdr.trigger_get_origin_node_id + +This function returns the node id corresponding to the origin for the trigger +row_id passed in as argument. If the origin isn't valid (which means the row +originated locally), returns the node id of the source or target node, +depending on the trigger row argument. Always returns `NULL` for row identifier +`SOURCE_OLD`. You can use this function to define conflict triggers to always favor a +trusted source node. + +#### Synopsis + +```sql +bdr.trigger_get_origin_node_id(row_id text) +``` + +#### Parameters + +- `row_id` — Identifier of the row. Can be any of `SOURCE_NEW`, `SOURCE_OLD`, and + `TARGET`, depending on trigger type and operation (see description of + individual trigger types). + +### bdr.ri_fkey_on_del_trigger + +When called as a BEFORE trigger, this function uses FOREIGN KEY information +to avoid FK anomalies. + +#### Synopsis + +```sql +bdr.ri_fkey_on_del_trigger() +``` + +## Row contents + +The `SOURCE_NEW`, `SOURCE_OLD`, and `TARGET` contents depend on the operation, REPLICA +IDENTITY setting of a table, and the contents of the target table. + +The TARGET row is available only in conflict triggers. The TARGET row +contains data only if a row was found when applying `UPDATE` or `DELETE` in the target +table. If the row isn't found, the TARGET is `NULL`. + +## Triggers notes + +Execution order for triggers: + +- Transform triggers — Execute once for each incoming row on the target. +- Normal triggers — Execute once per row. +- Conflict triggers — Execute once per row where a conflict exists. + +## Stream triggers manipulation interfaces + +You can create stream triggers only on tables with `REPLICA IDENTITY FULL` +or tables without any columns to which `TOAST` applies. + +### bdr.create_conflict_trigger + +This function creates a new conflict trigger. + +#### Synopsis + +```sql +bdr.create_conflict_trigger(trigger_name text, + events text[], + relation regclass, + function regprocedure, + args text[] DEFAULT '{}') +``` + +#### Parameters + +- `trigger_name` — Name of the new trigger. +- `events` — Array of events on which to fire this trigger. Valid values are + '`INSERT`', '`UPDATE`', and '`DELETE`'. +- `relation` — Relation to fire this trigger for. +- `function` — The function to execute. +- `args` — Optional. Specifies the array of parameters the trigger function + receives on execution (contents of `TG_ARGV` variable). + +#### Notes + +This function uses the same replication mechanism as `DDL` statements. This +means that the replication is affected by the +[ddl filters](repsets#ddl-replication-filtering) configuration. + +The function takes a global DML lock on the relation on which the trigger +is being created. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. + +Similar to normal PostgreSQL triggers, the `bdr.create_conflict_trigger` +function requires `TRIGGER` privilege on the `relation` and `EXECUTE` +privilege on the function. This applies with a +`bdr.backwards_compatibility` of 30619 or above. Additional +security rules apply in BDR to all triggers including conflict +triggers. See [Security and roles](security#triggers). + +### bdr.create_transform_trigger + +This function creates a transform trigger. + +#### Synopsis + +```sql +bdr.create_transform_trigger(trigger_name text, + events text[], + relation regclass, + function regprocedure, + args text[] DEFAULT '{}') +``` + +#### Parameters + +- `trigger_name` — Name of the new trigger. +- `events` — Array of events on which to fire this trigger. Valid values are + '`INSERT`', '`UPDATE`', and '`DELETE`'. +- `relation` — Relation to fire this trigger for. +- `function` — The function to execute. +- `args` — Optional. Specify array of parameters the trigger function + receives on execution (contents of `TG_ARGV` variable). + +#### Notes + +This function uses the same replication mechanism as `DDL` statements. This +means that the replication is affected by the +[ddl filters](repsets#ddl-replication-filtering) configuration. + +The function takes a global DML lock on the relation on which the trigger +is being created. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. + +Similarly to normal PostgreSQL triggers, the `bdr.create_transform_trigger` +function requires the `TRIGGER` privilege on the `relation` and `EXECUTE` +privilege on the function. Additional security rules apply in BDR to all +triggers including transform triggers. See +[Security and roles](security#triggers). + +### bdr.drop_trigger + +This function removes an existing stream trigger (both conflict and transform). + +#### Synopsis + +```sql +bdr.drop_trigger(trigger_name text, + relation regclass, + ifexists boolean DEFAULT false) +``` + +#### Parameters + +- `trigger_name` — Name of an existing trigger. +- `relation` — The relation the trigger is defined for. +- `ifexists` — When set to `true`, this function ignores missing + triggers. + +#### Notes + +This function uses the same replication mechanism as `DDL` statements. This +means that the replication is affected by the +[ddl filters](repsets#ddl-replication-filtering) configuration. + +The function takes a global DML lock on the relation on which the trigger +is being created. + +This function is transactional. You can roll back the effects with the +`ROLLBACK` of the transaction. The changes are visible to the current +transaction. + +Only the owner of the `relation` can execute the `bdr.drop_trigger` function. + +## Stream triggers examples + +A conflict trigger that provides similar behavior as the `update_if_newer` +conflict resolver: + +```sql +CREATE OR REPLACE FUNCTION update_if_newer_trig_func +RETURNS TRIGGER +LANGUAGE plpgsql +AS $$ +BEGIN + IF (bdr.trigger_get_committs('TARGET') > + bdr.trigger_get_committs('SOURCE_NEW')) THEN + RETURN TARGET; + ELSIF + RETURN SOURCE; + END IF; +END; +$$; +``` + +A conflict trigger that applies a delta change on a counter column and uses +SOURCE_NEW for all other columns: + +```sql +CREATE OR REPLACE FUNCTION delta_count_trg_func +RETURNS TRIGGER +LANGUAGE plpgsql +AS $$ +DECLARE + DELTA bigint; + SOURCE_OLD record; + SOURCE_NEW record; + TARGET record; +BEGIN + SOURCE_OLD := bdr.trigger_get_row('SOURCE_OLD'); + SOURCE_NEW := bdr.trigger_get_row('SOURCE_NEW'); + TARGET := bdr.trigger_get_row('TARGET'); + + DELTA := SOURCE_NEW.counter - SOURCE_OLD.counter; + SOURCE_NEW.counter = TARGET.counter + DELTA; + + RETURN SOURCE_NEW; +END; +$$; +``` + +A transform trigger that logs all changes to a log table instead of applying them: + +```sql +CREATE OR REPLACE FUNCTION log_change +RETURNS TRIGGER +LANGUAGE plpgsql +AS $$ +DECLARE + SOURCE_NEW record; + SOURCE_OLD record; + COMMITTS timestamptz; +BEGIN + SOURCE_NEW := bdr.trigger_get_row('SOURCE_NEW'); + SOURCE_OLD := bdr.trigger_get_row('SOURCE_OLD'); + COMMITTS := bdr.trigger_get_committs('SOURCE_NEW'); + + IF (TG_OP = 'INSERT') THEN + INSERT INTO log SELECT 'I', COMMITTS, row_to_json(SOURCE_NEW); + ELSIF (TG_OP = 'UPDATE') THEN + INSERT INTO log SELECT 'U', COMMITTS, row_to_json(SOURCE_NEW); + ELSIF (TG_OP = 'DELETE') THEN + INSERT INTO log SELECT 'D', COMMITTS, row_to_json(SOURCE_OLD); + END IF; + + RETURN NULL; -- do not apply the change +END; +$$; +``` + +This example shows a conflict trigger that implements trusted source +conflict detection, also known as trusted site, preferred node, or Always Wins +resolution. This uses the `bdr.trigger_get_origin_node_id()` function to provide +a solution that works with three or more nodes. + +```sql +CREATE OR REPLACE FUNCTION test_conflict_trigger() +RETURNS TRIGGER +LANGUAGE plpgsql +AS $$ +DECLARE + SOURCE record; + TARGET record; + + TRUSTED_NODE bigint; + SOURCE_NODE bigint; + TARGET_NODE bigint; +BEGIN + TARGET := bdr.trigger_get_row('TARGET'); + IF (TG_OP = 'DELETE') + SOURCE := bdr.trigger_get_row('SOURCE_OLD'); + ELSE + SOURCE := bdr.trigger_get_row('SOURCE_NEW'); + END IF; + + TRUSTED_NODE := current_setting('customer.trusted_node_id'); + + SOURCE_NODE := bdr.trigger_get_origin_node_id('SOURCE_NEW'); + TARGET_NODE := bdr.trigger_get_origin_node_id('TARGET'); + + IF (TRUSTED_NODE = SOURCE_NODE) THEN + RETURN SOURCE; + ELSIF (TRUSTED_NODE = TARGET_NODE) THEN + RETURN TARGET; + ELSE + RETURN NULL; -- do not apply the change + END IF; +END; +$$; +``` diff --git a/product_docs/docs/pgd/5/terminology.mdx b/product_docs/docs/pgd/5/terminology.mdx new file mode 100644 index 00000000000..c568e92c73f --- /dev/null +++ b/product_docs/docs/pgd/5/terminology.mdx @@ -0,0 +1,93 @@ +--- +title: Terminology +--- + +The terminology that follows is important for understanding EDB Postgres Distributed functionality and the requirements that it addresses in the realms of high availability, replication, and clustering. + +#### Asynchronous replication + +Copies data to cluster members after the transaction completes on the origin node. Asynchronous replication can provide higher performance and lower latency than synchronous replication. However, it introduces the potential for conflicts because of multiple concurrent changes. You must manage any conflicts that arise. + +#### Availability + +The probability that a system will operate satisfactorily at a given time when used in a stated environment. For many people, this is the overall amount of uptime versus downtime for an application. (See also **Nines**) + +#### CAMO or commit-at-most-once + +Wraps Eager Replication with additional transaction management at the application level to guard against a transaction being executed more than once. This is critical for high-value transactions found in payments solutions. It is roughly equivalent to the Oracle feature Transaction Guard. + +#### Clustering + +An approach for high availability in which multiple redundant systems are managed to avoid single points of failure. It appears to the end user as one system. + +#### Data sharding + +Enables scaling out a database by breaking up data into chunks called *shards* and distributing them across separate nodes. + +#### Eager Replication for BDR + +Conflict-free replication with all cluster members; technically, this is synchronous logical replication using two phase-commit (2PC). + +#### Eventual consistency + +A distributed computing consistency model stating changes to the same item in different cluster members will converge to the same value. With BDR this is achieved through asynchronous logical replication with conflict resolution and conflict-free replicated data types. + +#### Failover + +The automated process that recognizes a failure in a highly available database cluster and takes action to connect the application to another active database. The goal is to minimize downtime and data loss. + +#### Horizontal scaling** or **scale out + +A modern distributed computing approach that manages workloads across multiple nodes, such as scaling out a web server to handle increased traffic. + +#### Logical replication + +Provides more flexibility than physical replication in terms of selecting the data replicated between databases in a cluster. Also important is that cluster members can be on different versions of the database software. + +#### Nines + +A measure of availability expressed as a percentage of uptime in a given year. Three nines (99.9%) allows for 43.83 minutes of downtime per month. Four nines (99.99%) allows for 4.38 minutes of downtime per month. Five nines (99.999%) allows for 26.3 seconds of downtime per month. + +#### Node + +One database server in a cluster. A term "node" differs from the term "database server" because there is more than one node in a cluster. A node includes the database server, the OS, and the physical hardware, which is always separate from other nodes in a high-availability context. + +#### Physical replication + +Copies all changes from a database to one or more standby cluster members by copying an exact copy of database disk blocks. While fast, this method has downsides. For example, only one master node can run write transactions. Also, you can use this method only where all cluster members are on the same major version of the database software, in addition to several other more complex restrictions. + +#### Read scalability + +Can be achieved by introducing one or more read replica nodes to a cluster and have the application direct writes to the primary node and reads to the replica nodes. As the read workload grows, you can increase the number of read replica nodes to maintain performance. + +#### Recovery point objective (RPO) + +The maximum targeted period in which data might be lost due to a disruption in delivery of an application. A very low or minimal RPO is a driver for very high availability. + +#### Recovery time objective (RTO) + +The targeted length of time for restoring the disrupted application. A very low or minimal RTO is a driver for very high availability. + +#### Single point of failure (SPOF) + +The identification of a component in a deployed architecture that has no redundancy and therefore prevents you from achieving higher levels of availability. + +#### Switchover + +A planned change in connection between the application and the active database node in a cluster, typically done for maintenance. + +#### Synchronous replication + +When changes are updated at all participating nodes at the same time, typically leveraging two-phase commit. While this approach delivers immediate consistency and avoids conflicts, a performance cost in latency occurs due to the coordination required across nodes. + +#### Two-phase commit (2PC) + +A multi-step process for achieving consistency across multiple database nodes. + +#### Vertical scaling** or **scale up + +A traditional computing approach of increasing a resource (CPU, memory, storage, network) to support a given workload until the physical limits of that architecture are reached, e.g., Oracle Exadata. + +#### Write scalability + +Occurs when replicating the writes from the original node to other cluster members becomes less expensive. In vertical-scaled architectures, write scalability is possible due to shared resources. However, in horizontal scaled (or nothing-shared) architectures, this is possible only in very limited scenarios. diff --git a/product_docs/docs/pgd/5/tpa/index.mdx b/product_docs/docs/pgd/5/tpa/index.mdx new file mode 100644 index 00000000000..07a23c8e916 --- /dev/null +++ b/product_docs/docs/pgd/5/tpa/index.mdx @@ -0,0 +1,22 @@ +--- +title: Deploying with TPA +navTitle: Trusted Postgres Architect +indexCards: simple +navigation: +- using_tpa +- quick_start +redirects: + - ../deployments/tpaexec + - ../deployments/tpaexec/installing_tpaexec + - ../deployments/using_tpa/ +--- + +The standard way of deploying EDB Postgres Distributed in a self managed setting, +including physical and virtual machines, both self-hosted and in the cloud +(EC2), is to use EDB's deployment tool: [Trusted Postgres Architect](/tpa/latest/) (TPA). + +## Getting started + +For details on installing TPA, see: [TPA installation](/tpa/latest/INSTALL/). + + diff --git a/product_docs/docs/pgd/5/tpa/quick_start.mdx b/product_docs/docs/pgd/5/tpa/quick_start.mdx new file mode 100644 index 00000000000..654f9f19781 --- /dev/null +++ b/product_docs/docs/pgd/5/tpa/quick_start.mdx @@ -0,0 +1,46 @@ +--- +title: "Example: Deploying EDB Postgres Distributed" +navTitle: "Example: Deploying EDB Postgres Distributed" +description: > + A quick demonstration of deploying a PGD architecture using TPA on Amazon EC2 +redirects: + - /pgd/latest/deployments/tpaexec/quick_start/ +--- + + +The following steps setup EDB Postgres Distributed with an Always On Single Location +architecture using Amazon EC2. + +1. Generate a configuration file: + + ```shell + tpaexec configure myedbdpcluster --architecture PGD-Always-ON --platform aws --location-names eu-west-1 --data-nodes-per-location 3 + ``` + + This creates a subdirectory directory in current working directory called `myedbdpcluster` containing the `config.yml` configuration file TPA uses to create the cluster. Edit the `config.yml` as needed, for example to change the IP address range used for servers or adjust locations of nodes. + + We included options to specify using AWS, a single location, and three data nodes. By default, PGD will also configure two [PGD Proxy](../routing/proxy/) nodes and a Barman node for backup. + +1. Provision the cluster: + + ```shell + tpaexec provision myedbdpcluster + ``` + Since we specified AWS as the platform (the default platform), TPA provisions EC2 instances, VPCs, subnets, routing tables, internet gateways, security groups, EBS volumes, elastic IPs, and so on. + +1. Deploy the cluster: + ```shell + tpaexec deploy myedbdpcluster + ``` + TPA installs the needed packages, applies the configuration and sets up the actual EDB Postgres Distributed cluster + +1. Test the cluster: + + After the successful run of the `deploy` command the cluster is ready to use. You can connect to it via `psql` or any other database client. + + It's also possible to run a test that ensures the cluster is running as expected: + ```shell + tpaexec test myedbdpcluster + ``` + + diff --git a/product_docs/docs/pgd/5/tpa/using_tpa.mdx b/product_docs/docs/pgd/5/tpa/using_tpa.mdx new file mode 100644 index 00000000000..8cfea836bdb --- /dev/null +++ b/product_docs/docs/pgd/5/tpa/using_tpa.mdx @@ -0,0 +1,151 @@ +--- +title: Using TPA +description: > + Detailed reference and examples for using TPA to configure and deploy PGD +redirects: + - /pgd/latest/deployments/tpaexec/using_tpaexec/ +--- + +With TPA you configure, provision, and deploy your EDB Postgres Distributed clusters. + +## Configure +The `tpaexec configure` command generates a simple YAML configuration file to describe a cluster, based on the options you select. The configuration is ready for immediate use and you can modify it to better suit your needs. Editing the configuration file is the usual way to make any configuration changes to your cluster both before and after it's created. + +The syntax is: + +``` +tpaexec configure --architecture [options] +``` + +The available configuration options include: + +| Flags | Description | +| ------------------ | ----------- | +| `--architecture` | Required. Set to `PGD-Always-ON` for EDB Postgres Distributed deployments. | +| `--location-names l1 l2 l3` | Required. Specifies the number and name of the locations PGD will be deployed to. | +| `--data-nodes-per-location N` | Specifies number of data nodes per location. Default 3. | +| `--add-witness-node-per-location` | For even number of data nodes per location, this will add witness node to allow for local consensus. This is enabled by default for 2 data node locations. | +| `--cohost-proxies` | Whether to put PGD-Proxies to data nodes. By default proxies are installed to separate hosts | +| `--active-locations l2 l3` | Which locations should have local connection routing configured. By default global routing is configured. | +| `--add-witness-only-location loc` | This designates one of the cluster location as witness only (no data nodes will be present in that location) | +| `--enable-camo` | Sets up CAMO pair in each location. This only works with 2 data node per location. | + +More configuration options are listed in the TPA documentation for [PGD-Always-ON](/tpa/latest/architecture-PGD-Always-ON/) + +For example: + +``` +[tpa]$ tpaexec configure ~/clusters/speedy \ + --architecture PGD-Always-ON \ + --platform aws \ + --location-names eu-west-1 eu-north-1 eu-central-1 + --data-nodes-per-location 3 +``` + +The first argument must be the cluster directory, for example, `speedy` or `~/clusters/speedy` (the cluster is named `speedy` in both cases). We recommend that you keep all your clusters in a common directory, for example, `~/clusters`. The next argument must be `--architecture` to select an architecture, followed by options. + +The command creates a directory named ~/clusters/speedy and generates a configuration file named `config.yml` that follows the layout of the PGD-Always-ON architecture and gold layout. You can use the `tpcaexec info` command to see what values are supported for the configuration options based on what you specified when running the configure command. + +### Common configuration options + +Other configuration options include: + +#### Owner +Every cluster must be directly traceable to a person responsible for the provisioned resources. + +By default, a cluster is tagged as being owned by the login name of the user running `tpaexec provision`. If this name does not identify a person (for example, `postgres`, `ec2-user`), you must specify `--owner SomeId` to set an identifiable owner. + +You may use your initials, or "Firstname Lastname", or anything else that identifies you uniquely. + +#### Platform options +The default value for `--platform` is `aws`. It is the platform supported by the PGD-Always-ON architecture. + +Specify `--region` to specify any existing AWS region that you have access to (and that permits the required number of instances to be created). The default region is eu-west-1. + +Specify `--instance-type` with any valid instance type for AWS. The default is t3.micro. + +### Subnet selection +By default, each cluster is assigned a random /28 subnet under 10.33/16, but depending on the architecture, there may be one or more subnets, and each subnet may be anywhere between a /24 and a /29. + +Specify `--subnet` to use a particular subnet. For example, `--subnet 192.0.2.128/27`. + +Alternatively, specify `--subnet-pattern` to generate random subnets (as many as required by the architecture) matching the given pattern. For example, `--subnet-pattern 192.0.x.x`. + +### Disk space +Specify `--root-volume-size` to set the size of the root volume in GB. For example, `--root-volume-size 64`. The default is 16GB. (Depending on the image used to create instances, there may be a minimum size for the root volume.) + +For architectures that support separate postgres and barman volumes: + +Specify `--postgres-volume-size` to set the size of the Postgres volume in GB. The default is 16GB. + +Specify `--barman-volume-size` to set the size of the Barman volume in GB. The default is 32GB. + +### Distribution +Specify `--os` or `--distribution` to specify the OS to be used on the cluster's instances. The value is case-sensitive. + +The selected platform determines which distributions are available and which one is used by default. For more details, see `tpaexec info platforms/`. + +In general, you can use "Debian", "RedHat", and "Ubuntu" to select TPA images that have Postgres and other software preinstalled (to reduce deployment times). To use stock distribution images instead, append "-minimal" to the value, for example, `--distribution Debian-minimal`. + +### 2ndQuadrant repositories +By default, TPA installs the 2ndQuadrant public repository and adds on any product repositories that the architecture requires. + +Specify `--2Q-repositories source/name/release … ` to specify the complete list of 2ndQuadrant repositories to install on each instance in addition to the 2ndQuadrant public repository. + +If you do this, you must first export TPA_2Q_SUBSCRIPTION_TOKEN=xxx before you run tpaexec. You can get a subscription token from the [EnterpriseDB customer portal](https://techsupport.enterprisedb.com/customer_portal/) (**Support > Software subscriptions > Add**). + +### Software versions +By default TPA uses the latest major version of Postgres. Specify `--postgres-version` to install an earlier supported major version. + +By default, TPA always installs the latest version of every package. This is usually the desired behavior, but in some testing scenarios, it may be necessary to select specific package versions. For example, + +``` +--postgres-package-version 10.4-2.pgdg90+1 +--repmgr-package-version 4.0.5-1.pgdg90+1 +--barman-package-version 2.4-1.pgdg90+1 +--pglogical-package-version '2.2.0*' +--bdr-package-version '3.0.2*' +--pgbouncer-package-version '1.8*' +``` + +Specify `--extra-packages` or `--extra-postgres-packages` to install additional packages. The former lists packages to install along with system packages, while the latter lists packages to install later along with postgres packages. (If you mention packages that depend on Postgres in the former list, the installation fails because Postgres is not yet installed.) The arguments are passed on to the package manager for installation without any modifications. + +The `--extra-optional-packages` option behaves like `--extra-packages`, but it is not an error if the named packages cannot be installed. + +### Hostnames +By default, `tpaexec configure` randomly selects as many hostnames as it needs from a pre-approved list of several dozen names. This should be enough for most clusters. + +Specify `--hostnames-from` to select names from a different list (for example, if you need more names than are available in the canned list). The file must contain one hostname per line. + +Specify `--hostnames-pattern` to restrict hostnames to those matching the egrep-syntax pattern. If you choose to do this, you must ensure that the pattern matches only valid hostnames ([a-zA-Z0-9-]) and finds a sufficient number thereof. + +### Locations +By default, `tpaexec configure` uses the names first, second, and so on for any locations used by the selected architecture. + +Specify `--location-names` to provide more meaningful names for each location. + + +## Provision +The `tpaexec provision` command creates instances and other resources required by the cluster. The details of the process depend on the architecture (for example, PGD-Always-ON) and platform (for example, AWS) that you selected while configuring the cluster. + +For example, given AWS access with the necessary privileges, TPA provisions EC2 instances, VPCs, subnets, routing tables, internet gateways, security groups, EBS volumes, elastic IPs, and so on. + +You can also "provision" existing servers by selecting the "bare" platform and providing connection details. Whether these are bare metal servers or those provisioned separately on a cloud platform, they can be used just as if they had been created by TPA. + +You are not restricted to a single platform—you can spread your cluster out across some AWS instances (in multiple regions) and some on-premise servers, or servers in other data centres, as needed. + +At the end of the provisioning stage, you will have the required number of instances with the basic operating system installed, which TPA can access via SSH (with sudo to root). + +## Deploy +The `tpaexec deploy` command installs and configures Postgres and other software on the provisioned servers (which may or may not have been created by TPA; but it doesn't matter who created them so long as SSH and sudo access is available). This includes setting up replication, backups, and so on. + +At the end of the deployment stage, EDB Postgres Distributed is up and running. + +## Test +The `tpaexec test` command executes various architecture and platform-specific tests against the deployed cluster to ensure that it is working as expected. + +At the end of the testing stage, you will have a fully-functioning cluster. + +For more information, see [Trusted Postgres Architect](/tpa/latest/). + + diff --git a/product_docs/docs/pgd/5/transaction-streaming.mdx b/product_docs/docs/pgd/5/transaction-streaming.mdx new file mode 100644 index 00000000000..bfeb72c60e1 --- /dev/null +++ b/product_docs/docs/pgd/5/transaction-streaming.mdx @@ -0,0 +1,161 @@ +--- +navTitle: Transaction streaming +title: Transaction streaming +redirects: + - bdr/transaction_streaming + +--- + +With logical replication, transactions are decoded concurrently on the publisher +but aren't sent to subscribers until the transaction is committed. If the +changes exceed `logical_decoding_work_mem` (PostgreSQL 13 and later), they're +spilled to disk. This means that, particularly with large transactions, there's +some delay before they reach subscribers and might entail additional I/O +on the publisher. + +Beginning with PostgreSQL 14, transactions can optionally be decoded and sent to +subscribers before they're committed on the publisher. The subscribers save +the incoming changes to a staging file (or set of files) and apply them when +the transaction commits (or discard them if the transaction aborts). This makes +it possible to apply transactions on subscribers as soon as the transaction +commits. + +## BDR enhancements + +PostgreSQL's built-in transaction streaming has the following +limitations: + +- While you no longer need to spill changes to disk on the publisher, you must write changes + to disk on each subscriber. +- If the transaction aborts, the work (changes received by each subscriber + and the associated storage I/O) is wasted. + +However, starting with version 3.7, BDR supports parallel apply, enabling multiple writer +processes on each subscriber. This capability is leveraged to provide the following enhancements: + +- Decoded transactions can be streamed directly to a writer on the subscriber. +- Decoded transactions don't need to be stored on disk on subscribers. +- You don't need to wait for the transaction to commit before starting to apply the + transaction on the subscriber. + +### Caveats + +- You must enable parallel apply. +- Workloads consisting of many small and conflicting transactions can lead to + frequent deadlocks between writers. + +!!! Note + Direct streaming to writer is still an experimental feature. Use it + with caution. Specifically, it might not work well with + conflict resolutions since the commit timestamp of the streaming might not + be available. (The transaction might not yet have committed on the + origin.) + +## Configuration + +Configure transaction streaming in two locations: + +- At node level, using the GUC [bdr.default_streaming_mode](configuration#transaction-streaming) +- At group level, using the function [bdr.alter_node_group_config()](nodes#bdralter_node_group_config) + +### Node configuration using bdr.default_streaming_mode + +Permitted values are: + +- `off` +- `writer` +- `file` +- `auto` + +Default value is `auto`. + +Changing this setting requires a restart of the +pglogical receiver process for each subscription for the setting to take effect. You can achieve this with a server +restart. + +If `bdr.default_streaming_mode` is set any value other than `off`, the +subscriber requests transaction streaming from the publisher. How this is +provided can also depend on the group configuration setting. See +[Node configuration using bdr.default_streaming_mode](#node-configuration-using-bdrdefault_streaming_mode) for details. + +### Group configuration using bdr.alter_node_group_config() + +You can use the parameter `streaming_mode` in the function [bdr.alter_node_group_config()](nodes#bdralter_node_group_config) +to set the group transaction streaming configuration. + +Permitted values are: + +- `off` +- `writer` +- `file` +- `auto` +- `default` + +The default value is `default`. + +The value of the current setting is contained in the column `node_group_streaming_mode` +from the view [bdr.node_group](catalogs#bdrnode_group). The value returned is +a single char type, and the possible values are `D` (`default`), `W` (`writer`), +`F` (`file`), `A` (`auto`), and `O` (`off`). + +### Configuration setting effects + +Transaction streaming is controlled at the subscriber level by the GUC +`bdr.default_streaming_mode`. Unless set to `off` (which disables transaction +streaming), the subscriber requests transaction streaming. + +If the publisher can provide transaction streaming, it +streams transactions whenever the transaction size exceeds the threshold set in +`logical_decoding_work_mem`. The publisher usually has no control over whether +the transactions is streamed to a file or to a writer. Except for some +situations (such as COPY), it might hint for the subscriber to stream the +transaction to a writer (if possible). + +The subscriber can stream transactions received from the publisher to +either a writer or a file. The decision is based on several factors: + +- If parallel apply is off (`num_writers = 1`), then it's streamed to a file. + (writer 0 is always reserved for non-streamed transactions.) +- If parallel apply is on but all writers are already busy handling streamed + transactions, then the new transaction is streamed to a file. See + [bdr.writers](../monitoring#monitoring-bdr-writers) to check BDR + writer status. + +If streaming to a writer is possible (that is, a free writer is available), then the +decision whether to stream the transaction to a writer or a file is based on +the combination of group and node settings as per the following table: + +| Group | Node | Streamed to | +| ------- | ------ | ----------- | +| off | (any) | (none) | +| (any) | off | (none) | +| writer | file | file | +| file | writer | file | +| default | writer | writer | +| default | file | file | +| default | auto | writer | +| auto | (any) | writer | + +If the group configuration is set to `auto`, or the group +configuration is `default` and the node configuration is `auto`, +then the transaction is streamed to a writer only if the +publisher hinted to do this. + +Currently the publisher hints for the subscriber to stream to the writer +for the following transaction types. These are known to be conflict free +and can be safely handled by the writer. + +- `COPY` +- `CREATE INDEX CONCURRENTLY` + +## Monitoring + +You can monitor the use of transaction streaming using the [bdr.stat_subscription](catalogs#bdrstat_subscription) +function on the subscriber node. + +- `nstream_writer` — Number of transactions streamed to a writer. +- `nstream_file` — Number of transactions streamed to file. +- `nstream_commit` — Number of committed streamed transactions. +- `nstream_abort` — Number of aborted streamed transactions. +- `nstream_start` — Number of streamed transactions that were started. +- `nstream_stop` — Number of streamed transactions that were fully received. diff --git a/product_docs/docs/pgd/5/tssnapshots.mdx b/product_docs/docs/pgd/5/tssnapshots.mdx new file mode 100644 index 00000000000..d1e9a7dd4ac --- /dev/null +++ b/product_docs/docs/pgd/5/tssnapshots.mdx @@ -0,0 +1,61 @@ +--- +title: Timestamp-based snapshots +redirects: + - bdr/tssnapshots + +--- + +The timestamp-based snapshots allow reading data in a consistent manner by using +a user-specified timestamp rather than the usual MVCC snapshot. You can use this +to access data on different BDR nodes at a common point in time. For +example, you can use this as a way to compare data on multiple nodes for data-quality checking. + +This feature doesn't currently work with write transactions. + +Enable the use of timestamp-based snapshots using the `snapshot_timestamp` +parameter. This parameter accepts either a timestamp value or +a special value, `'current'`, which represents the current timestamp (now). If +`snapshot_timestamp` is set, queries use that timestamp to determine +visibility of rows rather than the usual MVCC semantics. + +For example, the following query returns the state of the `customers` table at +2018-12-08 02:28:30 GMT: + +```sql +SET snapshot_timestamp = '2018-12-08 02:28:30 GMT'; +SELECT count(*) FROM customers; +``` + +Without BDR, this works only with future timestamps or the +special 'current' value, so you can't use it for historical queries. + +BDR works with and improves on that feature in a multi-node environment. First, +BDR makes sure that all connections to other nodes replicate any +outstanding data that was added to the database before the specified +timestamp. This ensures that the timestamp-based snapshot is consistent across the whole +multi-master group. Second, BDR adds a parameter called +`bdr.timestamp_snapshot_keep`. This specifies a window of time when you can execute +queries against the recent history on that node. + +You can specify any interval, but be aware that VACUUM (including autovacuum) +doesn't clean dead rows that are newer than up to twice the specified +interval. This also means that transaction ids aren't freed for the same +amount of time. As a result, using this can leave more bloat in user tables. +Initially, we recommend 10 seconds as a typical setting, although you can change that as needed. + +Once the query is accepted for execution, the query might run +for longer than `bdr.timestamp_snapshot_keep` without problem, just as normal. + +Also, information about how far the snapshots were kept doesn't +survive server restart. The oldest usable timestamp for the timestamp-based +snapshot is the time of last restart of the PostgreSQL instance. + +You can combine the use of `bdr.timestamp_snapshot_keep` with the +`postgres_fdw` extension to get a consistent read across multiple nodes in a +BDR group. You can use this to run parallel queries across nodes, when used with foreign tables. + +There are no limits on the number of nodes in a multi-node query when using this +feature. + +Use of timestamp-based snapshots doesn't increase inter-node traffic or +bandwidth. Only the timestamp value is passed in addition to query data. diff --git a/product_docs/docs/pgd/5/twophase.mdx b/product_docs/docs/pgd/5/twophase.mdx new file mode 100644 index 00000000000..b1061b4bae1 --- /dev/null +++ b/product_docs/docs/pgd/5/twophase.mdx @@ -0,0 +1,67 @@ +--- +navTitle: Two-phase commit +title: Explicit two-phase commit (2PC) +redirects: + - bdr/twophase + +--- + +An application can explicitly opt to use two-phase commit with BDR. See +[Distributed Transaction Processing: The XA Specification](http://pubs.opengroup.org/onlinepubs/009680699/toc.pdf). + +The X/Open Distributed Transaction Processing (DTP) model envisions three +software components: + +- An application program (AP) that defines transaction boundaries and specifies + actions that constitute a transaction +- Resource managers (RMs, such as databases or file-access systems) that provide + access to shared resources +- A separate component called a transaction manager (TM) that assigns identifiers + to transactions, monitors their progress, and takes responsibility for + transaction completion and for failure recovery + +BDR supports explicit external 2PC using the `PREPARE TRANSACTION` and +`COMMIT PREPARED`/`ROLLBACK PREPARED` commands. Externally, a EDB Postgres Distributed cluster +appears to be a single resource manager to the transaction manager for a +single session. + +When `bdr.commit_scope` is `local`, the transaction is prepared only +on the local node. Once committed, changes are replicated, and +BDR then applies post-commit conflict resolution. + +Using `bdr.commit_scope` set to `local` might not seem to make sense with +explicit two-phase commit, but the option is offered to allow you +to control the tradeoff between transaction latency and robustness. + +Explicit two-phase commit doesn't work with either CAMO +or the global commit scope. Future releases might enable this combination. + +## Use + +Two-phase commits with a local commit scope work exactly like standard +PostgreSQL. Use the local commit scope and disable CAMO. + +```sql +BEGIN; + +SET LOCAL bdr.enable_camo = 'off'; +SET LOCAL bdr.commit_scope = 'local'; + +... other commands possible... +``` + +To start the first phase of the commit, the client must assign a +global transaction id, which can be any unique string identifying the +transaction: + +```sql +PREPARE TRANSACTION 'some-global-id'; +``` + +After a successful first phase, all nodes have applied the changes and +are prepared for committing the transaction. The client must then invoke +the second phase from the same node: + +```sql +COMMIT PREPARED 'some-global-id'; +``` diff --git a/product_docs/docs/pgd/5/upgrades/app-upgrades.mdx b/product_docs/docs/pgd/5/upgrades/app-upgrades.mdx new file mode 100644 index 00000000000..f964fdbd73f --- /dev/null +++ b/product_docs/docs/pgd/5/upgrades/app-upgrades.mdx @@ -0,0 +1,92 @@ +--- +title: "Application Schema Upgrades" +--- + +Similar to the upgrade of EDB Postgres Distributed itself, there are two +approaches to upgrading the application schema. The simpler option is to +stop all applications affected, preform the schema upgrade, and restart the +application upgraded to use the new schema variant. Again, this +imposes some downtime. + +To eliminate this downtime, EDB Postgres Distributed offers useful tools to +perform a rolling application schema upgrade. + +This section describes some of the recommendations and tips that make the +application schema upgrade less impactful for the cluster. + +### Rolling Application Schema Upgrades + +By default, DDL will automatically be sent to all nodes. This can be +controlled manually, as described in +[DDL Replication](../ddl/), which +could be used to create differences between database schemas across nodes. +BDR is designed to allow replication to continue even while minor +differences exist between nodes. These features are designed to allow +application schema migration without downtime, or to allow logical +standby nodes for reporting or testing. + +!!! Warning + Rolling Application Schema Upgrades have to be managed outside of BDR. + Careful scripting is required to make this work correctly + on production clusters. Extensive testing is advised. + +See [Replicating between nodes with differences](../appusage/) for details. + +When one node runs DDL that adds a new table, nodes that have not +yet received the latest DDL need to handle the extra table. +In view of this, the appropriate setting for rolling schema upgrades +is to configure all nodes to apply the `skip` resolver in case of a +`target_table_missing` conflict. This must be performed before any +node has additional tables added and is intended to be a permanent +setting. + +This is done with the following query, that must be **executed +separately on each node**, after replacing `node1` with the actual +node name: + +```sql +SELECT bdr.alter_node_set_conflict_resolver('node1', + 'target_table_missing', 'skip'); +``` + +When one node runs DDL that adds a column to a table, nodes that have not +yet received the latest DDL need to handle the extra columns. +In view of this, the appropriate setting for rolling schema +upgrades is to configure all nodes to apply the `ignore` resolver in +case of a `target_column_missing` conflict. This must be performed +before one node has additional columns added and is intended to be a +permanent setting. + +This is done with the following query, that must be **executed +separately on each node**, after replacing `node1` with the actual +node name: + +```sql +SELECT bdr.alter_node_set_conflict_resolver('node1', + 'target_column_missing', 'ignore'); +``` + +When one node runs DDL that removes a column from a table, nodes that +have not yet received the latest DDL need to handle the missing column. +This situation will cause a `source_column_missing` conflict, which uses +the `use_default_value` resolver. Thus, columns that neither +accept NULLs nor have a DEFAULT value require a two step process: + +1. Remove NOT NULL constraint or add a DEFAULT value for a column + on all nodes. +2. Remove the column. + +Constraints can be removed in a rolling manner. +There is currently no supported way for handling adding table +constraints in a rolling manner, one node at a time. + +When one node runs a DDL that changes the type of an existing column, +depending on the existence of binary coercibility between the current +type and the target type, the operation may not rewrite the underlying +table data. In that case, it will be only a metadata update of the +underlying column type. Rewrite of a table is normally restricted. +However, in controlled DBA environments, it is possible to change +the type of a column to an automatically castable one by adopting +a rolling upgrade for the type of this column in a non-replicated +environment on all the nodes, one by one. See [ALTER TABLE](../ddl/#alter-table) for more details. + section. diff --git a/product_docs/docs/pgd/5/upgrades/bdr_pg_upgrade.mdx b/product_docs/docs/pgd/5/upgrades/bdr_pg_upgrade.mdx new file mode 100644 index 00000000000..b75915c667f --- /dev/null +++ b/product_docs/docs/pgd/5/upgrades/bdr_pg_upgrade.mdx @@ -0,0 +1,161 @@ +--- +title: In-place Postgres Major Version Upgrades +--- + +Upgrading a BDR Node to a newer major version of Postgres is possible using the +command-line utility `bdr_pg_upgrade`. + +`bdr_pg_upgrade` internally uses the standard [`pg_upgrade`](https://www.postgresql.org/docs/current/pgupgrade.html) +with BDR specific logic to ensure a smooth upgrade. + +## Terminology + +Various terminology is used in this documentation to describe the upgrade process and components involved. + +*old cluster* - The existing Postgres cluster node to be upgraded, which data will be migrated from. + +*new cluster* - The new Postgres cluster, which data will be migrated to. This cluster node must be one (1) major version ahead of the old cluster. + +## Precautions + +Standard Postgres major version upgrade precautions apply, including the fact +that all the requirements for [`pg_upgrade`](https://www.postgresql.org/docs/current/pgupgrade.html#id-1.9.5.12.7) +must be met by both clusters. + +Additionaly, `bdr_pg_upgrade` should not be used if there are other tools using +replication slots and replication origins, only BDR slots and origins will be +restored after the upgrade. + +There are several prerequisites for `bdr_pg_upgrade` that have to be met: + +- Applications using the old cluster have been disconnected, it can for example, + be redirected to another node in the cluster +- Peer authentication is configured for both clusters, `bdr_pg_upgrade` + requires peer authentication +- BDR versions on both clusters must be exactly the same and must be version + 4.1.0 or above +- The new cluster must be in a shutdown state +- BDR packages must be installed in the new cluster +- The new cluster must be already initialized and configured as needed to + match the old cluster configuration +- Databases, tables, and other objects must not exist in the new cluster + +It is also recommended to have the old cluster up prior to running `bdr_pg_upgrade` +as the CLI will start the old cluster if it is shutdown. + +## Usage + +To upgrade to a newer major version of Postgres, the new version must first +be installed. + +### bdr_pg_upgrade command-line + +`bdr_pg_upgrade` passes all parameters to `pg_upgrade`. Therefore, you can +specify any parameters supported by [`pg_upgrade`](https://www.postgresql.org/docs/current/pgupgrade.html#id-1.9.5.12.6). + +#### Synopsis + +```shell +bdr_pg_upgrade [OPTION] ... +``` + +#### Options + +In addition to the options for `pg_upgrade`, the following parameters are +can be passed to `bdr_pg_upgrade`: + +- `-b, --old-bindir` - old cluster bin directory (required) +- `-B, --new-bindir`- new cluster bin directory (required) +- `-d, --old-datadir` - old cluster data directory (required) +- `-D, --new-datadir` - `REQUIRED` new cluster data directory (required) +- `--database` - BDR database name (required) +- `-p, --old-port` - old cluster port number +- `-s, --socketdir` - directory to use for postmaster sockets during upgrade +- `--check`- only perform checks, do not modify clusters + + +#### Environment Variables + +Environment variables can be used in place of command line parameters. + +- `PGBINOLD` - old cluster bin directory +- `PGBINNEW` - new cluster bin directory +- `PGDATAOLD` - old cluster data directory +- `PGDATANEW` - new cluster data directory +- `PGPORTOLD` - old cluster port number +- `PGSOCKETDIR` - directory to use for postmaster sockets during upgrade + + +### Example + +Given a scenario where: + +- Old cluster bin directory is `/usr/lib/postgresql/13/bin` +- New cluster bin directory is `/usr/lib/postgresql/14/bin` +- Old cluster data directory is `/var/lib/postgresql/13/main` +- New cluster data directory is `/var/lib/postgresql/14/main` +- Database name is `bdrdb` + + +The following command could be used to upgrade the cluster: + +``` +bdr_pg_upgrade \ +--old-bindir /usr/lib/postgresql/13/bin \ +--new-bindir /usr/lib/postgresql/14/bin \ +--old-datadir /var/lib/postgresql/13/main \ +--new-datadir /var/lib/postgresql/14/main \ +--database bdrdb +``` + +### Steps Performed + +Steps performed when running `bdr_pg_upgrade`. + +!!! Note + When `--check` is supplied as an argument to `bdr_pg_upgrade`, the CLI + will `skip` steps that modify the database. + +#### BDR Postgres Checks + +| Steps | `--check` supplied | +| :-----------------------------------------------|:------------------:| +| Collecting pre-upgrade new cluster control data | `run` | +| Checking new cluster state is shutdown | `run` | +| Checking BDR versions | `run` | +| Starting old cluster (if shutdown) | `skip` | +| Connecting to old cluster | `skip` | +| Checking if bdr schema exists | `skip` | +| Turning DDL replication off | `skip` | +| Terminating connections to database. | `skip` | +| Disabling connections to database | `skip` | +| Waiting for all slots to be flushed | `skip` | +| Disconnecting from old cluster | `skip` | +| Stopping old cluster | `skip` | +| Starting old cluster with BDR disabled | `skip` | +| Connecting to old cluster | `skip` | +| Collecting replication origins | `skip` | +| Collecting replication slots | `skip` | +| Disconnecting from old cluster | `skip` | +| Stopping old cluster | `skip` | + +#### `pg_upgrade` Steps + +Standard `pg_upgrade` steps are performed + +!!! Note + `--check` is passed to pg_upgrade if supplied + +#### BDR Post-Upgrade Steps + +| Steps | `--check` supplied | +| :-----------------------------------------------|:------------------:| +| Collecting old cluster control data | `skip` | +| Collecting new cluster control data | `skip` | +| Advancing LSN of new cluster | `skip` | +| Starting new cluster with BDR disabled | `skip` | +| Connecting to new cluster | `skip` | +| Creating replication origin Repeated for each origin | `skip` | +| Advancing replication origin Repeated for each origin | `skip` | +| Creating replication slot Repeated for each slot | `skip` | +| Stopping new cluster | `skip` | diff --git a/product_docs/docs/pgd/5/upgrades/compatibility.mdx b/product_docs/docs/pgd/5/upgrades/compatibility.mdx new file mode 100644 index 00000000000..62d8e315cc4 --- /dev/null +++ b/product_docs/docs/pgd/5/upgrades/compatibility.mdx @@ -0,0 +1,71 @@ +--- +title: Compatibility changes +--- + +There are numerous changes in PGD 5 that are not backwards compatible with +PGD 4 or PGD 3.7. + +## Connection routing + +HARP Manager does not exist anymore, it's been replaced by new +[Connection management](../routing) configuration. + +HARP Proxy is replaced by similarly functioning PGD-Proxy which removes any +deprecated features and is configured through above mentioned connection +management configuration. + +## Commit At Most Once + +CAMO configuration is now done through [Commit Scopes](../durability/commit-scopes). The +`bdr.camo_pairs` catalog and any related manipulation functions don't exist +anymore. The `bdr.enable_camo` GUC was removed. +The `synchronous_replication_availability` GUC does not affect CAMO anymore. +Use the `DEGRADE ON ... TO ASYNC` clause of a commit scope. + + +## Eager All Node Replication + +There is no more `global` scope, however it's possible to create scope with same +behavior using [Group Commit](../durability/group-commit). + +```sql +SELECT bdr.add_commit_scope( + commit_scope_name := 'eager_scope', + origin_node_group := 'top_group', + rule := 'ALL (top_group) GROUP COMMIT (conflict_resolution = eager, commit_decision = raft) ABORT ON (timeout = 60s)', + wait_for_ready := true +); +``` + +The `bdr.global_commit_timeout` GUC was removed, use `ABORT ON` clause for the +commit scope. + +## Lag Control + +Similarly to CAMO and Eager, Lag Control configuration was also moved to +[Commit Scopes](../durability/commit-scopes) for more flexible durability configuration. + +## Catalogs + +- `bdr.workers` does not show worker specific info like worker_commit_timestamp anymore +- `bdr.worker_errors` is deprecated and lost most of the info +- `bdr.state_journal_details` is deprecated and lost most of the info +- `bdr.event_summary` replaces the `bdr.worker_errors` and + `bdr.state_journal_details` with additional info like Raft role changes +- the table `bdr.node_catchup_info` now has user consumable view + `bdr.node_catchup_info_details` which shows info in more friendly way +- witness node is no longer distinguished by which replication sets + it replicates, but using `node_kind` value in `bdr.node_summary` +- all the Raft (consensus) related tables and functions were adjusted to support + multiple Raft instances (sub-group Raft) +- `bdr.node_pre_commit` view and the underlying table was removed as the + information is no longer stored in a table +- `bdr.commit_decisions` view was added and replaces the `bdr.node_pre_commit` one +- multiple internal autopatition tables were replaced by taskmgr ones as the + mechanism behind was generalized +- `bdr.network_monitoring` view was removed along with underlying tables and + functions +- many catalogs were added and some have new columns, as documented in the + [Catalogs](../catalogs) section of the documentation, these + are not breaking changes strictly speaking but we recommend to review them + when upgrading diff --git a/product_docs/docs/pgd/5/upgrades/index.mdx b/product_docs/docs/pgd/5/upgrades/index.mdx new file mode 100644 index 00000000000..e943c883e70 --- /dev/null +++ b/product_docs/docs/pgd/5/upgrades/index.mdx @@ -0,0 +1,202 @@ +--- +title: "Upgrading" +--- + +Because EDB Postgres Distributed consists in multiple software components, +the upgrade strategy depends partially on which components are being upgraded. + +In general it's possible to upgrade the cluster with almost zero downtime, by +using an approach called Rolling Upgrade where nodes are upgraded one by one, and +the application connections are switched over to already upgraded nodes. + +Ii's also possible to stop all nodes, perform the upgrade on all nodes and +only then restart the entire cluster, just like with a standard PostgreSQL setup. +This strategy of upgrading all nodes at the same time avoids running with +mixed versions of software and therefore is the simplest, but obviously incurs +some downtime and is not recommended unless the Rolling Upgrade is not possible +for some reason. + +To upgrade an EDB Postgres Distributed cluster, perform the following steps: + +1. Plan the upgrade. +2. Prepare for the upgrade. +3. Upgrade the server software. +4. Check and validate the upgrade. + +## Upgrade Planning + +There are broadly two ways to upgrade each node. + +* Upgrading nodes in-place to the newer software version, see [Rolling Server Software Upgrades](#rolling-server-software-upgrades). +* Replacing nodes with ones that have the newer version installed, see [Rolling Upgrade Using Node Join](#rolling-upgrade-using-node-join). + +Both of these approaches can be done in a rolling manner. + +### Rolling Upgrade considerations + +While the cluster is going through a rolling upgrade, mixed versions of software +are running in the cluster. For example, nodeA has BDR 3.7.16, while +nodeB and nodeC has 4.1.0. In this state, the replication and group +management uses the protocol and features from the oldest version (3.7.16 +in case of this example), so any new features provided by the newer version +which require changes in the protocol are disabled. Once all nodes are +upgraded to the same version, the new features are automatically enabled. + +Similarly, when a cluster with WAL decoder enabled nodes is going through a +rolling upgrade, WAL decoder on a higher version of BDR node produces LCRs +with a higher pglogical version and WAL decoder on a lower version of BDR node +produces LCRs with lower pglogical version. As a result, WAL senders on a higher +version of BDR nodes are not expected to use LCRs due to a mismatch in protocol +versions while on a lower version of BDR nodes, WAL senders may continue to use +LCRs. Once all the BDR nodes are on the same BDR version, WAL senders use +LCRs. + +A rolling upgrade starts with a cluster with all nodes at a prior release, +then proceeds by upgrading one node at a time to the newer release, until +all nodes are at the newer release. There should never be more than two versions +of any component running at the same time, which means the new upgrade must not +be initiated until the previous upgrade process has fully finished on all nodes. + +An upgrade process may take an extended period of time when the user decides +caution is required to reduce business risk, though it's not recommended +to run the mixed versions of the software indefinitely. + +While Rolling Upgrade can be used for upgrading major version of the software +it is not supported to mix PostgreSQL, EDB Postgres Extended and +EDB Postgres Advanced Server in one cluster, so this approach cannot +be used to change the Postgres variant. + +!!! Warning + Downgrades of the EDB Postgres Distributed are *not* supported and require + manual rebuild of the cluster. + +### Rolling Server Software Upgrades + +A rolling upgrade is the process where the [Server +Software Upgrade](#server-software-upgrade) process is performed on each node in the +cluster one after another, while keeping the remainder of the cluster +operational. + +The actual procedure depends on whether the Postgres component is being +upgraded to a new major version or not. + +During the upgrade process, the application can be switched over to a node +which is currently not being upgraded to provide continuous availability of +the database for applications. + +### Rolling Upgrade Using Node Join + +The other method of upgrade of the server software, is to join a new node +to the cluster and later drop one of the existing nodes running +the older version of the software. + +For this approach, the procedure is always the same, however because it +includes node join, the potentially large data transfer is required. + +Care must be taken to not use features that are available only in +the newer Postgres versions, until all nodes are upgraded to the +newer and same release of Postgres. This is especially true for any +new DDL syntax that may have been added to a newer release of Postgres. + +!!! Note + `bdr_init_physical` makes a byte-by-byte of the source node + so it cannot be used while upgrading from one major Postgres version + to another. In fact, currently `bdr_init_physical` requires that even the + BDR version of the source and the joining node is exactly the same. + It cannot be used for rolling upgrades via joining a new node method. Instead, a logical join must be used. + +### Upgrading a CAMO-Enabled Cluster + +Upgrading CAMO-Enabled Cluster requires upgrading CAMO groups one by one, while +disabling the CAMO protection for the group being upgraded and reconfiguring it +using the new [Commit Scope](../durability/commit-scopes) based settings. + +## Upgrade Preparation + +Each major release of the software contains several changes that may affect +compatibility with previous releases. These may affect the Postgres +configuration, deployment scripts, as well as applications using BDR. We +recommend to consider and possibly adjust in advance of the upgrade. + +Please see individual changes mentioned in [release notes](../rel_notes/) and any version +specific upgrade notes in this topic. + +## Server Software Upgrade + +The upgrade of EDB Postgres Distributed on individual nodes happens in-place. +There is no need for backup and restore when upgrading the BDR extension. + +### BDR Extension Upgrade + +BDR extension upgrade process consists of few simple steps. + +#### Stop Postgres + +During the upgrade of binary packages, it's usually best to stop the running +Postgres server first to ensure that mixed versions don't get loaded in case +of unexpected restart during the upgrade. + +#### Upgrade Packages + +The first step in the upgrade is to install the new version of the BDR packages, which +installs both the new binary and the extension SQL script. This step is operating system-specific. + +#### Start Postgres + +Once packages are upgraded the Postgres instance can be started, the BDR +extension is automatically upgraded upon start when the new binaries +detect older version of the extension. + +### Postgres Upgrade + +The process of in-place upgrade of Postgres highly depends on whether you are +upgrading to new minor version of Postgres of to new major version of Postgres. + +#### Minor Version Postgres Upgrade + +Upgrading to a new minor version of Postgres is similar to [upgrading +the BDR extension](#bdr-extension-upgrade). Stopping Postgres, upgrading packages, +and starting Postgres again is typically all that's needed. + +However, sometimes additional steps like re-indexing may be recommended for +specific minor version upgrades. Refer to the Release Notes of the +specific version of Postgres you are upgrading to. + +#### Major Version Postgres Upgrade + +Upgrading to a new major version of Postgres is a more complicated process. + +EDB Postgres Distributed provides a `bdr_pg_upgrade` command line utility, +which can be used to do a [In-place Postgres Major Version Upgrades](bdr_pg_upgrade). + +!!! Note + When upgrading to new major version of any software, including Postgres, the + BDR extension and others, it's always important to ensure the compatibility + of your application with the target version of a given software. + +## Upgrade Check and Validation + +After this procedure, your BDR node is upgraded. You can verify the current +version of BDR4 binary like this: + +```sql +SELECT bdr.bdr_version(); +``` + +Always check the [monitoring](../monitoring) after upgrade of a node to confirm +that the upgraded node is working as expected. + +## Moving from HARP to PGD-Proxy + +HARP can for a time coexist with the new +[Connection management](/product_docs/docs/pgd/5/routing) configuration. + +This means you can upgrade whole pre-5 cluster to PGD 5 cluster, setup +the connection routing, replace the HARP Proxy with PGD-Proxy, move application +connections to PGD-Proxy instances and removed the HARP Manager from all servers. + +It's highly recommended to do this as soon as possible after upgrading nodes to +PGD 5 as HARP is not certified for long term use with PGD 5. + +TPA provides some useful tools for this and will eventually provide single +command upgrade path between PGD 4 and PGD 5. diff --git a/product_docs/docs/pgd/5/upgrades/upgrade_paths.mdx b/product_docs/docs/pgd/5/upgrades/upgrade_paths.mdx new file mode 100644 index 00000000000..2ad66b0418d --- /dev/null +++ b/product_docs/docs/pgd/5/upgrades/upgrade_paths.mdx @@ -0,0 +1,25 @@ +--- +title: Supported BDR upgrade paths +--- + +## Upgrading from version 4 to version 5 + +Upgrades from PGD 4 to PGD 5 are supported from version 4.3.0, for older versions, please upgrade to 4.3.0 before upgrading to 5. See [Upgrading within 4](/pgd/4/upgrades/supported_paths/#upgrading-within-version-4) in the 4 documentation for more information. After upgrading to 4.3.0 or later the following combinations are allowed. + +| 4.3.0 | Target BDR version | +|-------|--------------------| +| X | 5.0.0 | + + +## Upgrading from version 3.7 to version 5 + +Upgrades from PGD 3.7 to PGD 5 are supported from version 3.7.20, please upgrade to 3.7.20 before upgrading to 5. See [Upgrading within from 3.7](/pgd/3.7/bdr/upgrades/supported_paths/#upgrading-within-version-37) in the 3.7 documentation for more information. After upgrading to 3.7.20 or later the following combinations are allowed + + +| 3.7.20 | Target BDR version | +|--------|--------------------| +| X | 5.0.0 | + +## Upgrading within version 5 + +Currently the only version of PGD 5 available is 5.0.0. diff --git a/product_docs/docs/tpa/23/INSTALL.mdx b/product_docs/docs/tpa/23/INSTALL.mdx new file mode 100644 index 00000000000..aa18a5ab1e8 --- /dev/null +++ b/product_docs/docs/tpa/23/INSTALL.mdx @@ -0,0 +1,177 @@ +--- +navTitle: Installation +title: TPA installation +originalFilePath: INSTALL.md + +--- + +To use TPA, you need to install tpaexec and run the `tpaexec setup` +command. This document explains how to install TPA packages. + +TPA packages are available to prospects (for a 60 day trial), EDB +customers with a valid Extreme HA subscription, or by prior arrangement. +Please contact your account manager to request access. + +We publish TPA packages for Debian 10 (buster), Ubuntu 22.04 (jammy), Ubuntu 20.04 +(focal), Ubuntu 18.04 (bionic), RHEL/CentOS 7.x and 8.x, Rocky 8.x and AlmaLinux 8.x. These +distributions provide a usable Python 3.6+ environment out of the box, +which TPA requires. However, TPA supports a wider range of +[distributions on target instances](reference/distributions). + +## Quickstart + +Login to [EDB Repos 2.0](https://www.enterprisedb.com/repos-downloads) +to obtain your token. Then execute the following command, substituting +your token for ``. + +```bash +# Add repository (Debian, Ubuntu) +$ curl -1sLf 'https://downloads.enterprisedb.com//postgres_distributed/setup.deb.sh' | sudo -E bash + +# Add repository (RedHat, Rocky or AlmaLinux) +$ curl -1sLf 'https://downloads.enterprisedb.com//postgres_distributed/setup.rpm.sh' | sudo -E bash +``` + +Then run the following commands: + +```bash +# Install packages (Debian, Ubuntu) +$ sudo apt-get install tpaexec + +# Install packages (RedHat, Rocky or AlmaLinux) +$ sudo yum install tpaexec + +# Install additional dependencies +$ sudo /opt/EDB/TPA/bin/tpaexec setup + +# Verify installation (run as a normal user) +$ /opt/EDB/TPA/bin/tpaexec selftest +``` + +More detailed explanations are given below. + +## What time is it? + +Please make absolutely sure that your system has the correct date and +time set, because various things will fail otherwise. For example: + +```bash +$ sudo ntpdate pool.ntp.org +``` + +## Packages + +To install TPA, you must first subscribe to an EDB repository that +provides it. The preferred source for repositories is EDB Repos 2.0. + +Login to [EDB Repos 2.0](https://www.enterprisedb.com/repos-downloads) +to obtain your token. Then execute the following command, substituting +your token for ``. + +```bash +# Debian or Ubuntu +$ curl -1sLf 'https://downloads.enterprisedb.com//postgres_distributed/setup.deb.sh' | sudo -E bash + +# RedHat, Rocky or AlmaLinux +$ curl -1sLf 'https://downloads.enterprisedb.com//postgres_distributed/setup.rpm.sh' | sudo -E bash +``` + +Alternatively, you may obtain TPA from the legacy 2ndQuadrant +repository. To do so, login to the EDB Customer Support Portal and +subscribe to the ["products/tpa/release" repository](https://techsupport.enterprisedb.com/software_subscriptions/add/products/tpa/) +by adding a subscription under Support/Software/Subscriptions, +and following the instructions to enable the repository on your system. + +Once you have enabled one of these repositories, you may install TPA +as follows: + +```bash +# Debian or Ubuntu +$ sudo apt-get install tpaexec + +# RedHat, Rocky or AlmaLinux +$ sudo yum install tpaexec +``` + +This will install TPA into `/opt/EDB/TPA`. It will also +ensure that other required packages (e.g., Python 3.6 or later) are +installed. + +We mention `sudo` here only to indicate which commands need root +privileges. You may use any other means to run the commands as root. + +## Python environment + +Next, run `tpaexec setup` to create an isolated Python environment and +install the correct versions of all required modules. + +```bash +$ sudo /opt/EDB/TPA/bin/tpaexec setup +``` + +You must run this as root because it writes to `/opt/EDB/TPA`, +but the process will not affect any system-wide Python modules you may +have installed (including Ansible). + +Add `/opt/EDB/TPA/bin` to the `PATH` of the user who will +normally run `tpaexec` commands. For example, you could add this to +your .bashrc or equivalent shell configuration file: + +```bash +$ export PATH=$PATH:/opt/EDB/TPA/bin +``` + +### Installing without network access + +When you run `tpaexec setup`, it will ordinarily download the Python +packages from the network. The `tpaexec-deps` package (available from +the same repository as tpaexec) bundles everything that would have been +downloaded, so that they can be installed without network access. Just +install the package before you run `tpaexec setup` and the bundled +copies will be used automatically. + +## Verification + +Once you're done with all of the above steps, run the following command +to verify your local installation: + +```bash +$ tpaexec selftest +``` + +If that command completes without any errors, your TPA installation +is ready for use. + +## Upgrading + +To upgrade to a later release of TPA, you must: + +1. Install the latest `tpaexec` package +2. Install the latest `tpaexec-deps` package (if required; see above) +3. Run `tpaexec setup` again + +If you have subscribed to the TPA package repository as described +above, running `apt-get update && apt-get upgrade` or `yum update` +should install the latest available versions of these packages. If not, +you can install the packages by any means available. + +We recommend that you run `tpaexec setup` again whenever a new version +of `tpaexec` is installed. Some new releases may not strictly require +this, but others will not work without it. + +## Ansible community support + +TPA now supports ansible community, you may choose to use it by +using `--use-community-ansible` option during `tpaexec setup`, default +will be to use the legacy 2ndQuadrant/ansible fork. This will change in +a future release, support for 2ndQuadrant/ansible will be dropped and +community ansible will become the new default. + +notable difference: + +- change the `--skip-flags` options to community behavior where a + task will be skipped if part of the list given to the `--skip-tags` + option even if it is also tagged with special tag `always`. + TPA expects all tasks tagged with `always` to be run to ensure + a complete deployment, therefor `--skip-tags` should not be used when + using community ansible. diff --git a/product_docs/docs/tpa/23/ansible-and-sudo.mdx b/product_docs/docs/tpa/23/ansible-and-sudo.mdx new file mode 100644 index 00000000000..97c059cad81 --- /dev/null +++ b/product_docs/docs/tpa/23/ansible-and-sudo.mdx @@ -0,0 +1,145 @@ +--- +title: TPA, Ansible, and sudo +originalFilePath: ansible-and-sudo.md + +--- + +TPA uses Ansible with sudo to execute tasks with elevated privileges +on target instances. This page explains how Ansible uses sudo (which is +in no way TPA-specific), and the consequences to systems managed +with TPA. + +TPA needs root privileges; + +- to install packages (required packages using the operating system's + native package manager, and optional packages using pip) +- to stop, reload and restart services (i.e Postgres, repmgr, efm, etcd, + haproxy, pgbouncer etc.) +- to perform a variety of other tasks (e.g., gathering cluster facts, + performing switchover, setting up cluster nodes) + +TPA also needs to be able to use sudo. You can make it ssh in as root +directly by setting `ansible_user: root`, but it will still use sudo to +execute tasks as other users (e.g., postgres). + +## Ansible sudo invocations + +When Ansible runs a task using sudo, you will see a process on the +target instance that looks something like this: + +``` +/bin/bash -c 'sudo -H -S -n -u root /bin/bash -c \ + '"'"'echo BECOME-SUCCESS-kfoodiiprztsyerriqbjuqhhbemejgpc ; \ + /usr/bin/python2'"'"' && sleep 0' +``` + +People who were expecting something like `sudo yum install -y xyzpkg` +are often surprised by this. By and large, most tasks in Ansible will +invoke a Python interpreter to execute Python code, rather than +executing recognisable shell commands. (Playbooks may execute `raw` +shell commands, but TPA uses such tasks only to bootstrap a Python +interpreter.) + +Ansible modules contain Python code of varying complexity, and an +Ansible playbook is not just a shell script written in YAML format. +There is no way to “extract” shell commands that would do the same thing +as executing an arbitrary Ansible playbook. + +There is one significant consequence of how Ansible uses sudo: [privilege +escalation must be general](https://docs.ansible.com/ansible/latest/playbook_guide/playbooks_privilege_escalation.html#privilege-escalation-must-be-general). That, it is not possible +to limit sudo invocations to specific commands in sudoers.conf, +as some administrators are used to doing. Most tasks will just invoke python. +You could have restricted sudo access to python if it were not +for the random string in every command—but once Python is running as root, +there's no effective limit on what it can do anyway. + +Executing Python modules on target hosts is just the way Ansible works. +None of this is specific to TPA in any way, and these considerations +would apply equally to any other Ansible playbook. + +## Recommendations + +- Use SSH public key-based authentication to access target instances. + +- Allow the SSH user to execute sudo commands without a password. + +- Restrict access by time, rather than by command. + +TPA needs access only when you are first setting up your cluster or +running `tpaexec deploy` again to make configuration changes, e.g., +during a maintenance window. Until then, you can disable its access +entirely (a one-line change for both ssh and sudo). + +During deployment, everything Ansible does is generally predictable +based on what the playbooks are doing and what parameters you provide, +and each action is visible in the system logs on the target instances, +as well as the Ansible log on the machine where tpaexec itself runs. + +Ansible's focus is less to impose fine-grained restrictions on what +actions may be executed and more to provide visibility into what it does +as it executes, so elevated privileges are better assigned and managed +by time rather than by scope. + +## SSH and sudo passwords + +We *strongly* recommend setting up password-less SSH key authentication +and password-less sudo access, but it is possible to use passwords too. + +If you set `ANSIBLE_ASK_PASS=yes` and `ANSIBLE_BECOME_ASK_PASS=yes` +in your environment before running tpaexec, Ansible will prompt you to +enter a login password and a sudo password for the remote servers. It +will then negotiate the login/sudo password prompt on the remote server +and send the password you specify (which will make your playbooks take +noticeably longer to run). + +We do not recommend this mode of operation because we feel it is a more +effective security control to completely disable access through a +particular account when not needed than to use a combination of +passwords to restrict access. Using public key authentication for ssh +provides an effective control over who can access the server, and it's +easier to protect a single private key per authorised user than it is to +protect a shared password or multiple shared passwords. Also, if you +limit access at the ssh/sudo level to when it is required, the passwords +do not add any extra security during your maintenance window. + +## sudo options + +To use Ansible with sudo, you must not set `requiretty` in sudoers.conf. + +If needed, you can change the sudo options that Ansible uses +(`-H -S -n`) by setting `become_flags` in the +`[privilege_escalation]` section of ansible.cfg, or +`ANSIBLE_BECOME_FLAGS` in the environment, or `ansible_become_flags` +in the inventory. All three methods are equivalent, but please change +the sudo options only if there is a specific need to do so. The defaults +were chosen for good reasons. For example, removing `-S -n` will cause +tasks to timeout if password-less sudo is incorrectly configured. + +## Logging + +For playbook executions, the sudo logs will show mostly invocations of +Python (just as it will show only an invocation of bash when someone +uses `sudo -i`). + +For more detail, the syslog will show the exact arguments to each module +invocation on the target instance. For a higher-level view of why that +module was invoked, the ansible.log on the controller shows what that +task was trying to do, and the result. + +If you want even more detail, or an independent source of audit data, +you can run auditd on the server and use the SELinux log files. You can +get still more fine-grained syscall-level information from bpftrace/bcc +(e.g., opensnoop shows every file opened on the system, and execsnoop +shows every process executed on the system). You can do any or all of +these things, depending on your needs, with the obvious caveat of +increasing overhead with increased logging. + +## Local privileges + +The +[installation instructions for TPA](INSTALL) +mention sudo only as shorthand for “run these commands as root somehow”. +Once TPA is installed and you have run `tpaexec setup`, TPA +itself does not require elevated privileges on the local machine. (But +if you use Docker, you must run tpaexec as a user that belongs to a +group that is permitted to connect to the Docker daemon.) diff --git a/product_docs/docs/tpa/23/architecture-BDR-Always-ON.mdx b/product_docs/docs/tpa/23/architecture-BDR-Always-ON.mdx new file mode 100644 index 00000000000..e6d17ca0c7f --- /dev/null +++ b/product_docs/docs/tpa/23/architecture-BDR-Always-ON.mdx @@ -0,0 +1,62 @@ +--- +title: BDR-Always-ON +originalFilePath: architecture-BDR-Always-ON.md + +--- + +BDR in an Always-ON configuration, intended for use in production. + +In BDR-Always-ON architecture we have four variants, which can be +selected with the `--layout` configure option: + +1. bronze: 2×bdr+primary, bdr+witness, barman, 2×harp-proxy + +2. silver: bronze, with bdr+witness promoted to bdr+primary, and barman + moved to separate location + +3. gold: two symmetric locations with 2×bdr+primary, 2×harp-proxy, + and barman each; plus a bdr+witness in a third location + +4. platinum: gold, but with one bdr+readonly (logical standby) added to + each of the main locations + +You can check EDB's Postgres-BDR Always On Architectures +[whitepaper](https://www.enterprisedb.com/promote/bdr-always-on-architectures) +for the detailed layout diagrams. + +This architecture is meant for use with BDR versions 3.6, 3.7, and 4. + +## Cluster configuration + +``` +[tpa]$ tpaexec configure ~/clusters/bdr \ + --architecture BDR-Always-ON \ + --layout gold \ + --harp-consensus-protocol bdr \ + --platform aws --region eu-west-1 --instance-type t3.micro \ + --distribution Debian-minimal +``` + +You must specify `--architecture BDR-Always-ON`. (In the example +above, it is the only option required to produce a working +configuration.) + +You also must specify `--layout layoutname` to set one of the supported BDR +use-case variations. The current options are bronze, silver, gold, and +platinum. The bronze, gold and platinum layouts have a BDR witness node +to ensure odd number of nodes for Raft consensus majority. Witness nodes do +not participate in the data replication. + +You must specify `--harp-consensus-protocol protocolname`. The supported +protocols are bdr and etcd; see [`Configuring HARP`](reference/harp) for more details. + +You may optionally specify `--bdr-database dbname` to set the name of +the database with BDR enabled (default: bdrdb). + +You may optionally specify `--enable-camo` to set the pair of BDR +primary instances in each region to be each other's CAMO partners. + +Please note we enable HARP2 by default in BDR-Always-ON architecture. + +You may also specify any of the options described by +[`tpaexec help configure-options`](tpaexec-configure). diff --git a/product_docs/docs/tpa/23/architecture-M1.mdx b/product_docs/docs/tpa/23/architecture-M1.mdx new file mode 100644 index 00000000000..4ce8ebcf2fa --- /dev/null +++ b/product_docs/docs/tpa/23/architecture-M1.mdx @@ -0,0 +1,39 @@ +--- +title: M1 +originalFilePath: architecture-M1.md + +--- + +A Postgres cluster with a primary and a streaming replica, one Barman +server, and any number of additional replicas cascaded from the first +one. + +By default, the primary has one read-only replica attached in the same +location; the replica, in turn, has one cascaded replica attached in a +different location, where the Barman server is also configured to take +backups from the primary. + +![Cluster with cascading replication](images/m1.png) + +If there is an even number of PostgreSQL nodes, the Barman node is +additionally configured as a repmgr witness. This ensures that the +number of repmgr nodes is always odd, which is convenient when +enabling automatic failover. + +## Cluster configuration + +``` +[tpa]$ tpaexec configure ~/clusters/m1 \ + --architecture M1 \ + --platform aws --region eu-west-1 --instance-type t3.micro \ + --distribution Debian-minimal +``` + +You must specify `--architecture M1`. (In the example above, this is +the only option required to produce a working configuration.) + +You may optionally specify `--num-cascaded-replicas N` to request N +cascaded replicas (including 0 for none; default: 1). + +You may also specify any of the options described by +[`tpaexec help configure-options`](tpaexec-configure). diff --git a/product_docs/docs/tpa/23/architecture-PGD-Always-ON.mdx b/product_docs/docs/tpa/23/architecture-PGD-Always-ON.mdx new file mode 100644 index 00000000000..9d1afba5b00 --- /dev/null +++ b/product_docs/docs/tpa/23/architecture-PGD-Always-ON.mdx @@ -0,0 +1,79 @@ +--- +title: PGD-Always-ON +originalFilePath: architecture-PGD-Always-ON.md + +--- + +EDB Postgres Distributed in an Always-ON configuration, intended for use in production. + +This architecture is meant for use with PGD (BDR) version 5 only. + +## Cluster configuration + +``` +[tpa]$ tpaexec configure ~/clusters/pgd-ao \ + --architecture PGD-Always-ON \ + --location-names eu-west-1 eu-north-1 eu-central-1 \ + --data-nodes-per-location 2 \ + --add-witness-node-per-location \ + --active-locations eu-west-1 eu-central-1 \ + --add-witness-only-location eu-north-1 \ + --platform aws --instance-type t3.micro \ + --distribution Debian-minimal +``` + +You must specify `--architecture PGD-Always-ON`. + +You must specify a list of location names for the cluster with +`--location-names dc1 dc2 dc3`. + +A location represents an independent data centre that provides a level +of redundancy, in whatever way this definition makes sense to your use +case. For example, AWS regions, or availability zones within a region, +or any other designation to identify where your servers are hosted. + +A PGD-Always-ON cluster comprises one or more locations with the same +number of data nodes (if required to establish consensus, there may be +an additional witness node in each location, as well as a single extra +witness-only location). These locations, as many as required, must be +named in the `--location-names` list. + +(If you are using TPA to provision an AWS cluster, the locations will be +mapped to separate availability zones within the `--region` you specify. +You may specify multiple `--regions`, but TPA does not currently set up +VPC peering to allow instances in different regions to communicate with +each other. For a multi-region cluster, you will need to set up VPC +peering yourself.) + +By default, each location will have three data nodes. You may specify +a different number with `--data-nodes-per-location N`. The minimum +number is 2. + +If you have two data nodes per location, each location must also have an +extra witness node, and TPA will add one by default. For any even number +of data nodes >2, you may specify `--add-witness-node-per-location` to +add the extra witness node. + +By default, each location will also have separate PGD-Proxy instances. +You may specify `--cohost-proxies` to run PGD-Proxy on the data nodes. + +By default, TPA will configure PGD-Proxy to use global connection +routing, i.e., to elect a write lead from all available data nodes +across all locations. You may specify `--active-locations l2 l3` to +limit connection routing to nodes in the specified locations. This will +enable subgroup RAFT and proxy routing for those locations only. + +You may optionally specify `--add-witness-only-location loc` to +designate one of the cluster's locations as a special witness-only +location that contains no data nodes and only a single witness node, +used to improve the availability of consensus. This location cannot be +among the active locations list. + +You may optionally specify `--database-name dbname` to set the name of +the database with BDR enabled (default: bdrdb). + +You may optionally specify `--enable-camo` to set the pair of BDR +primary instances in each region to be each other's CAMO partners. + +You may also specify any of the options described by +[`tpaexec help configure-options`](tpaexec-configure). diff --git a/product_docs/docs/tpa/23/configure-cluster.mdx b/product_docs/docs/tpa/23/configure-cluster.mdx new file mode 100644 index 00000000000..eabf9765742 --- /dev/null +++ b/product_docs/docs/tpa/23/configure-cluster.mdx @@ -0,0 +1,253 @@ +--- +title: Cluster configuration +originalFilePath: configure-cluster.md + +--- + +With TPA, the way to make any configuration change to a cluster is +to edit config.yml and run the provision/deploy/test cycle. The process +is carefully designed to be idempotent, and to make changes only in +response to a change in the configuration or a change on the instances. + +The [`tpaexec configure`](tpaexec-configure) command will generate +a sensible config.yml file for you, but it covers only the most common +topology and configuration options. If you need something beyond the +defaults, or you need to make changes after provisioning the cluster, +you will need to edit config.yml anyway. + +This page is an overview of the configuration mechanisms available. +There's a separate page with more details about the specific +[variables you can set to customise the deployment process](configure-instance). + +## config.yml + +Your `config.yml` file is a +[YAML format](https://yaml.org) text file that represents all aspects of +your desired cluster configuration. Here's a minimal example of a +cluster with two instances: + +```yaml +cluster_name: speedy + +cluster_vars: + postgres_version: 9.6 + +instances: +- node: 1 + Name: one + role: primary + platform: docker + vars: + ansible_user: root + x: 42 + +- node: 2 + Name: two + role: replica + platform: docker + upstream: one + vars: + ansible_user: root + x: 53 +``` + +These three definitions are central to your cluster configuration. The +file may contain many other definitions (including platform-specific +details), but the list of `instances` with `vars` set either for one +instance or for the whole cluster are the basic building blocks of +every TPA configuration. + +All +[`tpaexec configure`](tpaexec-configure) +options translate to config.yml variables in +some way. A single option may affect several variables (e.g., +`--bdr-version` could set `postgres_version`, +`tpa_2q_repositories`, `edb_repositories`, `extra_postgres_extensions`, and so on), but +you can always accomplish with an editor what you could by running the +command. + +In terms of YAML syntax, config.yml as a whole represents a hash with +keys such as `cluster_vars` and `instances`. **You must ensure that +each key is defined only once.** If you were to inadvertently repeat the +`cluster_vars`, say, the second definition would completely override +the former, and your next deployment could make unintended changes +because of missing (shadowed) variables. + +TPA checks the consistency of the overall cluster topology (for +example, if you declare an instance with the role "replica", you must +also declare the name of its upstream instance, and that instance must +exist), but it will not prevent you from setting any variable you like +on the instances. You must exercise due caution, and try out changes in +a test environment before rolling them out into production. + +## Variables + +In Ansible terminology, most configuration settings are “inventory +variables”—TPA will translate `cluster_vars` into `group_vars` +(that apply to the cluster as a whole) and each instance's `vars` into +`host_vars` in the inventory during provisioning, and deployment will +use the inventory values. After you change config.yml, **you must +remember to run** `tpaexec provision` **before** `tpaexec deploy`. + +Any variable can be set for the entire cluster, or an individual host, +or both; host variables override group variables. In practice, setting +`x: 42` in `cluster_vars` is no different from setting it in every +host's `vars`. A host that needs `x` during deployment will see the +value 42 either way. A host will always see the most specific value, so +it is convenient to set some default value for the group and override it +for specific instances as required. + +Whenever possible, defining variables in `cluster_vars` and overriding +them for specific instances results in a concise configuration that is +easier to review and change (less repetition). Beyond that, it's up to +you to decide whether any given setting makes more sense as a group or +host variable. + +## Cluster variables + +The keys under `cluster_vars` may map to any valid YAML type, and will +be translated directly into group variables in the Ansible inventory: + +```yaml +cluster_vars: + postgres_version: 11 + tpa_2q_repositories: + - products/bdr3/release + - products/pglogical3/release + postgres_conf_settings: + bdr.trace_replay: true +``` + +In this case, `tpaexec provision` will write three variables (a +string, a list, and a hash) to the inventory in +`group_vars/tag_Cluster_name/01-cluster_name.yml`. + +## Instance variables + +This documentation uses the term “instance variables” to refer to any +variables that are defined for a specific instance in config.yml. For +example, here's a typical instance definition: + +```yaml +instances: +- Name: unwind + node: 1 + backup: unkempt + location: a + role: + - primary + - bdr + volumes: + - device_name: root + encrypted: true + volume_size: 16 + volume_type: gp2 + - device_name: /dev/xvdf + encrypted: true + vars: + volume_for: postgres_data + volume_size: 64 + volume_type: gp2 + platform: aws + type: t3.micro + vars: + ansible_user: ec2-user + postgres_conf_directory: /opt/postgres/conf +``` + +The variables defined in this instance's `vars` will all become host +variables in the inventory, but all host vars in the inventory do not +come from `vars` alone. Some other instance settings, including +`platform`, `location`, `volumes`, and `role` are also copied to the +inventory as host vars (but you cannot define these settings under +`vars` or `cluster_vars` instead). + +The settings outside `vars` may describe the properties of the instance +(e.g., `Name` and `node`) or its place in the topology of the cluster +(e.g., `role`, `backup`) or they may be platform-specific attributes +(e.g., instance `type` and `volumes`). Other than knowing that they +cannot be defined under `vars`, it is rarely necessary to distinguish +between these instance “settings” and instance “variables”. + +In this case, `tpaexec provision` will write a number of host +variables to the inventory in `host_vars/unwind/01-instance_vars.yml`. + +## instance_defaults + +This is a mechanism to further reduce repetition in +config.yml. It is most useful for instance settings that cannot be +defined as `cluster_vars`. For example, you could write the following: + +```yaml +instance_defaults: + platform: aws + type: t3.micro + tags: + AWS_ENVIRONMENT_SPECIFIC_TAG_KEY: some_mandated_value + +instances: +- node: 1 + Name: one +- node: 2 + Name: two +- … +``` + +Whatever you specify under `instance_defaults` serves as the default for +every entry in `instances`. In this example, it saves spelling out the +`platform` and `type` of each instance, and makes it easier to change +all your instances to a different type. If any instance specifies a +different value, it will of course take precedence over the default. + +It may help to think of `instance_defaults` as being a macro facility to +use in defining `instances`. What is ultimately written to the inventory +comes from the (expanded) definition of `instances` alone. If you're +trying to decide whether to put something in `cluster_vars` or +`instance_defaults`, it probably belongs in the former unless it +*cannot* be defined as a variable (e.g., `platform` or `type`), which is +true for many platform-specific properties (such as AWS resource tags) +that are used only in provisioning, and not during deployment. + +The `instance_defaults` mechanism does nothing to stop you from using it +to fill in the `vars` for an instance (default hash values are merged +with any hash specified in the `instances` entry). However, there is no +particular advantage to doing this rather than setting the same default +in `cluster_vars` and overriding it for an instance if necessary. When +in doubt, use `cluster_vars`. + +## Locations + +You can also specify a list of `locations` in config.yml: + +```yaml +locations: +- Name: first + az: eu-west-1a + region: eu-west-1 + subnet: 10.33.110.128/28 + +- Name: second + az: us-east-1b + region: us-east-1 + subnet: 10.33.75.0/24 + +instances: +- node: 1 + Name: one + location: first +… +``` + +If an instance specifies `location: first` (or `location: 0`), the +settings under that location serve as defaults for that instance. Again, +just like `instance_defaults`, an instance may override the defaults +that it inherits from its location. And again, you can use this feature +to fill in `vars` for an instance. This can be useful if you have some +defaults that apply to only half your instances, and different values +for the other half (as with the platform-specific settings in the +example above). + +Locations represent a collection of settings that instances can “opt in” +to. You can use them to stand for different data centres, AWS regions, +Docker hosts, or something else entirely. TPA does not expect or +enforce any particular interpretation. diff --git a/product_docs/docs/tpa/23/configure-instance.mdx b/product_docs/docs/tpa/23/configure-instance.mdx new file mode 100644 index 00000000000..a7c17ae0372 --- /dev/null +++ b/product_docs/docs/tpa/23/configure-instance.mdx @@ -0,0 +1,173 @@ +--- +title: Instance configuration +originalFilePath: configure-instance.md + +--- + +This page presents an overview of the various controls that TPA +offers to customise the deployment process on cluster instances, with +links to more detailed documentation. + +Before you dive into the details of deployment, it may be helpful to +read [an overview of configuring a cluster](configure-cluster) to +understand how cluster and instance variables and the other mechanisms +in config.yml work together to allow you to write a concise, +easy-to-review configuration. + +## System-level configuration + +The first thing TPA does is to ensure that Python is bootstrapped +and ready to execute Ansible modules (a distribution-specific process). +Then it completes various system-level configuration tasks before moving +on to [Postgres configuration](#postgres) below. + +- [Distribution support](reference/distributions) +- [Python environment](reference/python) (`preferred_python_version`) +- [Environment variables](reference/target_environment) (e.g., `https_proxy`) + +### Package repositories + +You can use the +[pre-deploy hook](tpaexec-hooks#pre-deploy) +to execute tasks before any package repositories are configured. + +- [Configure YUM repositories](reference/yum_repositories) + (for RHEL, Rocky and AlmaLinux) + +- [Configure APT repositories](reference/apt_repositories) + (for Debian and Ubuntu) + +- [Configure 2ndQuadrant and EDB repositories](reference/2q_and_edb_repositories) + (on any system) + +- [Configure a local package repository](reference/local-repo) + (to ship packages to target instances) + +You can use the +[post-repo hook](tpaexec-hooks#post-repo) +to execute tasks after package repositories have been configured (e.g., +to correct a problem with the repository configuration before installing +any packages). + +### Package installation + +Once the repositories are configured, packages are installed at various +stages throughout the deployment, beginning with a batch of system +packages: + +- [Install non-Postgres packages](reference/packages) + (e.g., acl, openssl, sysstat) + +Postgres and other components (e.g., Barman, repmgr, pgbouncer) will be +installed separately according to the cluster configuration; these are +documented in their own sections below. + +### Other system-level tasks + +- [Create and mount filesystems](reference/volumes) (including RAID, + LUKS setup) +- [Upload artifacts](reference/artifacts) (files, directories, + tar archives) +- [Set sysctl values](reference/sysctl_values) +- [Configure /etc/hosts](reference/hosts) +- [Manage ssh_known_hosts entries](reference/manage_ssh_hostkeys) + +## Postgres + +Postgres configuration is an extended process that goes hand-in-hand +with setting up other components like repmgr and pgbouncer. It begins +with installing Postgres itself. + +### Version selection + +Use the +[`--postgres-version`](tpaexec-configure#software-versions) +configure option or set `postgres_version` in config.yml to specify +which Postgres major version you want to install. The default version is +currently 11, but you can select 9.4, 9.5, 9.6, 10, 12, or 13 instead. + +That's all you really need to do to set up a working cluster. Everything +else on this page is optional. You can control every aspect of the +deployment if you want to, but the defaults are carefully tuned to give +you a sensible cluster as a starting point. + +### Installation + +The default `postgres_installation_method` is to install packages for +the version of Postgres you selected, along with various extensions, +according to the architecture's needs: + +- [Install Postgres and Postgres-related packages](reference/postgres_installation_method_pkg) + (e.g., pglogical, BDR, etc.) + +- [Build and install Postgres and extensions from source](reference/postgres_installation_method_src) + (for development and testing) + +Whichever installation method you choose, TPA can give you the same +cluster configuration with a minimum of effort. + +### Configuration + +- [Configure the postgres Unix user](reference/postgres_user) + +- [Run initdb to create the PGDATA directory](reference/initdb) + +- [Configure pg_hba.conf](reference/pg_hba.conf) + +- [Configure pg_ident.conf](reference/pg_ident.conf) + +- [Configure postgresql.conf](reference/postgresql.conf) + +You can use the +[postgres-config hook](tpaexec-hooks#postgres-config) +to execute tasks after the Postgres configuration files have been +installed (e.g., to install additional configuration files). + +Once the Postgres configuration is in place, TPA will go on to +install and configure other components such as Barman, repmgr, +pgbouncer, and haproxy, according to the details of the architecture. + +## Other components + +- [Configure Barman](reference/barman) +- [Configure pgbouncer](reference/pgbouncer) +- [Configure haproxy](reference/haproxy) +- [Configure HARP](reference/harp) +- [Configure EFM](reference/efm) + +### Configuring and starting services + +TPA will now install systemd service unit files for each service. +The service for Postgres is named `postgres.service`, and can be started +or stopped with `systemctl start postgres`. + +In the first deployment, the Postgres service will now be started. If +you are running `tpaexec deploy` again, the service may be reloaded or +restarted depending on what configuration changes you may have made. Of +course, if the service is already running and there are no changes, then +it's left alone. + +In any case, Postgres will be running at the end of this step. + +## After starting Postgres + +- [Create Postgres users](reference/postgres_users) + +- [Create Postgres tablespaces](reference/postgres_tablespaces) + +- [Create Postgres databases](reference/postgres_databases) + +- [Configure pglogical replication](reference/pglogical) + +- [Configure .pgpass](reference/pgpass) + +You can use the +[postgres-config-final hook](tpaexec-hooks#postgres-config-final) +to execute tasks after the post-startup Postgres configuration has been +completed (e.g., to perform SQL queries to create objects or load data). + +- [Configure BDR](reference/bdr) + +You can use the +[post-deploy hook](tpaexec-hooks#post-deploy) +to execute tasks after the deployment process has completed. diff --git a/product_docs/docs/tpa/23/configure-source.mdx b/product_docs/docs/tpa/23/configure-source.mdx new file mode 100644 index 00000000000..4d972d2128b --- /dev/null +++ b/product_docs/docs/tpa/23/configure-source.mdx @@ -0,0 +1,165 @@ +--- +title: Building from source +originalFilePath: configure-source.md + +--- + +TPA can build Postgres and other required components from source and +deploy a cluster with exactly the same configuration as with the default +packaged installation. This makes it possible to deploy repeatedly from +source to quickly test changes in a realistic, fully-configured cluster +that reproduces every aspect of a particular setup, regardless of +architecture or platform. + +You can even combine packaged installations of certain components with +source builds of others. For example, you can install Postgres from +packages and compile pglogical and BDR from source, but package +dependencies would prevent installing pglogical from source and BDR from +packages. + +Source builds are meant for use in development, testing, and for support +operations. + +## Quickstart + +Spin up a cluster with 2ndQPostgres, pglogical3, and bdr all built from +stable branches: + +```bash +$ tpaexec configure ~/clusters/speedy -a BDR-Always-ON \ + --layout bronze \ + --harp-consensus-protocol etcd \ + --install-from-source \ + 2ndqpostgres:2QREL_13_STABLE_dev \ + pglogical3:REL3_7_STABLE \ + bdr3:REL3_7_STABLE +``` + +As above, but set up a cluster that builds 2ndQPostgres source code from +the official git repository and uses the given local work trees to build +pglogical and BDR. This feature is specific to Docker: + +```bash +$ tpaexec configure ~/clusters/speedy \ + --architecture BDR-Always-ON --layout bronze \ + --harp-consensus-protocol etcd \ + --platform docker \ + --install-from-source 2ndqpostgres:2QREL_13_STABLE_dev \ + pglogical3 bdr3 \ + --local-source-directories \ + pglogical3:~/src/pglogical bdr3:~/src/bdr +``` + +After deploying your cluster, you can use +`tpaexec deploy … --skip-tags build-clean` on subsequent runs to +reuse build directories. (Otherwise the build directory is emptied +before starting the build.) + +Read on for a detailed explanation of how to build Postgres, pglogical, +BDR, and other components with custom locations and build parameters. + +## Configuration + +There are two aspects to configuring source builds. + +If you just want a cluster running a particular combination of sources, +run `tpaexec configure` to generate a configuration with sensible +defaults to download, compile, and install the components you select. +You can build Postgres or Postgres Extended, pglogical, and BDR, and specify +branch names to build from, as shown in the examples above. + +The underlying mechanism is capable of much more than the command-line +options allow. By editing config.yml, you can clone different source +repositories, change the build location, specify different configure or +build parameters, redefine the build commands entirely, and so on. You +can, therefore, build things other than Postgres, pglogical, and BDR. + +The available options are documented here: + +- [Building Postgres from source](reference/postgres_installation_method_src) + +- [Building extensions with `install_from_source`](reference/install_from_source) + +## Local source directories + +You can use TPA to provision Docker containers that build Postgres +and/or extensions from your local source directories instead of from a +Git repository. + +Suppose you're using `--install-from-source` to declare what you want +to build: + +```bash +$ tpaexec configure ~/clusters/speedy \ + --architecture BDR-Always-ON --layout bronze \ + --harp-consensus-protocol etcd \ + --platform docker \ + --install-from-source 2ndqpostgres:2QREL_13_STABLE_dev \ + pglogical3:REL3_7_STABLE bdr3:REL3_7_STABLE \ + … +``` + +By default, this will clone the known repositories for Postgres Extended, +pglogical3, and bdr3, check out the given branches, and build them. But +you can add `--local-source-directories` to specify that you want the +sources to be taken directly from your host machine instead: + +```bash +$ tpaexec configure ~/clusters/speedy \ + --architecture BDR-Always-ON --layout bronze \ + --harp-consensus-protocol etcd \ + --platform docker \ + --install-from-source 2ndqpostgres:2QREL_13_STABLE_dev \ + pglogical3 bdr3 \ + --local-source-directories \ + pglogical3:~/src/pglogical bdr3:~/src/bdr \ + … +``` + +This configuration will still install Postgres Extended from the repository, +but it obtains pglogical3 and bdr3 sources from the given directories on +the host. These directories are bind-mounted read-only into the Docker +containers at the same locations where the git repository would have +been cloned to, and the default (out-of-tree) build proceeds as usual. + +If you specify a local source directory for a component, you cannot +specify a branch to build (cf. `pglogical3:REL3_7_STABLE` vs. +`pglogical3` for `--install-from-source` in the examples above). The +source directory is mounted read-only in the containers, so TPA +cannot do anything to change it—neither `git pull`, nor +`git checkout`. You get whichever branch you have checked out locally, +uncommitted changes and all. + +Using `--local-source-directories` includes a list of Docker volume +definitions in config.yml: + +```yaml +local_source_directories: + - /home/ams/src/pglogical:/opt/postgres/src/pglogical:ro + - /home/ams/src/bdr:/opt/postgres/src/bdr:ro + - ccache-bdr_src_36-20200828200021:/root/.ccache:rw +``` + +### ccache + +TPA installs ccache by default for source builds of all kinds. When +you are using a Docker cluster with local source directories, by default +a new Docker volume is attached to the cluster's containers to serve as +a shared ccache directory. This volume is completely isolated from the +host, and is removed when the cluster is deprovisioned. + +Use the `--shared-ccache /path/to/host/ccache` configure option to +specify a longer-lived shared ccache directory. This directory will be +bind-mounted r/w into the containers, and its contents will be shared +between the host and the containers. + +(By design, there is no way to install binaries compiled on the host +directly into the containers.) + +## Rebuilding + +After deploying a cluster with components built from source, you can +rebuild those components quickly without having to rerun `tpaexec +deploy` by using the `tpaexec rebuild-sources` command. This will run +`git pull` for any components built from git repositories on the +containers, and rebuild all components. diff --git a/product_docs/docs/tpa/23/images/Cluster1.jpg b/product_docs/docs/tpa/23/images/Cluster1.jpg new file mode 100644 index 00000000000..215efb764e9 --- /dev/null +++ b/product_docs/docs/tpa/23/images/Cluster1.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0689660fd7b7877bb859b073bcf8006f10c9f4325fa3917fecc758c8c1e26342 +size 67204 diff --git a/product_docs/docs/tpa/23/images/bdr-always-on.png b/product_docs/docs/tpa/23/images/bdr-always-on.png new file mode 100644 index 00000000000..28942db994d --- /dev/null +++ b/product_docs/docs/tpa/23/images/bdr-always-on.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c1b7e1208e566ad2264330ca420a79b4c21caf62ce6c8209d4eeac6825c9094 +size 41211 diff --git a/product_docs/docs/tpa/23/images/m1.dot b/product_docs/docs/tpa/23/images/m1.dot new file mode 100644 index 00000000000..5776b84b88d --- /dev/null +++ b/product_docs/docs/tpa/23/images/m1.dot @@ -0,0 +1,10 @@ +# © Copyright EnterpriseDB UK Limited 2015-2023 - All rights reserved. + +digraph M1 { + backup [shape=box]; + primary -> replica; + primary -> backup; + replica -> replica_2; + replica -> replica_…; + replica -> replica_N; +} diff --git a/product_docs/docs/tpa/23/images/m1.png b/product_docs/docs/tpa/23/images/m1.png new file mode 100644 index 00000000000..d367a8dad50 --- /dev/null +++ b/product_docs/docs/tpa/23/images/m1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba330be52c91df1353f44b56079cf4ebdffbbc35a16515d1435d6635f8d9fcc8 +size 25170 diff --git a/product_docs/docs/tpa/23/index.mdx b/product_docs/docs/tpa/23/index.mdx new file mode 100644 index 00000000000..f77aa2b1732 --- /dev/null +++ b/product_docs/docs/tpa/23/index.mdx @@ -0,0 +1,193 @@ +--- +navigation: + - index + - rel_notes + - INSTALL + - '#Creating a cluster' + - tpaexec-configure + - tpaexec-provision + - tpaexec-deploy + - tpaexec-test + - '#Architectures' + - architecture-PGD-Always-ON + - architecture-BDR-Always-ON + - architecture-M1 + - '#Platforms' + - platform-aws + - platform-bare + - platform-docker + - '#Customizations' + - configure-cluster + - configure-instance + - configure-source + - tpaexec-hooks + - '#Cluster management' + - tpaexec-update-postgres + - tpaexec-switchover + - tpaexec-server-pool + - tpaexec-rehydrate + - '#Miscellaneous' + - tower + - ansible-and-sudo + - misc-configure-putty + - misc-troubleshooting + - reference +title: TPA +originalFilePath: index.md + +--- + +## Introduction + +TPA is an orchestration tool that uses Ansible to build Postgres +clusters according to EDB's recommendations. + +TPA embodies the best practices followed by EDB, and is informed by many +years of experience with deploying Postgres and associated components in +various scenarios. These recommendations are as applicable to quick +testbed setups as to production environments. + +(You can skip straight to the [TPA installation +instructions](INSTALL) if you want to get started.) + +## What can TPA do? + +TPA operates in four distinct stages to bring up a Postgres cluster: + +- Generate a cluster [configuration](#configuration) +- [Provision](#provisioning) servers (VMs, containers) to host the cluster +- [Deploy](#deployment) software to the provisioned instances +- [Test](#testing) the deployed cluster + +```bash +# 1. Configuration: decide what kind of cluster you want +[tpa]$ tpaexec configure clustername --architecture M1 --platform aws + +# 2. Provisioning: create the servers needed to host the cluster +[tpa]$ tpaexec provision clustername + +# 3. Deployment: install and configure the necessary software +[tpa]$ tpaexec deploy clustername + +# 4. Testing: make sure everything is working as expected +[tpa]$ tpaexec test clustername +``` + +You can run TPA from your laptop, an EC2 instance, or any machine +that can reach the cluster's servers over the network. + +Here's a [list of capabilities and supported software](reference/tpaexec-support). + +### Configuration + +The [`tpaexec configure`](tpaexec-configure) +command generates a simple YAML configuration file to describe a +cluster, based on the options you select. The configuration is ready for +immediate use, and you can modify it to better suit your needs. Editing +the configuration file is the usual way to [make any configuration +changes to your cluster](configure-cluster), both before and after +it's created. + +At this stage, you must select an architecture and a platform for the +cluster. An **architecture** is a recommended layout of servers and +software to set up Postgres for a specific purpose. Examples include +"M1" (Postgres with a primary and streaming replicas) and +"BDR-Always-ON" (Postgres with BDR in an HA configuration). A +**platform** is a means to host the servers to deploy any architecture, +e.g., AWS, Docker, or bare-metal servers. + +### Provisioning + +The [`tpaexec provision`](tpaexec-provision) +command creates instances and other resources required by the cluster. +The details of the process depend on the architecture (e.g., M1) and +platform (e.g., AWS) that you selected while configuring the cluster. + +For example, given AWS access with the necessary privileges, TPA +will provision EC2 instances, VPCs, subnets, routing tables, internet +gateways, security groups, EBS volumes, elastic IPs, etc. + +You can also "provision" existing servers by selecting the "bare" +platform and providing connection details. Whether these are bare metal +servers or those provisioned separately on a cloud platform, they can be +used just as if they had been created by TPA. + +You are not restricted to a single platform—you can spread your cluster +out across some AWS instances (in multiple regions) and some on-premise +servers, or servers in other data centres, as needed. + +At the end of the provisioning stage, you will have the required number +of instances with the basic operating system installed, which TPA +can access via SSH (with sudo to root). + +### Deployment + +The [`tpaexec deploy`](tpaexec-deploy) +command installs and configures Postgres and other software on the +provisioned servers (which may or may not have been created by TPA; +but it doesn't matter who created them so long as SSH and sudo access is +available). This includes setting up replication, backups, and so on. + +At the end of the deployment stage, Postgres will be up and running. + +### Testing + +The [`tpaexec test`](tpaexec-test) command executes various +architecture and platform-specific tests against the deployed cluster to +ensure that it is working as expected. + +At the end of the testing stage, you will have a fully-functioning +cluster. + +### Incremental changes + +TPA is carefully designed so that provisioning, deployment, and +testing are idempotent. You can run through them, make a change to +config.yml, and run through the process again to deploy the change. If +nothing has changed in the configuration or on the instances, then +rerunning the entire process will not change anything either. + +### Cluster management + +Once your cluster is up and running, TPA provides convenient cluster +management functions, including configuration changes, switchover, and +zero-downtime minor-version upgrades. These features make it easier and +safer to manage your cluster than making the changes by hand. + +### Extensible through Ansible + +TPA supports a [variety of configuration +options](configure-instance), so you can do a lot just by editing +config.yml and re-running provision/deploy/test. If you do need to go +beyond what TPA already supports, you can write + +- [Custom commands](reference/tpaexec-commands), which make it simple to write + playbooks to run on the cluster. Just create + `commands/xyz.yml` in your cluster directory, and invoke it + using `tpaexec xyz /path/to/cluster`. Ideal for any management tasks + or processes that you need to automate. + +- [Custom tests](reference/tpaexec-tests), which augment the builtin tests with + in-depth verifications specific to your environment and application. + Using `tpaexec test` to run all tests in a uniform, repeatable way + ensures that you will not miss out on anything important, either when + dealing with a crisis, or just during routine cluster management. + +- [Hook scripts](tpaexec-hooks), which are invoked during various + stages of the deployment. For example, tasks in `hooks/pre-deploy.yml` + will be run before the main deployment; there are many other hooks, + including `post-deploy`. This places the full range of Ansible + functionality at your disposal. + +## It's just Postgres + +TPA can create complex clusters with many features configured, but +the result is just Postgres. The installation follows some conventions +designed to make life simpler, but there is no hidden magic or anything +standing in the way between you and the database. You can do everything +on a TPA cluster that you could do on any other Postgres installation. + +## Getting started + +Follow the [TPA installation instructions](INSTALL) for your +system, then [configure your first cluster](tpaexec-configure). diff --git a/product_docs/docs/tpa/23/misc-configure-putty.mdx b/product_docs/docs/tpa/23/misc-configure-putty.mdx new file mode 100644 index 00000000000..97a2f7ea518 --- /dev/null +++ b/product_docs/docs/tpa/23/misc-configure-putty.mdx @@ -0,0 +1,81 @@ +--- +navTitle: PuTTY configuration +title: TPA - PuTTY Configuration guide +originalFilePath: misc-configure-putty.md + +--- + +In order to use PuTTY under Windows to connect via ssh to the AWS instances +that were created by the TPA utility ***tpaexec provision***, the keys will +need to be converted from the private key format (.pem) generated by Amazon EC2 +to the PuTTY format (.ppk). + +``` +# Provision the cluster +[tpa]$ tpaexec provision +``` + +**PuTTY** has a tool named **PuTTYgen**, which can convert keys to the required +format. + +* * * + +## Key conversion + +**Locate private key** + +Locate the private key in the cluster directory `` - it will be +named according to the **cluster_name** variable set in **config.yml** prefixed +by **id\_** - e.g. if the cluster_name is set to **testenv1**, then the private +key will be called **id_testenv1**. + +**Save key as .pem** + +Copy this file into your Windows filesystem & save it as a .pem file - in this +example **id_testenv1.pem** - cut and pasting into a text file will work fine +for this. + +**Key conversion** + +Start **PuTTYgen** and under Parameters, select appropriate Type of key to +generate: + +For older versions of **PuTTYgen**, select **SSH-2 RSA**; for recent versions +select**RSA** + +*Do not select SSH-1 (RSA)* + +Now choose **Load** - in the box that says **PuTTY Private Key Files (\*.ppk)** +you will need to select **All Files (\*.\*)** + +Select your **.pem** file and choose **Open**, then click **OK**. + +Select **Save private key** and click **Yes** to ignore the warning about saving +the key without a passphrase. Make sure that the file suffix is **.ppk** and +choose the same name as for the **.pem** file; in this example the filename +might be **id_testenv1.ppk** + +* * * + +## **Configure PuTTY** + +Start **PuTTY** and select **Session** from the **Category** window. In the +**Host Name** panel, enter `@` and in the Port Panel, enter +**22** + +The `` and `` can be found in the `/ssh_config` +file which gets created by the **`tpaexec provision`** utility. + +In the Putty **Category** window, Select **Connection**, expand **SSH** and +select **Auth** + +For the panel marked ***Private key file for authentication***, click **Browse** +and select the .ppk file that was saved above, then select **Open** + +In the Putty **Category** window, select **Session** again, enter a session name +in **Saved Sessions**, and **Save** + +You should now be able to connect to the AWS host via PuTTY by selecting this +saved session. + +* * * diff --git a/product_docs/docs/tpa/23/misc-troubleshooting.mdx b/product_docs/docs/tpa/23/misc-troubleshooting.mdx new file mode 100644 index 00000000000..5d854fec4af --- /dev/null +++ b/product_docs/docs/tpa/23/misc-troubleshooting.mdx @@ -0,0 +1,115 @@ +--- +title: Troubleshooting +originalFilePath: misc-troubleshooting.md + +--- + +### Recreate python virtual environment + +Occasionally the python venv can get in an inconsistent state, in which case the easiest solution is to delete and recreate it. Symptoms of a broken venv can include errors during provisioning like: + +``` +TASK [Write Vagrantfile and firstboot.sh] ****************************************************************************************************************************** +failed: [localhost] (item=Vagrantfile) => {"changed": false, "checksum": "bf1403a17d897b68fa8137784d298d4da36fb7f9", "item": "Vagrantfile", "msg": "Aborting, target uses selinux but python bindings (libselinux-python) aren't installed!"} +``` + +To create a new virtual environment (assuming tpaexec was installed into the default location): + +``` +[tpa]$ sudo rm -rf /opt/EDB/TPA/tpa-venv +[tpa]$ sudo /opt/EDB/TPA/bin/tpaexec setup +``` + +\###Strange AWS errors regarding credentials +If the time & date of the TPA server isn't correct, you can get AWS errors similar to this during provisioning: + +``` +TASK [Register key tpa_cluster in each region] ********************************************** +An exception occurred during task execution. To see the full traceback, use -vvv. The error was: ClientError: An error occurred (AuthFailure) when calling the DescribeKeyPairs operation: AWS was not able to validate the provided access credentials +failed: [localhost] (item=eu-central-1) => {"boto3_version": "1.8.8", "botocore_version": "1.11.8", "changed": false, "error": {"code": "AuthFailure", "message": "AWS was not able to validate the provided access credentials"}, "item": "eu-central-1", "msg": "error finding keypair: An error occurred (AuthFailure) when calling the DescribeKeyPairs operation: AWS was not able to validate the provided access credentials", "response_metadata": {"http_headers": {"date": "Thu, 27 Sep 2018 12:49:41 GMT", "server": "AmazonEC2", "transfer-encoding": "chunked"}, "http_status_code": 401, "request_id": "a0d905ba-188f-48fe-8e5a-c8d8799e3232", "retry_attempts": 0}} + +``` + +Solution - set the time and date correctly. + +``` +[tpa]$ sudo ntpdate pool.ntp.org +``` + +### Logging + +By default, all tpaexec logging will be saved in logfile `/ansible.log` + +To change the logfile location, set environment variable `ANSIBLE_LOG_PATH` to the desired location - e.g. + +``` +export ANSIBLE_LOG_PATH=~/ansible.log +``` + +To increase the verbosity of logging, just add `-v`/`-vv`/`-vvv`/`-vvvv`/`-vvvvv` to tpaexec command line: + +``` +[tpa]$ tpaexec deploy -v + +-v shows the results of modules +-vv shows the files from which tasks come +-vvv shows what commands are being executed on the target machines +-vvvv enables connection debugging, what callbacks have been loaded +-vvvvv shows some additional ssh configuration, filepath information +``` + +### Cluster test + +An easy way to smoketest an existing cluster is to run: + +``` +[tpa]$ tpaexec test +``` + +This will do a functional test of the cluster components, followed by a performance test of the cluster, using pgbench. As pgbench can take a while to complete, benchmarking can be omitted by running: + +``` +[tpa]$ tpaexec test --skip-tags pgbench +``` + +Tags in the test role are `repmgr,postgres,barman,pgbench` + +Note that when specifying multiple tags, they should be comma delimited, with +no spaces; for example: + +``` +[tpa]$ tpaexec test --skip-tags barman,pgbench +``` + +### TPA server test + +To check the installation of the TPA server itself, run: + +``` +[tpa]$ tpaexec selftest +``` + +### Skipping or including specific tags + +When re-running a tpaexec provision or deploy after a failure, in the interests +of time, it can sometimes be useful to miss out tasks by skipping specific tags. +For example to miss out the repmgr tasks: + +``` +[tpa]$ tpaexec deploy --skip-tags repmgr +``` + +To jump straight to re-run a particular task by specifying a tag--for example, +to immediately run BDR tasks: + +``` +[tpa]$ tpaexec deploy --tags bdr +``` + +*Note that this assumes that the previous tasks all completed successfully.* + +To find all the tags for the relevant architecture that might be useful, run: + +``` +[tpa]$ grep -rs "tags:" /opt/EDB/TPA/architectures +``` diff --git a/product_docs/docs/tpa/23/platform-aws.mdx b/product_docs/docs/tpa/23/platform-aws.mdx new file mode 100644 index 00000000000..b1b3b05ea10 --- /dev/null +++ b/product_docs/docs/tpa/23/platform-aws.mdx @@ -0,0 +1,388 @@ +--- +navTitle: AWS +title: aws +originalFilePath: platform-aws.md + +--- + +TPA fully supports provisioning production clusters on AWS EC2. + +## API access setup + +To use the AWS API, you must: + +- [Obtain an access keypair](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html) +- [Add it to your configuration](https://boto.readthedocs.org/en/latest/boto_config_tut.html) + +For example, + +```bash +[tpa]$ cat > ~/.aws/credentials +[default] +aws_access_key_id = AKIAIOSFODNN7EXAMPLE +aws_secret_access_key = wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +``` + +The AMI user should at least have following set of permissions so tpaexec +can use it to provision ec2 resources. + +``` +ec2:AssociateRouteTable +ec2:AttachInternetGateway +ec2:AuthorizeSecurityGroupIngress +ec2:CreateInternetGateway +ec2:CreateRoute +ec2:CreateRouteTable +ec2:CreateSecurityGroup +ec2:CreateSubnet +ec2:CreateTags +ec2:CreateVpc +ec2:DeleteKeyPair +ec2:DeleteRouteTable +ec2:DeleteSecurityGroup +ec2:DeleteSubnet +ec2:DeleteVpc +ec2:DescribeImages +ec2:DescribeInstanceStatus +ec2:DescribeInstances +ec2:DescribeInternetGateways +ec2:DescribeKeyPairs +ec2:DescribeRouteTables +ec2:DescribeSecurityGroups +ec2:DescribeSubnets +ec2:DescribeTags +ec2:DescribeVolumes +ec2:DescribeVpcAttribute +ec2:DescribeVpcClassicLink +ec2:DescribeVpcClassicLinkDnsSupport +ec2:DescribeVpcs +ec2:DisassociateRouteTable +ec2:ImportKeyPair +ec2:ModifyVpcAttribute +ec2:RunInstances +ec2:TerminateInstances +iam:AddRoleToInstanceProfile +iam:CreateInstanceProfile +iam:CreateRole +iam:DeleteInstanceProfile +iam:DeleteRole +iam:DeleteRolePolicy +iam:GetInstanceProfile +iam:GetRole +iam:GetRolePolicy: +iam:ListGroups +iam:ListInstanceProfiles +iam:ListInstanceProfilesForRole +iam:ListRolePolicies +iam:ListRoles +iam:ListUsers +iam:PassRole +iam:PutRolePolicy +iam:RemoveRoleFromInstanceProfile +kms:CreateGrant +kms:GenerateDataKeyWithoutPlaintext +s3:GetObject +s3:ListAllMyBuckets +s3:ListBucket +s3:PutObject +s3:PutObjectAcl +``` + +## Introduction + +The service is physically subdivided into +[regions and availability zones](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html). +An availability zone is represented by a region code followed by a +single letter, e.g., eu-west-1a (but that name may refer to different +locations for different AWS accounts, and there is no way to coordinate +the interpretation between accounts). + +AWS regions are completely isolated from each other and share no +resources. Availability zones within a region are physically separated, +and logically mostly isolated, but are connected by low-latency links +and are able to share certain networking resources. + +### Networking + +All networking configuration in AWS happens in the context of a +[Virtual Private Cloud](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-vpc.html) +within a region. Within a VPC, you can create +[subnets](https://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/VPC_Subnets.html) +that is tied to a specific availability zone, along with internet +gateways, routing tables, and so on. + +You can create any number of +[Security Groups](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html#vpc-security-groups) +to configure rules for what inbound and outbound traffic is permitted to +instances (in terms of protocol, a destination port range, and a source +or destination IP address range). + +### Instances + +AWS EC2 offers a variety of +[instance types](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html) +with different hardware configurations at different +price/performance points. Within a subnet in a particular availability +zone, you can create +[EC2 instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Instances.html) +based on a distribution image known as an +[AMI](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html), +and attach one or more +[EBS volumes](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AmazonEBS.html) +to provide persistent storage to the instance. You can SSH to the +instances by registering an +[SSH public key](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html). + +Instances are always assigned a private IP address within their subnet. +Depending on the subnet configuration, they may also be assigned an +[ephemeral public IP address](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-instance-addressing.html#concepts-public-addresses) +(which is lost when the instance is shut down, and a different ephemeral +IP is assigned when it is started again). You can instead assign a +static region-specific routable IP address known as an +[Elastic IP](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/elastic-ip-addresses-eip.html) +to any instance. + +For an instance to be reachable from the outside world, it must not only +have a routable IP address, but the VPC's networking configuration +(internet gateway, routing tables, security groups) must also align to +permit access. + +## Configuration + +Here's a brief description of the AWS-specific settings that you can +specify via `tpaexec configure` or define directly in config.yml. + +### Regions + +You can specify one or more regions for the cluster to use with `--region` or +`--regions`. TPA will generate the required vpc entries associated to each of +them and distribute locations into these regions evenly by using different +availability zones while possible. + +`regions` are differents from `locations`, each location belongs to a region +(and an availability zone inside this region). `regions` are AWS specific +objects, `locations` are cluster objects. + +Note: When specifying multiple regions, you need to manually edit network +configurations: + +- `ec2_vpc` entries must have non-overlaping cidr networks to allow use of + AWS vpc peering. by default TPA will set all cidr to `10.33.0.0/16`. + See [VPC](#vpc-required) for more informations. +- each `location` must be updated with `subnet` that match the `ec2_vpc` + `cidr` they belong to. See [Subnets](#subnets-optional) for more informations. +- TPA creates security groups with basic rules under `cluster_rules` and + those need to be updated to match `ec2_vpc` cidr for each `subnet` cidr. + see [Security groups](#security-groups-optional) for more informations. +- VPC peering must be setup manually before `tpaexec deploy`. We recommand + creating VPCs and required VPC peerings before running `tpaexec configure` + and using `vpc-id` in config.yml. See [VPC](#vpc-required) for more informations. + +### VPC (required) + +You must specify a VPC to use: + +``` +ec2_vpc: + Name: Test + cidr: 10.33.0.0/16 +``` + +This is the default configuration, which creates a VPC named Test with the +given CIDR if it does not exist, or uses the existing VPC otherwise. + +To create a VPC, you must specify both the Name and the cidr. If you specify +only a VPC Name, TPA will fail if a matching VPC does not exist. + +If TPA creates a VPC, +`tpaexec deprovision` will attempt to remove it, but will leave any +pre-existing VPC alone. (Think twice before creating new VPCs, because +AWS has a single-digit default limit on the number of VPCs per account.) + +If you need more fine-grained matching, or to specify different VPCs in +different regions, you can use the expanded form: + +``` +ec2_vpc: + eu-west-1: + Name: Test + cidr: 172.16.0.0/16 + us-east-1: + filters: + vpc-id: vpc-nnn + us-east-2: + Name: Example + filters: + [filter expressions] +``` + +### AMI (required) + +You must specify an AMI to use: + +``` +ec2_ami: + Name: xxx + Owner: self +``` + +You can add filter specifications for more precise matching: + +``` +ec2_ami: + Name: xxx + Owner: self + filters: + architecture: x86_64 + [more key/value filters] +``` + +(By default, `tpaexec configure` will select a suitable `ec2_ami` +for you based on the `--distribution` argument.) + +This platform supports Debian 9 (stretch), RedHat Enterprise Linux 7, +Rocky 8, and Ubuntu 16.04 (Xenial). + +### Subnets (optional) + +Every instance must specify its subnet (in CIDR form, or as a subnet-xxx +id). You may optionally specify the name and availability zone for each +subnet that we create: + +``` +ec2_vpc_subnets: + us-east-1: + 192.0.2.0/27: + az: us-east-1b + Name: example1 + 192.0.2.100/27: + az: us-east-1b + Name: example2 +``` + +### Security groups (optional) + +By default, we create a security group for the cluster. To use one or +more existing security groups, set: + +``` +ec2_groups: + us-east-1: + group-name: + - foo +``` + +If you want to customise the rules in the default security group, set +`cluster_rules`: + +``` +cluster_rules: +- cidr_ip: 0.0.0.0/0 + from_port: 22 + proto: tcp + to_port: 22 +- cidr_ip: 192.0.2.0/27 + from_port: 0 + proto: tcp + to_port: 65535 +- cidr_ip: 192.0.2.100/27 + from_port: 0 + proto: tcp + to_port: 65535 +``` + +This example permits ssh (port 22) from any address, and TCP connections on any +port from specific IP ranges. (Note: from_port and to_port define a numeric +range of ports, not a source and destination.) + +If you set up custom rules or use existing security groups, you must ensure +that instances in the cluster are allowed to communicate with each other as +required (e.g., allow tcp/5432 for Postgres). + +### Internet gateways (optional) + +By default, we create internet gateways for every VPC, unless you set: + +``` +ec2_instance_reachability: private +``` + +For more fine-grained control, you can set: + +``` +ec2_vpc_igw: + eu-west-1: yes + eu-central-1: yes + us-east-1: no +``` + +### SSH keys (optional) + +``` +# Set this to change the name under which we register our SSH key. +# ec2_key_name: tpa_cluster_name +# +# Set this to use an already-registered key. +# ec2_instance_key: xxx +``` + +### S3 bucket (optional) + +TPA requires access to an S3 bucket to provision an AWS cluster. This bucket +is used to temporarily store files such as SSH host keys, but may also be used for +other cluster data (such as backups). + +By default, TPA will use an S3 bucket named `edb-tpa-` +for any clusters you provision. (If the bucket does not exist, you will be asked to +confirm that you want TPA to create it for you.) + +To use an existing S3 bucket instead, set + +``` +cluster_bucket: name-of-bucket +``` + +(You can also set `cluster_bucket: auto` to accept the default bucket name without +the confirmation prompt.) + +TPA will never remove any S3 buckets when you deprovision the cluster. To remove +the bucket yourself, run: + +``` +aws s3 rb s3:// --force +``` + +The IAM user you are using to provision the instances must have read and +write access to this bucket. During provisioning, tpaexec will provide +instances with read-only access to the cluster_bucket through the +instance profile. + +### Instance profile (optional) + +``` +# Set this to change the name of the instance profile role we create. +# cluster_profile: cluster_name_profile +# +# Set this to use an existing instance profile (which must have all the +# required permissions assigned to it). +# instance_profile_name: xxx +``` + +### Instance Locale + +For some ec2 images and environments it might be desirable to change the +region and language settings. +The default is `en_US.UTF-8`. To find supported locales consult the +output of the following command on RedHat or Rocky Linux: + +```shell +localectl list-locales +``` + +Or the contents of the file /etc/locales.defs on a Debian or Ubuntu. + +Set the desired locale in your config.yml: + +```yaml +user_locale: en_GB.UTF-8 +``` diff --git a/product_docs/docs/tpa/23/platform-bare.mdx b/product_docs/docs/tpa/23/platform-bare.mdx new file mode 100644 index 00000000000..6555847c434 --- /dev/null +++ b/product_docs/docs/tpa/23/platform-bare.mdx @@ -0,0 +1,101 @@ +--- +navTitle: Bare metal +title: bare(-metal servers) +originalFilePath: platform-bare.md + +--- + +Set `platform: bare` in config.yml + +This platform is meant to support any server that is accessible via SSH, +including bare-metal servers as well as already-provisioned servers on +any cloud platform (including AWS). + +You must define the IP address(es) and username for each target server: + +```yaml +instances: + - node: 1 + Name: igor + platform: bare + public_ip: 192.0.2.1 + private_ip: 192.0.2.222 + vars: + ansible_user: xyzzy +``` + +You must ensure that + +1. TPA can ssh to the instance as `ansible_user` +2. The `ansible_user` has sudo access on the instance + +## SSH access + +In the example above, TPA will ssh to `xyzzy@192.0.2.1` to access +the instance. + +By default, TPA will run `ssh-keygen` to generate a new SSH keypair +in your cluster directory. The private key is named `id_cluster_name` +and the public key is stored in `id_cluster_name.pub`. + +You must either set `ssh_key_file: /path/to/id_keyname` to use a +different key that the instance will accept, or configure the instance +to allow access from the generated key (e.g., use `ssh-copy-id`, which +will append the contents of `id_cluster_name.pub` to +`~xyzzy/.ssh/authorized_keys`). + +You must also ensure that ssh can verify the host key(s) of the +instance. You can either add entries to the `known_hosts` file in your +cluster directory, or install the TPA-generated host keys from +`hostkeys/ssh_host_*_key*` in your cluster directory into `/etc/ssh` on +the instance (the generated `tpa_known_hosts` file contains entries for +these keys). + +For example, to ssh in with the generated user key, but keep the +existing host keys, you can do: + +```bash +$ cd ~/clusters/speedy +$ ssh-copy-id -i id_speedy xyzzy@192.0.2.1 +$ ssh-keyscan -H 192.0.2.1 >> tpa_known_hosts +``` + +Run `tpaexec ping ~/clusters/speedy` to check if it's working. If not, +append `-vvv` to the command to look at the complete ssh command-line. +(Note: Ansible will invoke ssh to execute a command like +`bash -c 'python3 && sleep 0'` on the instance. If you run ssh commands +by hand while debugging, replace this with a command that produces some +output and then exits instead, e.g., `'id'`.) + +For more details: + +- [Use a different ssh key](reference/ssh_key_file) +- [Manage ssh host keys for bare instances](reference/manage_ssh_hostkeys) + +## Distribution support + +TPA will try to detect the distribution running on target instances, +and fail if it is not supported. TPA currently supports Debian +(8/9/10; or jessie/stretch/buster), Ubuntu (16.04/18.04/20.04; or +xenial/bionic/focal), and RHEL/CentOS/Rocky/AlmaLinux (7.x/8.x) on `bare` instances. + +## IP addresses + +You can specify the `public_ip`, `private_ip`, or both for any instance. + +TPA uses these IP addresses in two ways: first, to ssh to the +instance to execute commands during deployment; and second, to set up +communications within the cluster, e.g., for `/etc/hosts` or to set +`primary_conninfo` for Postgres. + +If you specify a `public_ip`, it will be used to ssh to the instances +during deployment. If you specify a `private_ip`, it will be used to set +up communications within the cluster. If you specify both, the +`public_ip` will be used during deployment, and the `private_ip` for +cluster communications. + +If you specify only one or the other, the address will be used for both +purposes. For example, you could set only `public_ip` for servers on +different networks, or only `private_ip` if you're running TPA +inside a closed network. (Instead of using public/private, you can set +`ip_address` if you need to specify only one IP address.) diff --git a/product_docs/docs/tpa/23/platform-docker.mdx b/product_docs/docs/tpa/23/platform-docker.mdx new file mode 100644 index 00000000000..46bbb925f35 --- /dev/null +++ b/product_docs/docs/tpa/23/platform-docker.mdx @@ -0,0 +1,200 @@ +--- +title: Docker +originalFilePath: platform-docker.md + +--- + +TPA can create Docker containers and deploy a cluster to them. At +present, it sets up containers to run systemd and other services as if +they were ordinary VMs. + +Deploying to docker containers is an easy way to test different cluster +configurations. It is not meant for production use. + +## Synopsis + +Just select the platform at configure-time: + +```bash +[tpa]$ tpaexec configure clustername --platform docker […] +[tpa]$ tpaexec provision clustername +[tpa]$ tpaexec deploy clustername +``` + +## Operating system selection + +Use the standard `--os Debian/Ubuntu/RedHat` configure option to +select which distribution to use for the containers. TPA will build +its own systemd-enabled images for this distribution. These images will +be named with a `tpa/` prefix, e.g., `tpa/redhat:8`. + +Use `--os-image some/image:name` to specify an existing +systemd-enabled image instead. For example, the +[centos/systemd](https://hub.docker.com/r/centos/systemd/) +image (based on CentOS 7) can be used in this way. + +TPA does not support Debian 8 (jessie) or Ubuntu 16.04 (xenial) for +Docker containers, because of bugs in the old version of systemd shipped +on those distributions. + +## Installing Docker + +We test TPA with the latest stable Docker-CE packages. + +This documentation assumes that you have a working Docker installation, +and are familiar with basic operations such as pulling images and +creating containers. + +Please consult the +[Docker documentation](https://docs.docker.com) if you need help to +[install Docker](https://docs.docker.com/engine/install/) and +[get started](https://docs.docker.com/get-started/) with it. + +On MacOS X, you can [install "Docker Desktop for +Mac"](https://hub.docker.com/editions/community/docker-ce-desktop-mac/) +and launch Docker from the application menu. + +### CgroupVersion + +Support for CgroupVersion 2 is not fully baked yet for docker sdk in +ansible and related tooling. So while we recommend using a recent +version of docker; we rely on CgroupVersion 1 until version 2 is +fully supported. Instructions below suggest the changes to switch to +CgroupVersion 1 if your platform uses CgroupVersion 2 by default. + +On Linux: + +``` +$ echo 'GRUB_CMDLINE_LINUX=systemd.unified_cgroup_hierarchy=false' > \ + /etc/default/grub.d/cgroup.cfg +$ update-grub +$ reboot +``` + +On MacOS: + +1. Edit ~/Library/Group\\ Containers/group.com.docker/settings.json + and make the following replacement + `"deprecatedCgroupv1": false` → `"deprecatedCgroupv1": true` +2. Restart Docker Desktop app + +### Permissions + +TPA expects the user running it to have permission to access to the +Docker daemon (typically by being a member of the `docker` group that +owns `/var/run/docker.sock`). Run a command like this to check if you +have access: + +```bash +[tpa]$ docker version --format '{{.Server.Version}}' +19.03.12 +``` + +**WARNING**: Giving a user the ability to speak to the Docker daemon +lets them trivially gain root on the Docker host. Only trusted users +should have access to the Docker daemon. + +### Docker container privileges + +#### Privileged containers + +By default TPA provisions Docker containers in unprivileged mode, with no +added Linux capabilities flags. Such containers cannot manage host firewall +rules, file systems, block devices, or most other tasks that require true root +privileges on the host. + +If you require your containers to run in privileged mode, set the `privileged` +boolean variable for the instance(s) that need it, or globally in +`instance_defaults`, e.g.: + +``` +instance_defaults: + privileged: true +``` + +**WARNING**: Running containers in privileged mode allows the root user or any +process that can gain root to load kernel modules, modify host firewall rules, +escape the container namespace, or otherwise act much as the real host "root" +user would. Do not run containers in priviliged mode unless you really need to. + +See `man capabilities` for details on Linux capabilities flags. + +#### `security_opts` and the `no-new-privileges` flag + +tpaexec can start docker containers in a restricted mode where processes cannot +increase their privileges. setuid binaries are restricted, etc. Enable this in +tpaexec with the `instance_defaults` or per-container variable +`docker_security_opts`: + +``` +instance_defaults: + docker_security_opts: + - no-new-privileges +``` + +Other arguments to `docker run`'s `--security-opts` are also accepted, e.g. +SELinux user and role. + +#### Linux capabilities flags + +tpaexec exposes Docker's control over Linux capabilities flags with the +`docker_cap_add` list variable, which may be set per-container or in +`instance_defaults`. See `man capabilities`, the `docker run` documentation and +the documentation for the Ansible `docker_containers` module for details on +capabilities flags. + +Docker's `--cap-drop` is also supported via the `docker_cap_drop` list. + +For example, to run a container as unprivileged, but give it the ability to +modify the system clock, you might write: + +``` +instance_defaults: + privileged: false + docker_cap_add: + - sys_time + docker_cap_drop: + - all +``` + +### Docker storage configuration + +**Caution**: The default Docker configuration on many hosts uses +`lvm-loop` block storage and is not suitable for production +deployments. Run `docker info` to check which storage driver you are +using. If you are using the loopback scheme, you will see something +like this: + +``` + Storage Driver: devicemapper + … + Data file: /dev/loop0 +``` + +Consult the Docker documentation for more information on storage +configuration: + +- [Storage Drivers](https://docs.docker.com/storage/storagedriver/) +- [Configuring lvm-direct for production](https://docs.docker.com/storage/storagedriver/device-mapper-driver/#configure-direct-lvm-mode-for-production) + +## Docker container management + +All of the docker containers in a cluster can be started and stopped +together using the `start-containers` and `stop-containers` commands: + +```bash +[tpa]$ tpaexec start-containers clustername +[tpa]$ tpaexec stop-containers clustername +``` + +These commands don't provision or deprovision containers, or even +connect to them; they are intended to save resources when you're +temporarily not using a docker cluster that you need to keep +available for future use. + +For a summary of the provisioned docker containers in a cluster, +whether started or stopped, use the `list-containers` command: + +```bash +[tpa]$ tpaexec list-containers clustername +``` diff --git a/product_docs/docs/tpa/23/reference/2q_and_edb_repositories.mdx b/product_docs/docs/tpa/23/reference/2q_and_edb_repositories.mdx new file mode 100644 index 00000000000..fe97c4635be --- /dev/null +++ b/product_docs/docs/tpa/23/reference/2q_and_edb_repositories.mdx @@ -0,0 +1,115 @@ +--- +title: How TPA uses 2ndQuadrant and EDB repositories +originalFilePath: 2q_and_edb_repositories.md + +--- + +This page explains the package sources from which TPA can download EDB +(including 2ndQuadrant) software, how the source varies depending on the +selected software, and how to configure access to each source. + +Note that this page only describes the special configuration options and +logic for EDB and 2ndQuadrant sources. Arbitrary +[yum](yum_repositories) or [apt](apt_repositories) repositories +can be added independently of the logic described here. Likewise, +packages can be [downloaded in advance](tpaexec-download-packages) +and added to a [local repository](local-repo) if preferred. + +## Package sources used by TPA + +TPA downloads software from three package sources. Each of these +sources provides multiple repositories. In some cases, the same software +is available from more than one source. + +- [EDB Repos 2.0](https://www.enterprisedb.com/repos/) +- [EDB Repos 1.0](https://www.enterprisedb.com/repos/legacy) +- [2ndQuadrant Repos](https://techsupport.enterprisedb.com/customer_portal/sw/) + +By default, TPA will [select sources and repositories automatically](#how-sources-are-selected-by-default) +based on the architecture and other options you have specified, so it is +not generally necessary to change these. However, you will need to +ensure that you have a valid subscription for all the sources used and +that you have [exported the token](#authenticating-with-package-sources) +before running `tpaexec deploy` or the operation will fail. + +## Authenticating with package sources + +To use [EDB Repos 2.0](https://www.enterprisedb.com/repos/) you must +`export EDB_SUBSCRIPTION_TOKEN=xxx` before you run tpaexec. You can get +your subscription token from [the web +interface](https://www.enterprisedb.com/repos-downloads). + +To use +[2ndQuadrant repositories](https://techsupport.enterprisedb.com/customer_portal/sw/), +you must `export TPA_2Q_SUBSCRIPTION_TOKEN=xxx` before you run +tpaexec. You can get your subscription token from the 2ndQuadrant +Portal, under "Company info" in the left menu, then "Company". Some +repositories are available only by prior arrangement. + +To use [EDB Repos 1.0](https://www.enterprisedb.com/repos/legacy) you +must create a text file that contains your access credentials in the +`username:password` format and run `export +EDB_REPO_CREDENTIALS_FILE=/path/to/credentials/file` before you run +tpaexec. + +If you do not have an account for any of the sites listed, you can +register for access at + + +## How sources are selected by default + +For M1 and BDR-Always-ON architectures, the default source is +2ndQuadrant and the necessary repositories will be added from this +source. For software that is not available from this source (e.g EDB +Advanced Server), repositories will be selected from EDB Repos 1.0. + +If the PGD-Always-ON architecture is selected, repositories will be +selected from EDB Repos 2.0 and all software will be sourced +from these repositories. + +## Specifying EDB 2.0 repositories + +To specify the complete list of repositories from EDB Repos 2.0 to +install on each instance, set `edb_repositories` to a list of EDB +repository names: + +```yaml +cluster_vars: + edb_repositories: + - enterprise + - postgres_distributed +``` + +This example will install enterprise subscription repository as well as +postgres_distributed giving access to EPAS and BDR4+ products. +On Debian and Ubuntu systems, it will use the APT repository, and on +RedHat systems, it will use the YUM repository. + +If any EDB repositories are specified, any 2ndQuadrant repositories +specified will be ignored and no EDB Repos 1.0 will be installed. + +## Specifying 2ndQuadrant repositories + +To specify the complete list of 2ndQuadrant repositories to install on +each instance in addition to the 2ndQuadrant public repository, set +`tpa_2q_repositories` to a list of 2ndQuadrant repository names: + +```yaml +cluster_vars: + tpa_2q_repositories: + - products/pglogical3/release + - products/bdr3/release +``` + +This example will install the pglogical3 and bdr3 release repositories. +On Debian and Ubuntu systems, it will use the APT repository, and on +RedHat systems, it will use the YUM repository. + +The `dl/default/release` repository is always installed by default, +unless you + +- explicitly set `tpa_2q_repositories: []`, or +- have at least one entry in `edb_repositories`. + +Either or the above will result in no 2ndQuadrant repositories being +installed. diff --git a/product_docs/docs/tpa/23/reference/INSTALL-docker.mdx b/product_docs/docs/tpa/23/reference/INSTALL-docker.mdx new file mode 100644 index 00000000000..d7269280073 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/INSTALL-docker.mdx @@ -0,0 +1,77 @@ +--- +title: Running TPA in a Docker container +originalFilePath: INSTALL-docker.md + +--- + +If you are using a system for which there are no [TPA +packages](../INSTALL) available, and it's difficult to run TPA after +[installing from source](INSTALL-repo) (for example, because it's not +easy to obtain a working Python 3.6+ interpreter), your last resort may +be to build a Docker image and run TPA inside a Docker container. + +Please note that you do not need to run TPA in a Docker container in +order to [deploy to Docker containers](../platform-docker). It's always +preferable to run TPA directly if you can (even on MacOS X). + +## Quickstart + +You must have Docker installed and working on your system already. + +Run the following commands to clone the tpaexec source repository from Github +and build a new Docker image named `tpa/tpaexec`: + +```bash +$ git clone ssh://git@github.com/EnterpriseDB/tpaexec.git +$ cd tpaexec +$ docker build -t tpa/tpaexec . +``` + +Double-check the created image: + +```bash +$ docker image ls tpa/tpaexec +REPOSITORY TAG IMAGE ID CREATED SIZE +tpa/tpaexec latest e145cf8276fb 8 seconds ago 1.73GB +$ docker run --rm tpa/tpaexec tpaexec info +# TPAexec v20.11-59-g85a62fe3 (branch: master) +tpaexec=/usr/local/bin/tpaexec +TPA_DIR=/opt/EDB/TPA +PYTHON=/opt/EDB/TPA/tpa-venv/bin/python3 (v3.7.3, venv) +TPA_VENV=/opt/EDB/TPA/tpa-venv +ANSIBLE=/opt/EDB/TPA/tpa-venv/bin/ansible (v2.8.15) +``` + +Create a TPA container and make your cluster configuration directories +available inside the container: + +```bash +$ docker run --rm -v ~/clusters:/clusters \ + -it tpa/tpaexec:latest +``` + +You can now run commands like `tpaexec provision /clusters/speedy` at the +container prompt. (When you exit the shell, the container will be removed.) + +If you want to provision Docker containers using TPA, you must also allow +the container to access the Docker control socket on the host: + +``` +$ docker run --rm -v ~/clusters:/clusters \ + -v /var/run/docker.sock:/var/run/docker.sock \ + -it tpa/tpaexec:latest +``` + +Run `docker ps` within the container to make sure that your connection to the +host Docker daemon is working. + +## Installing Docker + +Please consult the +[Docker documentation](https://docs.docker.com) if you need help to +[install Docker](https://docs.docker.com/install) and +[get started](https://docs.docker.com/get-started/) with it. + +On MacOS X, you can [install "Docker Desktop for +Mac"](https://hub.docker.com/editions/community/docker-ce-desktop-mac/) +and launch Docker from the application menu. diff --git a/product_docs/docs/tpa/23/reference/INSTALL-repo.mdx b/product_docs/docs/tpa/23/reference/INSTALL-repo.mdx new file mode 100644 index 00000000000..60932bb5e87 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/INSTALL-repo.mdx @@ -0,0 +1,131 @@ +--- +title: Installing TPA from source +originalFilePath: INSTALL-repo.md + +--- + +This document explains how to use TPA from a copy of the source code +repository. + +Please [install TPA from packages](../INSTALL) if you can; install +from source only if no packages are available for your system (e.g., on +MacOS X), or if you are collaborating with the TPA developers to +test unreleased code. + +To run TPA from source, you must install all of the dependencies +(e.g., Python 3.6+) that the packages would handle for you, or download +the source and [run TPA in a Docker container](INSTALL-docker). +(Either way will work fine on Linux and MacOS X.) + +## Quickstart + +First, you must install the various dependencies that would have been +installed automatically along with the TPA packages. (You can use +something other than `sudo` to run these commands as root, if you +prefer.) + +```bash +# Debian (python3.7) or Ubuntu (python3.6) +$ sudo apt-get install python3.7 python3-pip python3-venv \ + git openvpn patch + +# RedHat, Rocky or AlmaLinux (python3 for RHEL7, python36 for RHEL8) +$ sudo yum install python36 python3-pip \ + epel-release git openvpn patch + +# MacOS X +$ brew tap discoteq/discoteq +$ brew install python@3 openvpn flock coreutils gpatch git +``` + +Next, install TPA itself: + +```bash +$ git clone ssh://git@github.com/EnterpriseDB/tpaexec.git +$ ./tpaexec/bin/tpaexec setup +$ ./tpaexec/bin/tpaexec selftest +``` + +## Step-by-step + +Install the various dependencies as described above. + +If your system does not have Python 3.6+ packages, you can use `pyenv` +to install a more recent Python in your home directory (see below), or +you can [run TPA in a Docker container](INSTALL-docker). + +Next, clone the TPA repository into, say, `~/tpaexec`. (It doesn't +matter where you put it, but don't use `/opt/EDB/TPA` or +`/opt/2ndQuadrant/TPA`, to avoid conflicts if you install the TPA +packages in future.) + +```bash +$ git clone ssh://git@github.com/EnterpriseDB/tpaexec.git ~/tpaexec +``` + +(If you're installing from source, please clone the repository instead +of downloading an archive of the source.) + +The remaining steps are the same as if you had installed the package. + +```bash +# Add tpaexec to your PATH for convenience +# (Put this in your ~/.bashrc too) +$ export PATH=$PATH:$HOME/tpaexec/bin + +$ tpaexec setup +$ tpaexec selftest +``` + +If the self-test completes without any errors, your TPA installation +is ready for use. + +## Python 3.6+ + +TPA requires Python 3.6 or later, available on most +modern distributions. If you don't have it, you can use +[pyenv](https://github.com/pyenv/pyenv) to install any version of Python +you like without affecting the system packages. + +```bash +# First, install pyenv and activate it in ~/.bashrc +# See https://github.com/pyenv/pyenv#installation +# (e.g., `brew install pyenv` on MacOS X) + +$ pyenv install 3.9.0 +Downloading Python-3.9.0.tar.xz... +-> https://www.python.org/ftp/python/3.9.0/Python-3.9.0.tar.xz +Installing Python-3.9.0... +Installed Python-3.9.0 to /home/ams/.pyenv/versions/3.9.0 + +$ pyenv local 3.9.0 +$ pyenv version +3.9.0 (set by /home/ams/pyenv/.python-version) + +$ pyenv which python3 +/home/ams/.pyenv/versions/3.9.0/bin/python3 +$ python3 --version +3.9.0 +``` + +If you were not already using pyenv, please remember to add `pyenv` to +your PATH in .bashrc and call `eval "$(pyenv init -)"` as described in +the [pyenv documentation](https://github.com/pyenv/pyenv#installation). + +## Virtual environment options + +By default, `tpaexec setup` will use the builtin Python 3 `-m venv` +to create a venv under `$TPA_DIR/tpa-venv`, and activate it +automatically whenever `tpaexec` is invoked. + +You can run `tpaexec setup --venv /other/location` to specify a +different location for the new venv. + +We strongly suggest sticking to the default venv location. If you use a +different location, you must also set the environment variable TPA_VENV +to its location, for example by adding the following line to your +.bashrc (or other shell startup scripts): + +```bash +export TPA_VENV="/other/location" +``` diff --git a/product_docs/docs/tpa/23/reference/air-gapped.mdx b/product_docs/docs/tpa/23/reference/air-gapped.mdx new file mode 100644 index 00000000000..cda51c26664 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/air-gapped.mdx @@ -0,0 +1,74 @@ +--- +title: Managing clusters in a disconnected or Air-Gapped environment +originalFilePath: air-gapped.md + +--- + +In a security controlled environment where no direct connection to the +Internet is allowed, it is necessary to provide all packages needed by +TPA to complete the deployment. This can be done via a local-repo on +each node in the cluster. TPA supports the addition of custom +repositories on each node via a +[local-repo](local-repo) and the required packages can be downloaded +using the [download-packages](tpaexec-download-packages) command. + +## Preparation + +Choose an internet connected machine where you can install TPA, +follow instructions below to either copy an existing cluster +configuration or create a new cluster. + +If you have an existing cluster in a disconnected environment, all you +need on the internet connected host is the config.yml. Create a +directory and copy that file into it. + +For an environment where the target instances will not have network +access, configure a new cluster with this option: + +``` +tpaexec configure --use-local-repo-only … +``` + +This will do everything that `--enable-local-repo` does, and disable the +configuration for all other package repositories. On RedHat instances, +this also includes disabling access to subscription-based services. + +In an existing cluster, you can set `use_local_repo_only: yes` in +`config.yml`: + +```yaml +cluster_vars: + use_local_repo_only: yes +``` + +Note: that you do not need separate cluster configurations for internet +connected and disconnected environments, the options below work in both. + +More info on [using local-repo for distributing packages](local-repo) + +## Downloading packages + +On the internet connected machine, ensure that you +have [docker installed](../platform-docker) and run: + +```shell +tpaexec download-packages cluster-dir --os --os-version +``` + +See detailed description for +the [package downloader](tpaexec-download-packages). + +## Copying packages to the target environment + +The resulting repository will be contained in the +`cluster-dir/local-repo` directory. This is a complete package repo for +the target OS. Copy this directory, from the connected controller to the +disconnected controller that will be used to deploy the cluster. Place +the directory in the same place, beneath the cluster directory. TPA +will then copy packages to the instances automatically. + +## Deploying in a disconnected environment + +Ensure that the cluster config.yml has been configured as above in +[Preparation](#preparation). Run `tpaexec provision` and `deploy` as you +would normally. diff --git a/product_docs/docs/tpa/23/reference/apt_repositories.mdx b/product_docs/docs/tpa/23/reference/apt_repositories.mdx new file mode 100644 index 00000000000..b238058de12 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/apt_repositories.mdx @@ -0,0 +1,36 @@ +--- +title: Configuring APT repositories +originalFilePath: apt_repositories.md + +--- + +This page explains how to configure APT package repositories on Debian +and Ubuntu systems. + +You can define named repositories in `apt_repositories`, and decide +which ones to use by listing the names in `apt_repository_list`: + +```yaml +cluster_vars: + apt_repositories: + Example: + key_id: XXXXXXXX + key_url: https://repo.example.com/path/to/XXXXXXXX.asc + repo: >- + deb https://repo.example.com/repos/Example/ xxx-Example main + + apt_repository_list: + - PGDG + - Example +``` + +This configuration would install the GPG key (with id `key_id`, +obtained from `key_url`) and a new entry under +`/etc/apt/sources.list.d` with the given `repo` line (or lines) +for the PGDG repository (which is already defined by default) and the +new Example repository. + +When you configure additional repositories, remember to include PGDG in +`apt_repository_list` if you still want to install PGDG packages. + +You can set `apt_repository_list: []` to not install any repositories. diff --git a/product_docs/docs/tpa/23/reference/artifacts.mdx b/product_docs/docs/tpa/23/reference/artifacts.mdx new file mode 100644 index 00000000000..bc84667ec05 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/artifacts.mdx @@ -0,0 +1,53 @@ +--- +title: Uploading artifacts +originalFilePath: artifacts.md + +--- + +You can define `artifacts` to create or copy files to target instances: + +```yaml +cluster_vars: + artifacts: + - type: path + path: /some/target/path + state: directory + owner: root + group: root + mode: 0755 + - type: file + src: /host/path/to/file + dest: /target/path/to/file + owner: root + group: root + mode: 0644 + - type: archive + src: example.tar.gz + dest: /some/target/path + - type: directory + src: /host/path/a/ + dest: /target/path/b/ +``` + +The following types are supported: + +- Use `path` to create or remove and change the ownership or mode of + files and directories (takes the same parameters as Ansible's `file` + module, which it uses internally) + +- Use `file` to copy a file from the controller and set the ownership + and mode (uses `copy`) + +- Use `archive` to extract files from an archive to a specified location + (uses `unarchive`) + +- Use `directory` to rsync a directory from the controller to target + instances (uses `synchronize`) + +The example shows one entry for each of the above artifact types, but +you can use these or any other parameters that the corresponding Ansible +module accepts. + +Copying files and directories to target instances is a common-enough +need that this feature provides a convenient shortcut you can use +instead of writing a [custom hook](../tpaexec-hooks). diff --git a/product_docs/docs/tpa/23/reference/barman.mdx b/product_docs/docs/tpa/23/reference/barman.mdx new file mode 100644 index 00000000000..eefe626f539 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/barman.mdx @@ -0,0 +1,88 @@ +--- +title: '' +originalFilePath: barman.md + +--- + +When an instance is given the `barman` role in config.yml, TPA will +configure it as a [Barman](https://pgbarman.org/) server to take backups +of any other instances that name it in their `backup` setting. + +```yaml +instances: +- Name: one + backup: two + … + +- Name: two + role: + - barman + … +``` + +Multiple `postgres` instances can have the same Barman server named as +their `backup`; equally, one `postgres` instance can have a list of +Barman servers named as its `backup` and backups will be taken to all +of the named servers. + +The default Barman configuration will connect to PostgreSQL using +pg_receivewal to take continuous backups of WAL, and will take a full +backup of the instance using rsync over ssh twice weekly. Full backups +and WAL are retained for long enough to enable recovery to any point in +the last 4 weeks. + +## Barman configuration + +On each barman server, a global configuration file will be created +as `/etc/barman.conf`; this contains default values for many barman +configuration variables. For each postgres server being backed up, +an additional Barman configuration file is created: to back up the +server `one`, the file will be `/etc/barman.d/one.conf`, and the backups +will be stored in `/var/lib/barman/one`. The file and directory names +are taken from the backed-up instance's `backup_name` setting, defaulting +to the instance's name. + +The following variables can be set on the backed-up instance and are +passed through into Barman's configuration with the prefix `barman_` +removed: + +| variable | default | +| ------------------------------- | -------------------------- | +| barman_archiver | false | +| barman_log_file | /var/log/barman.log | +| barman_backup_method | rsync | +| barman_compression | pigz | +| barman_reuse_backup | link | +| barman_parallel_jobs | 1 | +| barman_backup_options | concurrent_backup | +| barman_immediate_checkpoint | false | +| barman_network_compression | false | +| barman_basebackup_retry_times | 3 | +| barman_basebackup_retry_sleep | 30 | +| barman_minimum_redundancy | 3 | +| barman_retention_policy | RECOVERY WINDOW OF 4 WEEKS | +| barman_last_backup_maximum_age | 1 WEEK | +| barman_pre_archive_retry_script | | +| barman_post_backup_retry_script | | +| barman_post_backup_script | | +| barman_streaming_wals_directory | | + +## Backup scheduling + +TPA installs a cron job in `/etc/cron.d/barman` which will run every +minute and invoke `barman cron` to perform maintenance tasks. + +For each instance being backed up, it installs another cron job in +`/etc/cron.d/` which takes the backups of that instance. +This job runs as determined by the `barman_backup_interval` variable for +the instance, with the default being to take backups at 04:00 every +Wednesday and Saturday. + +## SSH keys + +TPA will generate ssh key pairs for the `postgres` and `barman` +users and install them into the respective ~/.ssh directories, and add +them to each other's authorized_keys file. The postgres user must be +able to ssh to the barman server in order to archive WAL segments (if +configured), and the barman user must be able to ssh to the Postgres +instance to take or restore backups. diff --git a/product_docs/docs/tpa/23/reference/bdr.mdx b/product_docs/docs/tpa/23/reference/bdr.mdx new file mode 100644 index 00000000000..0506a4a5e43 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/bdr.mdx @@ -0,0 +1,188 @@ +--- +title: BDR configuration +originalFilePath: bdr.md + +--- + +TPAexec can install and configure PGD (Postgres Distributed), formerly +known as BDR (Bi-directional replication) versions 3.6, 3.7, 4.x, and +5.x. + +Access to PGD/BDR packages is through EDB's package repositories only. +You must have a valid EDB subscription token to download the packages. + +This documentation touches on several aspects of BDR configuration, but +we refer you to the [PGD documentation](/pgd/latest) for an authoritative +description of the details. + +## Introduction + +TPAexec will install BDR and any dependencies on all BDR instances along +with Postgres itself. + +After completing the basic Postgres setup and starting Postgres, TPAexec +will then create the `bdr_database` and proceed to set up a BDR cluster +through the various steps described below. + +## Installation + +TPAexec will install the correct BDR packages, depending on the version +and flavour of Postgres in use (e.g., Postgres, Postgres Extended, or +EPAS). + +Set `bdr_version` to determine which major version of BDR to install +(i.e., 3, 4, 5). Set `bdr_package_version` to determine which exact +package to install (e.g., '3.6\*' to install the latest 3.6.x). + +## Overview of cluster setup + +After installing the required packages, configuring Postgres to load +BDR, and starting the server, TPAexec will go on to set up BDR nodes, +groups, replication sets, and other resources. + +Here's a summary of the steps TPAexec performs: + +- Create a BDR node (using bdr.create_node()) for each participating + instance + +- Create one or more BDR node groups (using bdr.create_node_group()) + depending on `bdr_node_groups` + +- Create replication sets, if required, to control exactly which changes + are replicated (depending on node group type and memberships, e.g., + subscriber-only and witness nodes may need special handling) + +- Join the relevant node groups on the individual instances + +- Perform additional configuration, such as enabling subgroup RAFT or + proxy routing. + +(This process involves executing a complex sequence of queries, some on +each instance in turn, and others in parallel. To make the steps easier +to follow, TPAexec designates an arbitrary BDR primary instance as the +"first_bdr_primary" for the cluster, and uses this instace to execute +most of these queries. The instance is otherwise not special, and its +identity is not significant to the BDR configuration itself.) + +## Instance roles + +Every instance with `bdr` in its `role` is a BDR instance, and +implicitly also a `postgres` server instance. + +A BDR instance with `readonly` in its role is a logical standby node +(which joins the BDR node group with `pause_in_standby` set), eligible +for promotion. + +A BDR instance with `subscriber-only` in its role is a subscriber-only +node, which receives replicated changes but does not publish them. + +A BDR instance with `witness` in its role is a witness node. + +Every BDR instance described above is implicitly also a `primary` +instance. The exception is an instance with `replica` in its role; that +indicates a physical streaming replica of an upstream BDR instance. Such +instances are not included in any recommended BDR architecture, and not +currently supported by TPAexec. + +## Configuration settings + +The settings mentioned below should ordinarily be set in `cluster_vars`, +so that they are set uniformly for all the BDR instances in the cluster. +You can set different values on different instances in some cases (e.g., +`bdr_database`), but in other cases, the result is undefined (e.g., all +instances must have exactly the same value of `bdr_node_groups`). + +We strongly recommend defining your BDR configuration by setting uniform +values for the whole cluster under `cluster_vars`. + +### bdr_database + +The `bdr_database` (default: bdrdb) will be initialised with BDR. + +### bdr_node_group + +The setting of `bdr_node_group` (default: based on the cluster name) +identifies which BDR cluster an instance should be a part of. It is also +used to identify a particular cluster for external components (e.g., +pgd-proxy or harp-proxy). + +### bdr_node_groups + +This is a list of BDR node groups that must be created before the group +join stage (if the cluster requires additional subgroups). + +In general, `tpaexec configure` will generate an appropriate value based +on the selected architecture. + +```yaml +cluster_vars: + bdr_node_groups: + - name: topgroup + - name: abc_subgroup + node_group_type: data + parent_group_name: topgroup + options: + location: abc + … +``` + +The first entry must be for the cluster's `bdr_node_group`. + +Each subsequent entry in the list must specify a `parent_group_name`, +and may specify the `node_group_type` (optional). + +Each entry may also have an optional key/value mapping of group options. +The available options vary by BDR version. + +### bdr_child_group + +If `bdr_child_group` is set for an instance (to the name of a group that +is mentioned in `bdr_node_groups`), it will join that group instead of +`bdr_node_group`. + +### bdr_commit_scopes + +This is an optional list of +[commit scopes](https://www.enterprisedb.com/docs/pgd/latest/bdr/group-commit/) +that must exist in the BDR database (available for BDR 4.1 and above). + +```yaml +cluster_vars: + bdr_commit_scopes: + - name: somescope + origin: somegroup + rule: 'ALL (somegroup) ON received …` + - name: otherscope + origin: othergroup + rule: '…' + … +``` + +Each entry must specify the `name` of the commit scope, the name of the +`origin` group, and the commit scope `rule`. The groups must correspond +to entries in `bdr_node_groups`. + +If you set `bdr_commit_scopes` explicitly, TPA will create, alter, or +drop commit scopes as needed to ensure that the database matches the +configuration. If you do not set it, it will leave existing commit +scopes alone. + +## Miscellaneous notes + +### Hooks + +TPAexec invokes the bdr-node-pre-creation, bdr-post-group-creation, and +bdr-pre-group-join [hooks](../tpaexec-hooks) during the BDR cluster +setup process. + +### Database collations + +TPAexec checks that the BDR database on every instance in a cluster has +the same collation (LC_COLLATE) setting. Having different collations in +databases in the same BDR cluster is a data loss risk. + +## Older versions of BDR + +TPAexec no longer actively supports or tests the deployment of BDR v1 +(with a patched version of Postgres 9.4), v2 (with Postgres 9.6), and +any versions below v3.6. diff --git a/product_docs/docs/tpa/23/reference/distributions.mdx b/product_docs/docs/tpa/23/reference/distributions.mdx new file mode 100644 index 00000000000..6e59bffcd52 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/distributions.mdx @@ -0,0 +1,44 @@ +--- +title: Distribution support +originalFilePath: distributions.md + +--- + +TPA detects and adapts to the distribution running on each target +instance. + +(This page is about distribution support on target instances that you +are deploying *to*, not about the system you are running TPA *from*. +See the [installation instructions](../INSTALL#distribution-support) for +more on the latter.) + +## Debian + +- Debian 10/buster is fully supported +- Debian 9/stretch is supported as a legacy platform +- Debian 8/jessie is supported as a legacy platform + +## Ubuntu + +- Ubuntu 20.04/focal is fully supported +- Ubuntu 18.04/bionic is fully supported +- Ubuntu 16.04/xenial is supported as a legacy platform + +## RedHat + +- RHEL/CentOS/Rocky/AlmaLinux 8.x is fully supported (python3 only) +- RHEL/CentOS 7.x is fully supported (python2 only) + +## Package availability + +All combinations of packages for Postgres and other components may not +be available for all of the supported distributions. For example, you +will need to use an older distribution to be able to install Postgres +9.4 with BDRv1 from packages; and not all projects publish Ubuntu 20.04 +packages yet. + +## Platform-specific considerations + +Some platforms may not support all of the distributions mentioned here. +For example, Debian 8 and Ubuntu 16.04 are not supported in [Docker +containers](../platform-docker). diff --git a/product_docs/docs/tpa/23/reference/edb_repositories.mdx b/product_docs/docs/tpa/23/reference/edb_repositories.mdx new file mode 100644 index 00000000000..e2d8cd76bf4 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/edb_repositories.mdx @@ -0,0 +1,35 @@ +--- +title: Configuring EDB Repos 2.0 repositories +originalFilePath: edb_repositories.md + +--- + +This page explains how to configure EDB Repos 2.0 package repositories +on any system. + +For more details on the EDB and 2ndQuadrant package sources used by +TPA see [this page](2q_and_edb_repositories). + +To specify the complete list of repositories from EDB Repos 2.0 to +install on each instance, set `edb_repositories` to a list of EDB +repository names: + +```yaml +cluster_vars: + edb_repositories: + - enterprise + - postgres_distributed +``` + +This example will install enterprise subscription repository as well as +postgres_distributed giving access to EPAS and BDR4+ products. +On Debian and Ubuntu systems, it will use the APT repository, and on +RedHat systems, it will use the YUM repository. + +If any EDB repositories are specified, any 2ndQuadrant repositories +specified will be ignored and no EDB Repos 1.0 will be installed. + +To use [EDB Repos 2.0](https://www.enterprisedb.com/repos/) you must +`export EDB_SUBSCRIPTION_TOKEN=xxx` before you run tpaexec. You can get +your subscription token from [the web +interface](https://www.enterprisedb.com/repos-downloads). diff --git a/product_docs/docs/tpa/23/reference/efm.mdx b/product_docs/docs/tpa/23/reference/efm.mdx new file mode 100644 index 00000000000..d470a75784c --- /dev/null +++ b/product_docs/docs/tpa/23/reference/efm.mdx @@ -0,0 +1,53 @@ +--- +title: Configuring EFM +originalFilePath: efm.md + +--- + +TPA will install and configure EFM when `failover_manager` is set to +`efm`. + +Note that EFM is only available via EDB's package repositories +and requires a valid subscription. + +## EFM configuration + +TPA will generate `efm.nodes` and `efm.properties` with the appropriate +instance-specific settings, with remaining settings set to the respective +default values. TPA will also place an `efm.notification.sh` script which +basically contains nothing by default and leaves it up to the user to fill it +in however they want. + +See the [EFM documentation](https://www.enterprisedb.com/docs/efm/latest/) +for more details on EFM configuration. + +## efm_conf_settings + +You can use `efm_conf_settings` to set any parameters, whether recognised +by TPA or not. Where needed, you need to quote the value exactly as it +would appear in `efm.properties`: + +```yaml +cluster_vars: + efm_conf_settings: + standby.restart.delay: 1 + application.name: quarry + reconfigure.num.sync: true + reconfigure.num.sync.max: 1 + reconfigure.sync.primary: true +``` + +If you make changes to values under `efm_conf_settings`, TPA will always +restart EFM to activate the changes. + +### EFM witness + +TPA will install and configure EFM as witness on instances whose `role` +contains `efm-witness`. + +### Repmgr + +EFM works as a failover manager and therefore TPA will still install +repmgr for setting up postgresql replicas. `repmgrd` i.e. repmgr's daemon +remains disabled in this case and repmgr's only job is to provided replication +setup functionality. diff --git a/product_docs/docs/tpa/23/reference/git-credentials.mdx b/product_docs/docs/tpa/23/reference/git-credentials.mdx new file mode 100644 index 00000000000..b40b83a4616 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/git-credentials.mdx @@ -0,0 +1,62 @@ +--- +title: Git credentials +originalFilePath: git-credentials.md + +--- + +This page explains how to clone Git repositories that require +authentication. + +This may be required when you change `postgres_git_url` +to [install Postgres from source](postgres_installation_method_src) or +[use `install_from_source`](install_from_source) to compile and +install extensions. + +You have two options to authenticate without writing the credentials to +disk on the target instance: + +- For an `ssh://` repository, you can add an SSH key to your local + ssh-agent. Agent forwarding is enabled by default if you use + `--install-from-source` (`forward_ssh_agent: yes` in config.yml). + +- For an `https://` repository, you can + `export TPA_GIT_CREDENTIALS=username:token` in your environment + before running `tpaexec deploy`. + +## SSH key authentication + +If you are cloning an SSH repository and have an SSH keypair +(`id_example` and `id_example.pub`), use SSH agent forwarding to +authenticate on the target instances: + +- **You need to run `ssh-agent` locally**. If your desktop environment + does not already set this up for you (as most do—`pgrep ssh-agent` + to check if it's running), run `ssh-agent bash` to temporarily start + a new shell with the agent enabled, and run `tpaexec deploy` from + that shell. + +- **Add the required key(s) to the agent** with + `ssh-add /path/to/id_example` (the private key file) + +- **Enable SSH agent forwarding** by setting `forward_ssh_agent: yes` + at the top level in config.yml before `tpaexec provision`. (This is + done by default if you use `--install-from-source`.) + +During deployment, any keys you add to your agent will be made available +for authentication to remote servers through the forwarded agent +connection. + +Use SSH agent forwarding with caution, preferably with a disposable +keypair generated specifically for this purpose. Users with the +privileges to access the agent's Unix domain socket on the target server +can co-opt the agent into impersonating you while authenticating to +other servers. + +## HTTPS username/password authentication + +If you are cloning an HTTPS repository with a username and +authentication token or password, just +`export TPA_GIT_CREDENTIALS=username:token` in your environment before +`tpaexec deploy`. During deployment, these credentials will be made +available to any `git clone` or `git pull` tasks (only). They will +not be written to disk on the target instances. diff --git a/product_docs/docs/tpa/23/reference/haproxy.mdx b/product_docs/docs/tpa/23/reference/haproxy.mdx new file mode 100644 index 00000000000..9dadd37463c --- /dev/null +++ b/product_docs/docs/tpa/23/reference/haproxy.mdx @@ -0,0 +1,62 @@ +--- +title: Configuring haproxy +originalFilePath: haproxy.md + +--- + +TPA will install and configure haproxy on instances whose `role` +contains `haproxy`. + +By default, haproxy listens on `127.0.0.1:5432` for requests forwarded +by [`pgbouncer`](pgbouncer) running on the same instance. You must +specify a list of `haproxy_backend_servers` to forward requests to. + +TPA will install the latest available version of haproxy by default. +You can install a specific version instead by setting +`haproxy_package_version: 1.9.15*` (for example). + +Note: see limitations of using wildcards in package_version in +[tpaexec-configure](../tpaexec-configure#known-issue-with-wildcard-use). + +You can set the following variables on any `haproxy` instance. + +| Variable | Default value | Description | +| ------------------------- | --------------------- | --------------------------------------------------------------------------------------------------------------------------------- | +| `haproxy_bind_address` | 127.0.0.1 | The address haproxy should bind to | +| `haproxy_port` | 5432 | The TCP port haproxy should listen on | +| `haproxy_backend_servers` | None | A list of Postgres instance names | +| `haproxy_maxconn` | `max_connections`×0.9 | The maximum number of connections allowed per backend server; the default is derived from the backend's `max_connections` setting | + +## Server options + +TPA will generate `/etc/haproxy/haproxy.cfg` with a backend that has +a `default-server` line and one line per backend server. All but the +first one will be marked as "backup" servers. + +Set `haproxy_default_server_extra_options` to a list of options on the +haproxy instance to add options to the `default-server` line; and set +`haproxy_server_options` to a list of options on the backend server to +add options (which will override the defaults) to the individual server +lines for each backend. + +## Example + +```yaml +instances: +- Name: one + vars: + haproxy_server_options: + - maxconn 33 +- Name: two +… +- Name: proxy + role: + - haproxy + vars: + haproxy_backend_servers: + - one + - two + haproxy_default_server_extra_options: + - on-error mark-down + - on-marked-down shutdown-sessions +``` diff --git a/product_docs/docs/tpa/23/reference/harp.mdx b/product_docs/docs/tpa/23/reference/harp.mdx new file mode 100644 index 00000000000..171fd4421c8 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/harp.mdx @@ -0,0 +1,99 @@ +--- +title: Configuring HARP +originalFilePath: harp.md + +--- + +TPA will install and configure HARP when `failover_manager` is set +to `harp`, which is the default for BDR-Always-ON clusters. + +## Installing HARP + +You must provide the `harp-manager` and `harp-proxy` packages. Please +contact EDB to obtain access to these packages. + +## Configuring HARP + +See the [HARP documentation](https://documentation.enterprisedb.com/harp/release/latest/configuration/) +for more details on HARP configuration. + +| Variable | Default value | Description | +| --------------------------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `cluster_name` | \`\` | The name of the cluster. | +| `harp_consensus_protocol` | \`\` | The consensus layer to use (`etcd` or `bdr`) | +| `harp_location` | `location` | The location of this instance (defaults to the `location` parameter) | +| `harp_ready_status_duration` | `10` | Amount of time in seconds the node's readiness status will persist if not refreshed. | +| `harp_leader_lease_duration` | `6` | Amount of time in seconds the Lead Master lease will persist if not refreshed. | +| `harp_lease_refresh_interval` | `2000` | Amount of time in milliseconds between refreshes of the Lead Master lease. | +| `harp_dcs_reconnect_interval` | `1000` | The interval, measured in ms, between attempts that a disconnected node tries to reconnect to the DCS. | +| `harp_dcs_priority` | `500` | In the case two nodes have an equal amount of lag and other qualified criteria to take the Lead Master lease, this acts as an additional ranking value to prioritize one node over another. | +| `harp_stop_database_when_fenced` | `false` | Rather than simply removing a node from all possible routing, stop the database on a node when it is fenced. | +| `harp_fenced_node_on_dcs_failure` | `false` | If HARP is unable to reach the DCS then fence the node. | +| `harp_maximum_lag` | `1048576` | Highest allowable variance (in bytes) between last recorded LSN of previous Lead Master and this node before being allowed to take the Lead Master lock. | +| `harp_maximum_camo_lag` | `1048576` | Highest allowable variance (in bytes) between last received LSN and applied LSN between this node and its CAMO partner(s). | +| `harp_camo_enforcement` | `lag_only` | Whether CAMO queue state should be strictly enforced. | +| `harp_use_unix_sock` | `false` | Use unix domain socket for manager database access. | +| `harp_request_timeout` | `250` | Time in milliseconds to allow a query to the DCS to succeed. | +| `harp_watch_poll_interval` | `500` | Milliseconds to sleep between polling DCS. Only applies when `harp_consensus_protocol` is `bdr`. | +| `harp_proxy_timeout` | `1` | Builtin proxy connection timeout, in seconds, to Lead Master. | +| `harp_proxy_keepalive` | `5` | Amount of time builtin proxy will wait on an idle connection to the Lead Master before sending a keepalive ping. | +| `harp_proxy_max_client_conn` | `75` | Maximum number of client connections accepted by harp-proxy (`max_client_conn`) | +| `harp_ssl_password_command` | None | a custom command that should receive the obfuscated sslpassword in the stdin and provide the handled sslpassword via stdout. | +| `harp_db_request_timeout` | `10s` | similar to dcs -> request_timeout, but for connection to the database itself. | + +You can use the +[harp-config hook](../tpaexec-hooks#harp-config) +to execute tasks after the HARP configuration files have been +installed (e.g., to install additional configuration files). + +## Consensus layer + +The `--harp-consensus-protocol` argument to `tpaexec configure` is +mandatory for the BDR-Always-ON architecture. + +### etcd + +If the `--harp-consensus-protocol etcd` option is given to `tpaexec +configure`, then TPA will set `harp_consensus_protocol` to `etcd` +in config.yml and give the `etcd` role to a suitable subset of the +instances, depending on your chosen layout. + +HARP v2 requires etcd v3.5.0 or above, which is available in the +products/harp/release package repositories provided by EDB. + +You can configure the following parameters for etcd: + +| Variable | Default value | Description | +| ---------------- | ------------- | -------------------------------------------- | +| etcd_peer_port | 2380 | The port used by etcd for peer communication | +| etcd_client_port | 2379 | The port used by clients to connect to etcd | + +### bdr + +If the `--harp-consensus-protocol bdr` option is given to `tpaexec +configure`, then TPA will set `harp_consensus_protocol` to `bdr` +in config.yml. In this case the existing BDR instances will be used +for consensus, and no further configuration is required. + +## Configuring a separate user for harp proxy + +If you want harp proxy to use a separate readonly user, you can specify that +by setting `harp_dcs_user: username` under cluster_vars. TPA will use +`harp_dcs_user` setting to create a readonly user and set it up in the DCS +configuration. + +## Custom SSL password command + +The command provided by `harp_ssl_password_command` will be used by HARP +to de-obfuscate the `sslpassword` given in connection string. If +`sslpassword` is not present then `harp_ssl_password_command` is +ignored. If `sslpassword` is not obfuscated then +`harp_ssl_password_command` is not required and should not be specified. + +## Configuring the harp service + +You can configure the following parameters for the harp service: + +| Variable | Default value | Description | +| --------------------------------- | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `harp_manager_restart_on_failure` | `false` | If `true`, the `harp-manager` service is overridden so it's restarted on failure. The default is `false` to comply with the service installed by the `harp-manager` package. | diff --git a/product_docs/docs/tpa/23/reference/hosts.mdx b/product_docs/docs/tpa/23/reference/hosts.mdx new file mode 100644 index 00000000000..51e5b4060a0 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/hosts.mdx @@ -0,0 +1,41 @@ +--- +title: Configuring /etc/hosts +originalFilePath: hosts.md + +--- + +By default, TPA will add lines to /etc/hosts on the target instances +with the IP address and hostname(s) of every instance in the cluster, so +that they can use each other's names for communication within the +cluster (e.g., in `primary_conninfo` for Postgres). + +You can specify a list of `extra_etc_hosts_lines` too: + +```yaml +instances: +- Name: one + … + vars: + extra_etc_hosts_lines: + - 192.0.2.1 acid.example.com + - 192.0.2.2 water.example.com +``` + +If you don't want the default entries at all, you can specify the +complete list of `etc_hosts_lines` for an instance instead, and only +those lines will be added to /etc/hosts: + +```yaml +instances: +- Name: one + … + vars: + etc_hosts_lines: + - 192.0.2.1 acid.example.com + - 192.0.2.2 water.example.com + - 192.0.2.3 base.example.com +``` + +If your /etc/hosts doesn't contain the default entries for instances in +the cluster, you'll need to ensure the names can be resolved in some +other way. diff --git a/product_docs/docs/tpa/23/reference/index.mdx b/product_docs/docs/tpa/23/reference/index.mdx new file mode 100644 index 00000000000..be23517c561 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/index.mdx @@ -0,0 +1,63 @@ +--- +title: Reference +indexCards: simple +navigation: + - '#Installation' + - INSTALL-docker + - INSTALL-repo + - air-gapped + - Requirements + - distributions + - tpaexec-support + - '#Services' + - bdr + - barman + - efm + - haproxy + - harp + - pem + - pgbouncer + - pgd-proxy + - pglogical + - repmgr + - '#Configuration' + - 2q_and_edb_repositories + - edb_repositories + - tpa_2q_repositories + - apt_repositories + - yum_repositories + - local-repo + - install_from_source + - git-credentials + - target_environment + - python + - hosts + - volumes + - artifacts + - ssh_key_file + - manage_ssh_hostkeys + - postgres_installation_method_src + - packages + - initdb + - postgres_installation_method_pkg + - pg-backup-api + - sysctl_values + - '#Postgres configuration' + - postgres_databases + - postgres_tablespaces + - postgresql.conf + - pg_hba.conf + - pg_ident.conf + - pgpass + - postgres_user + - postgres_users + - '#Commands' + - tpaexec-archive-logs + - tpaexec-download-packages + - tpaexec-commands + - tpaexec-tests + - '!legal-notice.mdx' + - platform-shared +originalFilePath: ../../../../dev/null + +--- diff --git a/product_docs/docs/tpa/23/reference/initdb.mdx b/product_docs/docs/tpa/23/reference/initdb.mdx new file mode 100644 index 00000000000..1aed44ee508 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/initdb.mdx @@ -0,0 +1,41 @@ +--- +title: Running initdb +originalFilePath: initdb.md + +--- + +TPA will first create `postgres_data_dir` if it does not exist, and +ensure it has the correct ownership, permissions, and SELinux context. +Then, unless the directory already contains a `VERSION` file, it will +run `initdb` to initialise `postgres_data_dir`. + +You can use the +[pre-initdb hook](../tpaexec-hooks#pre-initdb) +to execute tasks before `postgres_data_dir` is created and `initdb` is +run. If the hook initialises `postgres_data_dir`, TPA will find the +`VERSION` file and realise that it does not need to run `initdb` itself. + +You can optionally set `postgres_initdb_opts` to a list of options to +pass to `initdb`: + +```yaml +cluster_vars: + postgres_locale: de_DE.UTF-8 + postgres_initdb_opts: + - --data-checksums +``` + +We recommend always including the `--data-checksums` option (which is +included by default). + +TPA will set `TZ=UTC` in the environment, and set `LC_ALL` to +the `postgres_locale` you specify, when running `initdb`. + +## Separate configuration directory + +By default, `postgres_conf_dir` is equal to `postgres_data_dir`, and the +Postgres configuration files (postgresql.conf, pg_ident.conf, +pg_hba.conf, and the include files in conf.d) are created within the +data directory. If you change `postgres_conf_dir`, TPA will move the +generated configuration files to the new location after running +`initdb`. diff --git a/product_docs/docs/tpa/23/reference/install_from_source.mdx b/product_docs/docs/tpa/23/reference/install_from_source.mdx new file mode 100644 index 00000000000..22956090610 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/install_from_source.mdx @@ -0,0 +1,58 @@ +--- +title: Installing from source +originalFilePath: install_from_source.md + +--- + +You can define a list of extensions to build and install from their Git +repositories by setting `install_from_source` in config.yml: + +```yaml +cluster_vars: + install_from_source: + - name: ext + git_repository_url: https://repo.example.com/ext.git + git_repository_ref: dev/example + + - name: otherext + git_repository_url: ssh://repo.example.com/otherext.git + git_repository_ref: master + source_directory: /opt/postgres/src/otherext + build_directory: /opt/postgres/build/otherext + build_commands: + - "make -f /opt/postgres/src/otherext/Makefile install" + build_environment: + VAR: value +``` + +TPA will build and install extensions one by one in the order +listed, so you can build extensions that depend on another (such as +pglogical and BDR) by mentioning them in the correct order. + +Each entry must specify a `name`, `git_repository_url`, and +`git_repository_ref` (default: `master`) to build. You can use +[SSH agent forwarding or an HTTPS username/password](git-credentials) +to authenticate to the Git repository; and also set +`source_directory`, `build_directory`, `build_environment`, and +`build_commands` as shown above. + +Run `tpaexec deploy … --skip-tags build-clean` in order to reuse the +build directory when doing repeated deploys. (Otherwise the old build +directory is emptied before starting the build.) You can also configure +[local source directories](../configure-source#local-source-directories) +to speed up your development builds. + +Whenever you run a source build, Postgres will be restarted. + +## Build dependencies + +If you're building from source, TPA will ensure that the basic +Postgres build dependencies are installed. If you need any additional +packages, mention them in [`packages`](packages). For example + +```yaml +cluster_vars: + packages: + common: + - golang-1.16 +``` diff --git a/product_docs/docs/tpa/23/reference/local-repo.mdx b/product_docs/docs/tpa/23/reference/local-repo.mdx new file mode 100644 index 00000000000..cb382ee19ab --- /dev/null +++ b/product_docs/docs/tpa/23/reference/local-repo.mdx @@ -0,0 +1,149 @@ +--- +title: Shipping packages from a local repo +originalFilePath: local-repo.md + +--- + +If you create a local repository within your cluster directory, TPA +will make any packages in the repository available to cluster instances. +This is an easy way to ship extra packages to your cluster. + +Optionally, you can also instruct TPA to configure the instances to +use *only* this repository, i.e., disable all others. In this case, you +must provide *all* packages required during the deployment, starting +from basic dependencies like rsync, Python, and so on. + +## Quickstart + +To configure a cluster with a local repo enabled, run: + +``` +tpaexec configure --enable-local-repo … +``` + +This will generate your cluster configuration and create a `local-repo` +directory and OS-specific subdirectories. See below for details of the +[recommended layout](#local-repo-layout). + +## Disconnected environments + +See instructions for managing clusters in an [air-gapped](air-gapped) +environment. + +## Local repo layout + +By default, TPA will create a `local-repo` directory and OS-specific +subdirectories within it (e.g., `local-repo/Debian/10`), based on the OS +you select for the cluster. We recommend using separate subdirectories +because it makes it easier to accommodate instances running different +distributions. + +For example, a cluster running RedHat 8 might have the following layout: + +```text +local-repo/ +`-- RedHat + |-- 8.5 -> 8 + `-- 8 + `-- repodata +``` + +For each instance, TPA will look for the following subdirectories of +`local-repo` in order and use the first one it finds: + +- `/`, e.g., `RedHat/8.5` +- `/`, e.g., `RedHat/8` +- `/`, e.g., `Ubuntu/focal` +- ``, e.g., `Debian` +- The `local-repo` directory itself. + +If none of these directories exists, of course, TPA will not try to +set up any local repo on target instances. + +This way, you can put RedHat-specific packages under `RedHat/8` and +Ubuntu-specific packages under `Ubuntu/focal`, and instances will use +the right packages automatically. If you don't have instances running +different distributions, they'll all use the same subdirectory. + +## Populating the repository + +Run [`tpaexec download-packages`](tpaexec-download-packages) to +download all the packages required by a cluster into the local-repo. + +You must copy packages into the appropriate repository directory and +generate repository metadata before running `tpaexec deploy`. + +After copying the necessary packages into the repository directory, you +must use an OS-specific tool to generate the repository metadata. + +You must generate the metadata on the control node, i.e., the machine +where you run tpaexec. TPA will copy the metadata and packages to +target instances. + +You must generate the metadata in the subdirectory that the instance +will use, i.e., if you copy packages into `local-repo/Debian/10`, you +must create the metadata in that directory, not in `local-repo/Debian`. + +### Debian/Ubuntu repository metadata + +For Debian-based distributions, install the `dpkg-dev` package: + +```shell +$ sudo apt-get update && sudo apt-get install -y dpkg-dev +``` + +Now you can use `dpkg-scanpackages` to generate the metadata: + +```shell +$ cd local-repo/Debian/buster +# download/copy .deb package files +$ dpkg-scanpackages . | gzip > Packages.gz +``` + +### RedHat repository metadata + +First, install the `createrepo` package: + +```shell +$ sudo yum install -y createrepo +``` + +Now you can use `createrepo` to generate the metadata: + +```shell +$ cd local-repo/RedHat/8 +# download/copy .rpm package files +$ createrepo . +``` + +## Copying the repository + +TPA will use rsync to copy the contents of the repository directory, +including the generated metadata, to a directory on target instances. + +If rsync is not already available on an instance, TPA can install it +(i.e., `apt-get install rsync` or `yum install rsync`). However, if you +have set `use_local_repo_only`, the rsync package must be included in +the local repo. If required, TPA will copy just the rsync package +using scp and install it before copying the rest. + +## Repository configuration + +After copying the contents of the local repo to target instances, +TPA will configure the destination directory as a local (i.e., +path-based, rather than URL-based) repository. + +The idea is that if you provide, say, `example.deb` in the repository +directory, running `apt-get install example` will suffice to install it, +just like any package in any other repository. + +## Package installation + +TPA configures a repository with the contents that you provide, but +if the same package is available from different repositories, it is up +to the package manager to decide which one to install (usually the +latest, unless you specify a particular version). + +(However, if you set `use_local_repo_only: yes`, TPA will disable +all other package repositories, so that instances can only use the +packages that you provide in `local-repo`.) diff --git a/product_docs/docs/tpa/23/reference/manage_ssh_hostkeys.mdx b/product_docs/docs/tpa/23/reference/manage_ssh_hostkeys.mdx new file mode 100644 index 00000000000..f474981712f --- /dev/null +++ b/product_docs/docs/tpa/23/reference/manage_ssh_hostkeys.mdx @@ -0,0 +1,38 @@ +--- +title: Managing SSH host keys +originalFilePath: manage_ssh_hostkeys.md + +--- + +TPA generates a set of SSH host keys while provisioning a cluster. +These keys are stored in the cluster directory, under the `hostkeys` +subdirectory. These host keys are automatically installed into +`/etc/ssh` on AWS EC2 instances and Docker containers. + +By default, these host keys are not installed on +[bare instances](../platform-bare), +but you can set `manage_ssh_hostkeys` to enable it: + +```yaml +instances: +- Name: one + … + platform: bare + vars: + manage_ssh_hostkeys: yes +``` + +You must initially set up `known_hosts` in your cluster directory with +correct entries, as described in the docs for +[bare instances](../platform-bare). TPA will replace the host keys +during deployment. + +The `manage_ssh_hostkeys` setting is meaningful only for bare instances. +The generated host keys will be installed on all other instances. + +## known_hosts + +TPA will add entries for every host and its public host keys to the +global `ssh_known_hosts` file on every instance in the cluster, so that +they can ssh to each other without host key verification prompts, +regardless of whether they have `manage_ssh_hostkeys` set or not. diff --git a/product_docs/docs/tpa/23/reference/packages.mdx b/product_docs/docs/tpa/23/reference/packages.mdx new file mode 100644 index 00000000000..ed5a818ef7e --- /dev/null +++ b/product_docs/docs/tpa/23/reference/packages.mdx @@ -0,0 +1,69 @@ +--- +title: Installing packages +originalFilePath: packages.md + +--- + +TPA installs a batch of non-Postgres-related packages early during +the deployment, then all Postgres-related packages together, and then +packages for optional components separately. This page is about +installing packages like sysstat or strace, which have no dependency on +Postgres packages. + +You can add entries to `packages` under `cluster_vars` or a +particular instance's `vars` in config.yml: + +```yaml +cluster_vars: + packages: + common: + - pkg1 + - pkg2 + Debian: + - debpkg1 + RedHat: + - rhpkg1 + - rhpkg2 + Ubuntu: + - ubpkg1 +``` + +In the example above, TPA will install its own list of +`default_packages` and the packages listed under `packages.common` +on every instance, and the remaining distribution-specific packages +based on which distribution the instance is running. If any of these +packages is not available, the deployment will fail. + +Don't list any packages that depend on Postgres; use +[`extra_postgres_packages`](postgres_installation_method_pkg) +instead. + +## Optional packages + +You can specify a list of `optional_packages` to install. They will be +installed if they are available, and ignored otherwise. As with the +other settings, the `common` entries apply to every instance, whereas +any other lists apply only to instances running the relevant +distribution. + +```yaml +optional_packages: + common: + - pkg1 + - pkg2 + Debian: + - debpkg4 +``` + +## Removing packages + +You can specify a list of `unwanted_packages` that should be +removed if they are installed. + +```yaml +unwanted_packages: + common: + - badpkg1 + Ubuntu: + - badpkg2 +``` diff --git a/product_docs/docs/tpa/23/reference/pem.mdx b/product_docs/docs/tpa/23/reference/pem.mdx new file mode 100644 index 00000000000..259e762028a --- /dev/null +++ b/product_docs/docs/tpa/23/reference/pem.mdx @@ -0,0 +1,172 @@ +--- +title: Configuring Postgres Enterprise Manager (PEM) +originalFilePath: pem.md + +--- + +TPA will install and configure PEM when `tpaexec configure` command is run +with `--enable-pem` command line option. + +The default behavior with `--enable-pem` is to enable `pem-agent` role for all +`postgres` instances in the cluster. `pem-agent` role will also be added to +barman nodes when `--enable-pg-backup-api` command line option is used +alongside `--enable-pem`. + +A dedicated instance named `pemserver` will also be added to the cluster. + +Since PEM server uses postgres backend; pemserver instance implicitly uses +`postgres` role as well which ensures that pemserver gets a valid postgres +cluster configured for use as PEM backend. All configuration options available +for a normal postgres instance are valid for PEM's backend postgres instance +as well. See following for details: + +- [Configure pg_hba.conf](pg_hba.conf) +- [Configure postgresql.conf](postgresql.conf) + +Note that PEM is only available via EDB's package repositories and therefore +requires a valid subscription. + +## Supported architectures + +PEM is supported with M1 and BDR-Always-ON architectures via `--enable-pem` +configuration command line option. You could optionally edit the generated +cluster config (config.yml) and assign or remove `pem-agent` role from any +postgres instance in the cluster in order to enable or disable PEM there. + +Note that PEM server does not support pgextended for a backend yet. + +## PEM configuration + +TPA will configure pem agents and pem server with the appropriate +instance-specific settings, with remaining settings set to the respective +default values. Some of the configuration options may be exposed for user +configuration at some point in future. + +PEM server's web interface is configured to run on https and uses 443 port +for the same. PEM's webserver configuration uses self-signed certificates. + +Default login credentials for PEM server web interface use the postgres +backend database user which is set to `postgres` for postgresql and +`enterprisedb` for EPAS clusters by default. You could get the login +password for the web interface by running +`tpaexec show-password $clusterdir $user`. + +## Shared PEM server + +Some deployments may want to use a single PEM sever for monitoring and +managing multiple clusters in the organization. Shared pem server deployment +within tpaexec is supported via `pem_shared` variable that you could set via +`vars:` under the pem server instance for the given cluster config that plans +to use an existing pem server. `pem_shared` is a boolean variable so possible +values are true and false(default). When declaring a pemserver instance as +shared, we tell the given cluster config that pemserver instance is in fact +managed by a separate cluster config that provisioned and deployed the pem +server in the first place. So any changes we wanted to make to the pem server +instance including postgres backend for pem would be managed by the cluster +where pemserver instance is NOT declared as a shared pem instance. + +A typical workflow for using a shared pem server across multiple clusters +would look something like this: + +1. Create a tpaexec cluster with a single instance that has `pem-server` + role (call it 'pem-cluster' for this example). We could as easily use + the same workflow in a scenario where pem is provisioned as part of a + larger cluster and not just a single instance that runs as pemserver but + we use a single node cluster because it is easier to use that as an example + and arguably easy to maintain as well. +2. In the other cluster (pg-cluster for example), reference this particular + pemserver from $clusters/pem-cluster as a shared pem server instance and + use `bare` as platform so we are not trying to create a new pemserver instance. + Also specify the IP address of the pemserver that this cluster can + use to access pemserver instance. + + ```yml + - Name: pemserver + node: 5 + role: + - pem-server + platform: bare + public_ip: 13.213.53.205 + private_ip: 10.33.15.102 + vars: + pem_shared: true + ``` +3. Before running deploy in the postgres cluster, make sure that pg-cluster + can access pem server instance via ssh. You can allow this access by copying + pg-cluster's public key to pem server instance via `ssh-copy-id` and then do + an ssh to make sure you can login without having to specify the password. + + ```bash + # add pem-clusters key to the ssh-agent (handy for `aws` platform) + $ cd $clusters/pem-cluster + $ ssh-add id_pem-clutser + $ cd $clusters/pg-cluster + $ ssh-keyscan -4 $pem-server-ip >> known_hosts + $ ssh-copy-id -i id_pg-cluster.pub -o 'UserKnownHostsFile=tpa_known_hosts' $user@$pem-server-ip + $ ssh -F ssh_config pemserver + ``` +4. Update postgresql config on pem server node so it allows connections + from the new pg-cluster. You can modify existing pg_hba.conf on pem + server by adding new entries to `pem_postgres_extra_hba_settings` + under `vars:` in pem-cluster's config.yml. For example: + + ```yml + instances: + - Name: pemserver + location: main + node: 1 + role: + - pem-server + vars: + pem_postgres_extra_hba_settings: + - "# Allow pem connections from pg-cluster1.quire" + - hostssl pem +pem_agent 10.33.15.108/32 cert + - "# Allow pem connections from pg-cluster1.upside" + - hostssl pem +pem_agent 10.33.15.104/32 cert + - "# Allow pem connections from pg-cluster2.zippy" + - hostssl pem +pem_agent 10.33.15.110/32 cert + - "# Allow pem connections from pg-cluster2.utopic" + - hostssl pem +pem_agent 10.33.15.109/32 cert + ``` + + and then run `tpaexec provision $clusters/pem-cluster` followed by + `tpaexec deploy $clusters/pem-cluster`. When complete, nodes from + your new pg-cluster should be able to speak with pem server backend. +5. In order to make sure pem agents from the nodes in pg-cluster can + connect and register with the pem server backend, you must first + `export EDB_PEM_CREDENTIALS_FILE=/path/to/pem/credentials/file` + before you run `tpaexec deploy`. Credentials file is a text file that + contains your access credentials to the pemserver's backend postgres + instance in the `username:password` format. + + ```bash + $ cat pem_creds + postgres:f1I%fw!QmWevdzw#EL#$Ulu1cWhg7&RT + ``` + + If you don't know the backend password, you can get that by using + `show-password` tpaexec command. + + ```bash + tpaexec show-password $pem-clusterdir $user + ``` + + +6. Run `tpaexec deploy $clusters/pg-cluster` so pem is deployed on the + new pg-cluster while using shared pem server instance. + +## Connecting to the PEM UI + +PEM UI runs on https interface so you can connect with a running +instance of PEM server via https://$pem-server-ip/pem. Login credentials +for PEM UI are set to the postgres backend user which uses `postgres` +or `enterprisedb` for `postgresql` and `epas` flavours respectively. +tpaexec's show-password command will show the password for the backend +user. For example: + +```bash +tpaexec show-password $clusterdir $user +``` + +See [PEM documentation](https://www.enterprisedb.com/docs/pem/latest/) +for more details on PEM configuration and usage. diff --git a/product_docs/docs/tpa/23/reference/pg-backup-api.mdx b/product_docs/docs/tpa/23/reference/pg-backup-api.mdx new file mode 100644 index 00000000000..2f36d96c716 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/pg-backup-api.mdx @@ -0,0 +1,52 @@ +--- +title: SSL Certificates +originalFilePath: pg-backup-api.md + +--- + +If you set `enable_pg_backup_api: true` in `config.yml` or use the +`--enable-pg-backup-api` command line option during configure, instances +with the `barman` role will install pg-backup-api and set up an +apache proxy for client cert authentication. This apache proxy will use +an SSL CA generated for the cluster to generate its server and client +certificates. + +```yaml +cluster_vars: + enable_pg_backup_api: true +``` + +pg-backup-api will be installed via packages by default, but you can +also install from a git branch or a local directory. See +[configure-source.md](../configure-source) and +[install_from_source.md](install_from_source) for more details. + +Run `pg-backup-api status` on the barman node running pg-backup-api - if +you get "OK" back, the pg-backup-api service is running. + +To test that the proxy is working, run + +```shell +curl --cert /etc/tpa/pg-backup-api/pg-backup-user.crt \ + --key /etc/tpa/pg-backup-api/pg-backup-user.key \ + -X GET https://{hostname}/diagnose +``` + +If it's working, you'll get a large json output. You can compare this +with the output of `barman diagnose`, they should match exactly. + +The root certificate will be copied to +`/etc/tpa/pg-backup-api/` by default. + +A client certificate and key (`pg-backup-user.crt`and +`pg-backup-user.key`) will be generated for testing (through +`tpaexec test`) or command line from the barman host. See +[Testing](../tpaexec-test). + +An apache proxy server certificate and key (`pg-backup-api.crt` and +`pg-backup-api.key`) will also be generated + +Each service needing to query the api will need to generate its own +client certificate separately. PEM agent role, for instance, generates a +client certificate during it's setup when both `--enable-pem` and +`--enable-pg-backup-api` (or config.yml equivalent) are used. diff --git a/product_docs/docs/tpa/23/reference/pg_hba.conf.mdx b/product_docs/docs/tpa/23/reference/pg_hba.conf.mdx new file mode 100644 index 00000000000..ebaff15e04a --- /dev/null +++ b/product_docs/docs/tpa/23/reference/pg_hba.conf.mdx @@ -0,0 +1,59 @@ +--- +title: pg_hba.conf +originalFilePath: pg_hba.conf.md + +--- + +The Postgres documentation explains the various options available in +[`pg_hba.conf`](https://www.postgresql.org/docs/current/auth-pg-hba-conf.html). + +By default, TPA will generate a sensible `pg_hba.conf` for your +cluster, to allow replication between instances, and connections from +authenticated clients. + +You can add entries to the default configuration by providing a list of +`postgres_hba_settings`: + +```yaml +cluster_vars: + postgres_hba_settings: + - "# let authenticated users connect from anywhere" + - hostssl all all 0.0.0.0/0 scram-sha-256 +``` + +You can override the default `local all all peer` line in pg_hba.conf by +setting `postgres_hba_local_auth_method: md5`. + +If you don't want any of the default entries, you can change +`postgres_hba_template`: + +```yaml +cluster_vars: + postgres_hba_template: pg_hba.lines.j2 + postgres_hba_settings: + - "# my lines of text" + - "# and nothing but my lines" + - "# …not even any clients!" + - hostssl all all 0.0.0.0/0 reject +``` + +You can even create `templates/my_hba.j2` in your cluster directory and +set: + +```yaml +cluster_vars: + postgres_hba_template: my_hba.j2 +``` + +If you just want to leave the existing `pg_hba.conf` alone, you can do +that too: + +```yaml +cluster_vars: + postgres_hba_template: '' +``` + +Although it is possible to configure `pg_hba.conf` to be different on +different instances, we generally recommend a uniform configuration, so +as to avoid problems with access and replication after any +topology-changing events such as switchovers and failovers. diff --git a/product_docs/docs/tpa/23/reference/pg_ident.conf.mdx b/product_docs/docs/tpa/23/reference/pg_ident.conf.mdx new file mode 100644 index 00000000000..1d9a8ca0136 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/pg_ident.conf.mdx @@ -0,0 +1,25 @@ +--- +title: pg_ident.conf +originalFilePath: pg_ident.conf.md + +--- + +You should not normally need to change `pg_ident.conf`, and by default, +TPA will not modify it. + +You can set `postgres_ident_template` to replace `pg_ident.conf` with +whatever content you like. + +```yaml +cluster_vars: + pg_ident_template: ident.j2 +``` + +You will also need to create `templates/ident.j2` in the cluster +directory: + +```jinja2 +{% for u in ['unixuser1', 'unixuser2'] %} +mymap {{ u }} dbusername +{% endfor %} +``` diff --git a/product_docs/docs/tpa/23/reference/pgbouncer.mdx b/product_docs/docs/tpa/23/reference/pgbouncer.mdx new file mode 100644 index 00000000000..7134c373795 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/pgbouncer.mdx @@ -0,0 +1,59 @@ +--- +title: Configuring pgbouncer +originalFilePath: pgbouncer.md + +--- + +TPA will install and configure pgbouncer on instances whose `role` +contains `pgbouncer`. + +By default, pgbouncer listens for connections on port 6432 and forwards +connections to `127.0.0.1:5432` (which may be either Postgres or +[haproxy](haproxy), depending on the architecture). + +You can set the following variables on any `pgbouncer` instance. + +| Variable | Default value | Description | +| --------------------------- | --------------------- | -------------------------------------------------------------------------------------------------------------------------- | +| `pgbouncer_port` | 6432 | The TCP port pgbouncer should listen on | +| `pgbouncer_backend` | 127.0.0.1 | A Postgres server to connect to | +| `pgbouncer_backend_port` | 5432 | The port that the `pgbouncer_backend` listens on | +| `pgbouncer_max_client_conn` | `max_connections`×0.9 | The maximum number of connections allowed; the default is derived from the backend's `max_connections` setting if possible | +| `pgbouncer_auth_user` | pgbouncer_auth_user | Postgres user to use for authentication | + +## Databases + +By default, TPA will generate +`/etc/pgbouncer/pgbouncer.databases.ini` with a single wildcard `*` +entry under `[databases]` to forward all connections to the backend +server. You can set `pgbouncer_databases` as shown in the example below +to change the database configuration. + +## Authentication + +PgBouncer will connect to Postgres as the `pgbouncer_auth_user` and +execute the (already configured) `auth_query` to authenticate users. + +## Example + +```yaml +instances: +- Name: one + vars: + max_connections: 300 +- Name: two +- Name: proxy + role: + - pgbouncer + vars: + pgbouncer_backend: one + pgbouncer_databases: + - name: dbname + options: + pool_mode: transaction + dbname: otherdb + - name: bdrdb + options: + host: two + port: 6543 +``` diff --git a/product_docs/docs/tpa/23/reference/pgd-proxy.mdx b/product_docs/docs/tpa/23/reference/pgd-proxy.mdx new file mode 100644 index 00000000000..f9f6becbef2 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/pgd-proxy.mdx @@ -0,0 +1,79 @@ +--- +title: Configuring pgd-proxy +originalFilePath: pgd-proxy.md + +--- + +TPA will install and configure pgd-proxy for the PGD-Always-ON +architecture with BDR 5 on any instance with `pgd-proxy` in its `role`. + +(By default, the [PGD-Always-ON architecture](../architecture-PGD-Always-ON) +includes standalone `pgd-proxy` instances in each location, but using +the `--cohost-proxies` configure option will install pgd-proxy on the +BDR instances instead.) + +## Configuration + +`pgd-proxy` is configured at BDR level via SQL functions. + +| Hash | Function | Description | +| ------------------- | ------------------------------- | --------------------------------------------------------------------- | +| `pgd_proxy_options` | `bdr.alter_proxy_option()` | pgd-proxy configuration, e.g. port | +| `bdr_node_groups` | `bdr.alter_node_group_option()` | configuration for the proxy's node group, e.g. `enable_proxy_routing` | +| `bdr_node_options` | `bdr.alter_node_option()` | routing configuration for individual BDR nodes | + +See the BDR documentation for more details. + +### bdr_node_groups + +Group-level options related to pgd-proxy can be set under +`bdr_node_groups` along with other node group options: + +``` +cluster_vars: + bdr_node_groups: + - name: group1 + options: + enable_proxy_routing: true +``` + +Note that `enable_proxy_routing` must be explicitly set to `true` for pgd-proxy to be enabled for the group. + +### bdr_node_options + +Node-level options related to pgd-proxy can be set under +`bdr_node_options` on any BDR instance: + +``` +instances: +- Name: first + vars: + bdr_node_options: + route_priority: 42 +``` + +### pgd_proxy_options + +Options for a pgd-proxy instance itself, rather than the group or nodes +it is attached to, can be set under `default_pgd_proxy_options` under +`cluster_vars` (which applies to all proxies), or under +`pgd_proxy_options` on any pgd-proxy instance: + +``` +cluster_vars: + default_pgd_proxy_options: + listen_port: 6432 + +instances: +- Name: someproxy + vars: + pgd_proxy_options: + fallback_groups: + - somegroup +``` + +In this case, `someproxy` ends up with the `listen_port` setting from +`cluster_vars` and its own `fallback_groups` setting. However, it could +also override the default `listen_port` by defining a different value +alongside `fallback_groups`; this instance-level setting would take +precedence over the defaults in `cluster_vars`. diff --git a/product_docs/docs/tpa/23/reference/pglogical.mdx b/product_docs/docs/tpa/23/reference/pglogical.mdx new file mode 100644 index 00000000000..ee62cbcc0a8 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/pglogical.mdx @@ -0,0 +1,263 @@ +--- +title: pglogical configuration +originalFilePath: pglogical.md + +--- + +TPA can configure pglogical replication sets (publications) and +subscriptions with pglogical v2 and pglogical v3. + +```yaml +instances: +- node: 1 + Name: kazoo + … + vars: + publications: + - type: pglogical + database: example + name: some_publication_name + replication_sets: + - name: custom_replication_set + … + +- node: 2 + Name: keeper + vars: + subscriptions: + - type: pglogical + database: example + name: some_subscription_name + publication: + name: some_publication_name + replication_sets: + - default + - default_insert_only + - custom_replication_set + … +``` + +The pglogical extension will be created by default if you define +publications or subscriptions with `type: pglogical`, but it is up to +you to determine which version will be installed (e.g., subscribe to the +`products/pglogical3/release` repository for pglogical3). + +## Introduction + +TPA can configure everything needed to replicate changes between +instances using pglogical, and can also alter the replication setup +based on config.yml changes. + +To publish changes, you define an entry with `type: pglogical` in +`publications`. To subscribe to these changes, you define an entry +with `type: pglogical` in `subscriptions`, as shown above. + +Pglogical does not have a named publication entity (in the sense that +built-in logical replication has `CREATE PUBLICATION`). A publication +in config.yml just assigns a name to a collection of replication sets, +and subscriptions can use this name to refer to the desired provider. + +To use pglogical replication, both publishers and subscribers need a +named local pglogical node. TPA will create this node with +`pglogical.create_node()` if it does not exist. For publications, the +publication name is used as the pglogical node name. There can be only +one pglogical node in any given database, so you can have only one entry +in `publications` per database. + +However, pglogical subscriptions *do* have a name of their own. TPA +will create subscriptions with the given `name`, and use a default +value for the pglogical node name based on the instance's name and the +name of the database in which the subscription is created. You can +specify a different `node_name` if required—for example, when you have +configured a publication in the same database, so that all subscriptions +in that database must share the same pglogical node. + +TPA does some basic validation of the configuration—it will point +out the error if you spell `replication_sets` as `replciation_sets`, or +try to subscribe to a publication that is not defined, but it is your +responsibility to specify a meaningful set of publications and +subscriptions. + +TPA will configure pglogical after creating users, extensions, and +databases, but before any BDR configuration. You can set +[`postgres_users`](postgres_users) and +[`postgres_databases`](postgres_databases) to create databases +for replication, and use the +[`postgres-config-final`](../tpaexec-hooks#postgres-config-final) +hook to populate the databases before pglogical is configured. + +## Publications + +An entry in `publications` must specify a `name` and `database`, +and may specify a list of named `replication_sets` with optional +attributes, as well as a list of table or sequence names. + +```yaml +publications: +- type: pglogical + database: example + name: some_publication_name + replication_sets: + - name: default + replicate_insert: true + replicate_update: true + replicate_delete: true + replicate_truncate: true + autoadd_tables: false + autoadd_sequences: false + autoadd_existing: true + - name: custom_replication_set + tables: + - name: sometable + - name: '"some-schema".othertable' + columns: [a, b, c] + row_filter: 'a > 42' + synchronize_data: true + sequences: + - name: someseq + synchronize_data: true + - name: '"some-schema".otherseq' +``` + +Each replication set may specify optional attributes such as +`replicate_insert` and `autoadd_existing`. If specified, they will +be included as named parameters in the call to +`pglogical.create_replication_set()`, otherwise they will be left out +and the replication set will be created with pglogical's defaults instead. + +Apart from manipulating the list of relations belonging to the +replication set using the `autoadd_*` parameters in pglogical3, you +can also explicitly specify a list of tables or sequences. The name of +each relation may be schema-qualified (unqualified names are assumed to +be in `public`), and the entry may include optional attributes such as +`row_filter` (for tables only) or `synchronize_data`, as shown +above. + +## Subscriptions + +An entry in `subscriptions` must specify a `name` and `database`, +define a publication to subscribe to, and may specify other optional +attributes of the subscription. + +```yaml +subscriptions: +- type: pglogical + database: example + name: some_subscription_name + node_name: optional_pglogical_node_name + publication: + name: some_publication_name + # Optional attributes: + synchronize_structure: true + synchronize_data: true + forward_origins: ['all'] + strip_origins: false + apply_delay: '1 second' + writer: 'heap' + writer_options: + - 'magic' + - 'key=value' + - 'just-a-string' + # Optional attributes that can be changed for an existing + # subscription: + replication_sets: + - default + - default_insert_only + - custom_replication_set + enabled: true +``` + +A subscription can set `publication.name` (as shown above) to define +which publication to subscribe to. If there is more than one publication +with that name (across the entire cluster), you may specify the name of +an instance to disambiguate. If you want to refer to publications by +name, don't create multiple publications with the same name on the same +instance. + +```yaml +- type: pglogical + … + publication: + name: some_publication_name + instance: kazoo + + # OR + + provider_dsn: "host=… dbname=…" +``` + +Instead of referring to publications by name, you may explicitly specify +a `provider_dsn` instead. In this case, the given DSN is passed to +`pglogical.create_subscription()` directly (and `publication` is +ignored). You can use this mechanism to subscribe to instances outside +the TPA cluster. + +The other attributes in the example above are optional. If defined, they +will be included as named parameters in the call to +`pglogical.create_subscription()`, otherwise they will be left out. +(Some attributes shown are specific to pglogical3.) + +## Configuration changes + +For publications, you can add or remove replication sets, change the +attributes of a replication set, or change its membership (the tables +and sequences it contains). + +If you change `replicate_*` or `autoadd_*`, TPA will call +`pglogical.alter_replication_set()` accordingly (but note that you +cannot change `autoadd_existing` for existing replication sets, and +the `autoadd_*` parameters are all pglogical3-specific). + +If you change the list of `tables` or `sequences` for a replication +set, TPA will reconcile these changes by calling +`pglogical.alter_replication_set_{add,remove}_{table,sequence}()` as +needed. + +However, if you change `synchronize_data` or other attributes for a +relation (table or sequence) that is already a member of a replication +set, TPA will not propagate the changes (e.g., by dropping the table +and re-adding it with a different configuration). + +For subscriptions, you can only change the list of `replication_sets` +and enable or disable the subscription (`enabled: false`). + +In both cases, any replication sets that exist but are not mentioned in +the configuration will be removed (with +`pglogical.alter_subscription_remove_replication_set()` on the +subscriber, or `pglogical.drop_replication_set()` on the +publisher—but the default replication sets named `default`, +`default_insert_only`, and `ddl_sql` will not be dropped.) + +If you edit config.yml, remember to run `tpaexec provision` before +running `tpaexec deploy`. + +## Interaction with BDR + +It is possible to use BDR and pglogical together in the same database if +you exercise caution. + +BDR3 uses pglogical3 internally, and will create a pglogical node if one +does not exist. There can be only one pglogical node per database, so if +you configure a pglogical publication in `bdr_database`, the +instance's `bdr_node_name` must be the same as the publication's +`name`. Otherwise, the node will be created for the publication +first, and `bdr.create_node()` will fail later with an error about a +node name conflict. Any `subscriptions` in `bdr_database` must use +the same `node_name` too. + +Earlier versions of BDR do not use pglogical, so these considerations do +not apply. + +## Limitations + +- There is currently no support for + `pglogical.replication_set_{add,remove}_ddl()` + +- There is currently no support for + `pglogical.replication_set_add_all_{tables,sequences}()` + +- There is currently no support for + `pglogical.alter_subscription_{interface,writer_options}()` or + `pglogical.alter_subscription_{add,remove}_log()` + +- pglogical v1 support is not presently tested. diff --git a/product_docs/docs/tpa/23/reference/pgpass.mdx b/product_docs/docs/tpa/23/reference/pgpass.mdx new file mode 100644 index 00000000000..f1c1eea411f --- /dev/null +++ b/product_docs/docs/tpa/23/reference/pgpass.mdx @@ -0,0 +1,24 @@ +--- +title: Configuring .pgpass +originalFilePath: pgpass.md + +--- + +TPA will create `~postgres/.pgpass` by default with the passwords +for `postgres` and `repmgr` in it, for use between cluster instances. +You can set `pgpass_users` to create entries for a different list of +users. + +You can also include the `postgres/pgpass` role from hook scripts to +create your own `.pgpass` file: + +```yaml +- include_role: name=postgres/pgpass + vars: + pgpassfile: ~otheruser/.pgpass + pgpass_owner: otheruser + pgpass_group: somegroup + pgpass_users: + - xyzuser + - pqruser +``` diff --git a/product_docs/docs/tpa/23/reference/platform-shared.mdx b/product_docs/docs/tpa/23/reference/platform-shared.mdx new file mode 100644 index 00000000000..e1cf0fcd363 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/platform-shared.mdx @@ -0,0 +1,5 @@ +--- +title: Shared instances +originalFilePath: platform-shared.md + +--- diff --git a/product_docs/docs/tpa/23/reference/postgres_databases.mdx b/product_docs/docs/tpa/23/reference/postgres_databases.mdx new file mode 100644 index 00000000000..75539f722f7 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/postgres_databases.mdx @@ -0,0 +1,59 @@ +--- +title: Creating Postgres databases +originalFilePath: postgres_databases.md + +--- + +To create Postgres databases during deployment, add entries to the list +of `postgres_databases` under `cluster_vars` or a particular +instance's `vars` in config.yml: + +```yaml +cluster_vars: + postgres_databases: + - name: exampledb + + - name: complexdb + owner: example + encoding: UTF8 + lc_collate: de_DE.UTF-8 + lc_ctype: de_DE.UTF-8 + template: template0 + extensions: + - name: hstore + - name: dblink + languages: + - name: plperl + - name: plpython + tablespace: exampletablespace +``` + +The example above would create two databases (apart from any databases +that TPA itself decides to create, such as `bdr_database`). + +Each entry must specify the `name` of the database to create. All +other attributes are optional. + +The `owner` is `postgres` by default, but you can set it to any +valid username (the users in [`postgres_users`](postgres_users) +will have been created by this time). + +The `encoding`, `lc_collate`, and `lc_ctype` values default to the +`postgres_locale` set at the time of running initdb (the default is to +use the target system's LC_ALL or LANG setting). If you are creating a +database with non-default locale settings, you will also need to specify +`template: template0`. + +You can optionally specify the default `tablespace` for a database; the +tablespace must already exist +(see [`postgres_tablespaces`](postgres_tablespaces)). + +You can specify optional lists of `extensions` and `languages` to create +within each database (in addition to any extensions or languages +inherited from the template database). Any packages required must be +installed already, for example by including them in +[`extra_postgres_packages`](postgres_installation_method_pkg). + +TPA will not drop existing databases that are not mentioned in +`postgres_databases`, and it may create additional databases if required +(e.g., for BDR). diff --git a/product_docs/docs/tpa/23/reference/postgres_installation_method_pkg.mdx b/product_docs/docs/tpa/23/reference/postgres_installation_method_pkg.mdx new file mode 100644 index 00000000000..7933e3f4cf2 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/postgres_installation_method_pkg.mdx @@ -0,0 +1,36 @@ +--- +title: Installing Postgres-related packages +originalFilePath: postgres_installation_method_pkg.md + +--- + +TPA installs a batch of non-Postgres-related packages early during +the deployment, then all Postgres-related packages together, and then +packages for optional components separately. This page is about +installing packages like pglogical that depend on Postgres itself. + +To install extra packages that depend on Postgres (e.g., Postgis), list +them under `extra_postgres_packages` in `cluster_vars` or a +particular instance's `vars` in config.yml: + +```yaml +cluster_vars: + extra_postgres_packages: + common: + - postgres-pkg1 + - postgres-pkg2 + Debian: + - postgres-deb-pkg1 + RedHat: + - postgres11-rhpkg1 + - postgres11-rhpkg2 + Ubuntu: + - ubpkg1 +``` + +The packages listed under `packages.common` will be installed on every +instance, together with the default list of Postgres packages, and any +distribution-specific packages you specify. + +There's a separate page about +[compiling and installing Postgres from source](postgres_installation_method_src). diff --git a/product_docs/docs/tpa/23/reference/postgres_installation_method_src.mdx b/product_docs/docs/tpa/23/reference/postgres_installation_method_src.mdx new file mode 100644 index 00000000000..851b7e6f563 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/postgres_installation_method_src.mdx @@ -0,0 +1,95 @@ +--- +title: Postgres source installation +originalFilePath: postgres_installation_method_src.md + +--- + +TPA will compile and install Postgres from source if you set +`postgres_installation_method` to `src`. This feature is meant for use +in development and testing, and allows you to switch between packaged +and source builds within an identically-configured cluster. + +Even here, you do not need to change the defaults, which will give you a +working cluster with debugging enabled. + +## Git repository + +The default settings will build and install Postgres from the community +Git repository, using the `REL_xx_STABLE` branch corresponding to your +`postgres_version`. You can specify a different repository or branch +(any valid git reference) as follows: + +```yaml +cluster_vars: + postgres_git_url: git://git.postgresql.org/git/postgresql.git + postgres_git_ref: REL_12_STABLE +``` + +The default git.postgresql.org repository does not require +authentication, but if necessary, you can use +[SSH agent forwarding or an HTTPS username/password](git-credentials) +to authenticate to other repositories. + +The repository will be cloned into `postgres_src_dir` (default: +`/opt/postgres/src/postgres`), or updated with `git pull` if the +directory already exists (e.g., if you are re-deploying). + +### Build customisation + +By default, TPA will configure and build Postgres with debugging +enabled and sensible defaults in `postgres_build_dir` (default: +`/opt/postgres/build/postgres`). You can change various settings to +customise the build: + +```yaml +cluster_vars: + postgres_extra_configure_env: + CFLAGS: "-O3" + postgres_extra_configure_opts: + - --with-llvm + - --disable-tap-tests +``` + +This will run `./configure` with the options in +`postgres_extra_configure_opts` and the settings from +`postgres_extra_configure_env` defined in the environment. Some +options are specified by default (e.g., `--with-debug`), but can be +negated by the corresponding `--disable-xxx` or `--without-xxx` +options. Building `--without-openssl` is not supported. + +If required, you can also change the following default build commands: + +```yaml +cluster_vars: + postgres_make_command: "make -s" + postgres_build_targets: + - "all" + - "-C contrib all" + postgres_install_targets: + - "install" + - "-C contrib install" +``` + +Run `tpaexec deploy … --skip-tags build-clean` in order to reuse the +build directory when doing repeated deploys. (Otherwise the old build +directory is emptied before starting the build.) You can also configure +[local source directories](../configure-source#local-source-directories) +to speed up your development builds. + +Whenever you run a source build, Postgres will be restarted. + +## Additional components + +Even if you install Postgres from packages, you can compile and install +extensions from source. There's a separate page about how to configure +[`install_from_source`](install_from_source). + +If you install Postgres from source, however, you will need to install +extensions from source as well, because the extension packages typically +depend on the Postgres package(s) being installed. + +## Package installation + +There's a separate page about +[installing Postgres and Postgres-related packages](postgres_installation_method_pkg) +with `postgres_installation_method: pkg` (the default). diff --git a/product_docs/docs/tpa/23/reference/postgres_tablespaces.mdx b/product_docs/docs/tpa/23/reference/postgres_tablespaces.mdx new file mode 100644 index 00000000000..221a4b5e08c --- /dev/null +++ b/product_docs/docs/tpa/23/reference/postgres_tablespaces.mdx @@ -0,0 +1,53 @@ +--- +title: Creating Postgres tablespaces +originalFilePath: postgres_tablespaces.md + +--- + +To create Postgres tablespaces during deployment, define their names and +locations in `postgres_tablespaces` under `cluster_vars` or a particular +instance's `vars` in config.yml. + +If you [define volumes](volumes) with +`volume_for: postgres_tablespace` set and a `tablespace_name` defined, +they will be added as default entries to `postgres_tablespaces`. + +```yaml +cluster_vars: + postgres_tablespaces: + explicit: + location: /some/path + +instances: +- Name: example + … + volumes: + - device_name: /dev/xvdh + … + vars: + volume_for: postgres_tablespace + tablespace_name: implicit +``` + +The example above would create two tablespaces: explicit (at /some/path) +and implicit (at /opt/postgres/tablespaces/implicit/tablespace_data by +default, unless you specify a different mountpoint for the volume). + +Every `postgres_tablespace` volume must have `tablespace_name` defined; +the tablespace location will be derived from the volume's mountpoint. + +Every entry in `postgres_tablespaces` must specify a tablespace name (as +the key) and its `location`. If you are specifying tablespace locations +explicitly, do not put tablespaces inside PGDATA, and do not use any +volume mountpoint directly as a tablespace location (`lost+found` will +confuse some tools into thinking the directory is not empty). + +By default, the tablespace `owner` is `postgres`, but you can set it to +any valid username (the users in [`postgres_users`](postgres_users) +will have been created by this time). + +Streaming replicas must have the same `postgres_tablespace` volumes and +`postgres_tablespaces` setting as their upstream instance + +You can set the default tablespace for a database in +[`postgres_databases`](postgres_databases). diff --git a/product_docs/docs/tpa/23/reference/postgres_user.mdx b/product_docs/docs/tpa/23/reference/postgres_user.mdx new file mode 100644 index 00000000000..22f99e6a596 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/postgres_user.mdx @@ -0,0 +1,63 @@ +--- +title: The postgres Unix user +originalFilePath: postgres_user.md + +--- + +This page documents how the postgres user and its home directory are +configured. + +There's a separate page about how to create +[Postgres users in the database](postgres_users). + +## Shell configuration + +TPA will install a `.bashrc` file and ensure that it's also included +by the `.profile` or `.bash_profile` files. + +It will set a prompt that includes the username and hostname and working +directory, and ensure that `postgres_bin_dir` in in the `PATH`, and set +`PGDATA` to the location of `postgres_data_dir`. + +You can optionally specify `extra_bashrc_lines` to append arbitrary +lines to `.bashrc`. (Use the YAML multi-line string syntax `>-` to avoid +having to worry about quoting and escaping shell metacharacters.) + +```yaml +cluster_vars: + extra_bashrc_lines: + - alias la=ls\ -la + - >- + export PATH="$PATH":/some/other/dir +``` + +It will edit sudoers to allow +`sudo systemctl start/stop/reload/restart/status postgres`, and also +change `ulimits` to allow unlimited core dumps and raise the file +descriptor limits. + +## SSH keys + +TPA will use `ssh-keygen` to generate and install an SSH keypair for +the postgres user, and edit `.ssh/authorized_keys` so that the instances +in the cluster can ssh to each other as `postgres`. + +## TLS certificates + +By default, TPA will generate a private key and a self-signed TLS +certificate for use within the cluster. If you create `cluster_name.key` +and `cluster_name.crt` files within your cluster directory, it will use +that key and certificate instead of generating one. + +We strongly recommend either using the self-signed certificate +(perfectly sufficient to ensure that traffic between clients and server +is encrypted in transit) or making some more secure alternative +arrangement to install the TLS private key and certificate on the +instances (where the private key does not leave the instances). The +details depend on your certificate-signing infrastructure. + +## Username + +The `postgres_user` and `postgres_group` settings (both `postgres` by +default) are used consistently everywhere. You can change them if you +need to run Postgres as a different user for some reason. diff --git a/product_docs/docs/tpa/23/reference/postgres_users.mdx b/product_docs/docs/tpa/23/reference/postgres_users.mdx new file mode 100644 index 00000000000..ea54bc0c01d --- /dev/null +++ b/product_docs/docs/tpa/23/reference/postgres_users.mdx @@ -0,0 +1,65 @@ +--- +title: Creating Postgres users +originalFilePath: postgres_users.md + +--- + +To create Postgres users during deployment, add entries to the list of +`postgres_users` under `cluster_vars` or a particular instance's +`vars` in config.yml: + +```yaml +cluster_vars: + postgres_users: + - username: example + + - username: otheruser + generate_password: true + role_attrs: + - superuser + - replication + granted_roles: + - r1 + - r2 +``` + +The example above would create two users (apart from any users that +TPA itself decides to create, such as repmgr or barman). + +Each entry must specify the `username` to create. + +Any roles in the `granted_roles` list will be granted to the +newly-created user. + +The `role_attrs` list may contain certain +[CREATE ROLE options](https://www.postgresql.org/docs/12/sql-createrole.html) +such as `[NO]SUPERUSER`, `[NO]CREATEDB`, `[NO]LOGIN` (to create a +user or a role) etc. + +## Password generation + +By default, TPA will generate a random password for the user, and +store it in a vault-encrypted variable named `_password` in +the cluster's inventory. You can retrieve the value later: + +```bash +$ tpaexec show-password ~/clusters/speedy example +beePh~iez6lie4thi5KaiG%eghaeT]ai +``` + +You cannot explicitly specify a password in config.yml, but you can +store a different `_password` in the inventory instead: + +```bash +$ tpaexec store-password ~/clusters/speedy example --random +$ tpaexec show-password ~/clusters/speedy example +)>tkc}}k1y4&epaJ?;NJ:l'uT{C7D*

- + 'any 2 ("first", "second", "third")' + bdr.global_lock_statement_timeout: 60s +``` + +This is most useful with settings that TPA does not recognise +natively, but you can use it for any parameter (e.g., +`effective_cache_size` can be set as a variable, but +`authentication_timeout` cannot). + +These settings will be written to `conf.d/9900-role-settings.conf`, and +therefore take priority over variables set in any other way. + +If you make changes to values under `postgres_conf_settings`, TPA +has no way to know whether the a reload is sufficient to effect the +changes, or if a restart is required. Therefore it will always restart +the server to activate the changes. This is why it's always preferable +to use variables directly whenever possible. + +## shared_buffers + +By default, TPA will set `shared_buffers` to 25% of the available memory +(this is just a rule of thumb, not a recommendation). You can override this +default by setting `shared_buffers_ratio: 0.35` to use a different proportion, +or by setting `shared_buffers_mb: 796` to a specific number of MB, or by +specifying an exact value directly, e.g., `shared_buffers: "2GB"`. + +## effective_cache_size + +By default, TPA will set `effective_cache_size` to 50% of the available +memory. You can override this default by setting +`effective_cache_size_ratio: 0.35` to use a different proportion, or by setting +`effective_cache_size_mb: 796` to a specific number of MB, or by specifying an +exact value directly, e.g., `effective_cache_size: "8GB"`. + +## shared_preload_libraries + +TPA maintains an internal list of extensions that require entries in +`shared_preload_libraries` to work, and if you include any such +extensions in `postgres_extensions`, it will automatically update +`shared_preload_libraries` for you. + +If you are using unrecognised extensions that require preloading, you +can add them to `preload_extensions`: + +```yaml +cluster_vars: + preload_extensions: + - myext + - otherext +``` + +Now if you add `myext` to `postgres_extensions`, +`shared_preload_libraries` will include `myext`. + +By default, `shared_preload_libraries` is set in +`conf.d/8888-shared_preload_libraries.conf`. + +Setting `shared_preload_libraries` directly as a variable is not +supported. You should not normally need to set it, but if unavoidable, +you can set a fully-quoted value under +[`postgres_conf_settings`](#postgres_conf_settings). In this case, the +value is set in `conf.d/9900-tpa_postgres_conf_settings.conf`. + +## Making changes by hand + +There are two ways you can override anything in the TPA-generated +configuration. + +The first (and recommended) option is to use `ALTER SYSTEM`, which +always takes precedence over anything in the configuration files: + +```sql +# ALTER SYSTEM SET bdr.global_lock_statement_timeout TO '60s'; +``` + +You can also edit `conf.d/9999-override.conf`: + +```bash +$ echo "bdr.global_lock_statement_timeout='60s'" >> conf.d/9999-override.conf +``` + +All other files under `conf.d` are subject to be overwritten during +deployment if the configuration changes, but TPA will never change +`9999-override.conf` after initially creating the empty file. + +Depending on which settings you change, you may need to execute +`SELECT pg_reload_conf()` or restart the server for the changes to take +effect. + +## Generating postgresql.conf from scratch + +By default, TPA will leave the default (i.e., `initdb`-generated) +postgresql.conf file alone other than adding the `include_dir`. You +should not ordinarily need to override this behaviour, but you can set +`postgres_conf_template` to do so: + +```yaml +cluster_vars: + postgres_conf_template: 'pgconf.j2' +``` + +Now the `templates/pgconf.j2` in your cluster directory will be used to +generate postgresql.conf. diff --git a/product_docs/docs/tpa/23/reference/python.mdx b/product_docs/docs/tpa/23/reference/python.mdx new file mode 100644 index 00000000000..b7302f4e47e --- /dev/null +++ b/product_docs/docs/tpa/23/reference/python.mdx @@ -0,0 +1,48 @@ +--- +title: Python environment +originalFilePath: python.md + +--- + +TPA decides which Python interpreter to use based on the +[distribution it detects](distributions) on a target instance. It +will use Python 3 wherever possible, and fall back to Python 2 only when +unavoidable. + +The `tpaexec configure` command will set `preferred_python_version` +according to the distribution. + +| Distribution | Python 2 | Python 3 | +| ------------------- | -------- | -------- | +| Debian 10/buster | ✓ | ✓ (3.7) | +| Debian 9/stretch | ✓ | ✓ (3.5) | +| Debian 8/jessie | ✓ | ✗ (3.4) | +| Ubuntu 16.04/xenial | ✓ | ✓ (3.5) | +| Ubuntu 18.04/bionic | ✓ | ✓ (3.6) | +| Ubuntu 20.04/focal | ✗ | ✓ (3.8) | +| Ubuntu 22.04/jammy | ✗ | ✓ (3.10) | +| RHEL 7.x | ✓ | ✗ (3.6) | +| RHEL 8.x | ✗ | ✓ (3.6) | + +Ubuntu 20.04, 22.04 and RHEL 8.x can be used only with Python 3. + +RHEL 7.x ships with Python 3.6, but the librpm bindings for Python 3 are +not available, so TPA must use Python 2 instead. Debian 8 does not +have the Python 3.5+ required to support Ansible. + +You can decide for other distributions whether you prefer `python2` or +`python3`, but the default for new clusters is `python3`. + +## Backwards compatibility + +For compatibility with existing clusters, the default value of +`preferred_python_version` is `python2`, but you can explicitly choose +`python3` even on systems that were already deployed with `python2`. + +```yaml +cluster_vars: + preferred_python_version: python3 +``` + +TPA will ignore this setting on distributions where it cannot use +Python 3. diff --git a/product_docs/docs/tpa/23/reference/repmgr.mdx b/product_docs/docs/tpa/23/reference/repmgr.mdx new file mode 100644 index 00000000000..3477c15b8a8 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/repmgr.mdx @@ -0,0 +1,37 @@ +--- +title: Configuring repmgr +originalFilePath: repmgr.md + +--- + +TPA will install repmgr on all postgres instances that have the +`failover_manager` instance variable set to `repmgr`; this is the +default setting. + +The directory of the `repmgr` configuration file defaults to +`/etc/repmgr/`, where `` is the major version +of postgres being installed on this instance, but can be +changed by setting the `repmgr_conf_dir` variable for the instance. +The configuration file itself is always called `repmgr.conf`. + +The default repmgr configuration will set up automatic failover +between instances configured with the role `primary` and the role +`replica`. + +## repmgr configuration + +The following instance variables can be set: + +`repmgr_priority`: sets `priority` in the config file +`repmgr_location`: sets `location ` in the config file +`repmgr_reconnect_attempts`: sets `reconnect_attempts` in the config file, default `6` +`repmgr_reconnect_interval`: sets `reconnect_interval` in the config file, default `10` +`repmgr_use_slots`: sets `use_replication_slots` in the config file, default `1` +`repmgr_failover`: sets `failover` in the config file, default `automatic` + +Any extra settings in `repmgr_conf_settings` will also be passed through +into the repmgr config file. + +## repmgr on BDR instances + +On BDR instances, `repmgr_failover` will be set to `manual` by default. diff --git a/product_docs/docs/tpa/23/reference/ssh_key_file.mdx b/product_docs/docs/tpa/23/reference/ssh_key_file.mdx new file mode 100644 index 00000000000..652e293105e --- /dev/null +++ b/product_docs/docs/tpa/23/reference/ssh_key_file.mdx @@ -0,0 +1,21 @@ +--- +title: ssh_key_file +originalFilePath: ssh_key_file.md + +--- + +By default, `tpaexec provision` will use `ssh-keygen` to generate a new +SSH keypair for the cluster (into files named `id_cluster_name` and +`id_cluster_name.pub` inside the cluster directory). + +If you want to use an existing key instead, you can set `ssh_key_file` +at the top level of config.yml to the location of an SSH private key +file. The corresponding public key must be available with an extension +of `.pub` at the same location: + +```yaml +ssh_key_file: ~/.ssh/id_rsa +``` + +(If this file does not already exist, it will be created by `ssh-keygen` +during provisioning.) diff --git a/product_docs/docs/tpa/23/reference/sysctl_values.mdx b/product_docs/docs/tpa/23/reference/sysctl_values.mdx new file mode 100644 index 00000000000..d2409e03b63 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/sysctl_values.mdx @@ -0,0 +1,25 @@ +--- +title: Setting sysctl values +originalFilePath: sysctl_values.md + +--- + +By default, TPA sets various sysctl values on target instances, and +includes them in `/etc/sysctl.conf` so that they persist across reboots. + +You can optionally specify your own values in `sysctl_values`: + +```yaml +cluster_vars: + sysctl_values: + kernel.core_pattern: core.%e.%p.%t + vm.dirty_bytes: 4294967296 + vm.zone_reclaim_mode: 0 +``` + +Any values you specify will take precedence over TPA's default +values for that variable (if any). The settings will first be added to +`sysctl.conf` line-by-line, and finally loaded with `sysctl -p`. + +Docker and lxd instances do not support setting sysctls, so TPA will +skip this step altogether for those platforms. diff --git a/product_docs/docs/tpa/23/reference/target_environment.mdx b/product_docs/docs/tpa/23/reference/target_environment.mdx new file mode 100644 index 00000000000..6343c58f39d --- /dev/null +++ b/product_docs/docs/tpa/23/reference/target_environment.mdx @@ -0,0 +1,23 @@ +--- +title: Environment +originalFilePath: target_environment.md + +--- + +You can set `target_environment` to specify environment variables that +TPA should set on the target instances during deployment (e.g., to +specify an HTTPS proxy, as shown below). + +``` +cluster_vars: + target_environment: + https_proxy: https://proxy.example:8080 +``` + +TPA will ensure these settings are present in the environment (along +with any others it needs) during deployment and the later execution of +any cluster management commands. + +These environment settings are not persistent, but you can instead use +[`extra_bashrc_lines`](postgres_user) to set environment variables +for the postgres user. diff --git a/product_docs/docs/tpa/23/reference/tpa_2q_repositories.mdx b/product_docs/docs/tpa/23/reference/tpa_2q_repositories.mdx new file mode 100644 index 00000000000..b215f2bdfcc --- /dev/null +++ b/product_docs/docs/tpa/23/reference/tpa_2q_repositories.mdx @@ -0,0 +1,41 @@ +--- +title: Configuring 2ndQuadrant repositories +originalFilePath: tpa_2q_repositories.md + +--- + +This page explains how to configure 2ndQuadrant package repositories on +any system. + +For more details on the EDB and 2ndQuadrant package sources used by +TPA see [this page](2q_and_edb_repositories). + +To specify the complete list of 2ndQuadrant repositories to install on +each instance in addition to the 2ndQuadrant public repository, set +`tpa_2q_repositories` to a list of 2ndQuadrant repository names: + +```yaml +cluster_vars: + tpa_2q_repositories: + - products/pglogical3/release + - products/bdr3/release +``` + +This example will install the pglogical3 and bdr3 release repositories. +On Debian and Ubuntu systems, it will use the APT repository, and on +RedHat systems, it will use the YUM repository. + +To use +[2ndQuadrant repositories](https://techsupport.enterprisedb.com/customer_portal/sw/), +you must `export TPA_2Q_SUBSCRIPTION_TOKEN=xxx` before you run +tpaexec. You can get your subscription token from the 2ndQuadrant +Portal, under "Company info" in the left menu, then "Company". Some +repositories are available only by prior arrangement. + +The `dl/default/release` repository is always installed by default, +unless you + +- explicitly set `tpa_2q_repositories: []`, or +- have at least one entry in `edb_repositories`. + +Either or the above will result in no 2ndQuadrant repositories being installed. diff --git a/product_docs/docs/tpa/23/reference/tpaexec-archive-logs.mdx b/product_docs/docs/tpa/23/reference/tpaexec-archive-logs.mdx new file mode 100644 index 00000000000..3afca083ac7 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/tpaexec-archive-logs.mdx @@ -0,0 +1,101 @@ +--- +title: tpaexec archive-logs +originalFilePath: tpaexec-archive-logs.md + +--- + +To create a log directory and archive logs from instances, run + +```bash +tpaexec archive-logs +``` + +This will create a logs/YYYYMMDDHHMMss/ directory in your cluster directory +and download a tar.gz archive of all the files under /var/log on each instance +in the cluster into a separate directory. + +## Prerequisites + +If you have an existing cluster you can run `tpaexec archive-logs` +immediately. But if you are configuring a new cluster, you must at least +[provision](../tpaexec-provision) the cluster. You will get more logs if +you also [deploy](../tpaexec-deploy) the cluster. + +## Quickstart + +```bash +[tpa]$ tpaexec archive-logs ~/clusters/speedy + +PLAY [Prepare local host archive] ******************************************* + +TASK [Collect facts] ******************************************************** +ok: [localhost] + +TASK [Set time stamp] ******************************************************* +ok: [localhost] + +TASK [Create local log archive directory] *********************************** +changed: [localhost] + +PLAY [Archive log files from target instances] ****************************** + +... + +TASK [Remove remote archives] *********************************************** +changed: [kinship] +changed: [khaki] +changed: [uncivil] +changed: [urchin] + +PLAY RECAP ****************************************************************** +khaki : ok=3 changed=3 unreachable=0 failed=0 +kinship : ok=3 changed=3 unreachable=0 failed=0 +localhost : ok=3 changed=1 unreachable=0 failed=0 +uncivil : ok=3 changed=3 unreachable=0 failed=0 +urchin : ok=3 changed=3 unreachable=0 failed=0 +``` + +You can append `-v`, `-vv`, etc. to the command if you want more verbose output. + +## Generated files + +You can find the logs for each instance under the cluster directory: + +```bash +~/clusters/speedy/logs/ +`-- 220220306T185049 + |-- khaki-logs-20220306T185049.tar.gz + |-- kinship-logs-20220306T185049.tar.gz + |-- uncivil-logs-20220306T185049.tar.gz + `-- urchin-logs-20220306T185049.tar.gz +``` + +Archive contents example: + +```bash +khaki-logs +|-- anaconda +| |-- anaconda.log +| |-- dbus.log +| |-- dnf.librepo.log +| |-- hawkey.log +| |-- journal.log +| |-- ks-script-ipdkisn0.log +| |-- ks-script-jr03uzns.log +| |-- ks-script-mh2iidvh.log +| |-- lvm.log +| |-- packaging.log +| |-- program.log +| |-- storage.log +| |-- syslog +| `-- X.log +|-- btmp +|-- dnf.librepo.log +|-- dnf.log +|-- dnf.rpm.log +|-- hawkey.log +|-- lastlog +|-- private +|-- tpaexec.log +`-- wtmp +``` diff --git a/product_docs/docs/tpa/23/reference/tpaexec-commands.mdx b/product_docs/docs/tpa/23/reference/tpaexec-commands.mdx new file mode 100644 index 00000000000..7f3aec312eb --- /dev/null +++ b/product_docs/docs/tpa/23/reference/tpaexec-commands.mdx @@ -0,0 +1,53 @@ +--- +title: TPA custom commands +originalFilePath: tpaexec-commands.md + +--- + +You can define custom commands that perform tasks specific to your +environment on the instances in a TPA cluster. + +You can use this mechanism to automate any processes that apply to your +cluster. These commands can be invoked against your cluster directory, +like any built-in cluster management command. Having a uniform way to +define and run such processes reduces the likelihood of errors caused by +misunderstandings and operator error, or process documentation that was +correct in the past, but has drifted away from reality since then. + +Writing Ansible playbooks means that you can implement arbitrarily +complex tasks; following the custom command conventions means you can +take advantage of various facts that are set based on your config.yml +and the cluster discovery tasks that TPA performs, and not have to +think about details like connections, authentication, and other basic +features. + +This makes it much easier to write resilient, idempotent commands in a +way that ad-hoc shell scripts (could be, but) usually aren't. + +## Quickstart + +- Create `commands/mycmd.yml` within your cluster directory +- Run `tpaexec mycmd /path/to/cluster` + +## Example + +Here's an example of a command that runs a single command on all +instances in the cluster. Depending on the use-case, you can write +commands that target different hosts (e.g., `hosts: role_postgres` to +run only on Postgres instances), or run additional tasks and evaluate +conditions to determine exactly what to do. + +```yaml +--- +# Always start with this +- import_playbook: "{{ tpa_dir }}/architectures/lib/init.yml" + tags: always + +- name: Perform custom command tasks + hosts: all + tasks: + - name: Display last five lines of syslog + command: tail -5 /var/log/syslog + become_user: root + become: yes +``` diff --git a/product_docs/docs/tpa/23/reference/tpaexec-download-packages.mdx b/product_docs/docs/tpa/23/reference/tpaexec-download-packages.mdx new file mode 100644 index 00000000000..e10d9e2d86b --- /dev/null +++ b/product_docs/docs/tpa/23/reference/tpaexec-download-packages.mdx @@ -0,0 +1,63 @@ +--- +title: tpaexec download-packages +originalFilePath: tpaexec-download-packages.md + +--- + +The purpose of the downloader is to provide the packages required to do +a full installation of a TPA cluster from an existing configuration. +This is useful when you want to ship packages to secure clusters that do +not have internet access, or avoid downloading packages repeatedly for +test clusters. + +The downloader will download the full dependency tree of packages +required, and the resulting package repository will include metadata +files for the target distribution package manager, so can be used +exclusively to build clusters. At this time package managers Apt and YUM +are supported. + +## Usage + +An existing cluster configuration needs to exist which can be achieved +using the `tpaexec configure` command. No specific options are required +to use the downloader. See [configuring a cluster](../configure-cluster) +. + +Execute the download-packages subcommand to start the download process. +Provide the OS and OS version that should be used by the downloader. + +```shell +tpaexec download-packages cluster-dir --os RedHat --os-version 8 +``` + +This can also be expressed as a specific docker image. It is strongly +recommended that you use one of the tpa images prefixed like the example +below. + +```shell +tpaexec download-packages cluster-dir --docker-image tpa/redhat:8 +``` + +The downloader will place files downloaded in the directory `local-repo` +by default. It is possible to download to alternative directory by using +the option `--download-dir path`. + +## Using the result + +The contents of the `local-repo` directory is populated with a structure +determined by ansible according to the OS contained in the docker image. +For example, the docker image `tpa/redhat:8` would have the following: + +``` +cluster-dir/ +`-- local-repo + `-- RedHat + `-- 8 + |-- *.rpm + `-- repodata + `-- *repodata-files* +``` + +You can use this in the cluster as is or copy it to a target control +node. See [recommendations for installing to an air-gapped environment](air-gapped). A [local-repo](local-repo) will be detected and used +automatically by TPA. diff --git a/product_docs/docs/tpa/23/reference/tpaexec-support.mdx b/product_docs/docs/tpa/23/reference/tpaexec-support.mdx new file mode 100644 index 00000000000..994710a9489 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/tpaexec-support.mdx @@ -0,0 +1,38 @@ +--- +title: TPA capabilities and supported software +originalFilePath: tpaexec-support.md + +--- + +- [Python requirements](python) +- [Supported distributions](distributions) + +## Supported software + +TPA can install and configure the following major components. + +- Postgres 14, 13, 12, 11, 10 + +- EPAS (EDB Postgres Advanced Server) 14, 13, 12 + +- BDR 4, 3.7, 3.6, and earlier versions (deprecated) + +- pglogical 3, 2 (open source) + +- HARP 2 + +- repmgr + +- Barman + +- pgbouncer + +- haproxy (supported only for BDR 3.x) + +- Failover Manager (EFM) + +Support for the following components is in development. + +- pgbackrest + +- Postgres Enterprise Manager (PEM) diff --git a/product_docs/docs/tpa/23/reference/tpaexec-tests.mdx b/product_docs/docs/tpa/23/reference/tpaexec-tests.mdx new file mode 100644 index 00000000000..4d704925d92 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/tpaexec-tests.mdx @@ -0,0 +1,121 @@ +--- +title: TPA custom tests +originalFilePath: tpaexec-tests.md + +--- + +You can easily define in-depth tests specific to your environment and +application to augment TPA's [builtin tests](../tpaexec-test). + +We strongly recommend writing tests for any tasks, no matter how simple, +that you would run on your cluster to reassure yourself that everything +is working as you expect. Having a uniform and repeatable way to run +such tests ensures that you don't miss out on anything important, +whether you're dealing with a crisis or just doing routine cluster +management. + +If you write tests that target cluster instances by their configured +role (or other properties), you can be sure that all applicable tests +will be run on the right instances. No need to look up or remember how +many replicas to check the replication status on, nor which servers are +running pgbouncer, or any other such details that are an invitation to +making mistakes when you are checking things by hand. + +Tests must not make any significant changes to the cluster. If it's not +something you would think of doing on a production server, it probably +shouldn't be in a test. + +## Quickstart + +- Create `tests/mytest.yml` within your cluster directory +- Run `tpaexec test /path/to/cluster mytest` + +You can also create tests in some other location and use them across +clusters with the `--include-tests-from /other/path` option to +`tpaexec test`. + +(Run `tpaexec help test` for usage information.) + +## Example + +Here's how to write a test that is executed on all Postgres instances +(note `hosts: role_postgres` instead of `hosts: all`). + +You can use arbitrary Ansible tasks to collect information from the +cluster and perform tests. Just write tasks that will fail if some +expectation is not met (`assert`, `fail … when`, etc.). + +```yaml +--- +- name: Perform my custom tests + hosts: role_postgres + tasks: + + # Always start with this + - include_role: + name: test + tasks_from: prereqs.yml + + # Make sure that the PGDATA/PG_VERSION file exists. (This is just a + # simplified example, not something that actually needs testing.) + - name: Perform simple test + command: "test -f {{ postgres_data_dir }}/PG_VERSION" + become_user: "{{ postgres_user }}" + become: yes + + - name: Run pg_controldata + command: > + {{ postgres_bin_dir }}/pg_controldata {{ postgres_data_dir }} + register: controldata + become_user: "{{ postgres_user }}" + become: yes + + # Write output to clusterdir/$timestamp/$hostname/pg_controldata.txt + - name: Record pg_controldata output + include_role: + name: test + tasks_from: output.yml + vars: + output_file: pg_controldata.txt + content: | + {{ controldata.stdout }} +``` + +You can use the builtin `output.yml` as shown above to record arbitrary +test output in a timestamped test directory in your cluster directory. + +Each test must be a complete Ansible playbook (i.e., a list of plays, +not just a list of tasks). It will be imported and executed after the +basic TPA setup tasks. + +## Destructive tests + +Tests should not, by default, make any significant changes to a cluster. +(Even if they do something like creating a table to test replication, +they must be careful to clean up after themselves.) + +Any test that makes changes to a cluster that would be unacceptable on a +production cluster MUST be marked as `destructive`. These may be tests +that you run only in development, or during the initial cluster "burn +in" process. + +You can define "destructive" tests by setting `destructive: yes` when +including `prereqs.yml` in your test: + +```yaml +- hosts: … + tasks: + - include_role: + name: test + tasks_from: prereqs.yml + vars: + destructive: yes +``` + +If someone then runs `tpaexec test /path/to/cluster mytest`, they will +get an error asking them to confirm execution using the +`--destroy-this-cluster` option. + +(Note: using `--destroy-this-cluster` signifies an awareness of the risk +of running the command. It does not guarantee that the test will +actually destroy the cluster.) diff --git a/product_docs/docs/tpa/23/reference/volumes.mdx b/product_docs/docs/tpa/23/reference/volumes.mdx new file mode 100644 index 00000000000..48435f25778 --- /dev/null +++ b/product_docs/docs/tpa/23/reference/volumes.mdx @@ -0,0 +1,318 @@ +--- +title: Filesystem configuration +originalFilePath: volumes.md + +--- + +TPA allows you to define a list of `volumes` attached to each +instance. + +This list comprises both platform-specific settings that are used during +provisioning and filesystem-level settings used during deployment. + +First, `tpaexec provision` will use the information to create and +attach volumes to the instance (if applicable; see platform-specific +sections below for details). Then it will write a simplified list of +volumes (containing only non-platform-specific settings) as a host var +for the instance. Finally, `tpaexec deploy` will act on the simplified +list to set up and mount filesystems, if required. + +Here's a moderately complex example from an AWS cluster: + +```yaml +instances: +- Name: one + … + volumes: + - device_name: root + volume_type: gp2 + volume_size: 32 + - raid_device: /dev/md0 + device_name: /dev/xvdf + volume_type: io2 + volume_size: 64 + raid_units: 2 + raid_level: 1 + iops: 5000 + vars: + volume_for: postgres_data + encryption: luks + - raid_device: /dev/md1 + device_name: /dev/xvdh + ephemeral: ephemeral0 + raid_units: all + vars: + mountpoint: /mnt/scratch +``` + +In this example, the EC2 instance will end up with a 32GB EBS root +volume, a 64GB RAID-1 volume comprising two provisioned-iops EBS volumes +mounted as /opt/postgres/data, and a /tmp/scratch filesystem comprising +all available instance-store (“ephemeral”) volumes, whose number and +size are determined by the instance type. + +The details are documented in the section on AWS below, but settings +like `volume_type` and `volume_size` are used during provisioning, while +settings under `vars` like `volume_for` or `mountpoint` are written to +the inventory for use during deployment. + +## default_volumes + +Volumes are properties of an instance. You cannot set them in +`cluster_vars`, because they contain platform-specific settings. + +The +[`instance_defaults`](../configure-cluster#instance_defaults) +mechanism makes special allowances for volume definitions. Since volume +definitions in a large cluster may be quite repetitive (especially since +we recommend that instances in a cluster be configured as close to each +other as possible, you can specify `default_volumes` as shown here: + +```yaml +instance_defaults: + default_volumes: + - device_name: root + volume_type: gp2 + volume_size: 32 + - device_name: /dev/xvdf + volume_size: 100 + +instances: +- Name: one + … +- Name: two + volumes: + - device_name: /dev/xvdf + volume_size: 64 + - device_name: /dev/xvdg + volume_size: 64 + … +- Name: three + volumes: + - device_name: /dev/xvdf + volume_type: none +- Name: four + volumes: [] +``` + +Here every instance will have a 32GB root volume and a 100GB additional +volume by default (as is the case for instance `one`, which does not +specify anything different). Instance `two` will have the same root +volume, but it overrides `/dev/xvdf` to be 64GB instead, and has another +64GB volume in addition. Instance `three` will have the same root +volume, but no additional volume because it sets `volume_type: none` for +the default `/dev/xvdf`. Instance `four` will have no volumes at all. + +An instance starts off with whatever is specified in `default_volumes`, +and its `volumes` entries can override a default entry with the same +`device_name`, remove a volume by setting `volume_type` to `none`, add +new volumes with different names, or reject the defaults altogether. + +(This behaviour of merging two lists is specific to `default_volumes`. +If you set any other list in both `instance_defaults` and `instances`, +the latter will override the former completely.) + +## Platform AWS + +On AWS EC2 instances, you can attach EBS volumes. + +```yaml +instances: +- Name: one + … + volumes: + - device_name: root + volume_type: gp2 + volume_size: 32 + encrypted: yes + … + - device_name: /dev/xvdf + volume_type: io1 + volume_size: 32 + iops: 10000 + delete_on_termination: false + … + - device_name: /dev/xvdg + ephemeral: ephemeral0 + … +``` + +TPA translates a `device_name` of `root` to `/dev/sda` or +`/dev/xvda` based on the instance type, so that you don't need to +remember (or change) which one to use. + +The `volume_type` specifies the EBS volume type, e.g., `gp2` (for +“general-purpose” EBS volumes), `io1` for provisioned-IOPS volumes (in +which case you must also set `iops: 5000`), etc. + +The `volume_size` specifies the size of the volume in gigabytes. + +Set `encrypted: yes` to enable EBS encryption at rest. (This is an AWS +feature, enabled by default in newly-generated TPA configurations, +and is different from [LUKS encryption](#luks-encryption), explained +below.) + +Set `delete_on_termination` to `false` to prevent the volume from being +destroyed when the attached instance is terminated (which is the default +behaviour). + +Set `ephemeral: ephemeralN` to use a physically-attached +[instance store volume](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html), +formerly known as an ephemeral volume. The number, type, and size of +available instance store volumes depends on the instance type. Not all +instances have instance store volumes. Use instance store volumes only +for testing or temporary data, and EBS volumes for any data that you +care about. + +For an EBS volume, you can also set `snapshot: snap-xxxxxxxx` to attach +a volume from an existing snapshot. Volumes restored from snapshots may +be extraordinarily slow until enough data has been read from S3 and +cached locally. (In particular, you can spin up a new instance with +`PGDATA` from a snapshot, but expect it to take several hours before it +is ready to handle your full load.) + +If you set `attach_existing: yes` for a volume, and there is an existing +unattached EBS volume with matching Name/type/size/iops, a new volume +will not be created when launching the instance, but instead the +existing one will be attached to the instance the first time it starts. +Reattached EBS volumes do not suffer from the performance limitations of +volumes created from snapshots. + +## Platform bare + +TPA has no control over what volumes may be attached to +pre-provisioned `bare` instances, but if you define `volumes` with the +appropriate `device_name`, it will handle `mkfs` and `mount` for the +devices if required. + +## Platform Docker + +Docker containers can have attached volumes, but they are bind-mounted +directories, not regular block devices. They do not need to be +separately initialised or mounted. As such, the configuration looks +quite different. + +```yaml +instances: +- Name: one + platform: docker + … + volumes: + - /host/path/to/dir:/tmp/container/path:ro + - named_volume:/mnt/somevol:rw +``` + +You may recognise these volume specifications as arguments to +`docker run -v`. + +The volumes are attached when the container is created, and there are no +further actions during deployment. + +## RAID arrays + +On AWS EC2 instances, you can define RAID volumes: + +```yaml +instances: +- Name: one + … + volumes: + - raid_device: /dev/md0 + device_name: /dev/xvdf + raid_units: 2 + raid_level: 1 + volume_type: gp2 + volume_size: 100 + vars: + volume_for: postgres_data +``` + +This example will attach 4×100GB EBS gp2 volumes (`/dev/xvd[f-i]`) and +assemble them into a RAID-1 volume named `/dev/md0`. The handling of +`volume_for` or `mountpoint` during deployment happens as with any other +volume. + +TPA does not currently support the creation and assembly of RAID +arrays on other platforms, but you can use an existing array by adding +an entry to volumes with `device_name: /dev/md0` or `/dev/mapper/xyz`. +TPA will handle `mkfs` and `mount` as with any other block device. + +## LUKS encryption + +TPA can set up a LUKS-encrypted device: + +```yaml +instances: +- Name: one + … + volumes: + - device_name: /dev/xyz + vars: + encryption: luks + luks_volume: mappedname + volume_for: … +``` + +If a volume with `encryption: luks` set is not already initialised, +TPA will use `cryptsetup` to first `luksFormat` and then `luksOpen` +it to map it under `/dev/mapper/mappedname` before handling filesystem +creation as with any other device. + +(To avoid any possibility of data loss, TPA will refuse to set up +LUKS encryption on a device that contains a valid filesystem already.) + +If you create a LUKS-encrypted `volume_for: postgres_data`, TPA will +configure Postgres to not start automatically at boot. You can use +`tpaexec start-postgres clustername` to mount the volume and start +Postgres (and `stop-postgres` to stop Postgres and unmap the volume). + +The LUKS passphrase is generated locally and stored in the vault. + +## Filesystem creation and mounting + +If any `device` does not contain a valid filesystem, it will be +initialised with `mkfs`. + +```yaml +instances: +- Name: one + … + volumes: + - device_name: /dev/xyz + vars: + volume_for: … + fstype: ext4 + fsopts: + - -cc + - -m 2 + mountopts: 'defaults,relatime,nosuid' + readahead: 65536 + owner: root + group: root + mode: 0755 +``` + +You can specify the `fstype` (default: ext4), `fsopts` to be passed to +mkfs (default: none), and `mountopts` to be passed to mount and written +to fstab (see below). + +TPA will set the readahead for the device to 16MB by default (and +make the value persist across reboots), but you can specify a different +value for the volume as shown above. + +There are two ways to determine where a volume is mounted. You can +either specify a `mountpoint` explicitly, or you can set `volume_for` to +`postgres_data`, `postgres_wal`, `postgres_tablespace` or `barman_data`, +and TPA will translate the setting into an appropriate mountpoint +for the system. + +Once the `mountpoint` is determined, the `device` will be mounted there +with the given `mountopts` (default: `defaults,noatime`). An entry will +also be created for the filesystem in `/etc/fstab`. + +You may optionally specify `owner`, `group`, or `mode` for the volume, +and these attributes will be set on the `mountpoint`. Remember that at +this very early stage of deployment, you cannot count on the `postgres` +user to exist. In any case, TPA will (separately) ensure that any +directories needed by Postgres have the right ownership and permissions, +so you don't have to do it yourself. diff --git a/product_docs/docs/tpa/23/reference/yum_repositories.mdx b/product_docs/docs/tpa/23/reference/yum_repositories.mdx new file mode 100644 index 00000000000..52355b98d1a --- /dev/null +++ b/product_docs/docs/tpa/23/reference/yum_repositories.mdx @@ -0,0 +1,72 @@ +--- +title: Configuring YUM repositories +originalFilePath: yum_repositories.md + +--- + +This page explains how to configure YUM package repositories on RedHat +systems. + +You can define named repositories in `yum_repositories`, and decide +which ones to use by listing the names in `yum_repository_list`: + +```yaml +cluster_vars: + yum_repositories: + Example: + rpm_url: >- + https://repo.example.com/repos/Example/example-repo.rpm + + Other: + description: "Optional repository description" + baseurl: https://other.example.com/repos/Other/$basearch + gpgkey: + https://other.example.com/repos/Other/gpg.XXXXXXXXXXXXXXXX.key + + yum_repository_list: + - EPEL + - PGDG + - Example + - Other +``` + +This example shows two ways to define a YUM repository. + +If the repository has a “repo RPM” (a package that customarily installs +the necessary `/etc/yum.repos.d/*.repo` file and any GPG keys needed to +verify signed packages from the repository), you can just point to it. + +Otherwise, you can specify a description, a `baseurl`, and a `gpgkey` +URL, and TPAexec will create a `/etc/yum.repos.d/Other.repo` file for +you based on this information. + +The EPEL and PGDG repositories are defined by default. The EPEL +repository is required for correct operation, so you must always +include EPEL in `yum_repository_list`. You should also include PGDG if +you want to install PGDG packages. + +You can set `yum_repository_list: []` to not install any repositories +(but things will break without an alternative source of EPEL packages). + +If you need to perform any special steps to configure repository access, +you can use a [pre-deploy hook](../tpaexec-hooks) to create the .repo +file yourself: + +```yaml +- name: Define Example repository + copy: + dest: /etc/yum.repos.d/example.repo + owner: root + group: root + mode: 0644 + content: | + [example] + name=Example repo + baseurl=https://repo.example.com/repos/Example/ + enabled=1 + gpgkey=https://repo.example.com/repokey.asc + gpgcheck=1 +``` + +In this case, you do not need to list the repository in +`yum_repository_list`. diff --git a/product_docs/docs/tpa/23/rel_notes/index.mdx b/product_docs/docs/tpa/23/rel_notes/index.mdx new file mode 100644 index 00000000000..aacb4d917d8 --- /dev/null +++ b/product_docs/docs/tpa/23/rel_notes/index.mdx @@ -0,0 +1,10 @@ +--- +title: "Release notes" +--- + +The Trusted Postgres Architect documentation describes the latest version of Trusted Postgres Architect 23. + +| Version | Release date | +| ------- | ------------ | +| [23.12.0](tpa_23.12.0_rel_notes) | 21 Feb 2023 | + diff --git a/product_docs/docs/tpa/23/rel_notes/tpa_23.12.0_rel_notes.mdx b/product_docs/docs/tpa/23/rel_notes/tpa_23.12.0_rel_notes.mdx new file mode 100644 index 00000000000..e78c7ecd5b6 --- /dev/null +++ b/product_docs/docs/tpa/23/rel_notes/tpa_23.12.0_rel_notes.mdx @@ -0,0 +1,28 @@ +--- +title: Trusted Postgres Architect 23.12.0 release notes +navTitle: "Version 23.12.0" +--- + + +New features, enhancements, bug fixes, and other changes in Trusted Postgres Architect 23.11.0 include the following: + +| Type | Description | +| ---- |------------ | +| Feature | Introduce full support for EDB Postgres Distributed 5, including Commit At Most Once (CAMO) configuration support based on commit scopes. | +| Feature | Introduce support for EDB Postgres Extended repository and packages. | +| Enhancement |

Preliminary support for configuring multi-region AWS clusters.

Multi-region clusters require manual setup of VPCs and VPC.

| +| Enhancement | Enable proxy routing (and, therefore, subgroup RAFT) automatically for `--active-locations`. Removes the configure option to enable subgroup RAFT globally. | +| Bug fix | Ensure the EDB_SUBSCRIPTION_TOKEN is not logged.| +| Bug fix | Allow the user to suppress addition of the products/default/release repo to tpa_2q_repositories. | +| Bug fix |

Ensure that nodes subscribe to bdr_child_group, if available.

In clusters with multiple subgroups, TPA did not expect instances to be subscribed to the replication sets for both the top group and the subgroup, so it would incorrectly remove the latter from the node's subscribed replication sets.

| +| Bug fix |

Fail reliably with a useful error if Postgres doesn't start.

Due to an Ansible bug, the deployment wouldn't fail if Postgres did not start on some instances, but did start on others (for example, due to a difference in the configuration). Continuing on with the deployment resulted in errors when trying to access cluster_facts for the failed hosts later.

| +| Bug fix |

Don't call `bdr.alter_node_replication_sets()` on witnesses for BDR 4.3 and later.

This adjusts to a new restriction in BDR versions where witness nodes are not handled with a custom replication set configuration.

+| Bug fix | Replace hardcoded "barman" references to enable use of the barman_{user,group} settings to customize the barman user and home directory. | +| Bug fix | Add shared_preload_libraries entries, where appropriate, for extensions mentioned under postgres_databases[*].extensions. | +| Bug fix | Ensure that `pgaudit` does not appear before `bdr` in shared_preload_libraries (to avoid a known crash). | +| Bug fix | Fix syntax error (DSN quoting) in pgd-cli config file. | +| Bug fix |

Sort endpoints in pgd-proxy config to avoid file rewrites.

This will likely require a pgd-proxy restart on the next deploy (but it will avoid unnecessary future rewrites/restarts on subsequent deploys).

| +| Bug fix | Fix an error while installing rsync from a local-repo on RH systems. +| Bug fix | Fix an error with Apache WSGI module configuration for PEM 9 on Debian systems. | +| Bug fix | Don't remove the bdr extension if it has been created on purpose, even if it is unused. | + diff --git a/product_docs/docs/tpa/23/tower.mdx b/product_docs/docs/tpa/23/tower.mdx new file mode 100644 index 00000000000..18fa92feea6 --- /dev/null +++ b/product_docs/docs/tpa/23/tower.mdx @@ -0,0 +1,120 @@ +--- +navTitle: Ansible Tower (Automation Controller) +title: TPA and Ansible Tower/Ansible Automation Controller +originalFilePath: tower.md + +--- + +TPA can be used with RedHat Ansible Automation Controller (formerly +known as Ansible Tower) by running the configure and provision steps on +a standalone machine, treating the cluster as a bare-metal one, and then +importing the resulting inventory and other generated files into Tower. + +A TPA installation on Tower instance, which includes its own virtual +environment (tpa-venv), is then used for the deployment step in Tower. + +This document describes the appropriate procedure for Ansible Tower +version 3.8. + +## Preparing Tower for working with TPA + +### Setting up a TPA virtual environment + +- Install tpaexec on your Tower server under /opt/EDB/TPA. This is the + default location when TPA is installed from package. +- Run `tpaexec setup` which will create a virtual environment under + /opt/EDB/TPA/tpa-venv and install the required packages. +- Add TPA directory (/opt/EDB/TPA) to the "CUSTOM VIRTUAL ENVIRONMENT PATHS" + field in the System Settings page of Tower UI. + +### Creating the TPA_2Q_SUBSCRIPTION_TOKEN credential type + +Create the custom credential type called "TPA_2Q_SUBSCRIPTION_TOKEN" +as described below: + +- Go to the Credentials Type page in Tower UI. + +- Set the "NAME" field to "TPA_2Q_SUBSCRIPTION_TOKEN". + +- Paste this to "INPUT CONFIGURATION" field: + + ```yaml + fields: + - id: tpa_2q_sub_token + type: string + label: TPA_2Q_SUBSCRIPTION_TOKEN + required: + - tpa_2q_subscription_token + ``` + +- Paste this to "INJECTOR CONFIGURATION" field: + + ```yaml + env: + TPA_2Q_SUBSCRIPTION_TOKEN: '{{ tpa_2q_sub_token }}' + ``` + +- Click "SAVE" button. + +## Setting up a cluster + +- Ensure the hosts you intend to use for your cluster are known to your + Tower installation, that you have a Credential that can access them over + SSH, and that they have ansible_host, public_ip and private_ip set. + +- On a machine with tpaexec installed, prepare a file with a list of + your hostnames, one per line: + +```text +mercury +venus +mars +jupiter +saturn +neptune +``` + +- Run `tpaexec configure`: + +```bash +[tpa]$ tpaexec configure \ + --platform bare \ + --use-ansible-tower https://aac.example.com/api \ + --tower-git-repository ssh://git@git.example.com/example \ + --hostnames-from \ + --architecture BDR-Always-ON \ + --layout bronze \ + --harp-consensus-protocol bdr +``` + + The API endpoint is currently accepted and added to config.yml but + not used. The git repository will be used to import the cluster data + into Tower; tpaexec will create its own branch in the repository + for this cluster so it doesn't matter what branches already exist + in it. (This allows you to use the same repository for all of your + clusters.) All other options to `tpaexec configure`, as described in + [Configuration](tpaexec-configure) are still valid. + + This will create a cluster directory named after your cluster. + +- config.yml will now include the top-level dictionary `ansible_tower`, + which will cause `tpaexec provision` to treat the cluster as a Tower + cluster. + +- Edit config.yml if you need to make any other changes. + +- Run `tpaexec provision` to generate inventory and other related files. + +- Add a Project in Tower with the git repository. + +- Add the inventory from the project as an external source to your + inventory. + +- Create a Template in Tower specifying the TPA virtual environment, + your inventory, and the newly created project. Also include any required + credentials to reach your hosts, and a credential with a TPA + subscription token (TPA_2Q_SUBSCRIPTION_TOKEN). + +- Set one additional variable: `tpa_dir: /opt/EDB/TPA` + +- Run a job based on the new Template to deploy your cluster. diff --git a/product_docs/docs/tpa/23/tpaexec-configure.mdx b/product_docs/docs/tpa/23/tpaexec-configure.mdx new file mode 100644 index 00000000000..e419c25b7df --- /dev/null +++ b/product_docs/docs/tpa/23/tpaexec-configure.mdx @@ -0,0 +1,513 @@ +--- +navTitle: Configuration +title: Cluster configuration +originalFilePath: tpaexec-configure.md + +--- + +The `tpaexec configure` command generates a YAML cluster configuration +file that is required by subsequent stages in the provision/deploy/test +cycle. + +## Quickstart + +```bash +[tpa]$ tpaexec configure ~/clusters/speedy --architecture M1 +``` + +This command will create a directory named `~/clusters/speedy` and +generate a configuration file named `config.yml` that follows the +layout of the architecture named M1 (single primary, N replicas). +It will create a git repository in the new directory and make an initial +commit containing the generated `config.yml`. + +The command also accepts various options (some specific to the selected +architecture or platform) to modify the configuration, but the defaults +are sensible and intended to be usable straightaway. You are encouraged +to read the generated config.yml and fine-tune the configuration to suit +your needs. (Here's an overview of [configuration settings that affect +the deployment](configure-instance).) + +It's possible to write config.yml entirely by hand, but it's much easier +to edit the generated file. + +## Configuration options + +The first argument must be the cluster directory, e.g., `speedy` or +`~/clusters/speedy` (the cluster will be named speedy in both cases). +We recommend that you keep all your clusters in a common directory, +e.g., `~/clusters` in the example above. + +The next argument must be `--architecture ` to select an +architecture, e.g., +[M1](architecture-M1) or +[BDR-Always-ON](architecture-BDR-Always-ON). +For a complete list of architectures, run +`tpaexec info architectures`. + +The arguments above are always mandatory. The rest of the options +described here may be safely omitted, as in the example above; the +defaults will lead to a usable result. + +Run `tpaexec help configure-options` for a list of common options. + +### Architecture-specific options + +The architecture you select determines what other options are accepted. +Typically, each architecture accepts some unique options as well as the +generic options described below. + +For example, with M1 you can use `--num-cascaded-replicas 3` to create +a cluster with three cascaded replicas. Please consult the +documentation for an architecture for a list of options that it accepts +(or, in some cases, requires). + +### Platform options + +Next, you may use `--platform ` to select a platform, e.g., +[aws](platform-aws) or [bare](platform-bare). + +An architecture may or may not support a particular platform. If not, it +will fail to configure the cluster. + +The choice of platform affects the interpretation of certain options. +For example, if you choose aws, the arguments to +`--region ` and +`--instance-type ` +must be a valid +[AWS region name](https://docs.aws.amazon.com/general/latest/gr/rande.html) +and +[EC2 instance type](https://aws.amazon.com/ec2/instance-types/) +respectively. Please refer to the platform documentation for more details. + +If you do not explicitly select a platform, the default is currently +aws. + +**Note:** TPA fully supports creating clusters with instances on +different platforms, but `tpaexec configure` cannot currently generate +such a configuration. You must edit config.yml to specify multiple +platforms. + +### Owner + +Specify `--owner ` to associate the cluster (by some +platform-specific means, e.g., AWS tags) with the name of a person +responsible for it. This is especially important for cloud platforms. By +default, the owner is set to the login name of the user running +`tpaexec provision`. + +(You may use your initials, or "Firstname Lastname", or anything else +that identifies you uniquely.) + +### Region + +Specify `--region ` to select a region. + +This option is meaningful only for cloud platforms. The default for AWS +is eu-west-1. + +**Note:** TPA fully supports creating clusters that span multiple +regions, but `tpaexec configure` cannot currently generate such a +configuration. You must edit config.yml to specify multiple regions. + +### Network configuration + +By default, each cluster will be configured with a number of randomly selected +`/28` subnets from the CIDR range `10.33.0.0/16`, depending on the selected +architecture. + +Specify `--network 192.168.0.0/16` to assign subnets from a different network. + +**Note:** On AWS clusters, this corresponds to the VPC CIDR. +See [aws](platform-aws#vpc-required) documentation for details. + +Specify `--subnet-prefix 26` to assign subnets of a different size, /26 instead +of /28 in this case. + +Specify `--no-shuffle-subnets` to allocate subnets from the start of the +network CIDR range, without randomisation, e.g. `10.33.0.0/28`, then +`10.33.0.16/28` and so on. + +Specify `--exclude-subnets-from ` to exclude subnets that are +already used in existing cluster config.yml files. You can specify this +argument multiple times for each directory. + +**Note:** These options are not meaningful for the "bare" platform, where +TPA will not alter the network configuration of existing servers. + +### Instance type + +Specify `--instance-type ` to select an instance type. + +This option is meaningful only for cloud platforms. The default for AWS +is t3.micro. + +### Disk space + +Specify `--root-volume-size 64` to set the size of the root volume in +GB. (Depending on the platform, there may be a minimum size required for +the root volume.) + +The `--postgres-volume-size ` and +`--barman-volume-size ` options are available to set the sizes +of the Postgres and Barman volumes on those architectures and platforms +that support separate volumes for Postgres and Barman. + +None of these options is meaningful for the "bare" platform, where +TPA has no control over volume sizes. + +### Hostnames + +By default, `tpaexec configure` will randomly select as many hostnames +as it needs from a pre-approved list of several dozen names. This should +be enough for most clusters. + +Specify `--hostnames-from ` to select hostnames from a file +with one name per line. The file must contain at least as many valid +hostnames as there are instances in your cluster. Each line may contain +an optional IP address after the name; if present, this address will be +set as the `ip_address` for the corresponding instance in `config.yml`. + +Use `--hostnames-pattern '…pattern…'` to limit the selection to +lines matching an egrep pattern. + +Use `--hostnames-sorted-by="--dictionary-order"` to select a sort(1) +option other than `--random-sort` (which is the default). + +Use `--hostnames-unsorted` to not sort hostnames at all. In this case, +they will be assigned in the order they are found in the hostnames file. +This is the default when a hostnames file is explicitly specified. + +Hostnames may contain only letters (a-z), digits (0-9), and '-'. They +may be FQDNs, depending on the selected platform. Hostnames should be +in lowercase; any uppercase characters will be converted to lowercase +internally, and any references to these hostnames in config.yml (e.g., +`upstream: hostname`) must use the lowercase version. + +## Software selection + +### Distribution + +Specify `--distribution ` to select a distribution. + +The selected platform determines which distributions are available, and +which one is used by default. + +In general, you should be able to use "Debian", "RedHat", and "Ubuntu" +to select TPA images that have Postgres and other software preinstalled +(to reduce deployment times). To use stock distribution images instead, +append "-minimal" to the label, e.g., "Debian-minimal". + +This option is not meaningful for the "bare" platform, where TPA has +no control over which distribution is installed. + +### 2ndQuadrant and EDB repositories + +TPA can enable any 2ndQuadrant or EDB software repository that you have +access to through a subscription. + +By default, it will install the 2ndQuadrant public repository (which +does not need a subscription) and add on any product repositories that +the architecture may require (e.g., the BDR repository). + +More detailed explanation of how TPA uses 2ndQuadrant and EDB +repositories is available [here](reference/2q_and_edb_repositories) + +Specify `--2Q-repositories source/name/maturity …` or +`--edb-repositories repository …` to specify the complete list of +2ndQuadrant or EDB repositories to install on each instance in addition +to the 2ndQuadrant public repository. + +If any EDB repositories are specified, any 2ndQuadrant ones will be +ignored. + +Use this option with care. TPA will configure the named repositories +with no attempt to make sure the combination is appropriate. + +To use these options, you must `export TPA_2Q_SUBSCRIPTION_TOKEN=xxx` +or `export EDB_SUBSCRIPTION_TOKEN=xxx` before you run tpaexec. +You can get a 2ndQuadrant token from the 2ndQuadrant Portal under +"Company info" in the left menu, then "Company". You can get an EDB +token from enterprisedb.com/repos. + +### Local repository support + +Use `--enable-local-repo` to create a local package repository from +which to ship packages to target instances. + +In environments with restricted network access, you can instead use +`--use-local-repo-only` to create a local repository and disable all +other package repositories on target instances, so that packages are +installed only from the local repository. + +The page about [Local repository support](reference/local-repo) has more +details. + +### Software versions + +You may optionally specify `--postgres-version 13` or any other +supported major version of Postgres. + +By default, we always install the latest version of every package. This +is usually the desired behaviour, but in some testing scenarios, it may +be necessary to select specific package versions using any of the +following options: + +1. `--postgres-package-version 10.4-2.pgdg90+1` +2. `--repmgr-package-version 4.0.5-1.pgdg90+1` +3. `--barman-package-version 2.4-1.pgdg90+1` +4. `--pglogical-package-version '2.2.0*'` +5. `--bdr-package-version '3.0.2*'` +6. `--pgbouncer-package-version '1.8*'` + +You may use any version specifier that apt or yum would accept. + +If your version does not match, try appending a `*` wildcard. This +is often necessary when the package version has an epoch qualifier +like `2:...`. + +You may optionally specify `--epas` which sets `postgresql_flavour` to +`epas` in the generated config.yml. This means that tpaexec will install +EDB Postgres Advanced Server (requires EDB repository access) +instead of community Postgres (the default). + +Since EPAS supports both Oracle and postgres compatiblity features, +by default, EPAS initializes the cluster in `redwood` i.e. Oracle +compatibility mode. In order to initialize the cluster in postgres +mode, you may optionally specify `--no-redwood` which sets +`epas_redwood_compat` to False in the generated config.yml. + +You may also specify `--extra-packages p1 p2 …` or +`--extra-postgres-packages p1 p2 …` to install additional packages. +The former lists packages to install along with system packages, while +the latter lists packages to install later along with postgres packages. +(If you mention packages that depend on Postgres in the former list, the +installation will fail because Postgres will not yet be installed.) The +arguments are passed on to the package manager for installation without +any modifications. + +The `--extra-optional-packages p1 p2 …` option behaves like +`--extra-packages`, but it is not an error if the named packages +cannot be installed. + +### Known issue with wildcard use + +Please note that the use of wildcards in `*_package_version` when added +permanently to `config.yml`, can result in unexpected updates to +installed software during `tpaexec deploy` on nodes with RHEL 8 and +above (or derivative OSs which use dnf such as Rocky Linux). +When deploy runs on an existing cluster that already has packages +installed ansible may be unable to match the full package version. +For example, if the value for `bdr_package_version` was set to `3.6*` +then ansible would not be able to match this to an installed version of +BDR, it would assume no package is installed, and it would attempt to +install the latest version available of the package with the same name +in the configured repository, e.g. 3.7. + +We are aware of this limitation as an ansible dnf module bug and hope +to address this in a future release of TPA. + +### Building and installing from source + +If you specify `--install-from-source postgres`, Postgres will be +built and installed from a git repository instead of installed from +packages. Use `2ndqpostgres` instead of `postgres` to build and +install 2ndQPostgres. By default, this will build the appropriate +`REL_nnn_STABLE` branch. + +You may use `--install-from-source 2ndqpostgres pglogical3 bdr3` to +build and install all three components from source, or just use +`--install-from-source pglogical3 bdr3` to use packages for +2ndQPostgres, but build and install pglogical v3 and BDR v3 from source. +By default, this will build the `master` branch of pglogical and BDR. + +To build a different branch, append `:branchname` to the corresponding +argument. For example `--install-from-source 2ndqpostgres:dev/xxx`, or +`pglogical:bug/nnnn`. + +You may not be able to install packages that depend on a package that +you chose to replace with a source installation instead. For example, +BDR v3 packages depend on pglogical v3 packages, so you can't install +pglogical from its source repository and BDR from packages. Likewise, +you can't install Postgres from source and pglogical from packages. + +## Overrides + +You may optionally specify `--overrides-from a.yml …` to load one or +more YAML files with settings to merge into the generated config.yml. + +Any file specified here is first expanded as a Jinja2 template, and the +result is loaded as a YAML data structure, and merged recursively into +the arguments used to generate config.yml (comprising architecture and +platform defaults and arguments from the command-line). This process is +repeated for each additional override file specified; this means that +settings defined by one file will be visible to any subsequent files. + +For example, your override file might contain: + +``` +cluster_tags: + some_tag: "{{ lookup('env', 'SOME_ENV_VAR') }}" + +cluster_vars: + synchronous_commit: remote_write + postgres_conf_settings: + effective_cache_size: 4GB +``` + +These settings will augment `cluster_tags` and `cluster_vars` that +would otherwise be in config.yml. Settings are merged recursively, so +`cluster_tags` will end up containing both the default Owner tag as +well as `some_tag`. Similarly, the `effective_cache_size` setting +will override that variable, leaving other `postgres_conf_settings` +(if any) unaffected. In other words, you can set or override specific +subkeys in config.yml, but you can't empty or replace `cluster_tags` +or any other hash altogether. + +The merging only applies to hash structures, so you cannot use this +mechanism to change the list of `instances` within config.yml. It is +most useful to augment `cluster_vars` and `instance_defaults` with +common settings for your environment. + +That said, the mechanism does not enforce any restrictions, so please +exercise due caution. It is a good idea to generate two configurations +with and without the overrides and diff the two config.yml files to make +sure you understand the effect of all the overrides. + +## Ansible Tower + +Use the `--use-ansible-tower` and `--tower-git-repository` options to +create a cluster adapted for deployment with Ansible Tower. See [Ansible +Tower](tower) for details. + +## git repository + +By default, a git repository is created with an initial branch named +after the cluster, and a single commit is made, with the configure +options you used in the commit message. If you don't have git in your +`$PATH`, tpaexec will not raise an error but the repository will not be +created. To suppress creation of the git repository, use the `--no-git` +option. (Note that in an Ansible Tower cluster, a git repository is +required and will be created later by `tpaexec provision` if it does not +already exist.) + +## Examples + +Let's see what happens when we run the following command: + +```bash +[tpa]$ tpaexec configure ~/clusters/speedy --architecture M1 \ + --num-cascaded-replicas 2 --distribution Debian-minimal \ + --platform aws --region us-east-1 --subnet-pattern 10.33.x.x/28 \ + --instance-type t2.medium --root-volume-size 32 \ + --postgres-volume-size 64 --barman-volume-size 128 \ + --postgres-version 9.6 +[tpa]$ +``` + +There is no output, so there were no errors. The cluster directory has +been created and populated. + +```bash +$ ls ~/clusters/speedy +total 8 +drwxr-xr-x 2 ams ams 4096 Aug 4 16:23 commands +-rw-r--r-- 1 ams ams 1374 Aug 4 16:23 config.yml +lrwxrwxrwx 1 ams ams 51 Aug 4 16:23 deploy.yml -> + /home/ams/work/2ndq/TPA/architectures/M1/deploy.yml +``` + +The cluster configuration is in config.yml, and its neighbours are links +to architecture-specific support files that you need not interact with +directly. Here's what the configuration looks like: + +```yaml +--- +architecture: M1 + +cluster_name: speedy +cluster_tags: {} + +ec2_vpc: + Name: Test + cidr: 10.33.0.0/16 + +ec2_ami: + Name: debian-stretch-hvm-x86_64-gp2-2018-08-20-85640 + Owner: 379101102735 + +ec2_vpc_subnets: + us-east-1: + 10.33.161.64/28: + az: us-east-1a + 10.33.189.80/28: + az: us-east-1b + +cluster_vars: + postgres_version: 9.6 + tpa_2q_repositories: [] + vpn_network: 192.168.33.0/24 + +instance_defaults: + platform: aws + type: t2.medium + region: us-east-1 + default_volumes: + - device_name: root + volume_type: gp2 + volume_size: 32 + - device_name: /dev/xvdf + volume_type: gp2 + volume_size: 64 + vars: + volume_for: postgres_data + vars: + ansible_user: admin + +instances: + - node: 1 + Name: quirk + role: primary + subnet: 10.33.161.64/28 + + - node: 2 + Name: keeper + role: replica + upstream: quirk + backup: zealot + subnet: 10.33.161.64/28 + + - node: 3 + Name: zealot + role: + - barman + - log-server + - openvpn-server + - monitoring-server + - witness + volumes: + - device_name: /dev/xvdf + volume_type: gp2 + volume_size: 128 + vars: + volume_for: barman_data + subnet: 10.33.189.80/28 + + - node: 4 + Name: quaver + role: replica + upstream: keeper + subnet: 10.33.189.80/28 + + - node: 5 + Name: quavery + role: replica + upstream: keeper + subnet: 10.33.189.80/28 + +``` + +The next step is to run [`tpaexec provision`](tpaexec-provision) +or learn more about how to customise the configuration of +[the cluster as a whole](configure-cluster) or how to configure an +[individual instance](configure-instance). diff --git a/product_docs/docs/tpa/23/tpaexec-deploy.mdx b/product_docs/docs/tpa/23/tpaexec-deploy.mdx new file mode 100644 index 00000000000..522bb26342b --- /dev/null +++ b/product_docs/docs/tpa/23/tpaexec-deploy.mdx @@ -0,0 +1,102 @@ +--- +navTitle: Deployment +title: tpaexec deploy +originalFilePath: tpaexec-deploy.md + +--- + +Deployment is the process of installing and configuring Postgres and +other software on the cluster's servers. This includes setting up +replication, backups, and so on. + +At the end of the deployment stage, Postgres will be up and running +along with other components like repmgr, Barman, pgbouncer, etc. +(depending on the architecture selected). + +## Prerequisites + +Before you can run `tpaexec deploy`, you must have already run +[`tpaexec configure`](tpaexec-configure) to generate the cluster +configuration and then provisioned the servers with +[`tpaexec provision`](tpaexec-provision). + +Before deployment, you must +`export TPA_2Q_SUBSCRIPTION_TOKEN=xxx` to enable any 2ndQuadrant +repositories that require subscription. You can use the subscription +token that you used to [install TPA](INSTALL) itself. If you +forget to do this, an error message will soon remind you. + +## Quickstart + +```bash +[tpa]$ tpaexec deploy ~/clusters/speedy -v +Using /opt/EDB/TPA/ansible/ansible.cfg as config file + +PLAY [Basic initialisation and fact discovery] *************************************** +... + +PLAY [Set up TPA cluster nodes] ****************************************************** +... + +PLAY RECAP *************************************************************************** +zealot : ok=281 changed=116 unreachable=0 failed=0 +keeper : ok=284 changed=96 unreachable=0 failed=0 +quaver : ok=260 changed=89 unreachable=0 failed=0 +quavery : ok=260 changed=88 unreachable=0 failed=0 +quirk : ok=262 changed=100 unreachable=0 failed=0 + + +real 7m1.907s +user 3m2.492s +sys 1m5.318s +``` + +This command produces a great deal of output and may take a long time +(depending primarily on the latency between the host running tpaexec and +the hosts in the cluster, as well as how long it takes the instances to +download the packages they need to install). We recommend that you use +at least one `-v` during deployment. The output is also logged to +`ansible.log` in the cluster directory. + +The exact number of hosts, tasks, and changed tasks may of course vary. + +The deploy command takes no options itself—any options you provide after +the cluster name are passed on unmodified to Ansible (e.g., `-v`). + +Those who are familiar with Ansible may be concerned by the occasional +red "failed" task output scrolling by. Rest assured that if the process +does not stop soon afterwards, the error is of no consequence, and the +code will recover from it automatically. + +When the deployment is complete, you can run +[`tpaexec test`](tpaexec-test) to verify the installation. + +## Selective deployment + +You can limit the deployment to a subset of your hosts by setting +`deploy_hosts` to a comma-separated list of instance names: + +```bash +[tpa]$ tpaexec deploy ~/clusters/speedy -v -e deploy_hosts=keeper,quaver +``` + +This will run the deployment on the given instances, though it will also +initially execute some tasks on other hosts to collect information about +the state of the cluster. + +(Setting `deploy_hosts` is the recommended alternative to using +Ansible's `--limit` option, which TPA does not support.) + +## deploy.yml + +The deployment process is architecture-specific. Here's an overview of +the various +[configuration settings that affect the deployment](configure-instance). +If you are familiar with Ansible playbooks, you can follow along as +tpaexec applies various roles to the cluster's instances. + +Unlike config.yml, deploy.yml is not designed to be edited (and is +usually a link into the architectures directory). Even if you want to +extend the deployment process to run your own Ansible tasks, you should +do so by [creating hooks](tpaexec-hooks). This protects you from +future implementation changes within a particular architecture. diff --git a/product_docs/docs/tpa/23/tpaexec-hooks.mdx b/product_docs/docs/tpa/23/tpaexec-hooks.mdx new file mode 100644 index 00000000000..ee577717466 --- /dev/null +++ b/product_docs/docs/tpa/23/tpaexec-hooks.mdx @@ -0,0 +1,154 @@ +--- +navTitle: Deployment hooks +title: TPA hooks +originalFilePath: tpaexec-hooks.md + +--- + +TPA can set up fully-functional clusters with no user intervention, +and already provides a broad variety of +[settings to control your cluster configuration](configure-instance), +including custom repositories and packages, custom Postgres +configuration (both pg_hba.conf and postgresql.conf), and so on. + +You can write hook scripts to address specific needs that are not met by +the available configuration settings. Hooks allow you to execute +arbitrary Ansible tasks during the deployment. + +Hooks are the ultimate extension mechanism for TPA, and there is no +limit to what you can do with them. Please use them with caution, and +keep in mind the additional maintenance burden you are taking on. The +TPA developers have no insight into your hook code, and cannot +guarantee compatibility between releases beyond invoking hooks at the +expected stage. + +## Summary + +If you create files with specific names under the `hooks` subdirectory +of your cluster directory, TPA will invoke them at various stages of +the deployment process, as described below. + +```bash +$ mkdir ~/clusters/speedy/hooks +$ cat > ~/clusters/speedy/hooks/pre-deploy.yml +--- +- debug: msg="hello world!" +``` + +Hook scripts are invoked with `include_tasks`, so they are expected to +be YAML files containing a list of Ansible tasks (not a playbook, which +contains a list of plays). Unless otherwise documented below, hooks are +unconditionally executed for all hosts in the deployment. + +## General-purpose hooks + +### pre-deploy + +TPA invokes `hooks/pre-deploy.yml` immediately after bootstrapping +Python—but before doing anything else like configuring repositories and +installing packages. This is the earliest stage at which you can execute +your own code. + +You can use this hook to set up custom repository configuration, beyond +what you can do with +[`apt_repositories`](reference/apt_repositories) or +[`yum_repositories`](reference/yum_repositories). + +### post-repo + +TPA invokes `hooks/post-repo.yml` after configuring package +repositories. You can use it to make corrections to the repository +configuration before beginning to install packages. + +### pre-initdb + +TPA invokes `hooks/pre-initdb.yml` before deciding whether or not to +[run initdb to create PGDATA](reference/initdb) if it does not exist. You +should not ordinarily need to use this hook (but if you use it to create +`PGDATA` yourself, then TPA will skip `initdb`). + +### postgres-config + +TPA invokes `hooks/postgres-config.yml` after generating Postgres +configuration files, including pg_hba.conf and the files in conf.d, but +before the server has been started. + +You can use this hook, for example, to create additional configuration +files under `conf.d`. + +### postgres-config-final + +TPA invokes `hooks/postgres-config-final.yml` after starting +Postgres and creating users, databases, and extensions. You can use this +hook to execute SQL commands, for example, to perform custom extension +configuration or create database objects. + +### harp-config + +TPA invokes `hooks/harp-config.yml` after generating HARP configuration +files, but before the HARP service has been started. + +You can use this hook, for example, to perform any customizations to the HARP +proxy that are not provided by the built-in interface of TPA. + +Please note that this hook will be run in any node that installs HARP packages, +including BDR nodes. + +### post-deploy + +TPA invokes `hooks/post-deploy.yml` at the end of the deployment. + +You can go on to do whatever you want after this stage. + +If you use this hook to make changes to any configuration files that +were generated or altered during the TPA deployment, you run the +risk that the next `tpaexec deploy` will overwrite your changes (since +TPA doesn't know what your hook might have done). + +## BDR3 hooks + +These hooks are specific to BDRv3 deployments. + +### bdr-pre-node-creation + +TPA invokes `hooks/bdr-pre-node-creation.yml` on all instances +before creating a BDR node on any instance for the first time. The hook +will not be invoked if all required BDR nodes already exist. + +### bdr-post-group-creation + +TPA invokes `hooks/bdr-post-group-creation.yml` on all instances +after creating any BDR node group on the `first_bdr_primary` instance. +The hook will not be invoked if the required BDR groups already exist. + +### bdr-pre-group-join + +TPA invokes `hooks/bdr-pre-group-join.yml` on all instances +after creating, changing or removing the replication sets and +configuring the required subscriptions, before the node join. + +You can use this hook to execute SQL commands and perform other +adjustments to the replication set configuration and subscriptions that +might be required before the node join starts. + +For example, you can adjust the BDR witness replication set to +automatically add new tables and create DDL filters in general. + +## Other hooks + +### postgres-pre-update, postgres-post-update + +The [`update-postgres`](tpaexec-update-postgres) command invokes +`hooks/postgres-pre-update.yml` on a particular instance before it +installs any packages, and invokes `hooks/postgres-post-update.yml` +after the package installation is complete. Both hooks are invoked only +on the instance being updated. + +You can use these hooks to customise the update process for your +environment (e.g., to install other packages and stop and restart +services that TPA does not manage). + +## New hooks + +EDB adds new hooks to TPA as the need arises. If your use case is not +covered by the existing hooks, please contact us to discuss the matter. diff --git a/product_docs/docs/tpa/23/tpaexec-provision.mdx b/product_docs/docs/tpa/23/tpaexec-provision.mdx new file mode 100644 index 00000000000..6bf88bfc1be --- /dev/null +++ b/product_docs/docs/tpa/23/tpaexec-provision.mdx @@ -0,0 +1,195 @@ +--- +navTitle: Provisioning +title: tpaexec provision +originalFilePath: tpaexec-provision.md + +--- + +Provision creates instances and other resources required by the cluster. + +The exact details of this process depend both on +the architecture (e.g. [M1](architecture-M1)) +and platform (e.g. [AWS](platform-aws)) +that you selected while configuring the cluster. + +At the end of the provisioning stage, you will have the required number +of instances with the basic operating system installed, which TPA +can access via ssh (with sudo to root). + +## Prerequisites + +Before you can provision a cluster, you must generate the cluster +configuration with [`tpaexec configure`](tpaexec-configure) +(and edit config.yml to fine-tune the configuration if needed). + +You may need additional platform-dependent steps. For example, you need +to obtain an AWS API access keypair to provision EC2 instances, or set +up LXD or Docker to provision containers. Consult the platform +documentation for details. + +## Quickstart + +```bash +[tpa]$ tpaexec provision ~/clusters/speedy + +PLAY [Provision cluster] ********************************************************** +... + +TASK [Set up EC2 instances] ******************************************************* +changed: [localhost] => (item=us-east-1:quirk) +changed: [localhost] => (item=us-east-1:keeper) +changed: [localhost] => (item=us-east-1:zealot) +changed: [localhost] => (item=us-east-1:quaver) +changed: [localhost] => (item=us-east-1:quavery) +... + +TASK [Generate ssh_config file for the cluster] *********************************** +changed: [localhost] + +PLAY RECAP ************************************************************************ +localhost : ok=128 changed=20 unreachable=0 failed=0 + + +real 2m19.386s +user 0m51.819s +sys 0m27.852s +``` + +This command will produce lots of output (append `-v`, `-vv`, etc. +to the command if you want even more verbose output). The output is also +logged to `ansible.log` in the cluster directory. This can be overriden +by setting the environment variable `ANSIBLE_LOG_PATH` to the path and name of +the desired logfile. + +If it completes without error, you may proceed to run +[`tpaexec deploy`](tpaexec-deploy) to install and configure +software. + +## Options + +When provisioning cloud instances, it is especially important to make +sure instances are directly traceable to a human responsible for them. +By default, TPA will tag EC2 instances as being owned by the login +name of the user running `tpaexec provision`. + +Specify `--owner ` to change the name (e.g., if your username +happens to be something generic, like postgres or ec2-user). You may use +initials, or "Firstname Lastname", or anything else to uniquely identify +a person. + +Any other options you specify are passed on to Ansible. + +## Accessing the instances + +After provisioning completes, you should be able to SSH to the instances +(after a brief delay to allow the instances to boot up and install their +SSH host keys). As shown in the output above, tpaexec will generate an +ssh_config file for you to use. + +```bash +[tpa]$ cd ~/clusters/speedy +[tpa]$ cat ssh_config +Host * + Port 22 + IdentitiesOnly yes + IdentityFile "id_speedy" + UserKnownHostsFile "known_hosts tpa_known_hosts" + ServerAliveInterval 60 + +Host quirk + User admin + HostName 54.227.207.189 +Host keeper + User admin + HostName 34.229.111.196 +Host zealot + User admin + HostName 18.207.108.211 +Host quaver + User admin + HostName 54.236.36.251 +Host quavery + User admin + HostName 34.200.214.150 +[tpa]$ ssh -F ssh_config quirk +Linux quirk 4.9.0-6-amd64 #1 SMP Debian 4.9.82-1+deb9u3 (2018-03-02) x86_64 + +The programs included with the Debian GNU/Linux system are free software; +the exact distribution terms for each program are described in the +individual files in /usr/share/doc/*/copyright. + +Debian GNU/Linux comes with ABSOLUTELY NO WARRANTY, to the extent +permitted by applicable law. +Last login: Sat Aug 4 12:31:28 2018 from 136.243.148.74 +admin@quirk:~$ sudo -i +root@quirk:~# +``` + +You can run [`tpaexec deploy`](tpaexec-deploy) immediately after +provisioning. It will wait as long as required for the instances to come +up. You do not need to wait for the instances to come up, or ssh in to +them before you start deployment. + +## Generated files + +During the provisioning process, a number of new files will be created +in the cluster directory: + +```bash +[tpa]$ ls ~/clusters/speedy +total 240 +-rw-r--r-- 1 ams ams 193098 Aug 4 17:59 ansible.log +drwxr-xr-x 2 ams ams 4096 Aug 4 17:38 commands +-rw-r--r-- 1 ams ams 1442 Aug 4 17:54 config.yml +lrwxrwxrwx 1 ams ams 51 Aug 4 17:38 deploy.yml -> + /opt/EDB/TPA/architectures/M1/deploy.yml +drwxr-xr-x 2 ams ams 4096 Aug 4 17:38 hostkeys +-rw------- 1 ams ams 1675 Aug 4 17:38 id_speedy +-rw------- 1 ams ams 1438 Aug 4 17:38 id_speedy.ppk +-rw-r--r-- 1 ams ams 393 Aug 4 17:38 id_speedy.pub +drwxr-xr-x 4 ams ams 4096 Aug 4 17:50 inventory +-rw-r--r-- 1 ams ams 2928 Aug 4 17:50 tpa_known_hosts +-rw-r--r-- 1 ams ams 410 Aug 4 17:50 ssh_config +-rw-r--r-- 1 ams ams 3395 Aug 4 17:59 vars.json +drwxr-xr-x 2 ams ams 4096 Aug 4 17:38 vault +``` + +We've already studied the ssh*config file, which refers to the \`id*\*`files (an SSH keypair generated for the cluster) and`tpa_known_hosts`(the signatures of the`hostkeys/\` installed on the instances). + +The `vars.json` file may be used by `tpaexec provision` on +subsequent invocations with `--cached`. + +The `inventory/` directory contains static and dynamic inventory files +as well as group and host variable definitions from config.yml. + +```bash +[tpa]$ cat inventory/00-speedy +[tag_Cluster_speedy] +quirk ansible_host=54.227.207.189 node=1 platform=aws +keeper ansible_host=34.229.111.196 node=2 platform=aws +zealot ansible_host=18.207.108.211 node=3 platform=aws +quaver ansible_host=54.236.36.251 node=4 platform=aws +quavery ansible_host=34.200.214.150 node=5 platform=aws + +[tpa]$ cat inventory/group_vars/tag_Cluster_speedy/01-speedy.yml +cluster_name: speedy +cluster_tag: tag_Cluster_speedy +postgres_version: 9.6 +tpa_version: v3.0-451-g10647888 +tpa_2q_repositories: [] +vpn_network: 192.168.33.0/24 + +[tpa]$ cat inventory/host_vars/zealot/02-topology.yml +role: +- barman +- log-server +- openvpn-server +- monitoring-server +- witness +upstream: quirk +``` + +If you now change a variable in config.yml and rerun provision, these +files will be updated. If you don't change the configuration, it won't +do anything. If you add a new instance in config.yml and rerun, it will +bring up the new instance without affecting the existing ones. diff --git a/product_docs/docs/tpa/23/tpaexec-rehydrate.mdx b/product_docs/docs/tpa/23/tpaexec-rehydrate.mdx new file mode 100644 index 00000000000..7aae614d3f6 --- /dev/null +++ b/product_docs/docs/tpa/23/tpaexec-rehydrate.mdx @@ -0,0 +1,158 @@ +--- +navTitle: Rehydration +title: tpaexec rehydrate +originalFilePath: tpaexec-rehydrate.md + +--- + +The `tpaexec rehydrate` command rebuilds AWS EC2 instances with an +updated machine image (AMI), and allows for the rapid deployment of +security patches and OS upgrades to a cluster managed by TPA. + +Given a new AMI with all the required changes, this command terminates +an instance, replaces it with a newly-provisioned instance that uses the +new image, and attaches the data volumes from the old instance before +recreating the configuration of the server exactly (based on +`config.yml`). + +Publishing up-to-date images and requiring servers to be rebuilt from +scratch on a regular schedule is an alternative to allowing a fleet of +servers to download and install individual security updates themselves. +It makes it simpler to track the state of each server at a glance, and +discourages any manual changes to individual servers (they would be +wiped out during the instance replacement). + +TPA makes it simple to minimise disruption to the cluster as a whole +during the rehydration, even though the process must necessarily involve +downtime for individual servers as they are terminated and replaced. On +a [streaming replication cluster](architecture-M1), you can rehydrate +the replicas first, then use [`tpaexec switchover`](tpaexec-switchover) +to convert the primary to a replica before rehydrating it. On +[BDR-Always-ON clusters](architecture-BDR-Always-ON), you can [remove +each server from the haproxy server pool](tpaexec-server-pool) before +rehydrating it, then add it back afterwards. + +If you just want to install minor-version updates to Postgres and +associated components, you can use the +[`tpaexec update-postgres`](tpaexec-update-postgres) command instead. + +## Prerequisites + +To be able to rehydrate an instance, you must specify +`delete_on_termination: no` and `attach_existing: yes` for each of its +data volumes in `config.yml`. (The new instance will necessarily have a +new EBS root volume.) + +By default, when you terminate an EC2 instance, the EBS volumes attached +to it are also terminated. In this case, since we want to reattach them +to a new instance, we must disable `delete_on_termination`. Setting +`attach_existing` makes TPA search for old volumes when provisioning +a new instance and, if found, attach them to the instance after it's +running. + +Do not stop or terminate the old instance manually; the +`tpaexec rehydrate` command will do this after verifying that the +instance can be safely rehydrated. + +## Example + +Let's assume you have an AWS cluster configuration in `~/clusters/night`. + +### Change the configuration + +First, you must edit `config.yml` and specify the new AMI. For example: + +```yaml +ec2_ami: + Name: RHEL-8.3_HVM-20210209-x86_64-0-Hourly2-GP2 + Owner: '309956199498' +``` + +Check that `delete_on_termination` is disabled for each data volume. If +the parameter is not present, you can check its value through the AWS +EC2 management console. Click on 'Instances', select an instance, then +open the 'Description' tab and scroll down to 'Block devices', and click +on an EBS volume. If the "Delete on termination" flag is set to true, +you can [change it using `awscli`](#appendix). Also check +`attach_existing` and set it to `yes` if it isn't set already. + +Here's an example with both attributes correctly set: + +```yaml +instances: +- node: 1 + Name: vlad + subnet: 10.33.14.0/24 + role: primary + volumes: + - device_name: /dev/xvdf + volume_type: gp2 + volume_size: 16 + attach_existing: yes + delete_on_termination: false + vars: + volume_for: postgres_data + mountpoint: /var/lib/pgsql +``` + +(Note that volume parameters may be set in `instance_defaults` as well +as under specific instances. Search for `volumes:` and make sure all of +the relevant volumes have these two attributes set.) + +### Start the rehydration + +Here's the syntax for the rehydrate command: + +```bash +$ tpaexec rehydrate ~/clusters/night instancename +``` + +You can specify a single instance name or a comma-separated list of +instance names (but you cannot rehydrate all of the instances in the +cluster at once). + +The command will first check that every non-root EBS volume attached to +the instance (or instances) being rehydrated has the +`delete_on_termination` flag set to false. If this is not the case, it +will stop with an error before any instance is terminated. + +If the volume attributes are set correctly, the command will first +terminate each of the instances, then run provision and deploy to +replace them with new instances using the new AMI. + +## Rehydrate in phases + +In order to maintain cluster continuity, we recommend rehydrating the +cluster in phases. + +For example, in a [cluster that uses streaming +replication](architecture-M1) with a primary instance, two replicas, +and a Barman backup server, you could rehydrate the Barman instance and +one replica first, then another replica, then +[switchover](tpaexec-switchover) from the primary to one of the +rehydrated replicas, rehydrate the former primary, and (optionally), +switchover back to the original primary. This sequence ensures that one +primary and one replica are always available. + +## Appendix + +#### Using awscli to change volume attributes + +First, find the instance and EBS volume in the AWS management console. +Click on 'Instances', select an instance, open the 'Description' tab and +scroll down to 'Block devices', and select an EBS volume. To disable +`delete_on_termination`, run the following command after substituting +the correct values for the `--region`, `--instance-id`, and block device +name: + +```bash +$ aws ec2 modify-instance-attribute \ + --region eu-west-1 --instance-id i-XXXXXXXXXXXXXXXXX \ + --block-device-mappings \ + '[{"DeviceName": "/dev/xvdf", "Ebs": {"DeleteOnTermination": false}}]' +``` + +Do this for each of the data volumes for the instance, and after a brief +delay, you should be able to see the changes in the management console, +and `tpaexec rehydrate` will also detect that the instance can be safely +rehydrated. diff --git a/product_docs/docs/tpa/23/tpaexec-server-pool.mdx b/product_docs/docs/tpa/23/tpaexec-server-pool.mdx new file mode 100644 index 00000000000..05bd863f79d --- /dev/null +++ b/product_docs/docs/tpa/23/tpaexec-server-pool.mdx @@ -0,0 +1,46 @@ +--- +navTitle: Server pool management +title: BDR/HAProxy server pool management +originalFilePath: tpaexec-server-pool.md + +--- + +The `tpaexec pool-disable-server` and `pool-enable-server` commands +allow a BDR instance in a [BDR-Always-ON +cluster](architecture-BDR-Always-ON) to be temporarily removed from +the HAProxy active server pool for maintenance, and then added back +afterwards. + +These commands follow the same process as [rolling +updates](tpaexec-update-postgres) by default, so +`pool-disable-server` will wait for active transactions against a BDR +instance to complete and for pgbouncer to direct new connections to +another instance before completing. Use the `--nowait` option if you +don't want to wait for active sessions to end. + +Running `pool-disable-server` immediately followed by +`pool-enable-server` on an instance will have the effect of moving all +active traffic to a different instance (in essence, a switchover). This +allows you to run online maintenace tasks such as long-running VACUUM +commands, while maintaining instance availability. + +If there are multiple HAProxy servers configured with the same set of +`haproxy_backend_servers`, this command will remove or add the given +server to the pool of every relevant proxy in parallel. + +## Example + +```bash +$ tpaexec pool-disable-server ~/clusters/clockwork orange # --nowait + +# HAProxy will no longer direct any traffic to the BDR instance named +# 'orange', so you can perform maintenance on it (e.g., run `tpaexec +# rehydrate`). + +$ tpaexec pool-enable-server ~/clusters/clockwork orange +``` + +When you remove an instance from the server pool, HAProxy will not +direct any traffic to it, even if the other instance(s) in the pool +fail. You must remember to add the server back to the active pool once +the maintenance activity is concluded. diff --git a/product_docs/docs/tpa/23/tpaexec-switchover.mdx b/product_docs/docs/tpa/23/tpaexec-switchover.mdx new file mode 100644 index 00000000000..b890b7a7732 --- /dev/null +++ b/product_docs/docs/tpa/23/tpaexec-switchover.mdx @@ -0,0 +1,39 @@ +--- +navTitle: Switchover +title: tpaexec switchover +originalFilePath: tpaexec-switchover.md + +--- + +The `tpaexec switchover` command performs a controlled switchover +between a primary and a replica in a [cluster that uses streaming +replication](architecture-M1). After you run this command, the +selected replica is promoted to be the new primary, the former primary +becomes a new replica, and any other replicas in the cluster will be +reconfigured to follow the new primary. + +The command checks that the cluster is healthy before switching roles, +and is designed to be run without having to shut down any repmgr +services beforehand. + +(This is equivalent to running `repmgr standby switchover` with the +`--siblings-follow` option.) + +## Example + +This command will make `replicaname` be the new primary in +`~/clusters/speedy`: + +```bash +$ tpaexec switchover ~/clusters/speedy replicaname +``` + +## Architecture options + +This command is applicable only to [M1 clusters](architecture-M1) +that have a single writable primary instance and one or more read-only +replicas. + +For BDR-Always-ON clusters, use the +[HAProxy server pool management commands](tpaexec-server-pool) to +perform maintenance on BDR instances. diff --git a/product_docs/docs/tpa/23/tpaexec-test.mdx b/product_docs/docs/tpa/23/tpaexec-test.mdx new file mode 100644 index 00000000000..f520da45c89 --- /dev/null +++ b/product_docs/docs/tpa/23/tpaexec-test.mdx @@ -0,0 +1,25 @@ +--- +navTitle: Testing +title: tpaexec test +originalFilePath: tpaexec-test.md + +--- + +Now we run architecture-specific tests against a deployed cluster to +verify the installation. At the end of this stage, we have a +fully-functioning cluster. + +You must have already run `tpaexec configure`, `tpaexec provision`, +and `tpaexec deploy` successfully before you can run `tpaexec test`. + +## Quickstart + +```bash +[tpa]$ tpaexec test ~/clusters/speedy -v +``` + +Output is once again logged to `ansible.log` in the cluster directory. + +If this command succeeds, your cluster works. + +Congratulations. diff --git a/product_docs/docs/tpa/23/tpaexec-update-postgres.mdx b/product_docs/docs/tpa/23/tpaexec-update-postgres.mdx new file mode 100644 index 00000000000..5364f210d83 --- /dev/null +++ b/product_docs/docs/tpa/23/tpaexec-update-postgres.mdx @@ -0,0 +1,100 @@ +--- +navTitle: Rolling updates +title: TPA rolling updates +originalFilePath: tpaexec-update-postgres.md + +--- + +The `tpaexec update-postgres` command performs a minor-version update +of Postgres and related packages without interrupting overall cluster +operations. Individual instances will be stopped and restarted during +the update, but queries will be routed in such a way as to allow +applications to continue database operations. + +The exact procedure is architecture-specific. + +## BDR-Always-ON + +For BDR clusters, the update process goes through the cluster instances +one by one and does the following: + +1. Tell haproxy the server is under maintenance. +2. If the instance was the active server, request pgbouncer to reconnect + and wait for active sessions to be closed. +3. Stop Postgres, update packages, and restart Postgres. +4. Finally, mark the server as "ready" again to receive requests through + haproxy. + +BDR logical standby or physical replica instances are updated without +any haproxy or pgbouncer interaction. Non-Postgres instances in the +cluster are left alone. + +You can control the order in which the cluster's instances are updated +by defining the `update_hosts` variable: + +``` +$ tpaexec update-postgres ~/clusters/speedy \ + -e update_hosts=quirk,keeper,quaver +``` + +This may be useful to minimise lead/shadow switchovers during the update +by listing the active BDR primary instances last, so that the shadow +servers are updated first. + +If your environment requires additional actions, the +[postgres-pre-update and postgres-post-update hooks](tpaexec-hooks) +allow you to execute custom Ansible tasks before and after the package +installation step. + +## M1 + +For M1 clusters, `update-postgres` will first update the streaming +replicas one by one, then perform a [switchover](tpaexec-switchover) +from the primary to one of the replicas, update the primary, and +switchover back to it again. + +## Package version selection + +By default, `tpaexec update-postgres` will update to the latest +available versions of the installed packages if you did not explicitly +specify any package versions (e.g., Postgres, BDR, or pglogical) when +you created the cluster. + +If you did select specific versions, for example by using any of the +`--xxx-package-version` options (e.g., postgres, bdr, pglogical) to +[`tpaexec configure`](tpaexec-configure), or by defining +`xxx_package_version` variables in config.yml, the update will do +nothing because the installed packages already satisfy the requested +versions. + +In this case, you must edit config.yml, remove the version settings, and +re-run `tpaexec provision`. The update will then install the latest +available packages. You can still update to a specific version by +specifying versions on the command line as shown below: + +``` +$ tpaexec update-postgres ~/clusters/speedy -vv \ + -e postgres_package_version="2:11.6r2ndq1.6.13*" \ + -e pglogical_package_version="2:3.6.11*" \ + -e bdr_package_version="2:3.6.11*" +``` + +Please note that version syntax here depends on your OS distribution and +package manager. In particular, yum accepts `*xyz*` wildcards, while +apt only understands `xyz*` (as in the example above). + +Note: see limitations of using wildcards in package_version in +[tpaexec-configure](tpaexec-configure#known-issue-with-wildcard-use). + +It is your responsibility to ensure that the combination of Postgres, +BDR, and pglogical package versions that you request are sensible. That +is, they should work together, and there should be an upgrade path from +what you have installed to the new versions. + +For BDR clusters, it is a good idea to explicitly specify exact versions +for all three components (Postgres, BDR, pglogical) rather than rely on +the package manager's dependency resolution to select the correct +dependencies. + +We strongly recommend testing the update in a QA environment before +running it in production. diff --git a/scripts/source/dispatch_product.py b/scripts/source/dispatch_product.py index 736ed6b80f1..a0946b7daeb 100755 --- a/scripts/source/dispatch_product.py +++ b/scripts/source/dispatch_product.py @@ -17,6 +17,7 @@ "EnterpriseDB/bdr": f"node {args.workspace}/destination/scripts/source/bdr.js {args.workspace}/source {args.workspace}/destination --unhandled-rejections=strict", "EnterpriseDB/pglogical": f"node {args.workspace}/destination/scripts/source/pglogical.js {args.workspace}/source {args.workspace}/destination --unhandled-rejections=strict", "EnterpriseDB/harp": f"rsync -a --delete {args.workspace}/source/docs/user_guide/ {args.workspace}/destination/product_docs/docs/harp/2.0/", + "EnterpriseDB/tpaexec": f"{args.workspace}/destination/scripts/source/process-tpa-docs.sh {args.workspace}/source {args.workspace}/destination", } ret = os.system( diff --git a/scripts/source/process-tpa-docs.sh b/scripts/source/process-tpa-docs.sh new file mode 100755 index 00000000000..9f16993cd5f --- /dev/null +++ b/scripts/source/process-tpa-docs.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +TPAVERSION=23 + +if [ -z $1 ] || [ -z $2 ] +then + echo "the path to the source and destination checkouts must be provided" + exit 1 +fi + +# convert inputs to actual directory names, in case a relative path is passed in. +SOURCE_CHECKOUT=`cd $1 && pwd` +DESTINATION_CHECKOUT=`cd $2 && pwd` + +cd $DESTINATION_CHECKOUT/product_docs/docs/tpa/$TPAVERSION/ +node $DESTINATION_CHECKOUT/scripts/source/files-to-ignore.mjs \ + "$DESTINATION_CHECKOUT/product_docs/docs/tpa/$TPAVERSION/" \ + > $SOURCE_CHECKOUT/files-to-ignore.txt + +cd $SOURCE_CHECKOUT/docs/ + +node $DESTINATION_CHECKOUT/scripts/source/tpaexec.js src + +node $DESTINATION_CHECKOUT/scripts/source/merge-indexes.mjs \ + "$SOURCE_CHECKOUT/docs/src/index.mdx" \ + "$DESTINATION_CHECKOUT/product_docs/docs/tpa/$TPAVERSION/index.mdx" \ + "$SOURCE_CHECKOUT/docs/src/index.mdx" \ + >> $SOURCE_CHECKOUT/files-to-ignore.txt + +node $DESTINATION_CHECKOUT/scripts/source/merge-indexes.mjs \ + "$SOURCE_CHECKOUT/docs/src/reference/index.mdx" \ + "$DESTINATION_CHECKOUT/product_docs/docs/tpa/$TPAVERSION/reference/index.mdx" \ + "$SOURCE_CHECKOUT/docs/src/reference/index.mdx" \ + >> $SOURCE_CHECKOUT/files-to-ignore.txt + +rsync -av --delete --exclude="*.md" --exclude="architectures" --exclude="templates" --exclude-from=$SOURCE_CHECKOUT/files-to-ignore.txt src/ $DESTINATION_CHECKOUT/product_docs/docs/tpa/$TPAVERSION/ diff --git a/scripts/source/tpaexec.js b/scripts/source/tpaexec.js new file mode 100644 index 00000000000..5f47300b5f0 --- /dev/null +++ b/scripts/source/tpaexec.js @@ -0,0 +1,291 @@ +// run: node scripts/source/tpaexec.js source_path" +// purpose: +// Import and convert the tpa docs to EDB Docs -style MDX +// +const path = require("path"); +const fs = require("fs/promises"); +const { read, write } = require("to-vfile"); +const remarkParse = require("remark-parse"); +const mdx = require("remark-mdx"); +const unified = require("unified"); +const remarkFrontmatter = require("remark-frontmatter"); +const remarkStringify = require("remark-stringify"); +const admonitions = require("remark-admonitions"); +const yaml = require("js-yaml"); +const visit = require("unist-util-visit"); +const visitAncestors = require("unist-util-visit-parents"); +const mdast2string = require("mdast-util-to-string"); +const { exec } = require("child_process"); +const isAbsoluteUrl = require("is-absolute-url"); + +const fileToMetadata = {}; +const args = process.argv.slice(2); +const basePath = path.resolve(args[0], ""); +const referenceMarkdownFiles = []; + +(async () => { + const processor = unified() + .use(remarkParse) + .use(remarkStringify, { emphasis: "*", bullet: "-", fences: true }) + .use(admonitions, { tag: "!!!", icons: "none", infima: true }) + .use(remarkFrontmatter) + .use(mdx) + .use(transformer); + + const processEntry = async (dirEntry, destPath, indexFilename) => { + if (typeof dirEntry === "string") dirEntry = { "": dirEntry }; + for (const [navTitle, dest] of Object.entries(dirEntry)) { + if (!dest) { + console.warn("don't know how to process entry: ", dirEntry); + continue; + } + + // subsection + // + if (dest instanceof Array) { + let subDestPath = destPath; + let subIndexFilename = indexFilename; + // special handling: if navTitle ends with a slash, put contents in subdirectory + if (navTitle.endsWith("/")) { + fileToMetadata[indexFilename] = { + navigation: [], + ...fileToMetadata[indexFilename], + }; + fileToMetadata[indexFilename].navigation.push( + navTitle.replace(/\/$/, ""), + ); + + subIndexFilename = path.relative(basePath, "/dev/null"); + fileToMetadata[subIndexFilename] = { + title: navTitle.replace(/\/$/, ""), + }; + subDestPath = path.resolve(destPath, navTitle); + } + // default: add section break + else { + fileToMetadata[indexFilename] = { + navigation: [], + ...fileToMetadata[indexFilename], + }; + fileToMetadata[indexFilename].navigation.push("#" + navTitle); + } + + for (const subEntry of dest) { + await processEntry(subEntry, subDestPath, subIndexFilename); + } + + // write dummy index + if (navTitle.endsWith("/")) { + await process( + subIndexFilename, + path.resolve(subDestPath, "index.mdx"), + ); + } + continue; + } + + // normal entry + // + const fileAbsolutePath = path.resolve(basePath, dest); + const filename = path.relative(basePath, fileAbsolutePath); + const destFilepath = path.resolve( + destPath, + filename.replace(/\//g, "_") + "x", + ); + + fileToMetadata[filename] = { ...fileToMetadata[filename], navTitle }; + fileToMetadata[indexFilename] = { + navigation: [], + ...fileToMetadata[indexFilename], + }; + fileToMetadata[indexFilename].navigation.push( + path.basename(destFilepath, ".mdx"), + ); + + if (filename === indexFilename) continue; + await process(fileAbsolutePath, destFilepath); + } + }; + + const process = async (fileAbsolutePath, destFilepath) => { + let file = await read(fileAbsolutePath); + file.contents = stripEmptyComments(file.contents.toString()); + file = await processor.process(file); + file.path = destFilepath; + try { + await fs.mkdir(path.dirname(file.path), { recursive: true }); + } catch {} + await write(file); + }; + + const mdIndex = yaml.load( + await fs.readFile(path.resolve(basePath, "../tpa.yml"), "utf8"), + ); + + const markdownToProcess = mdIndex.nav; + const indexFilename = "index.md"; + + // look for markdown files in the root but not in the index, add them under "reference" + // + function findDirEntry(filename, entries) { + for (const dirEntry of entries) { + for (const [navTitle, dest] of Object.entries(dirEntry)) { + let result = null; + if (dest instanceof Array) { + result = findDirEntry(filename, dest); + } + if (dest === filename) { + result = {}; + result[navTitle] = dest; + } + if (result) return result; + } + } + } + const { globby } = await import("globby"); + const allMarkdown = await globby(path.join(basePath, "*.md")); + for (const mdxPath of allMarkdown) { + if (findDirEntry(path.basename(mdxPath), markdownToProcess)) continue; + referenceMarkdownFiles.push(path.basename(mdxPath)); + } + if (referenceMarkdownFiles.length) + markdownToProcess.push({ "reference/": referenceMarkdownFiles }); + + for (const dirEntry of markdownToProcess) { + if (!dirEntry) continue; + await processEntry(dirEntry, basePath, indexFilename); + } + fileToMetadata[indexFilename].navTitle = fileToMetadata[indexFilename].title = + "TPA"; + + // write out index w/ navigation tree + await process( + path.resolve(basePath, indexFilename), + path.resolve(basePath, "index.mdx"), + ); +})(); + +// GPP leaves the files littered with these; they alter parsing by flipping sections to HTML context +// remove them BEFORE parsing to avoid issues +function stripEmptyComments(rawMarkdown) { + return rawMarkdown.replace(//g, ""); +} + +// Transforms: +// - identify title +// - identify navTitle +// - Create frontmatter YAML from above +// + +function transformer() { + return (tree, file) => { + const filename = path.relative(basePath, file.path); + const metadata = fileToMetadata[filename]; + if (!metadata) console.warn(`No metadata for ${filename}`); + let title = ""; + for (let i = 0; i < tree.children.length; ++i) { + const node = tree.children[i]; + if (node.type === "heading" && node.depth === 1) { + title = mdast2string(node); + tree.children.splice(i--, 1); + } + } + + // Apart from , there shouldn't be any JSX in these - so look for it and remove it. + // Warn about these, except for comments + visit(tree, "jsx", (node, index, parent) => { + // todo: use HAST parser here - this is not reliable + + // strip (potentially NON-EMPTY) HTML comments - these are not valid in JSX + const newValue = node.value.replace(/(?=/g, ""); + if (newValue !== node.value) { + node.value = newValue; + if (newValue.trim()) return; + } + + // ignore placeholder + if (node.value.match(/^ file === filename, + ); + visit(tree, ["link", "image"], (node) => { + let url = node.url || node.src; + if (isAbsoluteUrl(url) || url[0] === "/") return; + url = url.replace(/\.md(?=$|\?|#)/, ""); + const parsed = new URL(url, "base:/reference/"); + if (parsed.protocol !== "base:" || parsed.pathname === "/reference/") + return; + if ( + referenceMarkdownFiles.find( + (file) => + path.basename(file, ".md") === + parsed.pathname.replace(/^\/reference\//, ""), + ) + ) { + if (!isInReferences) url = parsed.href.replace(/^base:\//, ""); + } else if (isInReferences) { + url = parsed.href.replace(/^base:\/reference\//, "../"); + } + if (node.url) node.url = url; + else node.src = url; + }); + + // MDExtra anchors: + // - identify + // - remove + // - create explicit anchor preceding removal in container block + const anchorRE = /{#([^}]+)}/; + visitAncestors(tree, "text", (node, ancestors) => { + let anchor = node.value.match(anchorRE); + if (!anchor) return; + anchor = anchor[1]; + node.value = node.value.replace(anchorRE, ""); + + const blockTypes = ["root", "paragraph", "listItem", "blockquote"]; + for ( + let i = ancestors.length - 1, + parent = ancestors[ancestors.length - 1], + child = node; + i >= 0; + --i, child = parent, parent = ancestors[i] + ) { + if (!blockTypes.includes(parent.type)) continue; + anchor = { type: "jsx", value: `
` }; + parent.children.splice(parent.children.indexOf(child), 0, anchor); + break; + } + }); + + // images: strip Markdown Extra attribute block + visit(tree, "image", (node, index, parent) => { + const attrRE = /{[^}]+}/; + if (/{[^}]+?}/.test(parent.children[index + 1]?.value)) + parent.children[index + 1].value = parent.children[ + index + 1 + ].value.replace(attrRE, ""); + }); + + if (!metadata.title) metadata.title = title; + if ( + metadata.title?.trim() === metadata.navTitle?.trim() || + !metadata.navTitle?.trim()?.length + ) + delete metadata.navTitle; + metadata.originalFilePath = filename; + tree.children.unshift({ type: "yaml", value: yaml.dump(metadata) }); + }; +} diff --git a/src/components/card-decks.js b/src/components/card-decks.js index 75a258f6b19..cc1870480a4 100644 --- a/src/components/card-decks.js +++ b/src/components/card-decks.js @@ -64,27 +64,30 @@ const SimpleCard = ({ card }) => ( ); -const CardDecks = ({ cards, cardType = "simple" }) => { +const CardDecks = ({ cards, cardType = "simple", deckTitle = "" }) => { return ( -
- {cards.map((card) => { - return ( - - {cardType === "full" ? ( - - ) : ( - - )} - - ); - })} -
+ <> + {deckTitle &&

{deckTitle}

} +
+ {cards.map((card) => { + return ( + + {cardType === "full" ? ( + + ) : ( + + )} + + ); + })} +
+ ); }; diff --git a/src/constants/products.js b/src/constants/products.js index 856a1d37490..242ad8c70ab 100644 --- a/src/constants/products.js +++ b/src/constants/products.js @@ -57,4 +57,6 @@ export const products = { }, repmgr: { name: "repmgr", iconName: IconNames.HIGH_AVAILABILITY }, slony: { name: "Slony Replication", iconName: IconNames.NETWORK2 }, + tde: { name: "Transparent Data Encryption", iconName: IconNames.SECURITY }, + tpa: { name: "Trusted Postgres Architect", iconName: IconNames.INSTANCES }, }; diff --git a/src/pages/index.js b/src/pages/index.js index 45388f13ba6..b6f60930df5 100644 --- a/src/pages/index.js +++ b/src/pages/index.js @@ -309,6 +309,12 @@ const Page = () => ( Replication Server + + + Trusted Postgres Architect + + + { if (!node || !node.items) return null; if (Object.values(TileModes).includes(mode) && mode !== TileModes.None) { - const tiles = node.items.map((n) => getCards(n, mode === "simple" ? 0 : 1)); + const decks = {}; + let currentDeckName = ""; + for (let item of node.items) { + if (!item.path) { + currentDeckName = item.title; + } else { + decks[currentDeckName] = decks[currentDeckName] || []; + decks[currentDeckName].push(getCards(item, mode === "simple" ? 0 : 1)); + } + } - return ; + return Object.keys(decks).map((deckName) => { + return ( + + ); + }); } return null; };