diff --git a/build-sources.json b/build-sources.json index 6e2460b8192..e300a299746 100644 --- a/build-sources.json +++ b/build-sources.json @@ -10,6 +10,8 @@ "edb_plus": true, "efm": true, "epas": true, + "pgd": true, + "pgd_cli": true, "eprs": true, "hadoop_data_adapter": true, "jdbc_connector": true, diff --git a/gatsby-config.js b/gatsby-config.js index 8565b94e937..fb80bdd3039 100644 --- a/gatsby-config.js +++ b/gatsby-config.js @@ -27,6 +27,8 @@ const sourceToPluginConfig = { edb_plus: { name: "edb_plus", path: "product_docs/docs/edb_plus" }, efm: { name: "efm", path: "product_docs/docs/efm" }, epas: { name: "epas", path: "product_docs/docs/epas" }, + pgd: { name: "pgd", path: "product_docs/docs/pgd" }, + pgd_cli: { name: "edb_pgd", path: "product_docs/docs/pgd_cli" }, eprs: { name: "eprs", path: "product_docs/docs/eprs" }, hadoop_data_adapter: { name: "hadoop_data_adapter", @@ -71,6 +73,7 @@ const sourceToPluginConfig = { }, pem: { name: "pem", path: "product_docs/docs/pem" }, pgbouncer: { name: "pgbouncer", path: "product_docs/docs/pgbouncer" }, + pgd_cli: { name: "pgd_cli", path: "product_docs/docs/pgd_cli" }, pglogical: { name: "pglogical", path: "product_docs/docs/pglogical" }, pgpool: { name: "pgpool", path: "product_docs/docs/pgpool" }, postgis: { name: "postgis", path: "product_docs/docs/postgis" }, diff --git a/product_docs/docs/bdr/4.0/appusage.mdx b/product_docs/docs/bdr/4.0/appusage.mdx deleted file mode 100644 index cd2a3989b89..00000000000 --- a/product_docs/docs/bdr/4.0/appusage.mdx +++ /dev/null @@ -1,691 +0,0 @@ ---- -title: Application Usage -originalFilePath: appusage.md - ---- - -This chapter looks at BDR from an application or user perspective. - -Setting up nodes is discussed in a later chapter, as is replication -of DDL, and various options for controlling replication using -replication sets. - -## Application Behavior - -BDR supports replicating changes made on one node to other nodes. - -BDR will, by default, replicate all changes from INSERTs, UPDATEs, DELETEs -and TRUNCATEs from the source node to other nodes. Only the final changes -will be sent, after all triggers and rules have been processed. For example, -INSERT ... ON CONFLICT UPDATE will send either an INSERT or an UPDATE -depending on what occurred on the origin. If an UPDATE or DELETE affects -zero rows, then no changes will be sent. - -INSERTs can be replicated without any pre-conditions. - -For UPDATEs and DELETEs to be replicated on other nodes, we must be able to -identify the unique rows affected. BDR requires that a table have either -a PRIMARY KEY defined, a UNIQUE constraint or have an explicit REPLICA IDENTITY -defined on specfic column(s). If one of those is not defined, a WARNING will be -generated, and later UPDATEs or DELETEs will be explicitly blocked. -If REPLICA IDENTITY FULL is defined for a table, then a unique index is not required; -in that case, UPDATEs and DELETEs are allowed and will use the first non-unique -index that is live, valid, not deferred and does not have expressions or WHERE -clauses, otherwise a sequential scan will be used. - -TRUNCATE can be used even without a defined replication identity. -Replication of TRUNCATE commands is supported, but some care must be taken -when truncating groups of tables connected by foreign keys. When replicating -a truncate action, the subscriber will truncate the same group of tables that -was truncated on the origin, either explicitly specified or implicitly -collected via CASCADE, except in cases where replication sets are defined, -see [Replication Sets](repsets) chapter for further details and examples. -This will work correctly if all affected tables are part of the same -subscription. But if some tables to be truncated on the subscriber have -foreign-key links to tables that are not part of the same (or any) -replication set, then the application of the truncate action on the -subscriber will fail. - -Row-level locks taken implicitly by INSERT, UPDATE and DELETE commands will -be replicated as the changes are made. -Table-level locks taken implicitly by INSERT, UPDATE, DELETE and TRUNCATE -commands will also be replicated. -Explicit row-level locking (SELECT ... FOR UPDATE/FOR SHARE) by user sessions -is not replicated, nor are advisory locks. Information stored by transactions -running in SERIALIZABLE mode is not replicated to other nodes; the -transaction isolation level of SERIALIAZABLE is supported but transactions -will not be serialized across nodes, in the presence of concurrent -transactions on multiple nodes. - -If DML is executed on multiple nodes concurrently then potential conflicts -could occur if executing with asynchronous replication and these must be -must be either handled or avoided. Various avoidance mechanisms are possible, -discussed in the chapter on [Conflicts](conflicts) which is also -required reading. - -Sequences need special handling, described in the [Sequences](sequences) -chapter. - -Binary data in BYTEA columns is replicated normally, allowing "blobs" of data -up to 1GB in size. Use of the PostgreSQL "Large object" facility is not -supported in BDR. - -Rules execute only on the origin node, so are not executed during apply, -even if they are enabled for replicas. - -Replication is only possible from base tables to base tables. That is, the -tables on the source and target on the subscription side must be -tables, not views, materialized views, or foreign tables. Attempts to -replicate tables other than base tables will result in an error. -DML changes that are made through updatable views are resolved through to -base tables on the origin and then applied to the same base table name on -the target. - -BDR supports partitioned tables transparently, meaning that a partitioned -table can be added to a replication set and -changes that involve any of the partitions will be replicated downstream. - -By default, triggers execute only on the origin node. For example, an INSERT -trigger executes on the origin node and is ignored when we apply the change on -the target node. You can specify that triggers should execute on both the origin -node at execution time and on the target when it is replicated ("apply time") -by using `ALTER TABLE ... ENABLE ALWAYS TRIGGER`, or use the `REPLICA` option -to execute only at apply time, `ALTER TABLE ... ENABLE REPLICA TRIGGER`. - -Some types of trigger are not executed on apply, even if they exist on a -table and are currently enabled. Trigger types not executed are - -- Statement-level triggers (FOR EACH STATEMENT) -- Per-column UPDATE triggers (UPDATE OF column_name [, ...]) - -BDR replication apply uses the system-level default search_path. Replica -triggers, stream triggers and index expression functions may assume -other search_path settings which will then fail when they execute on apply. -To ensure this does not occur, resolve object references clearly using -either the default search_path only, always use fully qualified references to -objects, e.g. schema.objectname, or set the search path for a function using -`ALTER FUNCTION ... SET search_path = ...` for the functions affected. - -Note that BDR assumes that there are no issues related to text or other -collatable datatypes, i.e. all collations in use are available on all -nodes and the default collation is the same on all nodes. Replication of -changes uses equality searches to locate Replica Identity values, so this -will not have any effect except where unique indexes are explicitly defined -with non-matching collation qualifiers. Row filters might be affected -by differences in collations if collatable expressions were used. - -BDR handling of very-long "toasted" data within PostgreSQL is transparent to -the user. Note that the TOAST "chunkid" values will likely differ between -the same row on different nodes, but that does not cause any problems. - -BDR cannot work correctly if Replica Identity columns are marked as "external". - -PostgreSQL allows CHECK() constraints that contain volatile functions. Since -BDR re-executes CHECK() constraints on apply, any subsequent re-execution that -doesn't return the same result as previously will cause data divergence. - -BDR does not restrict the use of Foreign Keys; cascading FKs are allowed. - -## Non-replicated statements - -None of the following user commands are replicated by BDR, so their effects -occur on the local/origin node only: - -- Cursor operations (DECLARE, CLOSE, FETCH) -- Execution commands (DO, CALL, PREPARE, EXECUTE, EXPLAIN) -- Session management (DEALLOCATE, DISCARD, LOAD) -- Parameter commands (SET, SHOW) -- Constraint manipulation (SET CONSTRAINTS) -- Locking commands (LOCK) -- Table Maintenance commands (VACUUM, ANALYZE, CLUSTER, REINDEX) -- Async operations (NOTIFY, LISTEN, UNLISTEN) - -Note that since the `NOTIFY` SQL command and the `pg_notify()` functions -are not replicated, notifications are *not* reliable in case of failover. -This means that notifications could easily be lost at failover if a -transaction is committed just at the point the server crashes. -Applications running `LISTEN` may miss notifications in case of failover. -This is regrettably true in standard PostgreSQL replication and BDR does -not yet improve on this. CAMO and Eager replication options do not -allow the `NOTIFY` SQL command or the `pg_notify()` function. - -## DML and DDL Replication - -Note that BDR does not replicate the DML statement, it replicates the changes -caused by the DML statement. So for example, an UPDATE that changed -two rows would replicate two changes, whereas a DELETE that did not -remove any rows would not replicate anything. This means that the results -of execution of volatile statements are replicated, ensuring there is no -divergence between nodes as might occur with statement-based replication. - -DDL replication works differently to DML. For DDL, BDR replicates the -statement, which is then executed on all nodes. So a DROP TABLE IF EXISTS -might not replicate anything on the local node, but the statement is -still sent to other nodes for execution if DDL replication is enabled. -Full details are covered in their own chapter: [DDL replication]. - -BDR goes to great lengths to ensure that intermixed DML and DDL -statements work correctly, even within the same transaction. - -## Replicating between different release levels - -BDR is designed to replicate between nodes that have different major -versions of PostgreSQL. This is a feature designed to allow major -version upgrades without downtime. - -BDR is also designed to replicate between nodes that have different -versions of BDR software. This is a feature designed to allow version -upgrades and maintenance without downtime. - -However, while it's possible to join a node with a major version in -a cluster, you can not add a node with a minor version if the cluster -uses a newer protocol version, this will return error. - -Both of the above features may be affected by specific restrictions; -any known incompatibilities will be described in the release notes. - -## Replicating between nodes with differences - -By default, DDL will automatically be sent to all nodes. This can be -controlled manually, as described in [DDL Replication](ddl), which -could be used to create differences between database schemas across nodes. -BDR is designed to allow replication to continue even while minor -differences exist between nodes. These features are designed to allow -application schema migration without downtime, or to allow logical -standby nodes for reporting or testing. - -Currently, replication requires the same table name on all nodes. A future -feature may allow a mapping between different table names. - -It is possible to replicate between tables with dissimilar partitioning -definitions, such as a source which is a normal table replicating to a -partitioned table, including support for updates that change partitions -on the target. It can be faster if the partitioning definition is the -same on the source and target since dynamic partition routing need not be -executed at apply time. -Further details are available in the chapter on Replication Sets. - -By default, all columns are replicated. -BDR replicates data columns based on the column name. If a column -has the same name but a different datatype, we attempt to cast from the source -type to the target type, if casts have been defined that allow that. - -BDR supports replicating between tables that have a different number of columns. - -If the target has missing column(s) from the source then BDR will raise -a target_column_missing conflict, for which the default conflict resolver -is ignore_if_null. This will throw an ERROR if a non-NULL value arrives. -Alternatively, a node can also be configured with a conflict resolver of ignore. -This setting will not throw an ERROR, just silently ignore any additional -columns. - -If the target has additional column(s) not seen in the source record then BDR will -raise a source_column_missing conflict, for which the default conflict resolver -is use_default_value. Replication will proceed if the additional columns -have a default, either NULL (if nullable) or a default expression, but will -throw an ERROR and halt replication if not. - -Transform triggers can also be used on tables to provide default values -or alter the incoming data in various ways before apply. - -If the source and the target have different constraints, then -replication will be attempted, but it might fail if the rows from -source cannot be applied to the target. Row filters may help here. - -Replicating data from one schema to a more relaxed schema won't cause failures. -Replicating data from a schema to a more restrictive schema will be a source of -potential failures. -The right way to solve this is to place a constraint on the more relaxed side, -so bad data is prevented from being entered. That way, no bad data ever arrives -via replication, so it will never fail the transform into the more restrictive -schema. For example, if one schema has a column of type TEXT and another schema -defines the same column as XML, add a CHECK constraint onto the TEXT column -that enforces that the text is XML. - -A table may be defined with different indexes on each node. By default, the -index definitions will be replicated. Refer to [DDL Replication](ddl) to -specify how to create an index only on a subset of nodes, or just locally. - -Storage parameters, such as fillfactor and toast_tuple_target, may differ -between nodes for a table without problems. An exception to that is the -value of a table's storage parameter `user_catalog_table` must be identical -on all nodes. - -A table being replicated should be owned by the same user/role on each node. -Refer to [Security and Roles](security) for further discussion. - -Roles may have different passwords for connection on each node, though -by default changes to roles are replicated to each node. Refer to -[DDL Replication](ddl) to specify how to alter a role password only on a -subset of nodes, or just locally. - -## Comparison between nodes with differences - -Livecompare is a tool used for data comparison on a database, against BDR and -non-BDR nodes. It needs a minimum number of two connections to compare against -and reach a final result. - -From Livecompare 1.3 , you could configure with `all_bdr_nodes` set. This will -save you from clarifying all the relevant DSNs for each separate node in the -cluster. A BDR cluster has N amount of nodes with connection information, but -its only the initial and output connection that livecompare 1.3+ needs in order -to complete its job. Setting `logical_replication_mode` will state how all the -nodes are communicating. - -All the configuration is done within a .ini file, named bdrLC.ini for example. -Templates for this configuration file can be seen within the -`/etc/2ndq-livecompare/` location, where they were placed after the package -install. - -During the execution of LiveCompare, you will see N+1 progress bars, N being -the number of processes. Once all the tables are sourced a time will display, -as the transactions per second (tps) has been measured. This will continue to -count the time, giving you an estimate, then a total execution time at the end. - -This tool has a lot of customisation and filters. Such as tables, schemas and -replication_sets. LiveCompare can use stop-start without losing context -information, so it can be run at convenient times. After the comparison, a -summary and a DML script are generated so the user can review it. Please apply -the DML to fix the found differences, if any. - -## General Rules for Applications - -As discussed above, BDR uses replica identity values to identify the rows to -be changed. -Applications can cause difficulties if they insert, delete, and then later -re-use the same unique identifiers. -This is known as the ABA Problem. BDR cannot know whether the rows are the -current row, the last row, or much older rows. -See . - -Similarly, since BDR uses table names to identify the table against which -changes will be replayed, a similar ABA problem exists with applications that -CREATE, then DROP, and then later re-use the same object names. - -These issues give rise to some simple rules for applications to follow: - -1. Use unique identifiers for rows (INSERT) -2. Avoid modification of unique identifiers (UPDATE) -3. Avoid reuse of deleted unique identifiers -4. Avoid reuse of dropped object names - -In the general case, breaking those rules can lead to data anomalies and -divergence. Applications can break those rules as long as certain conditions -are met, but use caution: although anomalies can be unlikely, they are not -impossible. For example, a row value can be reused as long as the DELETE has -been replayed on all nodes, including down nodes. This might normally occur in -less than a second, but could potentially take days if a severe issue occurred -on one node that prevented it from restarting correctly. - -## Timing Considerations and Synchronous Replication - -Being asynchronous by default, peer nodes may lag behind making it's -possible for a client connected to multiple BDR nodes or switching -between them to read stale data. - -A [queue wait function](functions#bdrwait_for_apply_queue) is -provided for clients or proxies to prevent such stale reads. - -The synchronous replication features of Postgres are available to BDR -as well. In addition, BDR provides multiple variants for more synchronous -replication. Please refer to the -[Durability & Performance Options](durability) -chapter for an overview and comparison of all variants available and -its different modes. - -## Application Testing - -BDR applications can be tested using the following programs, -in addition to other techniques. - -- [TPAexec] -- [pgbench with CAMO/Failover options] -- [isolationtester with multi-node access] - -### TPAexec - -TPAexec is the system used by EDB to deploy reference TPA -architectures, including those based on Postgres-BDR. - -TPAexec includes test suites for each reference architecture; it also -simplifies creating and managing a local collection of tests to be run -against a TPA cluster, using a syntax as in the following example: - -``` -tpaexec test mycluster mytest -``` - -We strongly recommend that developers write their own multi-node suite -of TPAexec tests which verify the main expected properties of the -application. - -### pgbench with CAMO/Failover options - -In EDB Postgres Extended, the pgbench has been extended to allow users to -run failover tests while using CAMO or regular BDR deployments. The following -new options have been added: - -``` --m, --mode=regular|camo|failover -mode in which pgbench should run (default: regular) - ---retry -retry transactions on failover -``` - -in addition to the above options, the connection information about the -peer node for failover must be specified in [DSN -form](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING). - -- Use `-m camo` or `-m failover` to specify the mode for pgbench. - The `-m failover` specification can be used to test failover in - regular BDR deployments. - -- Use `--retry` to specify whether transactions should be retried when - failover happens with `-m failover` mode. This is enabled by default - for `-m camo` mode. - -Here's an example invocation in a CAMO environment: - -```sh - pgbench -m camo -p $node1_port -h $node1_host bdrdemo \ - "host=$node2_host user=postgres port=$node2_port dbname=bdrdemo" -``` - -The above command will run in `camo` mode. It will connect to `node1` and run the tests; if the -connection to `node1` connection is lost, then pgbench will connect to -`node2`. It will query `node2` to get the status of in-flight transactions. -Aborted and in-flight transactions will be retried in `camo` mode. - -In `failover` mode, if `--retry` is specified then in-flight transactions will be retried. In -this scenario there is no way to find the status of in-flight transactions. - -### isolationtester with multi-node access - -isolationtester has been extended to allow users to run tests on multiple -sessions and on multiple nodes. This is used for internal BDR testing, -though it is also available for use with user application testing. - -``` -$ isolationtester \ - --outputdir=./iso_output \ - --create-role=logical \ - --dbname=postgres \ - --server 'd1=dbname=node1' \ - --server 'd2=dbname=node2' \ - --server 'd3=dbname=node3' -``` - -Isolation tests are a set of tests run for examining concurrent behaviors in -PostgreSQL. These tests require running multiple interacting transactions, -which requires management of multiple concurrent connections, and therefore -can't be tested using the normal `pg_regress` program. The name "isolation" -comes from the fact that the original motivation was to test the -serializable isolation level; but tests for other sorts of concurrent -behaviors have been added as well. - -It is built using PGXS as an external module. -On installation, it creates isolationtester binary file which is run by -`pg_isolation_regress` to perform concurrent regression tests and observe -results. - -`pg_isolation_regress` is a tool similar to `pg_regress`, but instead of using -psql to execute a test, it uses isolationtester. It accepts all the same -command-line arguments as `pg_regress`. It has been modified to accept multiple -hosts as parameters. It then passes these host conninfo's along with server names -to isolationtester binary. Isolation tester compares these server names with the -names specified in each session in the spec files and runs given tests on -respective servers. - -To define tests with overlapping transactions, we use test specification -files with a custom syntax, which is described in the next section. To add -a new test, place a spec file in the specs/ subdirectory, add the expected -output in the expected/ subdirectory, and add the test's name to the Makefile. - -Isolationtester is a program that uses libpq to open multiple connections, -and executes a test specified by a spec file. A libpq connection string -specifies the server and database to connect to; defaults derived from -environment variables are used otherwise. - -Specification consists of five parts, tested in this order: - -`server ""` - - This defines the name of the servers that the sessions will run on. - There can be zero or more server "" specifications. - The conninfo corresponding to the names is provided via the command to - run isolationtester. This is described in `quickstart_isolationtest.md`. - This part is optional. - -`setup { }` - - The given SQL block is executed once, in one session only, before running - the test. Create any test tables or other required objects here. This - part is optional. Multiple setup blocks are allowed if needed; each is - run separately, in the given order. (The reason for allowing multiple - setup blocks is that each block is run as a single PQexec submission, - and some statements such as VACUUM cannot be combined with others in such - a block.) - -`teardown { }` - - The teardown SQL block is executed once after the test is finished. Use - this to clean up in preparation for the next permutation, e.g dropping - any test tables created by setup. This part is optional. - -`session ""` - - There are normally several "session" parts in a spec file. Each - session is executed in its own connection. A session part consists - of three parts: setup, teardown and one or more "steps". The per-session - setup and teardown parts have the same syntax as the per-test setup and - teardown described above, but they are executed in each session. The - setup part typically contains a "BEGIN" command to begin a transaction. - - Additionally, a session part also consists of `connect_to` specification. - This points to server name specified in the beginning which - indicates the server on which this session runs. - - `connect_to ""` - - Each step has the syntax - - `step "" { }` - - where `` is a name identifying this step, and SQL is a SQL statement - (or statements, separated by semicolons) that is executed in the step. - Step names must be unique across the whole spec file. - -`permutation ""` - - A permutation line specifies a list of steps that are run in that order. - Any number of permutation lines can appear. If no permutation lines are - given, the test program automatically generates all possible orderings - of the steps from each session (running the steps of any one session in - order). Note that the list of steps in a manually specified - "permutation" line doesn't actually have to be a permutation of the - available steps; it could for instance repeat some steps more than once, - or leave others out. - -Lines beginning with a # are considered comments. - -For each permutation of the session steps (whether these are manually -specified in the spec file, or automatically generated), the isolation -tester runs the main setup part, then per-session setup parts, then -the selected session steps, then per-session teardown, then the main -teardown script. Each selected step is sent to the connection associated -with its session. - -To run isolation tests in a BDR environment that ran all prerequisite make -commands, follow the below steps, - -1. Run `make isolationcheck-install` to install the isolationtester submodule - -2. You can run isolation regression tests using either - of the following commands from the bdr-private repo - - `make isolationcheck-installcheck` - `make isolationcheck-makecheck` - -A. To run isolationcheck-installcheck, you need to have two or more postgresql -servers running. Pass the conninfo's of servers to `pg_isolation_regress` -in BDR Makefile. - Ex: `pg_isolation_regress --server 'd1=host=myhost dbname=mydb port=5434' - --server 'd2=host=myhost1 dbname=mydb port=5432'` - -Now, add a .spec file containing tests in specs/isolation directory -of bdr-private/ repo. Add .out file in expected/isolation directory of -bdr-private/ repo. - -Then run - `make isolationcheck-installcheck` - -B. Isolationcheck-makecheck currently supports running isolation tests on a -single instance by setting up BDR between multiple databases. - -You need to pass appropriate database names, conninfos of bdr instances -to `pg_isolation_regress` in BDR Makefile as follows: - `pg_isolation_regress --dbname=db1,db2 --server 'd1=dbname=db1' - --server 'd2=dbname=db2'` - -Then run - `make isolationcheck-makecheck` - -Each step may contain commands that block until further action has been taken -(most likely, some other session runs a step that unblocks it or causes a -deadlock). A test that uses this ability must manually specify valid -permutations, i.e. those that would not expect a blocked session to execute a -command. If a test fails to follow that rule, isolationtester will cancel it -after 300 seconds. If the cancel doesn't work, isolationtester will exit -uncleanly after a total of 375 seconds of wait time. Testing invalid -permutations should be avoided because they can make the isolation tests take -a very long time to run, and they serve no useful testing purpose. - -Note that isolationtester recognizes that a command has blocked by looking -to see if it is shown as waiting in the `pg_locks` view; therefore, only -blocks on heavyweight locks will be detected. - -## Performance Testing & Tuning - -BDR allows you to issue write transactions onto multiple master nodes. -Bringing those writes back together onto each node has a cost in -performance that you should be aware of. - -First, replaying changes from another node has a CPU cost, an I/O cost -and it will generate WAL records. The resource usage is usually less -than in the original transaction since CPU overheads are lower as a result -of not needing to re-execute SQL. In the case of UPDATE and DELETE -transactions there may be I/O costs on replay if data isn't cached. - -Second, replaying changes holds table-level and row-level locks that can -produce contention against local workloads. The CRDT (Conflict-free Replicated Data Types) and CLCD (Column-Level Conflict Detection) features -ensure you get the correct answers even for concurrent updates, but they -don't remove the normal locking overheads. If you get locking contention, -try to avoid conflicting updates and/or keep transactions as short as -possible. A heavily updated row within a larger transaction will cause -a bottleneck on performance for that transaction. Complex applications -require some thought to maintain scalability. - -If you think you're having performance problems, you're encouraged -to develop performance tests using the benchmarking tools above. pgbench -allows you to write custom test scripts specific to your use case -so you can understand the overheads of your SQL and measure the impact -of concurrent execution. - -So if "BDR is running slow", then we suggest the following: - -1. Write a custom test script for pgbench, as close as you can make it - to the production system's problem case. -2. Run the script on one node to give you a baseline figure. -3. Run the script on as many nodes as occurs in production, using the - same number of sessions in total as you did on one node. This will - show you the effect of moving to multiple nodes. -4. Increase the number of sessions for the above 2 tests, so you can - plot the effect of increased contention on your application. -5. Make sure your tests are long enough to account for replication delays. -6. Ensure that replication delay isn't growing during your tests. - -Use all of the normal Postgres tuning features to improve the speed -of critical parts of your application. - -## Assessing Suitability - -BDR is compatible with PostgreSQL, but not all PostgreSQL applications are -suitable for use on distributed databases. Most applications are already, or -can be easily modified to become BDR compliant. Users can undertake an -assessment activity in which they can point their application to a BDR-enabled -setup. BDR provides a few knobs which can be set during the assessment period. -These will aid in the process of deciding suitability of their application in -a BDR-enabled environment. - -### Assessing updates of Primary Key/Replica Identity - -BDR cannot currently perform conflict resolution where the PRIMARY KEY is changed -by an UPDATE operation. It is permissible to update the primary key, but you must -ensure that no conflict with existing values is possible. - -BDR provides the following configuration parameter to assess how frequently -the primary key/replica identity of any table is being subjected to update -operations. - -Note that these configuration parameters must only be used for assessment only. -They can be used on a single node BDR instance, but must not be used on a production -BDR cluster with two or more nodes replicating to each other. In fact, a node -may fail to start or a new node will fail to join the cluster if any of the -assessment parameters are set to anything other than `IGNORE`. - -```sql -bdr.assess_update_replica_identity = IGNORE (default) | LOG | WARNING | ERROR -``` - -By enabling this parameter during the assessment period, one can log updates to -the key/replica identity values of a row. One can also potentially block such -updates, if desired. E.g. - -```sql -CREATE TABLE public.test(g int primary key, h int); -INSERT INTO test VALUES (1, 1); - -SET bdr.assess_update_replica_identity TO 'error'; -UPDATE test SET g = 4 WHERE g = 1; -ERROR: bdr_assess: update of key/replica identity of table public.test -``` - -Apply worker processes will always ignore any settings for this parameter. - -### Assessing use of LOCK on tables or in SELECT queries - -Because BDR writer processes operate much like normal user sessions, they are subject to -the usual rules around row and table locking. This can sometimes lead to BDR writer -processes waiting on locks held by user transactions, or even by each other. - -BDR provides the following configuration parameter to assess if the application -is taking explicit locks. - -```sql -bdr.assess_lock_statement = IGNORE (default) | LOG | WARNING | ERROR -``` - -Two types of locks that can be tracked are: - -- explicit table-level locking (LOCK TABLE ...) by user sessions -- explicit row-level locking (SELECT ... FOR UPDATE/FOR SHARE) by user sessions - -By enabling this parameter during the assessment period, one can track (or block) such explicit -locking activity. E.g. - -```sql -CREATE TABLE public.test(g int primary key, h int); -INSERT INTO test VALUES (1, 1); - -SET bdr.assess_lock_statement TO 'error'; -SELECT * FROM test FOR UPDATE; -ERROR: bdr_assess: "SELECT FOR UPDATE" invoked on a BDR node - -SELECT * FROM test FOR SHARE; -ERROR: bdr_assess: "SELECT FOR SHARE" invoked on a BDR node - -SET bdr.assess_lock_statement TO 'warning'; -LOCK TABLE test IN ACCESS SHARE MODE; -WARNING: bdr_assess: "LOCK STATEMENT" invoked on a BDR node -``` diff --git a/product_docs/docs/bdr/4.0/camo.mdx b/product_docs/docs/bdr/4.0/camo.mdx deleted file mode 100644 index aa227481377..00000000000 --- a/product_docs/docs/bdr/4.0/camo.mdx +++ /dev/null @@ -1,8 +0,0 @@ ---- -navTitle: Commit at Most Once -title: Commit At Most Once -originalFilePath: camo.md - ---- - - diff --git a/product_docs/docs/bdr/4.0/camo_clients.mdx b/product_docs/docs/bdr/4.0/camo_clients.mdx deleted file mode 100644 index 11fc9d3327f..00000000000 --- a/product_docs/docs/bdr/4.0/camo_clients.mdx +++ /dev/null @@ -1,8 +0,0 @@ ---- -navTitle: 'Appendix D: CAMO Reference Clients' -title: 'Appendix D: CAMO Reference Client Implementations' -originalFilePath: camo_clients.md - ---- - - diff --git a/product_docs/docs/bdr/4.0/configuration.mdx b/product_docs/docs/bdr/4.0/configuration.mdx deleted file mode 100644 index 780698c121a..00000000000 --- a/product_docs/docs/bdr/4.0/configuration.mdx +++ /dev/null @@ -1,498 +0,0 @@ ---- -navTitle: Configuration -title: PostgreSQL Configuration for BDR -originalFilePath: configuration.md - ---- - -There are several PostgreSQL configuration parameters that affect BDR -nodes. Note that these parameters could be set differently on each node, -though that is not recommended, in general. - -## PostgreSQL Settings for BDR - -BDR requires these PostgreSQL settings to run correctly: - -- `wal_level` - Must be set to `logical`, since BDR relies upon logical decoding. -- `shared_preload_libraries` - This must contain `bdr`, though may also contain - other entries before or afterwards, as needed, however 'pglogical' must not - be included -- `track_commit_timestamp` - Must be set to 'on' for conflict resolution to - retrieve the timestamp for each conflicting row. - -BDR requires these PostgreSQL settings to be set to appropriate values, -which vary according to the size and scale of the cluster. - -- `logical_decoding_work_mem` - memory buffer size used by logical decoding. - Transactions larger than this will overflow the buffer and be stored - temporarily on local disk. Default 64MB, but can be set much higher. -- `max_worker_processes` - BDR uses background workers for replication - and maintenance tasks, so there need to be enough worker slots for it to - work correctly. The formula for the correct minimal number of workers is: - one per PostgreSQL instance + one per database on that instance + four - per BDR-enabled database + one per peer node in the BDR group + one for each - writer enabled per peer node in the BDR group, for each database. - Additional worker processes may be needed temporarily when node is being - removed from a BDR group. -- `max_wal_senders` - Two needed per every peer node. -- `max_replication_slots` - Same as `max_wal_senders`. -- `wal_sender_timeout` and `wal_receiver_timeout` - Determine how - quickly a node considers its CAMO partner as disconnected or - reconnected; see [CAMO Failure Scenarios](camo#failure-scenarios) for - details. - -Note that in normal running for a group with N peer nodes, BDR will require -N slots and WAL senders. During synchronization, BDR will temporarily use another -N - 1 slots and WAL senders, so be careful to set the above parameters high enough -to cater for this occasional peak demand. - -With parallel apply turned on, the number of slots needs to be increased to -N slots from above formula \* writers. This is because the `max_replication_slots` -also sets maximum number of replication origins and some of the functionality -of parallel apply uses extra origin per writer. - -When the [Decoding Worker](nodes#decoding-worker) is enabled, this -process will require one extra replication slot per BDR group. - -The general safe recommended value on a 4 node BDR Group with a single database -is just to set `max_replication_slots` and `max_worker_processes` to something -like `50` and `max_wal_senders` to at least `10`. - -Note also that changing these parameters requires restarting the local node: -`max_worker_processes`, `max_wal_senders`, `max_replication_slots`. - -Applications may also wish to set these parameters. Please see chapter on -[Durability & Performance Options] for further discussion. - -- `synchronous_commit` - affects the durability and performance of BDR replication - in a similar way to [physical replication](https://www.postgresql.org/docs/11/runtime-config-wal.html#GUC-SYNCHRONOUS-COMMIT). -- `synchronous_standby_names` - same as above - -## BDR Specific Settings - -There are also BDR specific configuration settings that can be set. -Unless noted otherwise, values may be set by any user at any time. - -### Conflict Handling - -- `bdr.default_conflict_detection` - Sets the default conflict detection method - for newly created tables; accepts same values as - [bdr.alter_table_conflict_detection()](conflicts#bdralter_table_conflict_detection) - -### Global Sequence Parameters - -- `bdr.default_sequence_kind` - Sets the default [sequence kind](sequences). - -### DDL Handling - -- `bdr.default_replica_identity` - Sets the default value for `REPLICA IDENTITY` - on newly created tables. The `REPLICA IDENTITY` defines which information is - written to the write-ahead log to identify rows which are updated or deleted. - - The accepted values are: - - - `DEFAULT` - records the old values of the columns of the primary key, - if any (this is the default PostgreSQL behavior). - - `FULL` - records the old values of all columns in the row. - - `NOTHING` - records no information about the old row. - - See [PostgreSQL documentation](https://www.postgresql.org/docs/current/sql-altertable.html#SQL-CREATETABLE-REPLICA-IDENTITY) - for more details. - - BDR can not replicate `UPDATE`s and `DELETE`s on tables without a `PRIMARY KEY` - or `UNIQUE` constraint, unless the replica identity for the table is `FULL`, - either by table-specific configuration or via `bdr.default_replica_identity`. - - If bdr.default_replica_identity is DEFAULT and there is a `UNIQUE` - constraint on the table, it will not be automatically picked up as `REPLICA - IDENTITY`. It needs to be set explicitly at the time of creating the table, - or afterwards as described in the documentation above. - - Setting the replica identity of table(s) to `FULL` increases the volume of - WAL written and the amount of data replicated on the wire for the table. - -- `bdr.ddl_replication` - Automatically replicate DDL across nodes (default - "on"). - - This parameter can be only set by bdr_superuser or superuser roles. - - Running DDL or calling BDR administration functions with - `bdr.ddl_replication = off` can create situations where replication stops - until an administrator can intervene. See [the DDL replication chapter](ddl) - for details. - - A `LOG`-level log message is emitted to the PostgreSQL server logs whenever - `bdr.ddl_replication` is set to `off`. Additionally, a `WARNING-level` - message is written whenever replication of captured DDL commands or BDR - replication functions is skipped due to this setting. - -- `bdr.role_replication` - Automatically replicate ROLE commands across nodes - (default "on"). This parameter is settable by a superuser only. This setting - only works if `bdr.ddl_replication` is turned on as well. - - Turning this off without using external methods to ensure roles are in sync - across all nodes may cause replicated DDL to interrupt replication until - the administrator intervenes. - - See [Role manipulation statements in the DDL replication chapter](ddl#Role_manipulation_statements) - for details. - -- `bdr.ddl_locking` - Configures the operation mode of global locking for DDL. - - This parameter can be only set by bdr_superuser or superuser roles. - - Possible options are: - - - off - do not use global locking for DDL operations - - on - use global locking for all DDL operations - - dml - only use global locking for DDL operations that need to prevent - writes by taking the global DML lock for a relation - - A `LOG`-level log message is emitted to the PostgreSQL server logs - whenever `bdr.ddl_replication` is set to `off`. Additionally, a `WARNING` - message is written whenever any global locking steps are skipped due to - this setting. It is normal for some statements to result in two `WARNING`s, - one for skipping the DML lock and one for skipping the DDL lock. - -- `bdr.truncate_locking` - False by default, this configuration option sets the - TRUNCATE command's locking behavior. Determines whether (when true) TRUNCATE - obeys the bdr.ddl_locking setting. - -### Global Locking - -- `bdr.ddl_locking` - Described above. -- `bdr.global_lock_max_locks` - Maximum number of global locks that can be - held on a node (default 1000). May only be set at Postgres server start. -- `bdr.global_lock_timeout` - Sets the maximum allowed duration of any wait - for a global lock (default 10 minutes). A value of zero disables this timeout. -- `bdr.global_lock_statement_timeout` - Sets the maximum allowed duration of - any statement holding a global lock (default 60 minutes). - A value of zero disables this timeout. -- `bdr.global_lock_idle_timeout` - Sets the maximum allowed duration of - idle time in transaction holding a global lock (default 10 minutes). - A value of zero disables this timeout. - -### Node Management - -- `bdr.replay_progress_frequency` - Interval for sending replication position - info to the rest of the cluster (default 1 minute). - -- `bdr.standby_slot_names` - Require these slots to receive and confirm - replication changes before any other ones. This is useful primarily when - using physical standbys for failover or when using subscribe-only nodes. - -### Generic Replication - -- `bdr.writers_per_subscription` - Default number of writers per - subscription (in BDR this can also be changed by - `bdr.alter_node_group_config` for a group). - -- `bdr.max_writers_per_subscription` - Maximum number of writers - per subscription (sets upper limit for the setting above). - -- `bdr.xact_replication` - Replicate current transaction (default "on"). - - Turning this off will make the whole transaction local only, which - means the transaction will not be visible to logical decoding by - BDR and all other downstream targets of logical decoding. Data will - not be transferred to any other node, including logical standby nodes. - - This parameter can be only set by the bdr_superuser or superuser roles. - - This parameter can only be set inside the current transaction using the - `SET LOCAL` command unless `bdr.permit_unsafe_commands = on`. - -!!! Note - Even with transaction replication disabled, WAL will be generated - but those changes will be filtered away on the origin. - -!!! Warning - Turning off `bdr.xact_replication` *will* lead to data - inconsistency between nodes, and should only be used to recover from - data divergence between nodes or in - replication situations where changes on single nodes are required for - replication to continue. Use at your own risk. - -- `bdr.permit_unsafe_commands` - Option to override safety check on commands - that are deemed unsafe for general use. - - Requires `bdr_superuser` or PostgreSQL superuser. - -!!! Warning - The commands that are normally not considered safe may either - produce inconsistent results or break replication altogether. Use at your - own risk. - -- `bdr.batch_inserts` - How many consecutive inserts to one table within - a single transaction turns on batch processing of inserts for that table. - - This option allows replication of large data loads as COPY internally, - rather than set of inserts. It also how the initial data during node join - is copied. - -- `bdr.maximum_clock_skew` - - This specifies what should be considered as the maximum difference between - the incoming transaction commit timestamp and the current time on the - subscriber before triggering `bdr.maximum_clock_skew_action`. - - This checks if the timestamp of the currently replayed transaction is in the - future compared to the current time on the subscriber; and if it is, and the - difference is larger than `bdr.maximum_clock_skew`, it will do the action - specified by the `bdr.maximum_clock_skew_action` setting. - - The default is -1, which means: ignore clock skew (the check is turned - off). It is valid to set 0 as when the clock on all servers are synchronized, - the fact that we are replaying the transaction means it has been committed in - the past. - -- `bdr.maximum_clock_skew_action` - - This specifies the action to take if a clock skew higher than - `bdr.maximum_clock_skew` is detected. - - There are two possible values for this option: - - - `WARN` - Log a warning about this fact. The warnings are logged once per - minute (the default) at the maximum to prevent flooding the server log. - - `WAIT` - Wait for as long as the current local timestamp is no longer older than - remote commit timestamp minus the `bdr.maximum_clock_skew`. - -### `bdr.standby_slot_names` - -This option is typically used in failover configurations to ensure that the -failover-candidate streaming physical replica(s) for this BDR node -have received and flushed all changes before they ever become visible to any -subscribers. That guarantees that a commit cannot vanish on failover to a -standby for the provider. - -Replication slots whose names are listed in the comma-separated -`bdr.standby_slot_names` list are treated specially by the walsender -on a BDR node. - -BDR's logical replication walsenders will ensure that all local changes -are sent and flushed to the replication slots in `bdr.standby_slot_names` -before the node sends those changes to any other BDR replication -clients. Effectively it provides a synchronous replication barrier between the -named list of slots and all other replication clients. - -Any replication slot may be listed in `bdr.standby_slot_names`; both -logical and physical slots work, but it's generally used for physical slots. - -Without this safeguard, two anomalies are possible where a commit can be -received by a subscriber then vanish from the provider on failover because -the failover candidate hadn't received it yet: - -- For 1+ subscribers, the subscriber may have applied the change but the new - provider may execute new transactions that conflict with the received change, - as it never happened as far as the provider is concerned; - -and/or - -- For 2+ subscribers, at the time of failover, not all subscribers have applied - the change.The subscribers now have inconsistent and irreconcilable states - because the subscribers that didn't receive the commit have no way to get it - now. - -Setting `bdr.standby_slot_names` will (by design) cause subscribers to -lag behind the provider if the provider's failover-candidate replica(s) are not -keeping up. Monitoring is thus essential. - -Another use-case where the `bdr.standby_slot_names` is useful is when using -subscriber-only, to ensure that the subscriber-only node doesn't move ahead -of any of the other BDR nodes. - -### `bdr.standby_slots_min_confirmed` - -Controls how many of the `bdr.standby_slot_names` have to confirm before -we send data to BDR subscribers. - -### `bdr.writer_input_queue_size` - -This option is used to specify the size of the shared memory queue used -by the receiver to send data to the writer process. If the writer process is -stalled or making slow progress, then the queue might get filled up, stalling -the receiver process too. So it's important to provide enough shared memory for -this queue. The default is 1MB and the maximum allowed size is 1GB. While any -storage size specifier can be used to set the GUC, the default is kB. - -### `bdr.writer_output_queue_size` - -This option is used to specify the size of the shared memory queue used -by the receiver to receive data from the writer process. Since the writer is -not expected to send a large amount of data, a relatively smaller sized queue -should be enough. The default is 32kB and the maximum allowed size is 1MB. -While any storage size specifier can be used to set the GUC, the default is -kB. - -### `bdr.min_worker_backoff_delay` - -Rate limit BDR background worker launches by preventing a given worker -from being relaunched more often than every -`bdr.min_worker_backoff_delay` milliseconds. Time-unit suffixes are supported. - -!!! Note - This setting currently only affects receiver worker, which means it - primarily affects how fast a subscription will try to reconnect on error - or connection failure. - -The default is 1s. The delay is a time limit applied from launch-to-launch, -so the default value of `1s` limits of workers to at most once (re)launches -per second. - -If the backoff delay setting is changed and the PostgreSQL configuration is -reloaded then all current backoff waits will be reset. Additionally, the -`bdr.worker_task_reset_backoff_all()` function is provided to allow the -administrator to force all backoff intervals to immediately expire. - -A tracking table in shared memory is maintained to remember the last launch -time of each type of worker. This tracking table is not persistent; it is -cleared by PostgreSQL restarts, including soft-restarts during crash recovery -after an unclean backend exit. - -The view [`bdr.worker_tasks`](monitoring#bdr.worker_tasks) -may be used to inspect this state so the administrator can see any backoff -rate-limiting currently in effect. - -For rate limiting purposes, workers are classified by "task". This key consists -of the worker role, database oid, subscription id, subscription writer id, -extension library name and function name, extension-supplied worker name, and -the remote relation id for sync writers. `NULL` is used where a given -classifier does not apply, e.g. manager workers don't have a subscription ID -and receivers don't have a writer id. - -### CRDTs - -- `bdr.crdt_raw_value` - Sets the output format of [CRDT Data Types](crdt). - The default output (when this setting is `off`) is to return only the current - value of the base CRDT type (for example, a bigint for `crdt_pncounter`). - When set to `on`, the returned value represents the full representation of - the CRDT value, which can for example include the state from multiple nodes. - -### Max Prepared Transactions - -- `max_prepared_transactions` - Needs to be set high enough to cope - with the maximum number of concurrent prepared transactions across - the cluster due to explicit two-phase commits, CAMO or Eager - transactions. Exceeding the limit prevents a node from running a - local two-phase commit or CAMO transaction, and will prevent all - Eager transactions on the cluster. - May only be set at Postgres server start. - -### Eager Replication - -- `bdr.commit_scope` - Setting the commit scope to `global` enables - [eager all node replication](eager) (default `local`). - -- `bdr.global_commit_timeout` - Timeout for both stages of a global - two-phase commit (default 60s) as well as for CAMO-protected transactions - in their commit phase, as a limit for how long to wait for the CAMO - partner. - -### Commit at Most Once - -- `bdr.enable_camo` - Used to enable and control the CAMO feature. - Defaults to `off`. CAMO can be switched on per transaction by - setting this to `remote_write`, `remote_commit_async`, or - `remote_commit_flush`. For backwards-compatibility, the values - `on`, `true`, and `1` set the safest `remote_commit_flush` mode. - While `false` or `0` also disable CAMO. -- `bdr.standby_dsn` - Allows manual override of the connection - string (DSN) to reach the CAMO partner, in case it has changed since - the crash of the local node. Should usually be unset. - May only be set at Postgres server start. -- `bdr.camo_local_mode_delay` - The commit delay that applies in - CAMO's Local mode to emulate the overhead that normally occurs with - the CAMO partner having to confirm transactions. Defaults to 5 ms. - Setting to 0 disables this feature. -- `bdr.camo_enable_client_warnings` - Emit warnings if an activity is - carried out in the database for which CAMO properties cannot be - guaranteed. This is enabled by default. Well-informed users can choose - to disable this to reduce the amount of warnings going into their logs. -- `synchronous_replication_availability` - Can optionally be `async` - for increased availability by allowing a node to continue and - commit after its CAMO partner got disconnected. Under the default - value of `wait`, the node will wait indefinitely, and proceed to - commit only after the CAMO partner reconnects and sends - confirmation. - -### Transaction streaming - -- `bdr.default_streaming_mode` - used to control transaction streaming by - the subscriber node. Permissible values are: `off`, `writer`, `file`, `auto`. - Defaults to `auto`. If set to `off`, the subscriber will not request - transaction streaming. If set to one of the other permissible values, the - subscriber will request transaction streaming and the publisher will provide - this if it supports them and configured at group level. For - more details, see [Transaction Streaming](transaction-streaming). - -### Timestamp-based Snapshots - -- `snapshot_timestamp` - Turns on the usage of - [timestamp-based snapshots](tssnapshots) and sets the timestamp to use. -- `bdr.timestamp_snapshot_keep` - For how long to keep valid snapshots for the - timestamp-based snapshot usage (default 0, meaning do not keep past snapshots). - -### Monitoring and Logging - -- `bdr.debug_level` - Defines the log level that BDR uses to write - its debug messages. The default value is `debug2`. If you want to see - detailed BDR debug output, set `bdr.debug_level = 'log'`. - -- `bdr.trace_level` - Similar to the above, this defines the log level - to use for BDR trace messages. Enabling tracing on all nodes of a - BDR cluster may help EDB Support to diagnose issues. - May only be set at Postgres server start. - -!!! Warning - Setting `bdr.debug_level` or `bdr.trace_level` to a value >= - `log_min_messages` can produce a very large volume of log output, so it should not - be enabled long term in production unless plans are in place for log filtering, - archival and rotation to prevent disk space exhaustion. - -- `bdr.track_subscription_apply` - Track apply statistics for - each subscription. -- `bdr.track_relation_apply` - Track apply statistics for each - relation. -- `bdr.track_apply_lock_timing` - Track lock timing when tracking - statistics for relations. - -### Internals - -- `bdr.raft_keep_min_entries` - The minimum number of entries to keep in the - Raft log when doing log compaction (default 100). The value of 0 will disable - log compaction. **WARNING: If log compaction is disabled, the log will - grow in size forever.** May only be set at Postgres server start. -- `bdr.raft_response_timeout` - To account for network failures, the - Raft consensus protocol implemented will time out requests after a - certain amount of time. This timeout defaults to 30 seconds. -- `bdr.raft_log_min_apply_duration` - To move the state machine - forward, Raft appends entries to its internal log. During normal - operation, appending takes only a few milliseconds. This poses an - upper threshold on the duration of that append action, above which - an `INFO` message is logged. This may indicate an - actual problem. Default value of this parameter is 3000 ms. -- `bdr.raft_log_min_message_duration` - When to log a consensus request. - Measure round trip time of a bdr consensus request and log an - `INFO` message if the time exceeds this parameter. Default value - of this parameter is 5000 ms. -- `bdr.raft_group_max_connections` - The maximum number of connections - across all BDR groups for a Postgres server. These connections carry - bdr consensus requests between the groups' nodes. Default value of this - parameter is 100 connections. May only be set at Postgres server start. -- `bdr.backwards_compatibility` - Specifies the version to be - backwards-compatible to, in the same numerical format as used by - `bdr.bdr_version_num`, e.g. `30618`. Enables exact behavior of a - former BDR version, even if this has generally unwanted effects. - Defaults to the current BDR version. Since this changes from release - to release, we advise against explicit use within the configuration - file unless the value is different to the current version. -- `bdr.track_replication_estimates` - Track replication estimates in terms - of apply rates and catchup intervals for peer nodes. This information can - be used by protocols like CAMO to estimate the readiness of a - peer node. This parameter is enabled by default. -- `bdr.lag_tracker_apply_rate_weight` - We monitor how far behind peer nodes - are in terms of applying WAL from the local node, and calculate a moving - average of the apply rates for the lag tracking. This parameter specifies - how much contribution newer calculated values have in this moving average - calculation. Default value is 0.1. diff --git a/product_docs/docs/bdr/4.0/durability.mdx b/product_docs/docs/bdr/4.0/durability.mdx deleted file mode 100644 index eef394d9e8a..00000000000 --- a/product_docs/docs/bdr/4.0/durability.mdx +++ /dev/null @@ -1,193 +0,0 @@ ---- -title: Durability & Performance Options -originalFilePath: durability.md - ---- - -## Overview - -Synchronous or *Eager Replication* synchronizes between at least two -nodes of the cluster before committing a transaction. This provides -three properties of interest to applications, which are related, but -can all be implemented individually: - -- *Durability*: writing to multiple nodes increases crash resilience - and allows the data to be recovered after a crash and restart. -- *Visibility*: with the commit confirmation to the client, the database - guarantees immediate visibility of the committed transaction on some - sets of nodes. -- *No Conflicts After Commit*: the client can rely on the transaction to - eventually be applied on all nodes without further conflicts, or get - an abort directly informing the client of an error. - -BDR integrates with the `synchronous_commit` option of -Postgres itself, providing a variant of synchronous replication, -which can be used between BDR nodes. BDR also offers two additional -replication modes: - -- Commit At Most Once (CAMO). This feature solves the problem with knowing - whether your transaction has COMMITed (and replicated) or not in case of - certain errors during COMMIT. Normally, it might be hard to know whether - or not the COMMIT was processed in. With this feature, your application can - find out what happened, even if your new database connection is to node - than your previous connection. For more info about this feature see the - [Commit At Most Once](camo) chapter. -- Eager Replication. This is an optional feature to avoid replication - conflicts. Every transaction is applied on *all nodes* simultaneously, - and commits only if no replication conflicts are detected. This feature does - reduce performance, but provides very strong consistency guarantees. - For more info about this feature see the [Eager All-Node Replication](eager) - chapter. - -Postgres itself provides [Physical Streaming Replication](https://www.postgresql.org/docs/current/warm-standby.html#STREAMING-REPLICATION) -(PSR), which is uni-directional, but offers a [synchronous variant](https://www.postgresql.org/docs/current/warm-standby.html#SYNCHRONOUS-REPLICATION) -that can used in combination with BDR. - -This chapter covers the various forms of synchronous or eager -replication and its timing aspects. - -## Comparison - -Most options for synchronous replication available to -BDR allow for different levels of synchronization, offering different -trade-offs between performance and protection against node or network -outages. - -The following table summarizes what a client can expect from a peer -node replicated to after having received a COMMIT confirmation from -the origin node the transaction was issued to. - -| Variant | Mode | Received | Visible | Durable | -| ------- | ----------------------- | -------- | ------- | ------- | -| PGL/BDR | off (default) | no | no | no | -| PGL/BDR | remote_write (2) | yes | no | no | -| PGL/BDR | on (2) | yes | yes | yes | -| PGL/BDR | remote_apply (2) | yes | yes | yes | -| PSR | remote_write (2) | yes | no | no (1) | -| PSR | on (2) | yes | no | yes | -| PSR | remote_apply (2) | yes | yes | yes | -| CAMO | remote_write (2) | yes | no | no | -| CAMO | remote_commit_async (2) | yes | yes | no | -| CAMO | remote_commit_flush (2) | yes | yes | yes | -| Eager | n/a | yes | yes | yes | - -*(1) written to the OS, durable if the OS remains running and only -Postgres crashes.* - -*(2) unless switched to Local mode (if allowed) by setting -`synchronous_replication_availability` to `async'`, otherwise the -values for the asynchronous BDR default apply.* - -Reception ensures the peer will be able to eventually apply all -changes of the transaction without requiring any further -communication, i.e. even in the face of a full or partial network -outage. All modes considered synchronous provide this protection. - -Visibility implies the transaction was applied remotely, and any possible -conflicts with concurrent transactions have been resolved. Without -durability, i.e. prior to persisting the transaction, a crash of the -peer node may revert this state (and require re-transmission and -re-application of the changes). - -Durability relates to the peer node's storage and provides protection -against loss of data after a crash and recovery of the peer node. If -the transaction has already been visible before the crash, it will be -recovered to be visible, again. Otherwise, the transaction's payload -is persisted and the peer node will be able to apply the transaction -eventually (without requiring any re-transmission of data). - -## Internal Timing of Operations - -For a better understanding of how the different modes work, it is -helpful to realize PSR and BDR apply transactions rather -differently. - -With physical streaming replication, the order of operations is: - -- origin flushes a commit record to WAL, making the transaction - visible locally -- peer node receives changes and issues a write -- peer flushes the received changes to disk -- peer applies changes, making the transaction visible locally - -With BDR, the order of operations is different: - -- origin flushes a commit record to WAL, making the transaction - visible locally -- peer node receives changes into its apply queue in memory -- peer applies changes, making the transaction visible locally -- peer persists the transaction by flushing to disk - -For CAMO and Eager All Node Replication, note that the origin node -waits for a confirmation prior to making the transaction visible -locally. The order of operations is: - -- origin flushes a prepare or pre-commit record to WAL -- peer node receives changes into its apply queue in memory -- peer applies changes, making the transaction visible locally -- peer persists the transaction by flushing to disk -- origin commits and makes the transaction visible locally - -The following table summarizes the differences. - -| Variant | Order of apply vs persist on peer nodes | Replication before or after origin WAL commit record write | -| :------ | :-------------------------------------: | :--------------------------------------------------------- | -| PSR | persist first | after | -| BDR | apply first | after | -| CAMO | apply first | before (triggered by pre-commit) | -| Eager | apply first | before (triggered by prepare) | - -## Configuration - -The following table provides an overview of which configuration -settings are required to be set to a non-default value (req) or -optional (opt), but affecting a specific variant. - -| setting (GUC) | PSR | PGL | CAMO | Eager | -| ------------------------------------ | :-: | :-: | :--: | :---: | -| synchronous_standby_names | req | req | n/a | n/a | -| synchronous_commit | opt | opt | n/a | n/a | -| synchronous_replication_availability | opt | opt | opt | n/a | -| bdr.enable_camo | n/a | n/a | req | n/a | -| bdr.commit_scope | n/a | n/a | n/a | req | -| bdr.global_commit_timeout | n/a | n/a | opt | opt | - -## Planned Shutdown and Restarts - -When using PGL or CAMO in combination with `remote_write`, care must be taken -with planned shutdown or restart. By default, the apply queue is consumed -prior to shutting down. However, in the `immediate` shutdown mode, the queue -is discarded at shutdown, leading to the stopped node "forgetting" -transactions in the queue. A concurrent failure of another node could -lead to loss of data, as if both nodes failed. - -To ensure the apply queue gets flushed to disk, please use either -`smart` or `fast` shutdown for maintenance tasks. This maintains the -required synchronization level and prevents loss of data. - -## Synchronous Replication using BDR - -### Usage - -To enable synchronous replication using BDR, the application -name of the relevant BDR peer nodes need to be added to -`synchronous_standby_names`. The use of `FIRST x` or `ANY x` offers a -lot of flexibility, if this does not conflict with the requirements of -non-BDR standby nodes. - -Once added, the level of synchronization can be configured per -transaction via `synchronous_commit`, which defaults to `on` - meaning that -adding to `synchronous_standby_names` already enables synchronous -replication. Setting `synchronous_commit` to `local` or `off` turns -off synchronous replication. - -Due to BDR applying the transaction before persisting it, the -values `on` and `remote_apply` are equivalent (for logical -replication). - -### Limitations - -BDR uses the same configuration (and internal mechanisms) as -Physical Streaming Replication, therefore the needs for (physical, -non-BDR) standbys needs to be considered when configuring synchronous -replication between BDR nodes. diff --git a/product_docs/docs/bdr/4.0/eager.mdx b/product_docs/docs/bdr/4.0/eager.mdx deleted file mode 100644 index dfdd69bd832..00000000000 --- a/product_docs/docs/bdr/4.0/eager.mdx +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Eager Replication -originalFilePath: eager.md - ---- - - diff --git a/product_docs/docs/bdr/4.0/isolation_details.mdx b/product_docs/docs/bdr/4.0/isolation_details.mdx deleted file mode 100644 index 794a8bd5704..00000000000 --- a/product_docs/docs/bdr/4.0/isolation_details.mdx +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: 'Appendix B: Conflict Details' -originalFilePath: isolation/details.md - ---- - - diff --git a/product_docs/docs/bdr/4.0/known-issues.mdx b/product_docs/docs/bdr/4.0/known-issues.mdx deleted file mode 100644 index d11e2f72a9b..00000000000 --- a/product_docs/docs/bdr/4.0/known-issues.mdx +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: 'Appendix C: Known Issues' -originalFilePath: known-issues.md - ---- - -This section discusses currently known issues in BDR4. - -## Data Consistency - -Please remember to read about [Conflicts](conflicts) to understand -the implications of the asynchronous operation mode in terms of data -consistency. - -## List of Issues - -In the remaining part of this section we list a number of known issues -that are tracked in BDR's ticketing system, each marked with an -unique identifier. - -- If the resolver for the `update_origin_change` conflict - is set to `skip`, and `synchronous_commit=remote_apply` is used, and - concurrent updates of the same row are repeatedly applied on two - different nodes, then one of the update statements might hang due - to a deadlock with the BDR writer. As mentioned in the - [Conflicts](conflicts) chapter, `skip` is not the default - resolver for the `update_origin_change` conflict, and this - combination is not intended to be used in production: it discards - one of the two conflicting updates based on the order of arrival - on that node, which is likely to cause a divergent cluster. - In the rare situation that you do choose to use the `skip` - conflict resolver, please note the issue with the use of the - `remote_apply` mode. - -- A `galloc` sequence might skip some chunks if the - sequence is created in a rolled back transaction and then created - again with the same name, or if it is created and dropped when DDL - replication is not active and then it is created again when DDL - replication is active. - The impact of the problem is mild, because the sequence - guarantees are not violated; the sequence will only skip some - initial chunks. Also, as a workaround the user can specify the - starting value for the sequence as an argument to the - `bdr.alter_sequence_set_kind()` function. - -- Upgrades from BDR 3.7 are only supported by adding new nodes, - and **not** through in-place upgrade of the same data directory. - This is planned to be resolved in a future 4.0 minor release. - -- The `bdr.monitor_local_replslots()` function may return CRITICAL result - saying "There is at least 1 BDR replication slot which is missing" even if - all slots exists in presence of logical standbys or subscribe-only node - groups. - -- Decoding Worker feature does not work with CAMO/EAGER - -- Decoding Worker works only with the default replication sets diff --git a/product_docs/docs/bdr/4.0/release-notes.mdx b/product_docs/docs/bdr/4.0/release-notes.mdx deleted file mode 100644 index 5f96fe9e6b1..00000000000 --- a/product_docs/docs/bdr/4.0/release-notes.mdx +++ /dev/null @@ -1,316 +0,0 @@ ---- -title: Release Notes -originalFilePath: release-notes.md - ---- - -## BDR 4.0.2 - -This is a maintenance release for BDR 4.0 which includes minor -improvements as well as fixes for issues identified in previous -versions. - -### Improvements - -- Add `bdr.max_worker_backoff_delay` (BDR-1767) - This changes the handling of the backoff delay to exponentially - increase from `bdr.min_worker_backoff_delay` to - `bdr.max_worker_backoff_delay` in presence of repeated errors. This - reduces log spam and in some cases also prevents unnecessary - connection attempts. - -- Add `execute_locally` option to `bdr.replicate_ddl_command()` (RT73533) - This allows optional queueing of ddl commands for replication to other groups - without executing it locally. - -- Change ERROR on consensus issue during JOIN to WARNING - The reporting of these transient errors was confusing as they were - also shown in bdr.worker_errors. These are now changed to WARNINGs. - -### Resolved Issues - -- WAL decoder confirms end LSN of the running transactions record (BDR-1264) - Confirm end LSN of the running transactions record processed by WAL - decoder so that the WAL decoder slot remains up to date and WAL senders - get the candidate in timely manner. - -- Don't wait for autopartition tasks to complete on parting nodes (BDR-1867) - When a node has started parting process, it makes no sense to wait for - autopartition tasks on such nodes to finish since it's not part of the group - anymore. - -- Improve handling of node name reuse during parallel join (RT74789) - Nodes now have a generation number so that it's easier to identify the name - reuse even if the node record is received as part of a snapshot. - -- Fix locking and snapshot use during node management in the BDR manager - process (RT74789) - When processing multiple actions in the state machine, make sure to reacquire - the lock on the processed node and update the snapshot to make sure all updates - happening through consensus are taken into account. - -- Improve cleanup of catalogs on local node drop - Drop all groups, not only the primary one and drop all the node state - history info as well. - -- Improve error checking for join request in bdr_init_physical - Previously bdr_init_physical would simply wait forever when there was any issue - with the consensus request, now we do same checking as the logical join does. - -- Improve handling of various timeouts and sleeps in consensus - This reduces the amount of new consensus votes needed when processing many consensus - requests or time consuming consensus requests, for example during join of a - new node. - -- Fix handling of `wal_receiver_timeout` (BDR-1848) - The `wal_receiver_timeout` has not been triggered correctly due to a regression - in BDR 3.7 and 4.0. - -- Limit the `bdr.standby_slot_names` check when reporting flush position only to - physical slots (RT77985, RT78290) - Otherwise flush progress is not reported in presence of disconnected nodes when - using `bdr.standby_slot_names`. - -- Fix replication of data types created during bootstrap (BDR-1784) - -- Fix replication of arrays of builtin types that don't have binary transfer support (BDR-1042) - -- Prevent CAMO configuration warnings if CAMO is not being used (BDR-1825) - -### Upgrades - -This release supports upgrading from the following versions of BDR: - -- 4.0.0 and higher - -The upgrade path from BDR 3.7 is not currently stable and needs to be -considered beta. Tests should be performed with at least BDR 3.7.15. - -Please make sure you read and understand the process and limitations described -in the [Upgrade Guide](upgrades) before upgrading. - -## BDR 4.0.1 - -This is a maintenance release for BDR 4.0 which includes minor -improvements as well as fixes for issues identified in previous -versions. - -### Improvements - -- Reduce frequency of CAMO partner connection attempts - In case of a failure to connect to a CAMO partner to verify its - configuration and check the status of transactions, do not retry - immediately (leading to a fully busy pglogical manager process), but - throttle down repeated attempts to reconnect and checks to once per - minute. - -- Implement buffered read for LCR segment file (BDR-1422) - Implement LCR segment file buffering so that multiple LCR chunks - can be read at a time. This should reduce - I/O and improve CPU usage of Wal Senders when using the Decoding Worker. - -- Avoid unnecessary LCR segment reads (BDR-1426) - BDR now attempts to only read new LCR segments when there is at - least one available. This reduces I/O load when Decoding Worker is - enabled. - -- Performance of COPY replication including the initial COPY during join has - been greatly improved for partitioned tables (BDR-1479) - For large tables this can improve the load times by order of magnitude or - more. - -### Resolved Issues - -- Fix the parallel apply worker selection (BDR-1761) - This makes parallel apply work again. In 4.0.0 parallel apply was never - in effect due to this bug. - -- Fix Raft snapshot handling of `bdr.camo_pairs` (BDR-1753) - The previous release would not correctly propagate changes to the CAMO pair - configuration when they were received via Raft snapshot. - -- Correctly handle Raft snapshots from BDR 3.7 after upgrades (BDR-1754) - Upgrading a CAMO configured cluster taking into account the - `bdr.camo_pairs` in the snapshot while still excluding the ability - to perform in place upgrade of a cluster (due to upgrade - limitations unrelated to CAMO). - -- Switch from CAMO to Local Mode only after timeouts (RT74892) - Do not use the `catchup_interval` estimate when switching from CAMO - protected to Local Mode, as that could induce inadvertent switching - due to load spikes. Use the estimate only when switching from Local - Mode back to CAMO protected (to prevent toggling forth and back due - to lag on the CAMO partner). - -- Fix replication set cache invalidation when published replication set list - have changed (BDR-1715) - In previous versions we could use stale information about which replication - sets (and as a result which tables) should be published until the subscription - has reconnected. - -- Prevent duplicate values generated locally by galloc sequence in high - concurrency situations when the new chunk is used (RT76528) - The galloc sequence could have temporarily produce duplicate value when - switching which chunk is used locally (but not across nodes) if there were - multiple sessions waiting for the new value. This is now fixed. - -- Address memory leak on streaming transactions (BDR-1479) - For large transaction this reduces memory usage and I/O considerably when using - the streaming transactions feature. This primarily improves performance of - COPY replication. - -- Don't leave slot behind after PART_CATCHUP phase of node parting when the - catchup source has changed while the node was parting (BDR-1716) - When node is being removed (parted) from BDR group, we do so called catchup - in order to forward any missing changes from that node between remaining nodes - in order to keep the data on all nodes consistent. This requires an additional - replication slot to be created temporarily. Normally this replication slot - is removed at the end of the catchup phase, however in certain scenarios - where we have to change the source node for the changes, this slot could - have previously been left behind. From this version, this slot is always - correctly removed. - -- Ensure that the group slot is moved forward when there is only one node in - the BDR group - This prevents disk exhaustion due to WAL accumulation when the group is left - running with just single BDR node for a prolonged period of time. This is not - recommended setup but the WAL accumulation was not intentional. - -- Advance Raft protocol version when there is only one node in the BDR group - Single node clusters would otherwise always stay on oldest support protocol - until another node was added. This could limit available feature set on that - single node. - -### Upgrades - -This release supports upgrading from the following versions of BDR: - -- 3.7.14 -- 4.0.0 and higher - -Please make sure you read and understand the process and limitations described -in the [Upgrade Guide](upgrades) before upgrading. - -## BDR 4.0.0 - -BDR 4.0 is a new major version of BDR and adopted with this release number is -semantic versioning (for details see semver.org). The two previous major -versions are 3.7 and 3.6. - -### Improvements - -- BDR on EDB Postgres Advanced 14 now supports following features - which were previously only available on EDB Postgres Extended: - - - Commit At Most Once - a consistency feature helping - an application to commit each transaction only once, even in the - presence of node failures - - Eager Replication - synchronizes between the nodes of the cluster before - committing a transaction to provide conflict free replication - - Decoding Worker - separation of decoding into separate worker from wal - senders allowing for better scalability with many nodes - - Estimates for Replication Catch-up times - - Timestamp-based Snapshots - providing consistent reads across multiple - nodes for retrieving data as they appeared or will appear at a given time - - Automated dynamic configuration of row freezing to improve consistency - of UPDATE/DELETE conflicts resolution in certain corner cases - - Assesment checks - - Support for handling missing partitions as conflicts rather than errors - - Advanced DDL Handling for NOT VALID constraints and ALTER TABLE - -- BDR on community version of PostgreSQL 12-14 now supports following features - which were previously only available on EDB Postgres Advanced or - EDB Postgres Extended: - - - Conflict-free Replicated Data Types - additional data types which provide - mathematically proven consistency in asynchronous multi-master update - scenarios - - Column Level Conflict Resolution - ability to use per column last-update - wins resolution so that UPDATEs on different fields can be "merged" without - losing either of them - - Transform Triggers - triggers that are executed on the incoming stream of - data providing ability to modify it or to do advanced programmatic filtering - - Conflict triggers - triggers which are called when conflict is detected, - providing a way to use custom conflict resolution techniques - - CREATE TABLE AS replication - - Parallel Apply - allow multiple writers to apply the incoming changes - -- Support streaming of large transactions. - - This allows BDR to stream a large transaction (greater than - `logical_decoding_work_mem` in size) either to a file on the downstream - or to a writer process. This ensures that the transaction is decoded - even before it's committed, thus improving parallelism. Further, the - transaction can even be applied concurrently if streamed straight to a - writer. This improves parallelism even more. - - When large transactions are streamed to files, they are decoded and - the decoded changes are sent to the downstream even before they are - committed. The changes are written to a set of files and applied when - the transaction finally commits. If the transaction aborts, the changes - are discarded, thus wasting resources on both upstream and downstream. - - Sub-transactions are also handled automatically. - - This feature is available on PostgreSQL 14, EDB Postgres Extended 13+ and - EDB Postgres Advanced 14, see [Feature Compatibility](feature-matrix) - appendix for more details on which features can be used on which versions - of Postgres. - -- The differences that existed in earlier versions of BDR between standard - and enterprise edition have been removed. With BDR 4.0 there is one extension - for each supported Postgres distribution and version, i.e., PostgreSQL v12-14, - EDB Postgres Extended v12-14, and EDB Postgres Advanced 12-14. - - Not all features are available on all versions of PostgreSQL, the available - features are reported via feature flags using either `bdr_config` command - line utility or `bdr.bdr_features()` database function. See - [Feature Compatibility](feature-matrix) appendix for more details. - -- There is no pglogical 4.0 extension that corresponds to the BDR 4.0 extension. - BDR no longer has a requirement for pglogical. - - This means also that only BDR extension and schema exist and any - configuration parameters were renamed from `pglogical.` to `bdr.`. - -- Some configuration options have change defaults for better post-install - experience: - - - Parallel apply is now enabled by default (with 2 writers). Allows for - better performance, especially with streaming enabled. - - `COPY` and `CREATE INDEX CONCURRENTLY` are now streamed directly to writer - in parallel (on Postgres versions where streaming is supported) to all - available nodes by default, eliminating or at least reducing replication - lag spikes after these operations. - - The timeout for global locks have been increased to 10 minutes - - The `bdr.min_worker_backoff_delay` now defaults to 1s so that - subscriptions retry connection only once per second on error - -- Greatly reduced the chance of false positives in conflict detection during - node join for table that use origin based conflict detection - -- Move configuration of CAMO pairs to Raft (CAMO) - - To reduce chances of misconfiguration and make CAMO pairs within the - BDR cluster known globally, move the CAMO configuration from the - individual node's postgresql.conf to BDR system catalogs managed by - Raft. This for example can prevent against inadvertently dropping a - node that's still configured to be a CAMO partner for another active - node. - - Please see the [Upgrades chapter](upgrades#upgrading-a-camo-enable-cluster) - for details on the upgrade process. - - This deprecates GUCs `bdr.camo_partner_of` and `bdr.camo_origin_for` - and replaces the functions `bdr.get_configured_camo_origin_for()` - and `get_configured_camo_partner_of` with - `bdr.get_configured_camo_partner`. - -### Upgrades - -This release supports upgrading from the following version of BDR: - -- 3.7.13.1 - -Please make sure you read and understand the process and limitations described -in the [Upgrade Guide](upgrades) before upgrading. diff --git a/product_docs/docs/bdr/4.0/upgrades.mdx b/product_docs/docs/bdr/4.0/upgrades.mdx deleted file mode 100644 index 0e8f803d712..00000000000 --- a/product_docs/docs/bdr/4.0/upgrades.mdx +++ /dev/null @@ -1,355 +0,0 @@ ---- -navTitle: Upgrades -title: Application Schema Upgrades -originalFilePath: upgrades.md - ---- - -In this chapter we discuss upgrading software on a BDR cluster and how -to minimize downtime for applications during the upgrade. - -## Overview - -BDR cluster has two sets of software, the underlying PostgreSQL software -or some flavor of it and the PGLogical/BDR software. We will discuss -upgrading either or both of these softwares versions to their supported -major releases. - -To upgrade a BDR cluster, the following steps need to be performed on -each node: - -- plan the upgrade -- prepare for the upgrade -- upgrade the server software -- restart Postgres -- check and validate the upgrade - -## Upgrade Planning - -While the BDR 3.7 release supports PostgreSQL 11 - 13, BDR 4.0 supports -PostgreSQL versions 12 - 14. Please refer to (product-matrix.md) -page for the full list compatible software. Since BDR 4.0 supports newer -PostgreSQL releases, while upgrading from BDR 3.7 to BDR 4.0, it's also -recommended to upgrade to a newer PostgreSQL release. - -There are broadly two ways to upgrade the BDR version. - -- Upgrading one node at a time to the newer BDR version. -- Joining a new node running a newer version of the BDR software and - then optionally drop one of the old nodes. - -If you are only interested in upgrading the BDR software, any of the two -methods can be used. But if you also want to upgrade the PostgreSQL -version, then the second method must be used. - -!!! Warning - At this point in time, both of these two methods still have their - issues and need to be considered beta. We do not recommend to - upgrade from BDR 3.7 to BDR 4.0 just yet. This restriction will - be lifted in future versions of BDR 4. - -### Rolling Server Software Upgrades - -A rolling upgrade is the process where the below [Server -Software Upgrade](#Server-Software-Upgrade) is performed on each node in the -BDR Group one after another, while keeping the replication working. - -An upgrade to 4.0 is only supported from 3.7 in beta testing mode, -using a specific minimum -maintenance release (e.g. 3.7.15). Please consult the Release Notes -for the actual required minimum version. So if a node -is running with an older 3.7 release, it must first be upgraded to -the minimum required version and can only then be upgraded to 4.0. - -Just as with a single-node database, it's possible to stop all nodes, -perform the upgrade on all nodes and only then restart the entire -cluster. This strategy of upgrading all nodes at the same time avoids -running with mixed BDR versions and therefore is the simplest, but -obviously incurs some downtime. - -During the upgrade process, the application can be switched over to a node -which is currently not being upgraded to provide continuous availability of -the BDR group for applications. - -While the cluster is going through a rolling upgrade, replication happens -between mixed versions of BDR. For example, nodeA will have BDR 3.7.11, while -nodeB and nodeC will have 4.0.0. In this state, the replication and group -management will use the protocol and features from the oldest version (3.7.11 -in case of this example), so any new features provided by the newer version -which require changes in the protocol will be disabled. Once all nodes are -upgraded to the same version, the new features are automatically enabled. - -A BDR cluster is designed to be easily upgradeable. Most BDR releases -support rolling upgrades, which means running part of the cluster on one -release level and the remaining part of the cluster on a second, compatible, -release level. - -A rolling upgrade starts with a cluster with all nodes at a prior release, -then proceeds by upgrading one node at a time to the newer release, until -all nodes are at the newer release. Should problems occur, do not attempt -to downgrade without contacting Technical Support to discuss and provide -options. - -An upgrade process may take an extended period of time when the user decides -caution is required to reduce business risk, though this should not take any -longer than 30 days without discussion and explicit agreement from Technical -Support to extend the period of coexistence of two release levels. - -In case of problems during upgrade, do not initiate a second upgrade to a -newer/different release level. Two upgrades should never occur concurrently -in normal usage. Nodes should never be upgraded to a third release without -specific and explicit instructions from Technical Support. A case where -that might occur is if an upgrade failed for some reason and a Hot Fix was -required to continue the current cluster upgrade process to successful -conclusion. BDR has been designed and tested with more than 2 release -levels, but this cannot be relied upon for production usage except in -specific cases. - -### Rolling Upgrade Using Node Join - -The other method of upgrading BDR software, along with or without upgrading -the underlying PostgreSQL major version, is to join a new node -to the cluster and later drop one of the existing nodes running -the older version of the software. Even with this method, some features -that are available only in the newer version of the software may remain -unavailable until all nodes are finally upgraded to the newer versions. - -A new node running this release of BDR 4.0 can join a 3.7 cluster, -where each node in the cluster is running the latest 3.7.x version of -BDR. The joining node may run any of the supported PostgreSQL versions -12-14 but mixing of PostgreSQL, EDB Postgres Extended and EDB Postgres Advanced -is currently not supported. - -Care must be taken to not use features that are available only in -the newer PostgreSQL versions, until all nodes are upgraded to the -newer and same release of PostgreSQL. This is especially true for any -new DDL syntax that may have been added to newer release of PostgreSQL. - -Note that `bdr_init_physical` makes a byte-by-byte of the source node. -So it cannot be used while upgrading from one major PostgreSQL version -to another. In fact, currently `bdr_init_physical` requires that even -BDR version of the source and the joining node is exactly the same. So -it cannot be used for rolling upgrades via joining a new node method. In -all such cases, a logical join must be used. - -### Upgrading a CAMO-Enabled cluster - -CAMO protection requires at least one of the nodes of a CAMO pair to -be operational. For upgrades, we recommend to ensure that no CAMO -protected transactions are running concurrent to the upgrade, or to -use a rolling upgrade strategy, giving the nodes enough time to -reconcile in between the upgrades and the corresponding node downtime -due to the upgrade. - -Configuration of CAMO pairs has changed significantly compared to BDR -3.7: instead of GUCs in postgresql.conf, the pairing is now stored in -BDR system catalog `bdr.camo_pairs`. To upgrade a BDR cluster with -CAMO pairs from 3.7 to 4.0, the following steps need to be performed: - -- Add new upgraded nodes to the cluster in a rolling upgrade fashion. -- Configure the new CAMO pairs with the `bdr.add_camo_pair()` function. -- Switch the application to the new nodes. -- Drop the old nodes. - -!!!Note - `bdr.camo_partner_of` and `bdr.camo_origin_for` configurations are no - longer needed in BDR 4.0. - -## Upgrade Preparation - -Each major release of BDR contains several changes that may affect -compatibility with -previous releases. These may affect the Postgres configuration, -deployment scripts as well as applications using BDR. We recommend to -consider and possibly adjust in advance of the upgrade. - -### pglogical - -There is no pglogical4 and BDR4 will not work if any version of pglogical is -loaded via `shared_preload_libraries` to the same instance of Postgres. - -### Node Management - -The `bdr.create_node_group()` function has seen a number of changes: - -- It is now possible to create sub-groups, resulting in a tree-of-groups - structure of the whole BDR cluster. Monitoring views were updated - accordingly. -- The deprecated parameters `insert_to_update`, `update_to_insert`, - `ignore_redundant_updates`, `check_full_tuple` and `apply_delay` were - removed. - Use `bdr.alter_node_set_conflict_resolver()` instead of `insert_to_update`, - `update_to_insert`. The `check_full_tuple` is no longer needed as it is - handled automatically based on table conflict detection configuration. - -### Conflicts - -The configuration of conflict resolution and logging is now copied from -join source node to the newly joining node, rather than using defaults on the -new node. - -The default conflict resolution for some of the conflict types was changed. -See (conflicts.md#default-conflict-resolvers) for the new defaults. - -The conflict logging interfaces have changed from `bdr.alter_node_add_log_config` -and `bdr.alter_node_remove_log_config` to `bdr.alter_node_set_log_config`. - -The default conflict logging table is now named `bdr.conflict_history` and the -old `bdr.apply_log` no longer exists. The new table is partitioned using the -Autopartition feature of BDR. - -All conflicts are now logged by default to both log file and the conflict -table. - -Deprecated functions `bdr.row_version_tracking_enable()` and -`bdr.row_version_tracking_disable()` were removed. Use -`bdr.alter_table_conflict_detection()` instead. - -Some of the configuration for conflict handling is no longer stored in -`pglogical` schema. Any diagnostic queries that were using the `pglogical` -tables directly will have to switch to appropriate tables in `bdr` schema. -Queries using `bdr.node_group`, `bdr.local_node_summary`, -`bdr.local_node_summary` or -`bdr.node_local_info` will need to use the new columns `sub_repsets` and -`pub_repsets` instead of `replication_sets`. - -### Removed Or Renamed Settings (GUCs) - -All the `pglogical.` prefixed configuration variables were renamed to use `bdr.` -prefix instead. - -## Server Software Upgrade - -The upgrade of BDR software on individual nodes happens in-place. There is no -need for backup and restore when upgrading the BDR extension. -!!! - -!!! Warning - This method cannot be currently used for upgrading BDR 3.7 to 4.0. Only - way to upgrade 3.7 to 4.0 is to join 4.0 nodes into BDR 3.7 cluster as - described in [Rolling Upgrade Using Node Join](#rolling_upgrade_using_node_join) - section. This restriction may be lifted in future versions of BDR 4. - -The first step in the upgrade is to install the new version of the BDR packages, which -will install both the new binary and the extension SQL script. This step depends -on the operating system used - -### Restart Postgres - -Upgrading the binary and extension scripts by itself does not upgrade BDR -in the running instance of PostgreSQL. To do that, the PostgreSQL instance -needs to be restarted so that the new BDR binary can be loaded (the BDR binary -is loaded at the start of the PostgreSQL server). After that, the node is -upgraded. The extension SQL upgrade scripts are executed automatically as -needed. - -!!! Warning - It's important to never run the `ALTER EXTENSION ... UPDATE` command before the - PostgreSQL instance is restarted, as that will only upgrade the SQL-visible - extension but keep the old binary, which can cause unpredictable behaviour or - even crashes. The `ALTER EXTENSION ... UPDATE` command should never be needed; - BDR4 maintains the SQL-visible extension automatically as needed. - -### Upgrade Check and Validation - -After this procedure, your BDR node is upgraded. You can verify the current -version of BDR4 binary like this: - -```sql -SELECT bdr.bdr_version(); -``` - -Always check the [monitoring](monitoring) after upgrade -of a node to confirm that the upgraded node is working as expected. - -## Database Encoding - -We recommend using `UTF-8` encoding in all replicated databases. -BDR does not support replication between databases with different -encoding. There is currently no supported path to upgrade/alter encoding. - -Similar to the upgrade of BDR itself, there are two approaches to -upgrading the application schema. The simpler option is to stop all -applications affected, preform the schema upgrade and restart the -application upgraded to use the new schema variant. Again, this -imposes some downtime. - -To eliminate this downtime, BDR offers ways to perform a rolling -application schema upgrade as documented in the following section. - -## Rolling Application Schema Upgrades - -By default, DDL will automatically be sent to all nodes. This can be -controlled manually, as described in [DDL Replication](ddl), which -could be used to create differences between database schemas across nodes. -BDR is designed to allow replication to continue even while minor -differences exist between nodes. These features are designed to allow -application schema migration without downtime, or to allow logical -standby nodes for reporting or testing. - -!!! Warning - Application Schema Upgrades are managed by the user, not by BDR. - Careful scripting will be required to make this work correctly - on production clusters. Extensive testing is advised. - -Details of this are covered here -[Replicating between nodes with differences](appusage). - -When one node runs DDL that adds a new table, nodes that have not -yet received the latest DDL will need to cope with the extra table. -In view of this, the appropriate setting for rolling schema upgrades -is to configure all nodes to apply the `skip` resolver in case of a -`target_table_missing` conflict. This must be performed before any -node has additional tables added, and is intended to be a permanent -setting. - -This is done with the following query, that must be **executed -separately on each node**, after replacing `node1` with the actual -node name: - -```sql -SELECT bdr.alter_node_set_conflict_resolver('node1', - 'target_table_missing', 'skip'); -``` - -When one node runs DDL that adds a column to a table, nodes that have not -yet received the latest DDL will need to cope with the extra columns. -In view of this, the appropriate setting for rolling schema -upgrades is to configure all nodes to apply the `ignore` resolver in -case of a `target_column_missing` conflict. This must be performed -before one node has additional columns added and is intended to be a -permanent setting. - -This is done with the following query, that must be **executed -separately on each node**, after replacing `node1` with the actual -node name: - -```sql -SELECT bdr.alter_node_set_conflict_resolver('node1', - 'target_column_missing', 'ignore'); -``` - -When one node runs DDL that removes a column from a table, nodes that -have not yet received the latest DDL will need to cope with the missing column. -This situation will cause a `source_column_missing` conflict, which uses -the `use_default_value` resolver. Thus, columns that neither -accept NULLs nor have a DEFAULT value will require a two step process: - -1. Remove NOT NULL constraint or add a DEFAULT value for a column - on all nodes. -2. Remove the column. - -Constraints can be removed in a rolling manner. -There is currently no supported way for coping with adding table -constraints in a rolling manner, one node at a time. - -When one node runs a DDL that changes the type of an existing column, -depending on the existence of binary coercibility between the current -type and the target type, the operation may not rewrite the underlying -table data. In that case, it will be only a metadata update of the -underlying column type. Rewrite of a table is normally restricted. -However, in controlled DBA environments, it is possible to change -the type of a column to an automatically castable one by adopting -a rolling upgrade for the type of this column in a non-replicated -environment on all the nodes, one by one. More details are provided in the -[ALTER TABLE](ddl) section. diff --git a/product_docs/docs/bdr/4/appusage.mdx b/product_docs/docs/bdr/4/appusage.mdx new file mode 100644 index 00000000000..65e7625fa3d --- /dev/null +++ b/product_docs/docs/bdr/4/appusage.mdx @@ -0,0 +1,680 @@ +--- +title: Application use +--- + +Learn about the application from a user perspective. + +## Application behavior + +BDR supports replicating changes made on one node to other nodes. + +BDRs, by default, replicate all changes from INSERT, UPDATE, DELETE +and TRUNCATE operations from the source node to other nodes. Only the final changes +are sent, after all triggers and rules are processed. For example, +`INSERT ... ON CONFLICT UPDATE` sends either an insert or an update +depending on what occurred on the origin. If an update or delete affects +zero rows, then no changes are sent. + +INSERT can be replicated without any preconditions. + +For updates and deletes to replicate on other nodes, we must be able to +identify the unique rows affected. BDR requires that a table have either +a PRIMARY KEY defined, a UNIQUE constraint, or an explicit REPLICA IDENTITY +defined on specific columns. If one of those isn't defined, a warning is +generated, and later updates or deletes are explicitly blocked. +If REPLICA IDENTITY FULL is defined for a table, then a unique index isn't required. +In that case, updates and deletes are allowed and use the first non-unique +index that is live, valid, not deferred, and doesn't have expressions or WHERE +clauses. Otherwise, a sequential scan is used. + +You can use TRUNCATE even without a defined replication identity. +Replication of TRUNCATE commands is supported, but take care +when truncating groups of tables connected by foreign keys. When replicating +a truncate action, the subscriber truncates the same group of tables that +was truncated on the origin, either explicitly specified or implicitly +collected by CASCADE, except in cases where replication sets are defined. +See [Replication sets](repsets) for further details and examples. +This works correctly if all affected tables are part of the same +subscription. But if some tables to be truncated on the subscriber have +foreign-key links to tables that aren't part of the same (or any) +replication set, then applying the truncate action on the +subscriber fails. + +Row-level locks taken implicitly by INSERT, UPDATE, and DELETE commands are +replicated as the changes are made. +Table-level locks taken implicitly by INSERT, UPDATE, DELETE, and TRUNCATE +commands are also replicated. +Explicit row-level locking (`SELECT ... FOR UPDATE/FOR SHARE`) by user sessions +isn't replicated, nor are advisory locks. Information stored by transactions +running in SERIALIZABLE mode isn't replicated to other nodes. The +transaction isolation level of SERIALIAZABLE is supported, but transactions +aren't serialized across nodes in the presence of concurrent +transactions on multiple nodes. + +If DML is executed on multiple nodes concurrently, then potential conflicts +might occur if executing with asynchronous replication. These must be +either handled or avoided. Various avoidance mechanisms are possible, +discussed in [Conflicts](conflicts). + +Sequences need special handling, described in [Sequences](sequences). + +Binary data in BYTEA columns is replicated normally, allowing "blobs" of data +up to 1 GB in size. Use of the PostgreSQL "large object" facility isn't +supported in BDR. + +Rules execute only on the origin node so aren't executed during apply, +even if they're enabled for replicas. + +Replication is possible only from base tables to base tables. That is, the +tables on the source and target on the subscription side must be +tables, not views, materialized views, or foreign tables. Attempts to +replicate tables other than base tables result in an error. +DML changes that are made through updatable views are resolved to +base tables on the origin and then applied to the same base table name on +the target. + +BDR supports partitioned tables transparently, meaning that a partitioned +table can be added to a replication set and +changes that involve any of the partitions are replicated downstream. + +By default, triggers execute only on the origin node. For example, an INSERT +trigger executes on the origin node and is ignored when you apply the change on +the target node. You can specify for triggers to execute on both the origin +node at execution time and on the target when it's replicated ("apply time") +by using `ALTER TABLE ... ENABLE ALWAYS TRIGGER`, or use the `REPLICA` option +to execute only at apply time: `ALTER TABLE ... ENABLE REPLICA TRIGGER`. + +Some types of trigger aren't executed on apply, even if they exist on a +table and are currently enabled. Trigger types not executed are: + +- Statement-level triggers (`FOR EACH STATEMENT`) +- Per-column UPDATE triggers (`UPDATE OF column_name [, ...]`) + +BDR replication apply uses the system-level default search_path. Replica +triggers, stream triggers, and index expression functions can assume +other search_path settings that then fail when they execute on apply. +To prevent this from occurring, resolve object references clearly using +either only the default search_path, always use fully qualified references to +objects, e.g., schema.objectname, or set the search path for a function using +`ALTER FUNCTION ... SET search_path = ...` for the functions affected. + +BDR assumes that there are no issues related to text or other +collatable datatypes, i.e., all collations in use are available on all +nodes, and the default collation is the same on all nodes. Replication of +changes uses equality searches to locate Replica Identity values, so this +does't have any effect except where unique indexes are explicitly defined +with nonmatching collation qualifiers. Row filters might be affected +by differences in collations if collatable expressions were used. + +BDR handling of very long "toasted" data in PostgreSQL is transparent to +the user. The TOAST "chunkid" values likely differ between +the same row on different nodes, but that doesn't cause any problems. + +BDR can't work correctly if Replica Identity columns are marked as external. + +PostgreSQL allows CHECK() constraints that contain volatile functions. Since +BDR re-executes CHECK() constraints on apply, any subsequent re-execution that +doesn't return the same result as previously causes data divergence. + +BDR doesn't restrict the use of foreign keys. Cascading FKs are allowed. + +## Nonreplicated statements + +None of the following user commands are replicated by BDR, so their effects +occur on the local/origin node only: + +- Cursor operations (DECLARE, CLOSE, FETCH) +- Execution commands (DO, CALL, PREPARE, EXECUTE, EXPLAIN) +- Session management (DEALLOCATE, DISCARD, LOAD) +- Parameter commands (SET, SHOW) +- Constraint manipulation (SET CONSTRAINTS) +- Locking commands (LOCK) +- Table maintenance commands (VACUUM, ANALYZE, CLUSTER, REINDEX) +- Async operations (NOTIFY, LISTEN, UNLISTEN) + +Since the `NOTIFY` SQL command and the `pg_notify()` functions +aren't replicated, notifications aren't reliable in case of failover. +This means that notifications can easily be lost at failover if a +transaction is committed just when the server crashes. +Applications running `LISTEN` might miss notifications in case of failover. +This is true in standard PostgreSQL replication, and BDR doesn't +yet improve on this. CAMO and Eager Replication options don't +allow the `NOTIFY` SQL command or the `pg_notify()` function. + +## DML and DDL replication + +BDR doesn't replicate the DML statement. It replicates the changes +caused by the DML statement. For example, an UPDATE that changed +two rows replicates two changes, whereas a DELETE that didn't +remove any rows doesn't replicate anything. This means that the results +of executing volatile statements are replicated, ensuring there's no +divergence between nodes as might occur with statement-based replication. + +DDL replication works differently to DML. For DDL, BDR replicates the +statement, which then executes on all nodes. So a `DROP TABLE IF EXISTS` +might not replicate anything on the local node, but the statement is +still sent to other nodes for execution if DDL replication is enabled. +Full details are covered in [DDL replication](ddl). + +BDR works to ensure that intermixed DML and DDL +statements work correctly, even in the same transaction. + +## Replicating between different release levels + +BDR is designed to replicate between nodes that have different major +versions of PostgreSQL. This feature is designed to allow major +version upgrades without downtime. + +BDR is also designed to replicate between nodes that have different +versions of BDR software. This feature is designed to allow version +upgrades and maintenance without downtime. + +However, while it's possible to join a node with a major version in +a cluster, you can't add a node with a minor version if the cluster +uses a newer protocol version. This returns an error. + +Both of these features might be affected by specific restrictions. +See [Release notes](release-notes) for any known incompatibilities. + +## Replicating between nodes with differences + +By default, DDL is automatically sent to all nodes. You can control this manually, as described in [DDL Replication](ddl), and you could use it to create differences between database schemas across nodes. +BDR is designed to allow replication to continue even with minor +differences between nodes. These features are designed to allow +application schema migration without downtime or to allow logical +standby nodes for reporting or testing. + +Currently, replication requires the same table name on all nodes. A future +feature might allow a mapping between different table names. + +It is possible to replicate between tables with dissimilar partitioning +definitions, such as a source that is a normal table replicating to a +partitioned table, including support for updates that change partitions +on the target. It can be faster if the partitioning definition is the +same on the source and target since dynamic partition routing doesn't need to execute at apply time. +For details, see [Replication sets](repsets). + +By default, all columns are replicated. +BDR replicates data columns based on the column name. If a column +has the same name but a different datatype, we attempt to cast from the source +type to the target type, if casts were defined that allow that. + +BDR supports replicating between tables that have a different number of columns. + +If the target has missing columns from the source, then BDR raises +a `target_column_missing` conflict, for which the default conflict resolver +is `ignore_if_null`. This throws an error if a non-NULL value arrives. +Alternatively, you can also configure a node with a conflict resolver of `ignore`. +This setting doesn't throw an error but silently ignores any additional +columns. + +If the target has additional columns not seen in the source record, then BDR +raises a `source_column_missing` conflict, for which the default conflict resolver +is `use_default_value`. Replication proceeds if the additional columns +have a default, either NULL (if nullable) or a default expression, but +throws an error and halts replication if not. + +Transform triggers can also be used on tables to provide default values +or alter the incoming data in various ways before apply. + +If the source and the target have different constraints, then +replication is attempted, but it might fail if the rows from +source can't be applied to the target. Row filters can help here. + +Replicating data from one schema to a more relaxed schema won't cause failures. +Replicating data from a schema to a more restrictive schema can be a source of +potential failures. +The right way to solve this is to place a constraint on the more relaxed side, +so bad data can't be entered. That way, no bad data ever arrives +by replication, so it never fails the transform into the more restrictive +schema. For example, if one schema has a column of type TEXT and another schema +defines the same column as XML, add a CHECK constraint onto the TEXT column +to enforce that the text is XML. + +You can define a table with different indexes on each node. By default, the +index definitions are replicated. See [DDL replication](ddl) to +specify how to create an index only on a subset of nodes or just locally. + +Storage parameters, such as `fillfactor` and `toast_tuple_target`, can differ +between nodes for a table without problems. An exception to that is the +value of a table's storage parameter `user_catalog_table` must be identical +on all nodes. + +A table being replicated must be owned by the same user/role on each node. +See [Security and roles](security) for further discussion. + +Roles can have different passwords for connection on each node, although +by default changes to roles are replicated to each node. See +[DDL replication](ddl) to specify how to alter a role password only on a +subset of nodes or locally. + +## Comparison between nodes with differences + +LiveCompare is a tool for data comparison on a database, against BDR and +non-BDR nodes. It needs a minimum of two connections to compare against +and reach a final result. + +Since LiveCompare 1.3, you can configure with `all_bdr_nodes` set. This +saves you from clarifying all the relevant DSNs for each separate node in the +cluster. A BDR cluster has N amount of nodes with connection information, but +it's only the initial and output connection that LiveCompare 1.3+ needs +to complete its job. Setting `logical_replication_mode` states how all the +nodes are communicating. + +All the configuration is done in a `.ini` file, named `bdrLC.ini`, for example. +Find templates for this configuration file in +`/etc/2ndq-livecompare/`. + +While LiveCompare executes, you see N+1 progress bars, N being +the number of processes. Once all the tables are sourced, a time displays, +as the transactions per second (tps) was measured. This continues to +count the time, giving you an estimate and then a total execution time at the end. + +This tool offers a lot of customization and filters, such as tables, schemas, and +replication_sets. LiveCompare can use stop-start without losing context +information, so it can run at convenient times. After the comparison, a +summary and a DML script are generated so you can review it. Apply +the DML to fix the any differences found. + +## General rules for applications + +BDR uses replica identity values to identify the rows to +change. +Applications can cause difficulties if they insert, delete, and then later +reuse the same unique identifiers. +This is known as the [ABA problem](https://en.wikipedia.org/wiki/ABA_problem). BDR can't know whether the rows are the +current row, the last row, or much older rows. + +Similarly, since BDR uses table names to identify the table against which +changes are replayed, a similar ABA problem exists with applications that +create, drop, and then later reuse the same object names. + +These issues give rise to some simple rules for applications to follow: + +- Use unique identifiers for rows (INSERT). +- Avoid modifying unique identifiers (UPDATE). +- Avoid reusing deleted unique identifiers. +- Avoid reusing dropped object names. + +In the general case, breaking those rules can lead to data anomalies and +divergence. Applications can break those rules as long as certain conditions +are met, but use caution: while anomalies are unlikely, they aren't +impossible. For example, a row value can be reused as long as the DELETE was replayed on all nodes, including down nodes. This might normally occur in +less than a second but can take days if a severe issue occurred +on one node that prevented it from restarting correctly. + +## Timing considerations and synchronous replication + +Being asynchronous by default, peer nodes might lag behind, making it +possible for a client connected to multiple BDR nodes or switching +between them to read stale data. + +A [queue wait function](functions#bdrwait_for_apply_queue) is +provided for clients or proxies to prevent such stale reads. + +The synchronous replication features of Postgres are available to BDR +as well. In addition, BDR provides multiple variants for more synchronous +replication. See +[Durability and performance options](durability) for an overview and comparison of all variants available and +its different modes. + +## Application testing + +You can test BDR applications using the following programs, +in addition to other techniques. + +- [TPAexec] +- [pgbench with CAMO/Failover options] +- [isolationtester with multi-node access] + +### TPAexec + +TPAexec is the system used by EDB to deploy reference TPA +architectures, including those based on Postgres-BDR. + +TPAexec includes test suites for each reference architecture. It also +simplifies creating and managing a local collection of tests to run +against a TPA cluster, using a syntax like the following: + +``` +tpaexec test mycluster mytest +``` + +We strongly recommend that developers write their own multi-node suite +of TPAexec tests that verify the main expected properties of the +application. + +### pgbench with CAMO/Failover options + +In EDB Postgres Extended, the pgbench was extended to allow users to +run failover tests while using CAMO or regular BDR deployments. The following options were added: + +``` +-m, --mode=regular|camo|failover +mode in which pgbench should run (default: regular) + +--retry +retry transactions on failover +``` + +In addition to these options, the connection information about the +peer node for failover must be specified in [DSN +form](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING). + +- Use `-m camo` or `-m failover` to specify the mode for pgbench. + You can use The `-m failover` specification to test failover in + regular BDR deployments. + +- Use `--retry` to specify whether to retry transactions when + failover happens with `-m failover` mode. This option is enabled by default + for `-m camo` mode. + +Here's an example in a CAMO environment: + +```sh + pgbench -m camo -p $node1_port -h $node1_host bdrdemo \ + "host=$node2_host user=postgres port=$node2_port dbname=bdrdemo" +``` + +This command runs in camo mode. It connects to node1 and runs the tests. If the +connection to node1 is lost, then pgbench connects to +node2. It queries node2 to get the status of in-flight transactions. +Aborted and in-flight transactions are retried in camo mode. + +In failover mode, if `--retry` is specified, then in-flight transactions are retried. In +this scenario there's no way to find the status of in-flight transactions. + +### isolationtester with multi-node access + +isolationtester was extended to allow users to run tests on multiple +sessions and on multiple nodes. This is used for internal BDR testing, +although it's also available for use with user application testing. + +``` +$ isolationtester \ + --outputdir=./iso_output \ + --create-role=logical \ + --dbname=postgres \ + --server 'd1=dbname=node1' \ + --server 'd2=dbname=node2' \ + --server 'd3=dbname=node3' +``` + +Isolation tests are a set of tests for examining concurrent behaviors in +PostgreSQL. These tests require running multiple interacting transactions, +which requires managing multiple concurrent connections and therefore +can't be tested using the normal `pg_regress` program. The name "isolation" +comes from the fact that the original motivation was to test the +serializable isolation level. Tests for other sorts of concurrent +behaviors were added as well. + +It's built using PGXS as an external module. +On installation, it creates the `isolationtester` binary file, which is run by +`pg_isolation_regress` to perform concurrent regression tests and observe +results. + +`pg_isolation_regress` is a tool similar to `pg_regress`, but instead of using +psql to execute a test, it uses isolationtester. It accepts all the same +command-line arguments as `pg_regress`. It was modified to accept multiple +hosts as parameters. It then passes the host conninfo along with server names +to the `isolationtester` binary. Isolation tester compares these server names with the +names specified in each session in the spec files and runs given tests on +respective servers. + +To define tests with overlapping transactions, we use test specification +files with a custom syntax. To add +a new test, place a spec file in the `specs/` subdirectory, add the expected +output in the `expected/` subdirectory, and add the test's name to the makefile. + +Isolationtester is a program that uses libpq to open multiple connections +and executes a test specified by a spec file. A libpq connection string +specifies the server and database to connect to. Defaults derived from +environment variables are used otherwise. + +Specification consists of five parts, tested in this order: + +`server ""` + + This defines the name of the servers for the sessions to run on. + There can be zero or more server `""` specifications. + The conninfo corresponding to the names is provided by the command to + run isolationtester. This is described in `quickstart_isolationtest.md`. + This part is optional. + +`setup { }` + + The given SQL block is executed once, in one session only, before running + the test. Create any test tables or other required objects here. This + part is optional. Multiple setup blocks are allowed if needed. Each is + run separately, in the given order. The reason for allowing multiple + setup blocks is that each block is run as a single PQexec submission, + and some statements such as VACUUM can't be combined with others in such + a block. + +`teardown { }` + + The teardown SQL block is executed once after the test is finished. Use + this to clean up in preparation for the next permutation, such as dropping + any test tables created by setup. This part is optional. + +`session ""` + + There are normally several "session" parts in a spec file. Each + session is executed in its own connection. A session part consists + of three parts: setup, teardown, and one or more "steps." The per-session + setup and teardown parts have the same syntax as the per-test setup and + teardown, but they are executed in each session. The + setup part typically contains a BEGIN command to begin a transaction. + + A session part also consists of `connect_to` specification. + This points to a server name specified in the beginning that + indicates the server on which this session runs. + + `connect_to ""` + + Each step has the syntax: + + `step "" { }` + + where `` is a name identifying this step, and SQL is a SQL statement + (or statements, separated by semicolons) that's executed in the step. + Step names must be unique across the whole spec file. + +`permutation ""` + + A permutation line specifies a list of steps that are run in that order. + Any number of permutation lines can appear. If no permutation lines are + given, the test program automatically generates all possible orderings + of the steps from each session (running the steps of any one session in + order). The list of steps in a manually specified + "permutation" line doesn't actually have to be a permutation of the + available steps. It can, for instance, repeat some steps more than once + or leave others out. + +Lines beginning with a # are comments. + +For each permutation of the session steps (whether these are manually +specified in the spec file or automatically generated), the isolation +tester runs: + +1. The main setup part +1. Per-session setup parts +1. The selected session steps +1. Per-session teardown +1. The main teardown script + +Each selected step is sent to the connection associated +with its session. + +To run isolation tests in a BDR environment that ran all prerequisite make +commands: + +1. Run `make isolationcheck-install` to install the isolationtester submodule. + +2. You can run isolation regression tests using either + of the following commands from the bdr-private repo: + + `make isolationcheck-installcheck` + `make isolationcheck-makecheck` + +To run `isolationcheck-installcheck`, you need to have two or more postgresql +servers running. Pass the conninfo of each server to `pg_isolation_regress` +in the BDR makefile. + Ex: `pg_isolation_regress --server 'd1=host=myhost dbname=mydb port=5434' + --server 'd2=host=myhost1 dbname=mydb port=5432'` + +Next, add a `.spec` file containing tests in the `specs/isolation` directory +of the `bdr-private/` repo. Add a `.out` file in `expected/isolation` directory of +the n`bdr-private/` repo. + +Then run + `make isolationcheck-installcheck` + +`Isolationcheck-makecheck` currently supports running isolation tests on a +single instance by setting up BDR between multiple databases. + +You need to pass appropriate database names and the conninfo of bdr instances +to `pg_isolation_regress` in the BDR makefile as follows: + `pg_isolation_regress --dbname=db1,db2 --server 'd1=dbname=db1' + --server 'd2=dbname=db2'` + +Then run + `make isolationcheck-makecheck` + +Each step can contain commands that block until further action has been taken +(most likely, some other session runs a step that unblocks it or causes a +deadlock). A test that uses this ability must manually specify valid +permutations, that is, those that don'tt expect a blocked session to execute a +command. If a test doesn't follow that rule, isolationtester cancels it +after 300 seconds. If the cancel doesn't work, isolationtester exits +uncleanly after 375 seconds of wait time. Avoid testing invalid +permutations because they can make the isolation tests take +a very long time to run, and they serve no useful testing purpose. + +isolationtester recognizes that a command has blocked by checking whether it is shown as waiting in the `pg_locks` view. Therefore, only +blocks on heavyweight locks are detected. + +## Performance testing and tuning + +BDR allows you to issue write transactions onto multiple master nodes. +Bringing those writes back together onto each node has a cost in +performance. + +First, replaying changes from another node has a CPU cost, an I/O cost, +and it generates WAL records. The resource use is usually less +than in the original transaction since CPU overheads are lower as a result +of not needing to reexecute SQL. In the case of UPDATE and DELETE +transactions, there might be I/O costs on replay if data isn't cached. + +Second, replaying changes holds table-level and row-level locks that can +produce contention against local workloads. The conflict-free replicated data types (CRDT) and column-level conflict detection (CLCD) features +ensure you get the correct answers even for concurrent updates, but they +don't remove the normal locking overheads. If you get locking contention, +try to avoid conflicting updates or keep transactions as short as +possible. A heavily updated row in a larger transaction causes +a bottleneck on performance for that transaction. Complex applications +require some thought to maintain scalability. + +If you think you're having performance problems, +develop performance tests using the benchmarking tools. pgbench +allows you to write custom test scripts specific to your use case +so you can understand the overheads of your SQL and measure the impact +of concurrent execution. + +If BDR is running slow, then we suggest the following: + +1. Write a custom test script for pgbench, as close as you can make it + to the production system's problem case. +2. Run the script on one node to give you a baseline figure. +3. Run the script on as many nodes as occurs in production, using the + same number of sessions in total as you did on one node. This + shows you the effect of moving to multiple nodes. +4. Increase the number of sessions for these two tests so you can + plot the effect of increased contention on your application. +5. Make sure your tests are long enough to account for replication delays. +6. Ensure that replication delay isn't growing during your tests. + +Use all of the normal Postgres tuning features to improve the speed +of critical parts of your application. + +## Assessing suitability + +BDR is compatible with PostgreSQL, but not all PostgreSQL applications are +suitable for use on distributed databases. Most applications are already or +can easily be modified to become BDR compliant. You can undertake an +assessment activity in which you can point your application to a BDR-enabled +setup. BDR provides a few knobs that can be set during the assessment period. +These aid in the process of deciding suitability of your application in +a BDR-enabled environment. + +### Assessing updates of primary key/replica identity + +BDR can't currently perform conflict resolution where the PRIMARY KEY is changed +by an UPDATE operation. You can update the primary key, but you must +ensure that no conflict with existing values is possible. + +BDR provides the following configuration parameter to assess how frequently +the primary key/replica identity of any table is being subjected to update +operations. + +Use these configuration parameters only for assessment. +You can use them on a single node BDR instance, but don't use them on a production +BDR cluster with two or more nodes replicating to each other. In fact, a node +might fail to start or a new node fail to join the cluster if any of the +assessment parameters are set to anything other than `IGNORE`. + +```sql +bdr.assess_update_replica_identity = IGNORE (default) | LOG | WARNING | ERROR +``` + +By enabling this parameter during the assessment period, you can log updates to +the key/replica identity values of a row. You can also potentially block such +updates, if desired. For example: + +```sql +CREATE TABLE public.test(g int primary key, h int); +INSERT INTO test VALUES (1, 1); + +SET bdr.assess_update_replica_identity TO 'error'; +UPDATE test SET g = 4 WHERE g = 1; +ERROR: bdr_assess: update of key/replica identity of table public.test +``` + +Apply worker processes always ignore any settings for this parameter. + +### Assessing use of LOCK on tables or in SELECT queries + +Because BDR writer processes operate much like normal user sessions, they're subject to +the usual rules around row and table locking. This can sometimes lead to BDR writer +processes waiting on locks held by user transactions or even by each other. + +BDR provides the following configuration parameter to assess if the application +is taking explicit locks: + +```sql +bdr.assess_lock_statement = IGNORE (default) | LOG | WARNING | ERROR +``` + +Two types of locks that you can track are: + +- Explicit table-level locking (`LOCK TABLE ...`) by user sessions +- Explicit row-level locking (`SELECT ... FOR UPDATE/FOR SHARE`) by user sessions + +By enabling this parameter during the assessment period, you can track (or block) such explicit +locking activity. For example: + +```sql +CREATE TABLE public.test(g int primary key, h int); +INSERT INTO test VALUES (1, 1); + +SET bdr.assess_lock_statement TO 'error'; +SELECT * FROM test FOR UPDATE; +ERROR: bdr_assess: "SELECT FOR UPDATE" invoked on a BDR node + +SELECT * FROM test FOR SHARE; +ERROR: bdr_assess: "SELECT FOR SHARE" invoked on a BDR node + +SET bdr.assess_lock_statement TO 'warning'; +LOCK TABLE test IN ACCESS SHARE MODE; +WARNING: bdr_assess: "LOCK STATEMENT" invoked on a BDR node +``` diff --git a/product_docs/docs/bdr/4/camo.mdx b/product_docs/docs/bdr/4/camo.mdx new file mode 100644 index 00000000000..1144dac6bf9 --- /dev/null +++ b/product_docs/docs/bdr/4/camo.mdx @@ -0,0 +1,583 @@ +--- +navTitle: Commit At Most Once +title: Commit At Most Once +--- + +The objective of the Commit At Most Once (CAMO) feature is to prevent +the application from committing more than once. + +Without CAMO, when a client loses connection after a COMMIT is +submitted, the application might not receive a reply from the server +and is therefore unsure whether the transaction committed. + +The application can't easily decide between the two options of: + +- Retrying the transaction with the same data, since this can in some cases cause the data to be entered twice + +- Not retrying the transaction and risk that the data doesn't get + processed at all + +Either of those is a critical error with high-value data. + +One way to avoid this situation is to make sure that the transaction +includes at least one `INSERT` into a table with a unique index, but +that depends on the application design and requires application- +specific error-handling logic, so it isn't effective in all cases. + +The CAMO feature in BDR offers a more general solution and doesn't require an `INSERT`. When activated by +`bdr.enable_camo` or `bdr.commit_scope`, the application +receives a message containing the transaction identifier, if already +assigned. Otherwise, the first write statement in a transaction +sends that information to the client. +If the application sends an explicit COMMIT, the protocol ensures that the application receives the notification +of the transaction identifier before the COMMIT is sent. +If the server doesn't reply to the COMMIT, the application can +handle this error by using the transaction identifier to request +the final status of the transaction from another BDR node. +If the prior transaction status is known, then the application can safely +decide whether to retry the transaction. + +CAMO works in one of two modes: + +- Pair mode +- With Eager All Node Replication + +In pair mode, CAMO works by creating a pair of partner nodes that +are two BDR master nodes from the same top-level BDR group. In this operation mode, +each node in the pair knows the outcome of any recent transaction executed +on the other peer and especially (for our need) knows the outcome of any +transaction disconnected during COMMIT. +The node that receives the transactions from +the application might be referred to as "origin" and the node that confirms these transactions as "partner." +However, there's no difference in the CAMO configuration for the nodes in the +CAMO pair. The pair is symmetric. + +When combined with [Eager All-Node Replication](eager), CAMO +enables every peer (that is, a full BDR master node) to act as a CAMO partner. +No designated CAMO partner must be configured in this mode. + +!!! Warning + CAMO requires changes to the user's application + to take advantage of the advanced error handling. Enabling a parameter isn't enough to gain protection. Reference client implementations + are provided to customers upon request. + +## Requirements + +To use CAMO, an application must issue an explicit COMMIT message +a separate request (not as part of a multi-statement request). +CAMO can't provide status for transactions issued from procedures +or from single-statement transactions that use implicit commits. + +## Configuration + +Assume an existing BDR cluster consists of the nodes `node1` and +`node2`. Both notes are part of a BDR-enabled database called `bdrdemo`, and both part +of the same node group `mygroup`. You can configure the nodes +to be CAMO partners for each other. + +1. Create the BDR cluster where nodes `node1` and `node2` are part of the +`mygroup` node group. +1. Run the function `bdr.add_camo_pair()` on one node: + + ```sql + SELECT bdr.add_camo_pair('mygroup', 'node1', 'node2'); + ``` + +1. Adjust the application to use the COMMIT error handling that CAMO suggests. + +We don't recommend enabling CAMO at the server level, as this imposes +higher latency for all transactions, even when not needed. Instead, +selectively enable it for individual transactions +by turning on CAMO at the session or transaction level. + +To enable at CAME at the session level: + +```sql +SET bdr.enable_camo = 'remote_commit_flush'; +``` + +To enable CAMO for individual transactions, after starting the +transaction and before committing it: + +```sql +SET LOCAL bdr.enable_camo = 'remote_commit_flush'; +``` + +Valid values for `bdr.enable_camo` that enable CAMO are: + +* `off` (default) +* `remote_write` +* `remote_commit_async` +* `remote_commit_flush` or `on` + +See the [Comparison](durability#Comparison) of synchronous replication +modes for details about how each mode behaves. +Setting `bdr.enable_camo = off` disables this feature, which is the default. + +### CAMO with Eager All-Node Replication + +To use CAMO with Eager All-Node Replication, no changes are required +on either node. It is enough to enable the global commit +scope after the start of the transaction. You don't need to set +`bdr.enable_camo`. + +```sql +BEGIN; +SET LOCAL bdr.commit_scope = 'global'; +... +COMMIT; +``` + +The application still needs to be adjusted to use COMMIT error +handling as specified but is free to connect to any available BDR +node to query the transaction's status. + +## Failure scenarios + +Different failure scenarios occur in different +configurations. + +### Data persistence at receiver side + +By default, a PGL writer operates in +`bdr.synchronous_commit = off` mode when applying transactions +from remote nodes. This holds true for CAMO as well, meaning that +transactions are confirmed to the origin node possibly before reaching +the disk of the CAMO partner. In case of a crash or hardware failure, +it is possible for a confirmed transaction to be unrecoverable on the +CAMO partner by itself. This isn't an issue as long as the CAMO +origin node remains operational, as it redistributes the +transaction once the CAMO partner node recovers. + +This in turn means CAMO can protect against a single-node failure, +which is correct for local mode as well as or even in combination +with remote write. + +To cover an outage of both nodes of a CAMO pair, you can use +`bdr.synchronous_commit = local` to enforce a flush prior to the +pre-commit confirmation. This doesn't work in with +either remote write or local mode and has a performance +impact due to I/O requirements on the CAMO partner in the +latency sensitive commit path. + +### Local mode + +When `synchronous_replication_availability = 'async'`, a node +(i.e., master) detects whether its CAMO partner is +ready. If not, it temporarily switches to local mode. +When in local mode, a node commits transactions locally until +switching back to CAMO mode. + +This doesn't allow COMMIT status to be retrieved, but it does +let you choose availability over consistency. This mode +can tolerate a single-node failure. In case both nodes of a CAMO pair +fail, they might choose incongruent commit decisions to maintain +availability, leading to data inconsistencies. + +For a CAMO partner to switch to ready, it needs to be connected, and +the estimated catchup interval needs to drop below +`bdr.global_commit_timeout`. The current readiness status of a CAMO +partner can be checked with `bdr.is_camo_partner_ready`, while +`bdr.node_replication_rates` provides the current estimate of the catchup +time. + +The switch from CAMO protected to local mode is only ever triggered by +an actual CAMO transaction either because the commit exceeds the +`bdr.global_commit_timeout` or, in case the CAMO partner is already +known, disconnected at the time of commit. This switch is independent +of the estimated catchup interval. If the CAMO pair is configured to +require Raft to switch to local mode, this switch requires a +majority of nodes to be operational (see the `require_raft` flag for +[bdr.add_camo_pair](camo#adding-a-camo-pair)). This can prevent a +split brain situation due to an isolated node from switching to local +mode. If `require_raft` isn't set for the CAMO pair, the origin node +switches to local mode immediately. + +You can configure the detection on the sending node using PostgreSQL +settings controlling keep-alives and timeouts on the TCP connection to +the CAMO partner. +The `wal_sender_timeout` is the time that a node waits +for a CAMO partner until switching to local mode. Additionally, +the `bdr.global_commit_timeout` setting puts a per-transaction +limit on the maximum delay a COMMIT can incur due to the +CAMO partner being unreachable. It might be lower than the +`wal_sender_timeout`, which influences synchronous standbys as +well, and for which a good compromise between responsiveness and +stability must be found. + +The switch from local mode to CAMO mode depends on the CAMO partner +node, which initiates the connection. The CAMO partner tries to +reconnect at least every 30 seconds. After connectivity is +reestablished, it might therefore take up to 30 seconds until the CAMO +partner connects back to its origin node. Any lag that accumulated on +the CAMO partner further delays the switch back to CAMO protected +mode. + +Unlike during normal CAMO operation, in local mode there's no +additional commit overhead. This can be problematic, as it allows the +node to continuously process more transactions than the CAMO +pair can normally process. Even if the CAMO partner eventually +reconnects and applies transactions, its lag only ever increases +in such a situation, preventing reestablishing the CAMO protection. +To artificially throttle transactional throughput, BDR provides the +`bdr.camo_local_mode_delay` setting, which allows you to delay a COMMIT in +local mode by an arbitrary amount of time. We recommend measuring +commit times in normal CAMO mode during expected workloads and +configuring this delay accordingly. The default is 5 ms, which reflects +a local network and a relatively quick CAMO partner response. + +Consider the choice of whether to allow local mode in view of +the architecture and the availability requirements. The following examples provide some detail. + +### Example: Symmetric node pair + +This example considers a setup with two BDR nodes that are the +CAMO partner of each other. +This is the only possible configuration starting with BDR4. + +This configuration enables CAMO behavior on both nodes. It's +therefore suitable for workload patterns where it is acceptable to +write concurrently on more than one node, such as in cases that aren't +likely to generate conflicts. + +#### With local mode + +If local mode is allowed, there's no single point of failure. When one node fails: + +- The other node can determine the status of all transactions that + were disconnected during COMMIT on the failed node. +- New write transactions are allowed: + - If the second node also fails, then the outcome of those + transactions that were being committed at that time is + unknown. + +#### Without local mode + +If local mode isn't allowed, then each node requires the other node +for committing transactions, that is, each node is a single point of +failure. When one node fails: + +- The other node can determine the status of all transactions that + were disconnected during COMMIT on the failed node. +- New write transactions are prevented until the node recovers. + +## Application use + +### Overview and requirements + +CAMO relies on a retry loop and specific error handling +on the client side. There are three aspects to it: + +* The result of a transaction's COMMIT needs to be checked and, in + case of a temporary error, the client must retry the transaction. +* Prior to COMMIT, the client must retrieve a global + identifier for the transaction, consisting of a node id and a + transaction id (both 32-bit integers). +* If the current server fails while attempting a COMMIT of a transaction, + the application must connect to its CAMO partner, retrieve the status + of that transaction, and retry depending on the response. + +The application must store the global transaction +identifier only for the purpose of verifying the transaction status in +case of disconnection during COMMIT. In particular, the application +doesn't need an additional persistence layer. If the application +fails, it needs only the information in the database to restart. + +### Adding a CAMO pair + +The function `bdr.add_camo_pair()` configures an existing pair of BDR +nodes to work as a symmetric CAMO pair. + +The `require_raft` option controls how and when to switch to local +mode in case `synchronous_replication_availability` is set to `async`, +allowing such a switch in general. + +#### Synopsis + +```sql +bdr.add_camo_pair(node_group text, left_node text, right_node text, + require_raft bool) +``` + +!!! Note + The names `left` and `right` have no special meaning. + +!!! Note + Since BDR version 4.0, only symmetric CAMO configurations are + supported, that is, both nodes of the pair act as a CAMO partner for + each other. + +### Changing the configuration of a CAMO pair + +The function `bdr.alter_camo_pair()` allows you to toggle the +`require_raft` You can't currently change +the nodes of a pairing. You must instead use `bdr.remove_camo_pair` followed by +`bdr.add_camo_pair`. + +#### Synopsis + +```sql +bdr.alter_camo_pair(node_group text, left_node text, right_node text, + require_raft bool) +``` + +### Removing a CAMO pair + +The function `bdr.remove_camo_pair()` removes a CAMO pairing of two +nodes and disallows future use of CAMO transactions by +`bdr.enable_camo` on those two nodes. + +#### Synopsis + +```sql +bdr.remove_camo_pair(node_group text, left_node text, right_node text) +``` + +!!! Note + The names `left` and `right` have no special meaning. + +### CAMO partner connection status + +The function `bdr.is_camo_partner_connected` allows checking the +connection status of a CAMO partner node configured in pair mode. +There currently is no equivalent for CAMO used with +Eager Replication. + +#### Synopsis + +```sql +bdr.is_camo_partner_connected() +``` + +#### Return value + +A Boolean value indicating whether the CAMO partner is currently +connected to a WAL sender process on the local node and therefore can +receive transactional data and send back confirmations. + +### CAMO partner readiness + +The function `bdr.is_camo_partner_ready` allows checking the readiness +status of a CAMO partner node configured in pair mode. Underneath, +this triggers the switch to and from local mode. + +#### Synopsis + +```sql +bdr.is_camo_partner_ready() +``` + +#### Return value + +A Boolean value indicating whether the CAMO partner can reasonably be +expected to confirm transactions originating from the local node in a +timely manner (before `bdr.global_commit_timeout` expires). + +!!! Note + This function queries the past or current state. A + positive return value doesn't indicate whether the CAMO partner can + confirm future transactions. + +### Fetch the CAMO partner + +This function shows the local node's CAMO partner (configured by pair +mode). + +```sql +bdr.get_configured_camo_partner() +``` + +### Wait for consumption of the apply queue from the CAMO partner + +The function `bdr.wait_for_camo_partner_queue` is a wrapper of +`bdr.wait_for_apply_queue` defaulting to query the CAMO partner node. +It yields an error if the local node isn't part of a CAMO pair. + +#### Synopsis + +```sql +bdr.wait_for_camo_partner_queue() +``` + +### Transaction status between CAMO nodes + +This function enables a wait for CAMO transactions to be fully resolved. + +```sql +bdr.camo_transactions_resolved() +``` + +### Transaction status query function + +To check the status of a transaction that was being committed when the node +failed, the application must use this function: + +```sql +bdr.logical_transaction_status(node_id, xid, require_camo_partner) +``` + +With CAMO used in pair mode, use this function only on +a node that's part of a CAMO pair. Along with Eager +replication, you can use it on all nodes. + +In both cases, you must call the function within 15 minutes after +the commit was issued. The CAMO partner must regularly purge +such meta-information and therefore can't provide correct answers for +older transactions. + +Before querying the status of a transaction, this function waits for +the receive queue to be consumed and fully applied. This prevents +early negative answers for transactions that were +received but not yet applied. + +Despite its name, it's not always a read-only operation. +If the status is unknown, the CAMO partner decides whether to +commit or abort the transaction, storing that decision locally to +ensure consistency going forward. + +The client must not call this function before +attempting to commit on the origin. Otherwise the transaction might be +forced to roll back. + +#### Synopsis + +```sql +bdr.logical_transaction_status(node_id OID, + xid OID, + require_camo_partner BOOL DEFAULT true) +``` + +#### Parameters + +- `node_id` — The node id of the BDR node the transaction originates + from, usually retrieved by the client before COMMIT from the PQ + parameter `bdr.local_node_id`. +- `xid` — The transaction id on the origin node, usually retrieved by + the client before COMMIT from the PQ parameter `transaction_id` + (requires `enable_camo` to be set to `on`, `remote_write`, + `remote_commit_async`, or `remote_commit_flush`. See + [Commit At Most Once settings](configuration#commit-at-most-once)) +- `require_camo_partner` — Defaults to true and enables configuration + checks. Set to false to disable these checks and query the + status of a transaction that was protected by Eager All-Node + Replication. + +#### Return value + +The function returns one of these results: + +- `'committed'::TEXT` — The transaction was committed, is visible + on both nodes of the CAMO pair, and will eventually be replicated to + all other BDR nodes. No need for the client to retry it. + +- `'aborted'::TEXT` — The transaction was aborted and will not be + replicated to any other BDR node. The client needs to either + retry it or escalate the failure to commit the transaction. + +- `'in progress'::TEXT` — The transaction is still in progress on this + local node and wasn't committed or aborted yet. The transaction might be in the COMMIT phase, waiting for + the CAMO partner to confirm or deny the commit. The recommended + client reaction is to disconnect from the origin node and reconnect + to the CAMO partner to query that instead. With a load balancer or proxy + in between, where the client lacks control over which node gets + queried, the client can only poll repeatedly until the status + switches to either `'committed'` or `'aborted'`. + + For Eager All-Node Replication, peer nodes yield this result for + transactions that aren't yet committed or aborted. This means that + even transactions not yet replicated (or not even started on the + origin node) might yield an `in progress` result on a peer BDR node in + this case. However, the client must not query the transaction + status prior to attempting to commit on the origin. + +- `'unknown'::TEXT` — The transaction specified is unknown, either + because it's in the future, not replicated to that specific node + yet, or too far in the past. The status of such a transaction is + not yet or no longer known. This return value is a sign of improper + use by the client. + +The client must be prepared to retry the function call on error. + +## Interaction with DDL and global locks + +Transactions protected by CAMO can contain DDL operations. However, DDL uses global locks, which already provide some +synchronization among nodes. See +[DDL locking details](ddl#ddl-locking-details) for more +information. + +Combining CAMO with DDL imposes a higher latency and also +increases the chance of global deadlocks. We therefore recommend using a +relatively low `bdr.global_lock_timeout`, which aborts the DDL and +therefore resolves a deadlock in a reasonable amount of time. + +### Nontransactional DDL + +The following DDL operations aren't allowed in a transaction +block and therefore can't benefit from CAMO protection. For +these, CAMO is automatically disabled internally: + +* all concurrent index operations (`CREATE`, `DROP`, and `REINDEX`) +* `REINDEX DATABASE`, `REINDEX SCHEMA`, and `REINDEX SYSTEM` +* `VACUUM` +* `CLUSTER` without any parameter +* `ALTER TABLE DETACH PARTITION CONCURRENTLY` +* `ALTER TYPE [enum] ADD VALUE` +* `ALTER SYSTEM` +* `CREATE` and `DROP DATABASE` +* `CREATE` and `DROP TABLESPACE` +* `ALTER DATABASE [db] TABLESPACE` + +## CAMO limitations + +- CAMO is designed to query the results of a recently failed COMMIT on +the origin node, so in case of disconnection, code the application +to immediately request the transaction status from the CAMO partner. +Have as little delay as possible after the failure before +requesting the status. Applications must not rely on CAMO decisions +being stored for longer than 15 minutes. + +- If the application forgets the global identifier assigned, for example +as a result of a restart, there's no easy way to recover +it. Therefore, we recommend that applications wait for outstanding +transactions to end before shutting down. + +- For the client to apply proper checks, a transaction protected by CAMO +can't be a single statement with implicit transaction control. You also can't +use CAMO with a transaction-controlling procedure or +in a `DO` block that tries to start or end transactions. + +- CAMO resolves commit status but doesn't yet resolve pending +notifications on commit. CAMO and Eager replication options don't +allow the `NOTIFY` SQL command or the `pg_notify()` function. +They also don't allow `LISTEN` or `UNLISTEN`. + +- When replaying changes, CAMO transactions may detect conflicts just +the same as other transactions. If timestamp conflict detection is used, +the CAMO transaction uses the timestamp of the prepare on the origin +node, which is before the transaction becomes visible on the origin +node itself. + +## Performance implications + +CAMO extends the Postgres replication protocol by adding a +message roundtrip at commit. Applications have a higher +commit latency than with asynchronous replication, mostly determined +by the roundtrip time between involved nodes. Increasing the number +of concurrent sessions can help to increase parallelism to +obtain reasonable transaction throughput. + +The CAMO partner confirming transactions must store transaction +states. Compared to non-CAMO operation, this might require an +additional seek for each transaction applied from the origin. + +## Client application testing + +Proper use of CAMO on the client side isn't trivial. We strongly +recommend testing the application behavior with the BDR +cluster against failure scenarios such as node crashes or network +outages. + +## CAMO versus group commit + +CAMO doesn't currently work with +[group commit](group_commit). diff --git a/product_docs/docs/bdr/4.0/catalogs.mdx b/product_docs/docs/bdr/4/catalogs.mdx similarity index 84% rename from product_docs/docs/bdr/4.0/catalogs.mdx rename to product_docs/docs/bdr/4/catalogs.mdx index d8287b872ad..466ceb510a1 100644 --- a/product_docs/docs/bdr/4.0/catalogs.mdx +++ b/product_docs/docs/bdr/4/catalogs.mdx @@ -1,13 +1,11 @@ --- -title: Catalogs and Views -originalFilePath: catalogs.md +title: Catalogs and views ---- -This section contains a listing of system catalogs and views used by BDR in -alphabetical order. +--- +Catalogs and views are presented here in alphabetical order. -## User-Visible Catalogs and Views +## User-visible catalogs and views ### `bdr.conflict_history` @@ -15,47 +13,47 @@ This table is the default table where conflicts are logged. The table is RANGE partitioned on column `local_time` and is managed by Autopartition. The default data retention period is 30 days. -Access to this table is possible by any table owner, who may see all +Access to this table is possible by any table owner, who can see all conflicts for the tables they own, restricted by row-level security. -For further details see [Logging Conflicts to a Table](conflicts). +For details, see [Logging conflicts to a table](conflicts). -#### `bdr.conflict_history` Columns +#### `bdr.conflict_history` columns | Name | Type | Description | | ----------------------- | ------------------------ | ------------------------------------------------------------------------------------------------------------------------ | -| sub_id | oid | which subscription produced this conflict; can be joined to `bdr.subscription` table | -| local_xid | xid | local transaction of the replication process at the time of conflict | -| local_lsn | pg_lsn | local transaction of the replication process at the time of conflict | -| local_time | timestamp with time zone | local time of the conflict | -| remote_xid | xid | transaction which produced the conflicting change on the remote node (an origin) | -| remote_commit_lsn | pg_lsn | commit lsn of the transaction which produced the conflicting change on the remote node (an origin) | -| remote_commit_time | timestamp with time zone | commit timestamp of the transaction which produced the conflicting change on the remote node (an origin) | -| conflict_type | text | detected type of the conflict (see [List of Conflict Types]) | -| conflict_resolution | text | conflict resolution chosen (see [List of Conflict Resolutions]) | -| conflict_index | regclass | conflicting index (only valid if the index wasn't dropped since) | -| reloid | oid | conflicting relation (only valid if the index wasn't dropped since) | -| nspname | text | name of the schema for the relation on which the conflict has occurred at the time of conflict (does not follow renames) | -| relname | text | name of the relation on which the conflict has occurred at the time of conflict (does not follow renames) | -| key_tuple | json | json representation of the key used for matching the row | -| remote_tuple | json | json representation of an incoming conflicting row | -| local_tuple | json | json representation of the local conflicting row | -| apply_tuple | json | json representation of the resulting (the one that has been applied) row | -| local_tuple_xmin | xid | transaction which produced the local conflicting row (if `local_tuple` is set and the row is not frozen) | -| local_tuple_node_id | oid | node which produced the local conflicting row (if `local_tuple` is set and the row is not frozen) | -| local_tuple_commit_time | timestamp with time zone | last known change timestamp of the local conflicting row (if `local_tuple` is set and the row is not frozen) | +| sub_id | oid | Which subscription produced this conflict; can be joined to `bdr.subscription` table | +| local_xid | xid | Local transaction of the replication process at the time of conflict | +| local_lsn | pg_lsn | Local transaction of the replication process at the time of conflict | +| local_time | timestamp with time zone | Local time of the conflict | +| remote_xid | xid | Transaction that produced the conflicting change on the remote node (an origin) | +| remote_commit_lsn | pg_lsn | Commit LSN of the transaction which produced the conflicting change on the remote node (an origin) | +| remote_commit_time | timestamp with time zone | Commit timestamp of the transaction that produced the conflicting change on the remote node (an origin) | +| conflict_type | text | Detected type of the conflict | +| conflict_resolution | text | Conflict resolution chosen | +| conflict_index | regclass | Conflicting index (valid only if the index wasn't dropped since) | +| reloid | oid | Conflicting relation (valid only if the index wasn't dropped since) | +| nspname | text | Name of the schema for the relation on which the conflict has occurred at the time of conflict (doesn't follow renames) | +| relname | text | Name of the relation on which the conflict has occurred at the time of conflict (does not follow renames) | +| key_tuple | json | Json representation of the key used for matching the row | +| remote_tuple | json | Json representation of an incoming conflicting row | +| local_tuple | json | Json representation of the local conflicting row | +| apply_tuple | json | Json representation of the resulting (the one that has been applied) row | +| local_tuple_xmin | xid | Transaction that produced the local conflicting row (if `local_tuple` is set and the row isn't frozen) | +| local_tuple_node_id | oid | Node that produced the local conflicting row (if `local_tuple` is set and the row isn't frozen) | +| local_tuple_commit_time | timestamp with time zone | Last known change timestamp of the local conflicting row (if `local_tuple` is set and the row isn't frozen) | ### `bdr.conflict_history_summary` A view containing user-readable details on row conflict. -#### `bdr.conflict_history_summary` Columns +#### `bdr.conflict_history_summary` columns | Name | Type | Description | | ----------------------- | ------------------------ | -------------------------- | | nspname | text | Name of the schema | | relname | text | Name of the table | -| local_time | timestamp with time zone | local time of the conflict | +| local_time | timestamp with time zone | Local time of the conflict | | local_tuple_commit_time | timestamp with time zone | Time of local commit | | remote_commit_time | timestamp with time zone | Time of remote commit | | conflict_type | text | Type of conflict | @@ -63,7 +61,7 @@ A view containing user-readable details on row conflict. ### `bdr.consensus_kv_data` -A persistent storage for the internal Raft based KV store used by +A persistent storage for the internal Raft-based KV store used by `bdr.consensus_kv_store()` and `bdr.consensus_kv_fetch()` interfaces. #### `bdr.consensus_kv_data` Columns @@ -79,19 +77,19 @@ A persistent storage for the internal Raft based KV store used by ### `bdr.camo_decision_journal` A persistent journal of decisions resolved by a CAMO partner node -after a failover, in case `bdr.logical_transaction_status` got -invoked. Unlike `bdr.node_pre_commit`, this does not cover -transactions processed under normal operational conditions (i.e. both +after a failover, in case `bdr.logical_transaction_status` was +invoked. Unlike `bdr.node_pre_commit`, this doesn't cover +transactions processed under normal operational conditions (i.e., both nodes of a CAMO pair are running and connected). Entries in this journal -are not ever cleaned up automatically. This is a purely diagnostic -tool that the system does not depend on in any way. +aren't ever cleaned up automatically. This is a diagnostic +tool that the system doesn't depend on. -#### `bdr.camo_decision_journal` Columns +#### `bdr.camo_decision_journal` columns | Name | Type | Description | | -------------- | ----------- | ---------------------------------------------- | | origin_node_id | oid | OID of the node where the transaction executed | -| origin_xid | oid | Transaction Id on the remote origin node | +| origin_xid | oid | Transaction ID on the remote origin node | | decision | char | 'c' for commit, 'a' for abort | | decision_ts | timestamptz | Decision time | @@ -103,20 +101,20 @@ This table lists merge ("handlers") functions for all CRDT data types. | Name | Type | Description | | ------------- | ------- | --------------------------------- | -| crdt_type_id | regtype | CRDT data type id | +| crdt_type_id | regtype | CRDT data type ID | | crdt_merge_id | regproc | Merge function for this data type | ### `bdr.ddl_replication` This view lists DDL replication configuration as set up by current [DDL filters](repsets#ddl-replication-filtering). -#### `bdr.ddl_replication` Columns +#### `bdr.ddl_replication` columns | Name | Type | Description | | ------------ | ---- | ------------------------------------------------------------ | | set_ddl_name | name | Name of DDL filter | -| set_ddl_tag | text | Which command tags it applies on (regular expression) | -| set_ddl_role | text | Which roles it applies to (regular expression) | +| set_ddl_tag | text | The command tags it applies on (regular expression) | +| set_ddl_role | text | The roles it applies to (regular expression) | | set_name | name | Name of the replication set for which this filter is defined | ### `bdr.depend` @@ -130,40 +128,40 @@ managing global consensus. As for the `bdr.global_consensus_response_journal` catalog, the payload is stored in a binary encoded format, which can be decoded -with the `bdr.decode_message_payload()` function; see the +with the `bdr.decode_message_payload()` function. See the [`bdr.global_consensus_journal_details`] view for more details. -#### `bdr.global_consensus_journal` Columns +#### `bdr.global_consensus_journal` columns | Name | Type | Description | | ------------- | ----- | --------------------------------------- | -| log_index | int8 | Id of the journal entry | +| log_index | int8 | ID of the journal entry | | term | int8 | Raft term | -| origin | oid | Id of node where the request originated | -| req_id | int8 | Id for the request | +| origin | oid | ID of node where the request originated | +| req_id | int8 | ID for the request | | req_payload | bytea | Payload for the request | | trace_context | bytea | Trace context for the request | ### `bdr.global_consensus_journal_details` -This view presents Raft messages that were sent, and the corresponding +This view presents Raft messages that were sent and the corresponding responses, using the `bdr.decode_message_payload()` function to decode their payloads. -#### `bdr.global_consensus_journal_details` Columns +#### `bdr.global_consensus_journal_details` columns | Name | Type | Description | | ------------------------ | ----- | --------------------------------------------- | -| log_index | int8 | Id of the journal entry | +| log_index | int8 | ID of the journal entry | | term | int8 | Raft term | -| request_id | int8 | Id of the request | -| origin_id | oid | Id of the node where the request originated | +| request_id | int8 | ID of the request | +| origin_id | oid | ID of the node where the request originated | | req_payload | bytea | Payload of the request | | origin_node_name | name | Name of the node where the request originated | -| message_type_no | oid | Id of the BDR message type for the request | +| message_type_no | oid | ID of the BDR message type for the request | | message_type | text | Name of the BDR message type for the request | | message_payload | text | BDR message payload for the request | -| response_message_type_no | oid | Id of the BDR message type for the response | +| response_message_type_no | oid | ID of the BDR message type for the response | | response_message_type | text | Name of the BDR message type for the response | | response_payload | text | BDR message payload for the response | | response_errcode_no | text | SQLSTATE for the response | @@ -177,14 +175,14 @@ that were received while managing global consensus. As for the `bdr.global_consensus_journal` catalog, the payload is stored in a binary-encoded format, which can be decoded with the -`bdr.decode_message_payload()` function; see the +`bdr.decode_message_payload()` function. See the [`bdr.global_consensus_journal_details`] view for more details. -#### `bdr.global_consensus_response_journal` Columns +#### `bdr.global_consensus_response_journal` columns | Name | Type | Description | | ------------- | ----- | ------------------------------ | -| log_index | int8 | Id of the journal entry | +| log_index | int8 | ID of the journal entry | | res_status | oid | Status code for the response | | res_payload | bytea | Payload for the response | | trace_context | bytea | Trace context for the response | @@ -194,13 +192,13 @@ stored in a binary-encoded format, which can be decoded with the This catalog table stores the information needed for recovering the global lock state on server restart. -For monitoring usage, operators should prefer the -[`bdr.global_locks`](#bdrglobal_locks) view, because the visible rows -in `bdr.global_lock` do not necessarily reflect all global locking activity. +For monitoring usage, the +[`bdr.global_locks`](#bdrglobal_locks) view is preferable because the visible rows +in `bdr.global_lock` don't necessarily reflect all global locking activity. -Do not modify the contents of this table: it is an important BDR catalog. +Don't modify the contents of this table. It is an important BDR catalog. -#### `bdr.global_lock` Columns +#### `bdr.global_lock` columns | Name | Type | Description | | -------------- | ------- | ---------------------------------------------------------------------------- | @@ -210,20 +208,20 @@ Do not modify the contents of this table: it is an important BDR catalog. | nspname | name | Schema name for the locked relation | | relname | name | Relation name for the locked relation | | groupid | oid | OID of the top level group (for Advisory locks) | -| key1 | integer | First 32-bit key or lower order 32-bits of 64-bit key (for Advisory locks) | -| key2 | integer | Second 32-bit key or higher order 32-bits of 64-bit key (for Advisory locks) | -| key_is_bigint | boolean | True if 64-bit integer key is used (for Advisory locks) | +| key1 | integer | First 32-bit key or lower order 32-bits of 64-bit key (for advisory locks) | +| key2 | integer | Second 32-bit key or higher order 32-bits of 64-bit key (for advisory locks) | +| key_is_bigint | boolean | True if 64-bit integer key is used (for advisory locks) | ### `bdr.global_locks` A view containing active global locks on this node. The `bdr.global_locks` view -exposes BDR's shared-memory lock state tracking, giving administrators a greater +exposes BDR's shared-memory lock state tracking, giving administrators greater insight into BDR's global locking activity and progress. -See [Monitoring Global Locks](monitoring#Monitoring-Global-Locks) +See [Monitoring global locks](monitoring#Monitoring-global-locks) for more information about global locking. -#### `bdr.global_locks` Columns +#### `bdr.global_locks` columns | Name | Type | Description | | -------------------------- | ----------- | ----------------------------------------------------------------- | @@ -235,7 +233,7 @@ for more information about global locking. | `acquire_stage` | text | Internal state of the lock acquisition process | | `waiters` | int4 | List of backends waiting for the same global lock | | `global_lock_request_time` | timestamptz | Time this global lock acquire was initiated by origin node | -| `local_lock_request_time` | timestamptz | Time the local node started trying to acquire the local-lock | +| `local_lock_request_time` | timestamptz | Time the local node started trying to acquire the local lock | | `last_state_change_time` | timestamptz | Time `acquire_stage` last changed | Column details: @@ -246,26 +244,26 @@ Column details: - `origin_node_id` and `origin_node_name`: If these are the same as the local node's ID and name, then the local node is the initiator of the global DDL - lock, i.e. it is the node running the acquiring transaction. If these fields + lock, i.e., it is the node running the acquiring transaction. If these fields specify a different node, then the local node is instead trying to acquire its local DDL lock to satisfy a global DDL lock request from a remote node. - `pid`: The process ID of the process that requested the global DDL lock, - if the local node is the requesting node. Null on other nodes; query the + if the local node is the requesting node. Null on other nodes. Query the origin node to determine the locker pid. - `global_lock_request_time`: The timestamp at which the global-lock request - initiator started the process of acquiring a global lock. May be null if - unknown on the current node. This time is stamped at the very beginning - of the DDL lock request, and includes the time taken for DDL epoch management + initiator started the process of acquiring a global lock. Can be null if + unknown on the current node. This time is stamped at the beginning + of the DDL lock request and includes the time taken for DDL epoch management and any required flushes of pending-replication queues. Currently only known on origin node. - `local_lock_request_time`: The timestamp at which the local node started trying to acquire the local lock for this global lock. This includes the - time taken for the heavyweight session lock acquire, but does NOT include - any time taken on DDL epochs or queue flushing. If the lock is re-acquired - after local node restart, this will be the node restart time. + time taken for the heavyweight session lock acquire but doesn't include + any time taken on DDL epochs or queue flushing. If the lock is reacquired + after local node restart, it becomes the node restart time. - `last_state_change_time`: The timestamp at which the `bdr.global_locks.acquire_stage` field last changed for this global lock @@ -276,11 +274,11 @@ Column details: This catalog table contains consensus snapshots created or received by the local node. -#### `bdr.local_consensus_snapshot` Columns +#### `bdr.local_consensus_snapshot` columns | Name | Type | Description | | --------- | ----- | ----------------------- | -| log_index | int8 | Id of the journal entry | +| log_index | int8 | ID of the journal entry | | log_term | int8 | Raft term | | snapshot | bytea | Raft snapshot data | @@ -288,11 +286,11 @@ the local node. This catalog table stores the current state of Raft on the local node. -#### `bdr.local_consensus_state` Columns +#### `bdr.local_consensus_state` columns | Name | Type | Description | | ----------------- | ---- | ----------------------------------- | -| node_id | oid | Id of the node | +| node_id | oid | ID of the node | | current_term | int8 | Raft term | | apply_index | int8 | Raft apply index | | voted_for | oid | Vote cast by this node in this term | @@ -300,13 +298,13 @@ This catalog table stores the current state of Raft on the local node. ### `bdr.local_node` -This table identifies the local node in current database of current Postgres instance. +This table identifies the local node in the current database of the current Postgres instance. -#### `bdr.local_node` Columns +#### `bdr.local_node` columns | Name | Type | Description | | ----------- | ------- | --------------------------- | -| node_id | oid | Id of the node | +| node_id | oid | ID of the node | | pub_repsets | text\[] | Published replication sets | | sub_repsets | text\[] | Subscribed replication sets | @@ -319,23 +317,23 @@ local node. Information about status of either subscription or table synchronization process. -#### `bdr.local_sync_status` Columns +#### `bdr.local_sync_status` columns | Name | Type | Description | | ----------------- | ------ | -------------------------------------------------------- | -| sync_kind | char | What kind of synchronization is/was done | -| sync_subid | oid | Id of subscription doing the synchronization | +| sync_kind | char | The kind of synchronization done | +| sync_subid | oid | ID of subscription doing the synchronization | | sync_nspname | name | Schema name of the synchronized table (if any) | | sync_relname | name | Name of the synchronized table (if any) | | sync_status | char | Current state of the synchronization | -| sync_remote_relid | oid | Id of the synchronized table (if any) on the upstream | +| sync_remote_relid | oid | ID of the synchronized table (if any) on the upstream | | sync_end_lsn | pg_lsn | Position at which the synchronization state last changed | ### `bdr.network_path_info` A catalog view that stores user-defined information on network costs between node locations. -#### `bdr.network_path_info` Columns +#### `bdr.network_path_info` columns | Name | Type | Description | | --------------- | ------- | ------------------------------------------ | @@ -350,14 +348,14 @@ A catalog view that stores user-defined information on network costs between nod This table lists all the BDR nodes in the cluster. -#### `bdr.node` Columns +#### `bdr.node` columns | Name | Type | Description | | --------------------- | ------ | --------------------------------------------------------------------------- | -| node_id | oid | Id of the node | +| node_id | oid | ID of the node | | node_name | name | Name of the node | -| node_group_id | oid | Id of the node group | -| source_node_id | oid | Id of the source node | +| node_group_id | oid | ID of the node group | +| source_node_id | oid | ID of the source node | | synchronize_structure | "char" | Schema synchronization done during the join | | node_state | oid | Consistent state of the node | | target_state | oid | State that the node is trying to reach (during join or promotion) | @@ -368,30 +366,30 @@ This table lists all the BDR nodes in the cluster. ### `bdr.node_catchup_info` -This catalog table records relevant catch-up information on each node, either +This catalog table records relevant catchup information on each node, either if it is related to the join or part procedure. -#### `bdr.node_catchup_info` Columns +#### `bdr.node_catchup_info` columns | Name | Type | Description | | -------------- | ------ | -------------------------------------------------------------------------- | -| node_id | oid | Id of the node | -| node_source_id | oid | Id of the node used as source for the data | +| node_id | oid | ID of the node | +| node_source_id | oid | ID of the node used as source for the data | | slot_name | name | Slot used for this source | | min_node_lsn | pg_lsn | Minimum LSN at which the node can switch to direct replay from a peer node | | catchup_state | oid | Status code of the catchup state | -| origin_node_id | oid | Id of the node from which we want transactions | +| origin_node_id | oid | ID of the node from which we want transactions | If a node(node_id) needs missing data from a parting node(origin_node_id), -it can get it from a node that already has it(node_source_id) via forwarding. -The records in this table will persist until the node(node_id) is a member of +it can get it from a node that already has it(node_source_id) by forwarding. +The records in this table persists until the node(node_id) is a member of the BDR cluster. ### `bdr.node_conflict_resolvers` Currently configured conflict resolution for all known conflict types. -#### `bdr.node_conflict_resolvers` Columns +#### `bdr.node_conflict_resolvers` columns | Name | Type | Description | | ----------------- | ---- | ------------------------------------ | @@ -402,7 +400,7 @@ Currently configured conflict resolution for all known conflict types. This catalog table lists all the BDR node groups. -#### `bdr.node_group` Columns +#### `bdr.node_group` columns | Name | Type | Description | | ----------------------------- | -------- | --------------------------------------------------------------------------------------------- | @@ -414,7 +412,7 @@ This catalog table lists all the BDR node groups. | node_group_flags | int | The group flags | | node_group_uuid | uuid | The uuid of the group | | node_group_apply_delay | interval | How long a subscriber waits before applying changes from the provider | -| node_group_check_constraints | bool | Whether the apply process should check constraints when applying data | +| node_group_check_constraints | bool | Whether the apply process checks constraints when applying data | | node_group_num_writers | int | Number of writers to use for subscriptions backing this node group | | node_group_enable_wal_decoder | bool | Whether the group has enable_wal_decoder set | | node_group_streaming_mode | char | Transaction streaming setting: 'O' - off, 'F' - file, 'W' - writer, 'A' - auto, 'D' - default | @@ -424,7 +422,7 @@ This catalog table lists all the BDR node groups. A view showing default replication sets create for BDR groups. See also `bdr.replication_sets`. -#### `bdr.node_group_replication_sets` Columns +#### `bdr.node_group_replication_sets` columns | Name | Type | Description | | ------------------ | ------- | ------------------------------------------------------------------------------------ | @@ -439,12 +437,12 @@ A view showing default replication sets create for BDR groups. See also A catalog table used to store per-node configuration that's specific to the local node (as opposed to global view of per-node configuration). -#### `bdr.node_local_info` Columns +#### `bdr.node_local_info` columns | Name | Type | Description | | ------------- | ---- | ----------------------------------------------------------------------- | | node_id | oid | The OID of the node (including the local node) | -| applied_state | oid | Internal id of the node state | +| applied_state | oid | Internal ID of the node state | | ddl_epoch | int8 | Last epoch number processed by the node | | slot_name | name | Name of the slot used to connect to that node (NULL for the local node) | @@ -457,39 +455,39 @@ A catalog view that stores user-defined information on node locations. | Name | Type | Description | | --------------- | ---- | --------------------------- | | node_group_name | name | Name of the BDR group | -| node_id | oid | Id of the node | -| node_region | text | User supplied region name | -| node_location | text | User supplied location name | +| node_id | oid | ID of the node | +| node_region | text | User-supplied region name | +| node_location | text | User-supplied location name | ### `bdr.node_log_config` A catalog view that stores information on the conflict logging configurations. -#### `bdr.node_log_config` Columns +#### `bdr.node_log_config` columns | Name | Description | | ----------------- | --------------------------------------------------------- | -| log_name | name of the logging configuration | -| log_to_file | whether it logs to the server log file | -| log_to_table | whether it logs to a table, and which table is the target | -| log_conflict_type | which conflict types it logs, if NULL means all | -| log_conflict_res | which conflict resolutions it logs, if NULL means all | +| log_name | Name of the logging configuration | +| log_to_file | Whether it logs to the server log file | +| log_to_table | Whether it logs to a table, and which table is the target | +| log_conflict_type | Which conflict types it logs, if NULL means all | +| log_conflict_res | Which conflict resolutions it logs, if NULL means all | ### `bdr.node_peer_progress` Catalog used to keep track of every node's progress in the replication stream. Every node in the cluster regularly broadcasts its progress every `bdr.replay_progress_frequency` milliseconds to all other nodes (default -is 60000 ms - i.e 1 minute). Expect N \* (N-1) rows in this relation. +is 60000 ms, i.e., 1 minute). Expect N \* (N-1) rows in this relation. -You may be more interested in the `bdr.node_slots` view for monitoring +You might be more interested in the `bdr.node_slots` view for monitoring purposes. See also [Monitoring](monitoring). -#### `bdr.node_peer_progress` Columns +#### `bdr.node_peer_progress` columns | Name | Type | Description | | ----------------------- | ----------- | ------------------------------------------------------------------------------------ | -| node_id | oid | The OID of the originating node which reported this position info | +| node_id | oid | The OID of the originating node that reported this position info | | peer_node_id | oid | The OID of the node's peer (remote node) for which this position info was reported | | last_update_sent_time | timestamptz | The time at which the report was sent by the originating node | | last_update_recv_time | timestamptz | The time at which the report was received by the local server | @@ -502,26 +500,26 @@ purposes. See also [Monitoring](monitoring). ### `bdr.node_pre_commit` Used internally on a node configured as a Commit At Most Once (CAMO) -partner. Shows the decisions a CAMO partner took on transactions in +partner. Shows the decisions a CAMO partner took on transactions in the last 15 minutes. -#### `bdr.node_pre_commit` Columns +#### `bdr.node_pre_commit` columns | Name | Type | Description | | -------------- | ----------- | ---------------------------------------------- | | origin_node_id | oid | OID of the node where the transaction executed | -| origin_xid | oid | Transaction Id on the remote origin node | +| origin_xid | oid | Transaction ID on the remote origin node | | decision | char | 'c' for commit, 'a' for abort | -| local_xid | xid | Transaction Id on the local node | -| commit_ts | timestamptz | commit timestamp of the transaction | -| decision_ts | timestamptz | decision time | +| local_xid | xid | Transaction ID on the local node | +| commit_ts | timestamptz | Commit timestamp of the transaction | +| decision_ts | timestamptz | Decision time | ### `bdr.node_replication_rates` This view contains information about outgoing replication activity from a -given node +given node. -#### `bdr.node_replication_rates` Columns +#### `bdr.node_replication_rates` columns | Column | Type | Description | | ---------------- | -------- | ---------------------------------------------------------------------------------------------------- | @@ -533,11 +531,11 @@ given node | replay_lag_bytes | int8 | Bytes difference between replay_lsn and current WAL write position on origin | | replay_lag_size | text | Human-readable bytes difference between replay_lsn and current WAL write position | | apply_rate | bigint | LSNs being applied per second at the peer node | -| catchup_interval | interval | Approximate time required for the peer node to catchup to all the changes that are yet to be applied | +| catchup_interval | interval | Approximate time required for the peer node to catch up to all the changes that are yet to be applied | !!! Note - The `replay_lag` is set immediately to zero after reconnect; we suggest - as a workaround to use `replay_lag_bytes`, `replay_lag_size` or + The `replay_lag` is set immediately to zero after reconnect. + As a workaround, use `replay_lag_bytes`, `replay_lag_size`, or `catchup_interval`. ### `bdr.node_slots` @@ -545,10 +543,10 @@ given node This view contains information about replication slots used in the current database by BDR. -See [Monitoring Outgoing Replication](monitoring#Monitoring-Outgoing-Replication) +See [Monitoring outgoing replication](monitoring#monitoring-outgoing-replication) for guidance on the use and interpretation of this view's fields. -#### `bdr.node_slots` Columns +#### `bdr.node_slots` columns | Name | Type | Description | | ------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------- | @@ -560,7 +558,7 @@ for guidance on the use and interpretation of this view's fields. | origin_id | oid | The OID of the origin node | | target_id | oid | The OID of the target node | | local_slot_name | name | Name of the replication slot according to BDR | -| slot_name | name | Name of the slot according to Postgres (should be same as above) | +| slot_name | name | Name of the slot according to Postgres (same as above) | | is_group_slot | boolean | True if the slot is the node-group crash recovery slot for this node (see ["Group Replication Slot"]\(nodes.md#Group Replication Slot)) | | is_decoder_slot | boolean | Is this slot used by Decoding Worker | | plugin | name | Logical decoding plugin using this slot (should be pglogical_output or bdr) | @@ -599,15 +597,15 @@ for guidance on the use and interpretation of this view's fields. | replay_lag_size | text | Human-readable bytes difference between replay_lsn and current WAL write position | !!! Note - The `replay_lag` is set immediately to zero after reconnect; we suggest - as a workaround to use `replay_lag_bytes` or `replay_lag_size`. + The `replay_lag` is set immediately to zero after reconnect. + As a workaround, use `replay_lag_bytes` or `replay_lag_size`. ### `bdr.node_summary` This view contains summary information about all BDR nodes known to the local node. -#### `bdr.node_summary` Columns +#### `bdr.node_summary` columns | Name | Type | Description | | ---------------------- | ---- | --------------------------------------------------------------------------- | @@ -615,7 +613,7 @@ node. | node_group_name | name | Name of the BDR group the node is part of | | interface_connstr | text | Connection string to the node | | peer_state_name | text | Consistent state of the node in human readable form | -| peer_target_state_name | text | State which the node is trying to reach (during join or promotion) | +| peer_target_state_name | text | State that the node is trying to reach (during join or promotion) | | node_seq_id | int4 | Sequence identifier of the node used for generating unique sequence numbers | | node_local_dbname | name | Database name of the node | | set_repl_ops | text | Which operations does the default replication set replicate | @@ -624,9 +622,9 @@ node. ### `bdr.queue` -This table stores historical record of replicated DDL statements. +This table stores the historical record of replicated DDL statements. -#### `bdr.queue` Columns +#### `bdr.queue` columns | Name | Type | Description | | ---------------- | ----------- | -------------------------------------------------------------- | @@ -638,42 +636,42 @@ This table stores historical record of replicated DDL statements. ### `bdr.replication_set` -A table that stores replication set configuration. It's recommended to check the -`bdr.replication_sets` view instead for user queries. +A table that stores replication set configuration. For user queries, we recommend instead checking the +`bdr.replication_sets` view. -#### `bdr.replication_set` Columns +#### `bdr.replication_set` columns | Name | Type | Description | | ------------------ | ------- | ------------------------------------------------------------------------------ | | set_id | oid | The OID of the replication set | -| set_nodeid | oid | Oid of the node (always local node oid currently) | +| set_nodeid | oid | OID of the node (always local node oid currently) | | set_name | name | Name of the replication set | | replicate_insert | boolean | Indicates if the replication set replicates INSERTs | | replicate_update | boolean | Indicates if the replication set replicates UPDATEs | | replicate_delete | boolean | Indicates if the replication set replicates DELETEs | | replicate_truncate | boolean | Indicates if the replication set replicates TRUNCATEs | | set_isinternal | boolean | Reserved | -| set_autoadd_tables | boolean | Indicates if new tables will be automatically added to this replication set | -| set_autoadd_seqs | boolean | Indicates if new sequences will be automatically added to this replication set | +| set_autoadd_tables | boolean | Indicates if new tables are automatically added to this replication set | +| set_autoadd_seqs | boolean | Indicates if new sequences are automatically added to this replication set | ### `bdr.replication_set_table` -A table that stores replication set table membership. It's recommended to check -the `bdr.tables` view instead for user queries. +A table that stores replication set table membership. For user queries, we recommend instead checking +the `bdr.tables` view. -#### `bdr.replication_set_table` Columns +#### `bdr.replication_set_table` columns | Name | Type | Description | | -------------- | ------------ | --------------------------------- | | set_id | oid | The OID of the replication set | -| set_reloid | regclass | Local id of the table | +| set_reloid | regclass | Local ID of the table | | set_att_list | text\[] | Reserved | | set_row_filter | pg_node_tree | Compiled row filtering expression | ### `bdr.replication_set_ddl` -A table that stores replication set ddl replication filters. It's recommended -to check the `bdr.ddl_replication` view instead for user queries. +A table that stores replication set ddl replication filters. For user queries, we recommend +instead checking the `bdr.ddl_replication` view. #### `bdr.replication_set_ddl` Columns @@ -686,10 +684,10 @@ to check the `bdr.ddl_replication` view instead for user queries. ### `bdr.replication_sets` -A view showing replication sets defined in the BDR group, even if they are not +A view showing replication sets defined in the BDR group, even if they aren't currently used by any node. -#### `bdr.replication_sets` Columns +#### `bdr.replication_sets` columns | Name | Type | Description | | ------------------ | ------- | ------------------------------------------------------------------------------ | @@ -699,14 +697,14 @@ currently used by any node. | replicate_update | boolean | Indicates if the replication set replicates UPDATEs | | replicate_delete | boolean | Indicates if the replication set replicates DELETEs | | replicate_truncate | boolean | Indicates if the replication set replicates TRUNCATEs | -| set_autoadd_tables | boolean | Indicates if new tables will be automatically added to this replication set | -| set_autoadd_seqs | boolean | Indicates if new sequences will be automatically added to this replication set | +| set_autoadd_tables | boolean | Indicates if new tables are automatically added to this replication set | +| set_autoadd_seqs | boolean | Indicates if new sequences are automatically added to this replication set | ### `bdr.schema_changes` -A simple view to show all the changes to schemas within BDR. +A simple view to show all the changes to schemas win BDR. -#### `bdr.schema_changes` Columns +#### `bdr.schema_changes` columns | Name | Type | Description | | ------------------------ | ------------ | ------------------------- | @@ -722,7 +720,7 @@ A simple view to show all the changes to schemas within BDR. A view to see the allocation details for galloc sequences. -#### `bdr.sequence_alloc` Columns +#### `bdr.sequence_alloc` columns | Name | Type | Description | | ------------------- | ----------- | ------------------------------------------------ | @@ -734,9 +732,9 @@ A view to see the allocation details for galloc sequences. ### `bdr.schema_changes` -A simple view to show all the changes to schemas within BDR. +A simple view to show all the changes to schemas in BDR. -#### `bdr.schema_changes` Columns +#### `bdr.schema_changes` columns | Name | Type | Description | | ------------------------ | ------------ | ------------------------- | @@ -752,7 +750,7 @@ A simple view to show all the changes to schemas within BDR. A view to see the sequences allocated. -#### `bdr.sequence_alloc` Columns +#### `bdr.sequence_alloc` columns | Name | Type | Description | | ------------------- | ----------- | ------------------------------------------------ | @@ -765,9 +763,9 @@ A view to see the sequences allocated. ### `bdr.sequences` This view lists all sequences with their kind, excluding sequences -for internal BDR book-keeping. +for internal BDR bookkeeping. -#### `bdr.sequences` Columns +#### `bdr.sequences` columns | Name | Type | Description | | ------- | ---- | ----------------------------------------------------- | @@ -779,21 +777,21 @@ for internal BDR book-keeping. Dynamic activity for each backend or worker process. -This contains the same information as pg_stat_activity, except wait_event +This contains the same information as `pg_stat_activity`, except `wait_event` is set correctly when the wait relates to BDR. ### `bdr.stat_relation` -Apply statistics for each relation. Only contains data if the tracking +Apply statistics for each relation. Contains data only if the tracking is enabled and something was replicated for a given relation. -#### `bdr.stat_relation` Columns +#### `bdr.stat_relation` columns | Column | Type | Description | | ------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------- | | nspname | name | Name of the relation's schema | | relname | name | Name of the relation | -| relid | oid | Oid of the relation | +| relid | oid | OID of the relation | | total_time | double precision | Total time spent processing replication for the relation | | ninsert | bigint | Number of inserts replicated for the relation | | nupdate | bigint | Number of updates replicated for the relation | @@ -809,15 +807,15 @@ is enabled and something was replicated for a given relation. ### `bdr.stat_subscription` -Apply statistics for each subscription. Only contains data if the tracking +Apply statistics for each subscription. Contains data only if the tracking is enabled. -#### `bdr.stat_subscription` Columns +#### `bdr.stat_subscription` columns | Column | Type | Description | | -------------------- | ------------------------ | ------------------------------------------------------------------------------------------------------------------- | | sub_name | name | Name of the subscription | -| subid | oid | Oid of the subscription | +| subid | oid | OID of the subscription | | nconnect | bigint | Number of times this subscription has connected upstream | | ncommit | bigint | Number of commits this subscription did | | nabort | bigint | Number of aborts writer did for this subscription | @@ -851,18 +849,18 @@ is enabled. ### `bdr.subscription` This catalog table lists all the subscriptions owned by the local BDR -node, and which mode they are in. +node and their modes. -#### `bdr.subscription` Columns +#### `bdr.subscription` columns | Name | Type | Description | | ----------------- | -------- | -------------------------------------------------------------------------------- | -| sub_id | oid | Id of the subscription | +| sub_id | oid | ID of the subscription | | sub_name | name | Name of the subscription | -| nodegroup_id | oid | Id of nodegroup | -| origin_node_id | oid | Id of origin node | -| source_node_id | oid | Id of source node | -| target_node_id | oid | Id of target node | +| nodegroup_id | oid | ID of nodegroup | +| origin_node_id | oid | ID of origin node | +| source_node_id | oid | ID of source node | +| target_node_id | oid | ID of target node | | subscription_mode | char | Mode of subscription | | sub_enabled | bool | Whether the subscription is enabled (should be replication) | | apply_delay | interval | How much behind should the apply of changes on this subscription be (normally 0) | @@ -878,7 +876,7 @@ node, and which mode they are in. This view contains summary information about all BDR subscriptions that the local node has to other nodes. -#### `bdr.subscription_summary` Columns +#### `bdr.subscription_summary` columns | Name | Type | Description | | -------------------------- | ----------- | ---------------------------------------------------------------------------------------- | @@ -908,18 +906,18 @@ local node has to other nodes. This view shows incoming replication status between the local node and all other nodes in the BDR cluster. We consider replication to be -blocked when the subscription has restarted from the same LSN at least +blocked when the subscription restarted from the same LSN at least twice and not a single transaction is yet applied after the current -upstream connection was established. If the very first transaction after +upstream connection was established. If the first transaction after restart is very big and still being applied, the `replication_blocked` -result maybe wrong. +result might be wrong. If this is a logical standby node, then only the status for its upstream -node is shown. Similarly, replication status is not shown for +node is shown. Similarly, replication status isn't shown for subscriber-only nodes since they never send replication changes to other nodes. -#### `bdr.replication_status` Columns +#### `bdr.replication_status` columns | Column | Type | Description | | ------------------- | ------------------------ | --------------------------------------------------------------- | @@ -938,10 +936,10 @@ nodes. ### `bdr.tables` This view lists information about table membership in replication sets. -If a table exists in multiple replication sets, it will appear multiple times +If a table exists in multiple replication sets, it appears multiple times in this table. -#### `bdr.tables` Columns +#### `bdr.tables` columns | Name | Type | Description | | ------------------ | ------- | --------------------------------------------------------------------------------- | @@ -958,10 +956,10 @@ in this table. ### `bdr.trigger` -Within this view, you can see all the stream triggers created. +In this view, you can see all the stream triggers created. Often triggers here are created from `bdr.create_conflict_trigger`. -#### `bdr.trigger` Columns +#### `bdr.trigger` columns | Name | Type | Description | | -------------- | -------- | ----------------------------- | @@ -973,7 +971,7 @@ Often triggers here are created from `bdr.create_conflict_trigger`. ### `bdr.triggers` -An expanded view of `bdr.trigger` with more easy to read columns. +An expanded view of `bdr.trigger` with columns that are easier to read. | Name | Type | Description | | ------------------ | ------------------ | ----------------------- | @@ -994,10 +992,10 @@ into the state of BDR workers. | Name | Type | Description | | ----------------------- | ----------- | --------------------------------------------------------- | -| worker_pid | int | Process Id of the worker process | +| worker_pid | int | Process ID of the worker process | | worker_role | int | Numeric representation of worker role | | worker_role_name | text | Name of the worker role | -| worker_subid | oid | Subscription Id if the worker is associated with one | +| worker_subid | oid | Subscription ID if the worker is associated with one | | worker_commit_timestamp | timestamptz | Last commit timestamp processed by this worker if any | | worker_local_timestamp | timestamptz | Local time at which the above commit was processed if any | @@ -1016,24 +1014,24 @@ A persistent log of errors from BDR background worker processes. | sub_name | name | Name of the subscription | | worker_role | int4 | Internal identifier of the role of this worker (1: manager, 2: receive, 3: writer, 4: output, 5: extension) | | worker_role_name | text | Role name | -| worker_pid | int4 | Process id of the worker causing the error | +| worker_pid | int4 | Process ID of the worker causing the error | | error_time | timestamptz | Date and time of the error | | error_age | interval | Duration since error | | error_message | text | Description of the error | | error_context_message | text | Context in which the error happened | -| remoterelid | oid | Oid of remote relation on that node | +| remoterelid | oid | OID of remote relation on that node | ### `bdr.writers` Specific information about BDR writer processes. -#### `bdr.writers` Columns +#### `bdr.writers` columns | Name | Type | Description | | --------------------- | ----------- | ------------------------------------------------------------------ | | sub_name | name | Name of the subscription | -| pid | int | Process Id of the worker process | -| syncing_rel | int | Oid of the relation being synchronized (if any) | +| pid | int | Process ID of the worker process | +| syncing_rel | int | OID of the relation being synchronized (if any) | | streaming_allowed | text | Can this writer be target of direct to writer streaming | | is_streaming | bool | Is there transaction being streamed to this writer | | remote_xid | xid | Remote transaction id of the transaction being processed (if any) | @@ -1052,13 +1050,13 @@ The `bdr.worker_tasks` view shows BDR's current worker launch rate limiting state as well as some basic statistics on background worker launch and registration activity. -Unlike the other views listed here, it is not specific to the current database -and BDR node; state for all BDR nodes on the current PostgreSQL +Unlike the other views listed here, it isn't specific to the current database +and BDR node. State for all BDR nodes on the current PostgreSQL instance is shown. Join on the current database to filter it. -`bdr.worker_tasks` does not track walsenders and output plugins. +`bdr.worker_tasks` doesn't track walsenders and output plugins. -#### `bdr.worker_tasks` Columns +#### `bdr.worker_tasks` columns | Column | Type | Description | | --------------------------------- | ------------------------ | ------------------------------------------------------- | @@ -1072,7 +1070,7 @@ instance is shown. Join on the current database to filter it. | task_key_ext_funcname | name | Name of the function entry point | | task_key_ext_workername | name | Name assigned to the worker | | task_key_remoterelid | oid | Identifier of the remote syncing relation, if available | -| task_pid | integer | Process id of the worker | +| task_pid | integer | Process ID of the worker | | task_registered | timestamp with time zone | Worker registration timestamp | | since_registered | interval | Interval since the worker registered | | task_attached | timestamp with time zone | Worker attach timestamp | @@ -1082,13 +1080,13 @@ instance is shown. Join on the current database to filter it. | task_success | boolean | Is worker still running? | | task_next_launch_not_before | timestamp with time zone | Timestamp when the worker will be restarted again | | until_launch_allowed | interval | Time remaining for next launch | -| task_last_launch_requestor_pid | integer | Process id that requested launch | +| task_last_launch_requestor_pid | integer | Process ID that requested launch | | task_last_launch_request_time | timestamp with time zone | Timestamp when the request was made | | since_last_request | interval | Interval since the last request | | task_last_launch_request_approved | boolean | Did the last request succeed? | | task_nrequests | integer | Number of requests | | task_nregistrations | integer | Number of registrations | -| task_prev_pid | integer | Process id of the previous generation | +| task_prev_pid | integer | Process ID of the previous generation | | task_prev_registered | timestamp with time zone | Timestamp of the previous registered task | | since_prev_registered | interval | Interval since the previous registration | | task_prev_launched | timestamp with time zone | Timestamp of the previous launch | @@ -1103,29 +1101,29 @@ instance is shown. Join on the current database to filter it. Contains work items created and processed by autopartition worker. The work items are created on only one node and processed on different nodes. -#### `bdr.autopartition_work_queue` Columns +#### `bdr.autopartition_work_queue` columns | Column | Type | Description | | ------------------ | ------ | ------------------------------------------------------------------------------------------------------------------------ | -| ap_wq_workid | bigint | The Unique ID of the work item | +| ap_wq_workid | bigint | The unique ID of the work item | | ap_wq_ruleid | int | ID of the rule listed in autopartition_rules. Rules are specified using bdr.autopartition command | | ap_wq_relname | name | Name of the relation being autopartitioned | -| ap_wq_relnamespace | name | Name of the tablespace specified in rule for this work item. | +| ap_wq_relnamespace | name | Name of the tablespace specified in rule for this work item | | ap_wq_partname | name | Name of the partition created by the workitem | | ap_wq_work_kind | char | The work kind can be either 'c' (Create Partition), 'm' (Migrate Partition), 'd' (Drop Partition), 'a' (Alter Partition) | | ap_wq_work_sql | text | SQL query for the work item | -| ap_wq_work_depends | Oid\[] | Oids of the nodes on which the work item depends | +| ap_wq_work_depends | Oid\[] | OIDs of the nodes on which the work item depends | ### `bdr.autopartition_workitem_status` -The status of the work items which is updated locally on each node. +The status of the work items that is updated locally on each node. -#### `bdr.autopartition_workitem_status` Columns +#### `bdr.autopartition_workitem_status` columns | Column | Type | Description | | ----------------- | ----------- | ---------------------------------------------------------------------------------- | | ap_wi_workid | bigint | The ID of the work item | -| ap_wi_nodeid | Oid | Oid of the node on which the work item is being processed | +| ap_wi_nodeid | Oid | OID of the node on which the work item is being processed | | ap_wi_status | char | The status can be either 'q' (Queued), 'c' (Complete), 'f' (Failed), 'u' (Unknown) | | ap_wi_started_at | timestamptz | The start timestamptz of work item | | ap_wi_finished_at | timestamptz | The end timestamptz of work item | @@ -1133,15 +1131,15 @@ The status of the work items which is updated locally on each node. ### `bdr.autopartition_local_work_queue` Contains work items created and processed by autopartition worker. This is -similar to bdr.autopartition_work_queue, except that these work items are for +similar to `bdr.autopartition_work_queue`, except that these work items are for locally managed tables. Each node creates and processes its own local work items, independent of other nodes in the cluster. -#### `bdr.autopartition_local_work_queue` Columns +#### `bdr.autopartition_local_work_queue` columns | Column | Type | Description | | ------------------ | ------ | ------------------------------------------------------------------------------------------------------------------------ | -| ap_wq_workid | bigint | The Unique ID of the work item | +| ap_wq_workid | bigint | The unique ID of the work item | | ap_wq_ruleid | int | ID of the rule listed in autopartition_rules. Rules are specified using bdr.autopartition command | | ap_wq_relname | name | Name of the relation being autopartitioned | | ap_wq_relnamespace | name | Name of the tablespace specified in rule for this work item. | @@ -1154,12 +1152,12 @@ items, independent of other nodes in the cluster. The status of the work items for locally managed tables. -#### `bdr.autopartition_local_workitem_status` Columns +#### `bdr.autopartition_local_workitem_status` columns | Column | Type | Description | | ----------------- | ----------- | ---------------------------------------------------------------------------------- | | ap_wi_workid | bigint | The ID of the work item | -| ap_wi_nodeid | Oid | Oid of the node on which the work item is being processed | +| ap_wi_nodeid | Oid | OID of the node on which the work item is being processed | | ap_wi_status | char | The status can be either 'q' (Queued), 'c' (Complete), 'f' (Failed), 'u' (Unknown) | | ap_wi_started_at | timestamptz | The start timestamptz of work item | | ap_wi_finished_at | timestamptz | The end timestamptz of work item | @@ -1168,11 +1166,11 @@ The status of the work items for locally managed tables. Uses `bdr.run_on_all_nodes` to gather CAMO-related information from all nodes. -#### `bdr.group_camo_details` Columns +#### `bdr.group_camo_details` columns | Name | Type | Description | | -------------------------- | ---- | ----------------------------------------------------------------------------------- | -| node_id | text | Internal node id | +| node_id | text | Internal node ID | | node_name | text | Name of the node | | camo_partner | text | Node name of the camo partner | | is_camo_partner_connected | text | Connection status | @@ -1186,33 +1184,48 @@ Uses `bdr.run_on_all_nodes` to gather CAMO-related information from all nodes. Information regarding all the CAMO pairs configured in all the cluster. -#### `bdr.camo_pairs` Columns +#### `bdr.camo_pairs` columns | Name | Type | Description | | ------------- | ---- | ------------- | -| node_group_id | oid | Node group id | -| left_node_id | oid | Node id | -| right_node_id | oid | Node id | +| node_group_id | oid | Node group ID | +| left_node_id | oid | Node ID | +| right_node_id | oid | Node ID | +| require_raft | bool | Whether switching to local mode requires majority | !!! Note The names `left` and `right` have no special meaning. - BDR4 can only configure symmetric CAMO configuration, i.e. both nodes + BDR4 can configure only symmetric CAMO configuration, i.e., both nodes in the pair are CAMO partners for each other. +### `bdr.commit_scopes` + +Catalog storing all possible commit scopes that you can use for +`bdr.commit_scope` to enable group commit. + +#### `bdr.commit_scopes` columns + +| Name | Type | Description | +|--------------------------------|------|----------------------------------------- | +| commit_scope_id | oid | ID of the scope to be referenced | +| commit_scope_name | name | Name of the scope to be referenced | +| commit_scope_origin_node_group | oid | Node group for which the rule applies, referenced by ID | +| sync_scope_rule | text | Definition of the scope | + ### `bdr.group_raft_details` Uses `bdr.run_on_all_nodes` to gather Raft Consensus status from all nodes. -#### `bdr.group_raft_details` Columns +#### `bdr.group_raft_details` columns | Name | Type | Description | | ---------------- | ---- | ------------------------------ | -| node_id | oid | Internal node id | +| node_id | oid | Internal node ID | | node_name | name | Name of the node | | state | text | Raft worker state on the node | | leader_id | oid | Node id of the RAFT_LEADER | -| current_term | int | Raft election internal id | -| commit_index | int | Raft snapshot internal id | +| current_term | int | Raft election internal ID | +| commit_index | int | Raft snapshot internal ID | | nodes | int | Number of nodes accessible | | voting_nodes | int | Number of nodes voting | | protocol_version | int | Protocol version for this node | @@ -1221,7 +1234,7 @@ Uses `bdr.run_on_all_nodes` to gather Raft Consensus status from all nodes. Uses `bdr.run_on_all_nodes` to gather BDR slot information from all nodes. -#### `bdr.group_replslots_details` Columns +#### `bdr.group_replslots_details` columns | Name | Type | Description | | --------------- | -------- | ------------------------------------------------------------------------------- | @@ -1243,7 +1256,7 @@ Uses `bdr.run_on_all_nodes` to gather BDR slot information from all nodes. Uses `bdr.run_on_all_nodes` to gather subscription status from all nodes. -#### `bdr.group_subscription_summary` Columns +#### `bdr.group_subscription_summary` columns | Name | Type | Description | | -------------------------- | ---- | ---------------------------------------------- | @@ -1256,49 +1269,49 @@ Uses `bdr.run_on_all_nodes` to gather subscription status from all nodes. Uses `bdr.run_on_all_nodes` to gather BDR information from all nodes. -#### `bdr.group_versions_details` Columns +#### `bdr.group_versions_details` columns | Name | Type | Description | | ---------------- | ---- | ------------------------------ | -| node_id | oid | Internal node id | +| node_id | oid | Internal node ID | | node_name | name | Name of the node | | postgres_version | text | PostgreSQL version on the node | | bdr_version | text | BDR version on the node | -## Internal Catalogs and Views +## Internal catalogs and views ### `bdr.ddl_epoch` An internal catalog table holding state per DDL epoch. -#### `bdr.ddl_epoch` Columns +#### `bdr.ddl_epoch` columns | Name | Type | Description | | --------------------- | ----------- | ------------------------------------------------------------------------ | | ddl_epoch | int8 | Monotonically increasing epoch number | -| origin_node_id | oid | Internal node id of the node that requested creation of this epoch | +| origin_node_id | oid | Internal node ID of the node that requested creation of this epoch | | epoch_consume_timeout | timestamptz | Timeout of this epoch | | epoch_consumed | boolean | Switches to true as soon as the local node has fully processed the epoch | ### `bdr.internal_node_pre_commit` -Internal catalog table; please use the `bdr.node_pre_commit` view. +Internal catalog table. Use the `bdr.node_pre_commit` view. ### `bdr.sequence_kind` -An internal state table storing the type of each non-local sequence. The view -`bdr.sequences` is recommended for diagnostic purposes. +An internal state table storing the type of each nonlocal sequence. We recommend the view +`bdr.sequences` for diagnostic purposes. -#### `bdr.sequence_kind` Columns +#### `bdr.sequence_kind` columns | Name | Type | Description | | ------- | ---- | ----------------------------------------------------------- | | seqid | oid | Internal OID of the sequence | -| seqkind | char | Internal sequence kind ('l'=local,'t'=timeshard,'g'=galloc) | +| seqkind | char | Internal sequence kind ('l'=local,'t'=timeshard,'s'=snowflakeid,'g'=galloc) | ### `bdr.state_journal` -An internal node state journal. Please use `bdr.state_journal_details` for +An internal node state journal. Use `bdr.state_journal_details` for diagnostic purposes instead. ### `bdr.state_journal_details` @@ -1307,17 +1320,17 @@ Every change of node state of each node is logged permanently in `bdr.state_jour for diagnostic purposes. This view provides node names and human-readable state names and carries all of the information in that journal. -Once a node has successfully joined, the last state entry will be +Once a node has successfully joined, the last state entry is `BDR_PEER_STATE_ACTIVE`. This differs from the state of each replication connection listed in `bdr.node_slots.state`. -#### `bdr.state_journal_details` Columns +#### `bdr.state_journal_details` columns | Name | Type | Description | | ------------- | ----------- | -------------------------------------------------------- | | state_counter | oid | Monotonically increasing event counter, per node | -| node_id | oid | Internal node id | +| node_id | oid | Internal node ID | | node_name | name | Name of the node | -| state | oid | Internal state id | +| state | oid | Internal state ID | | state_name | text | Human-readable state name | | entered_time | timestamptz | Point in time the current node observed the state change | diff --git a/product_docs/docs/bdr/4.0/column-level-conflicts.mdx b/product_docs/docs/bdr/4/column-level-conflicts.mdx similarity index 51% rename from product_docs/docs/bdr/4.0/column-level-conflicts.mdx rename to product_docs/docs/bdr/4/column-level-conflicts.mdx index 2a3628743eb..cb22e27afe9 100644 --- a/product_docs/docs/bdr/4.0/column-level-conflicts.mdx +++ b/product_docs/docs/bdr/4/column-level-conflicts.mdx @@ -1,29 +1,29 @@ --- -navTitle: Column-Level Conflict Resolution -title: Column-Level Conflict Detection -originalFilePath: column-level-conflicts.md +navTitle: Column-level conflict resolution +title: Column-level conflict detection + --- By default, conflicts are resolved at row level. That is, when changes from two nodes conflict, we pick either the local or remote tuple and -discard the other one. For example, we may compare commit timestamps for +discard the other one. For example, we might compare commit timestamps for the two conflicting changes and keep the newer one. This ensures that all -nodes converge to the same result, and establishes commit-order-like +nodes converge to the same result and establishes commit-order-like semantics on the whole cluster. -However, in some cases it may be appropriate to resolve conflicts at -the column-level rather than the row-level. +However, in some cases it might be appropriate to resolve conflicts at +the column level rather than the row level. -Consider a simple example, where we have a table "t" with two integer -columns "a" and "b", and a single row `(1,1)`. Assume that on one node +Consider a simple example, where we have a table t with two integer +columns a and b and a single row `(1,1)`. Assume that on one node we execute: ```sql UPDATE t SET a = 100 ``` -...while on another node we concurrently (before receiving the preceding +On another node we concurrently (before receiving the preceding `UPDATE`) execute: ```sql @@ -33,43 +33,43 @@ UPDATE t SET b = 100 This results in an `UPDATE-UPDATE` conflict. With the `update_if_newer` conflict resolution, we compare the commit timestamps and keep the new row version. Assuming the second node committed last, we end up with -`(1,100)`, effectively discarding the change to column "a". +`(1,100)`, effectively discarding the change to column a. -For many use cases this is the desired and expected behaviour, but for -some this may be an issue - consider for example a multi-node cluster +For many use cases, this is the desired and expected behavior, but for +some this might be an issue. Consider, for example, a multi-node cluster where each part of the application is connected to a different node, updating a dedicated subset of columns in a shared table. In that case, -the different components may step on each other's toes, overwriting +the different components might step on each other's toes, overwriting their changes. -For such use cases, it may be more appropriate to resolve conflicts on -a given table at the column-level. To achieve that, BDR will track -the timestamp of the last change for each column separately, and use that +For such use cases, it might be more appropriate to resolve conflicts on +a given table at the column level. To achieve that, BDR tracks +the timestamp of the last change for each column separately and uses that to pick the most recent value (essentially `update_if_newer`). Applied to the previous example, we'll end up with `(100,100)` on both nodes, despite neither of the nodes ever seeing such a row. -When thinking about column-level conflict resolution, it may be useful +When thinking about column-level conflict resolution, it can be useful to see tables as vertically partitioned, so that each update affects -data in only one slice. This eliminates conflicts between changes to -different subsets of columns. In fact, vertical partitioning may even +data in only one slice. This approach eliminates conflicts between changes to +different subsets of columns. In fact, vertical partitioning can even be a practical alternative to column-level conflict resolution. -Column-level conflict resolution requires the table to have `REPLICA -IDENTITY FULL`. The `bdr.alter_table_conflict_detection` function does check -that, and will fail with an error otherwise. +Column-level conflict resolution requires the table to have +`REPLICA IDENTITY FULL`. The `bdr.alter_table_conflict_detection` function does check +that and fails with an error otherwise. -## Enabling and Disabling Column-Level Conflict Resolution +## Enabling and disabling column-level conflict resolution -The Column-Level Conflict Resolution is managed by the +The column-level conflict resolution is managed by the [bdr.alter_table_conflict_detection()](conflicts#bdralter_table_conflict_detection) function. ### Example -To illustrate how the `bdr.alter_table_conflict_detection()` is used, consider -this example that creates a trivial table `test_table` and then enable +To see how the `bdr.alter_table_conflict_detection()` is used, consider +this example that creates a trivial table `test_table` and then enables column-level conflict resolution on it: ```sql @@ -81,32 +81,33 @@ ALTER TABLE db=# SELECT bdr.alter_table_conflict_detection( db(# 'my_app.test_table'::regclass, 'column_modify_timestamp', 'cts'); - alter_table_conflict_detection + alter_table_conflict_detection -------------------------------- t db=# \d my_app.test_table ``` -You will see that the function adds a new `cts` column (as specified in -the function call), but it also created two triggers ( `BEFORE INSERT` -and `BEFORE UPDATE` ) that are responsible for maintaining timestamps +The function adds a new `cts` column (as specified in +the function call), but it also created two triggers (`BEFORE INSERT` +and `BEFORE UPDATE`) that are responsible for maintaining timestamps in the new column before each change. -Also worth mentioning is that the new column specifies `NOT NULL` +Also, the new column specifies `NOT NULL` with a default value, which means that `ALTER TABLE ... ADD COLUMN` -does not perform a table rewrite. +doesn't perform a table rewrite. -*Note*: We discourage using columns with the `bdr.column_timestamps` data type -for other purposes as it may have various negative effects -(it switches the table to column-level conflict resolution, which will -not work correctly without the triggers etc.). +!!! Note + We discourage using columns with the `bdr.column_timestamps` data type + for other purposes as it can have negative effects. + For example, it switches the table to column-level conflict resolution, which doesn't + work correctly without the triggers. -### Listing Table with Column-Level Conflict Resolution +### Listing table with column-level conflict resolution -Tables having column-level conflict resolution enabled can be listed -with the following query, which detects the presence of a column of -type `bdr.column_timestamp`: +You can list tables having column-level conflict resolution enabled +with the following query. This query detects the presence of a column of +type `bdr.column_timestamp`. ```sql SELECT nc.nspname, c.relname @@ -124,7 +125,7 @@ WHERE NOT pg_is_other_temp_schema(nc.oid) ### bdr.column_timestamps_create -This function creates column-level conflict resolution. This is called within +This function creates column-level conflict resolution. It's called within `column_timestamp_enable`. #### Synopsis @@ -135,47 +136,48 @@ bdr.column_timestamps_create(p_source cstring, p_timestamp timestampstz) #### Parameters -- `p_source` - The two options are 'current' or 'commit'. -- `p_timestamp` - Timestamp is dependent on the source chosen: if 'commit', - then TIMESTAMP_SOURCE_COMMIT; if 'current', then TIMESTAMP_SOURCE_CURRENT. +- `p_source` — The two options are `current` or `commit`. +- `p_timestamp` — Timestamp depends on the source chosen. If `commit`, + then `TIMESTAMP_SOURCE_COMMIT`. If `current`, then `TIMESTAMP_SOURCE_CURRENT`. -## DDL Locking +## DDL locking When enabling or disabling column timestamps on a table, the code uses DDL locking to ensure that there are no pending changes from before the -switch, to ensure we only see conflicts with either timestamps in both -tuples or neither of them. Otherwise, the code might unexpectedly see +switch. This approach ensures we see only conflicts with timestamps in both +tuples or in neither of them. Otherwise, the code might unexpectedly see timestamps in the local tuple and NULL in the remote one. It also ensures that the changes are resolved the same way (column-level or row-level) on all nodes. -## Current vs Commit Timestamp +## Current versus commit timestamp -An important question is what timestamp to assign to modified columns. +An important decision is the timestamp to assign to modified columns. By default, the timestamp assigned to modified columns is the current timestamp, as if obtained from `clock_timestamp`. This is simple, and -for many cases it is perfectly correct (e.g. when the conflicting rows +for many cases it is perfectly correct (for example, when the conflicting rows modify non-overlapping subsets of columns). -It may however have various unexpected effects: +It can, however, have various unexpected effects: - The timestamp changes during statement execution, so if an `UPDATE` - affects multiple rows, each will get a slightly different timestamp. - This means that the effects of concurrent changes may get "mixed" in various + affects multiple rows, each gets a slightly different timestamp. + This means that the effects of concurrent changes might get "mixed" in various ways (depending on how exactly the changes performed on different nodes interleave). - The timestamp is unrelated to the commit timestamp, and using it to - resolve conflicts means that the result is not equivalent to the commit order, - which means it likely is not serializable. + resolve conflicts means that the result isn't equivalent to the commit order, + which means it likely can't be serialized. -Note: We may add statement and transaction timestamps in the future, -which would address issues with mixing effects of concurrent statements or -transactions. Still, neither of these options can ever produce results -equivalent to commit order. +!!! Note + We might add statement and transaction timestamps in the future, + which would address issues with mixing effects of concurrent statements or + transactions. Still, neither of these options can ever produce results + equivalent to commit order. -It is possible to also use the actual commit timestamp, although this +It's possible to also use the actual commit timestamp, although this feature is currently considered experimental. To use the commit timestamp, set the last parameter to `true` when enabling column-level conflict resolution: @@ -184,23 +186,23 @@ resolution: SELECT bdr.column_timestamps_enable('test_table'::regclass, 'cts', true); ``` -This can also be disabled using `bdr.column_timestamps_disable`. +You can disable it using `bdr.column_timestamps_disable`. -Commit timestamps currently have a couple of restrictions that are -explained in the "Limitations" section. +Commit timestamps currently have restrictions that are +explained in [Limitations](#limitations). -## Inspecting Column Timestamps +## Inspecting column timestamps The column storing timestamps for modified columns is maintained -automatically by triggers, and must not be modified directly. It may -be useful to inspect the current timestamps value, for example while -investigating how a particular conflict was resolved. +automatically by triggers. Don't modify it directly. It can +be useful to inspect the current timestamps value, for example, while +investigating how a conflict was resolved. -There are three functions for this purpose: +Three functions are useful for this purpose: - `bdr.column_timestamps_to_text(bdr.column_timestamps)` - This function returns a human-readable representation of the timestamp mapping, and + This function returns a human-readable representation of the timestamp mapping and is used when casting the value to `text`: ```sql @@ -214,12 +216,12 @@ db=# select cts::text from test_table; - `bdr.column_timestamps_to_jsonb(bdr.column_timestamps)` - This function turns a JSONB representation of the timestamps mapping, and is used + This function turns a JSONB representation of the timestamps mapping and is used when casting the value to `jsonb`: ```sql db=# select jsonb_pretty(cts::jsonb) from test_table; - jsonb_pretty + jsonb_pretty --------------------------------------------------- { + "map": { + @@ -234,8 +236,8 @@ db=# select jsonb_pretty(cts::jsonb) from test_table; - `bdr.column_timestamps_resolve(bdr.column_timestamps, xid)` This function updates the mapping with the commit timestamp for the attributes modified - by the most recent transaction (if it already committed). This only - matters when using the commit timestamp. For example in this case, the last + by the most recent transaction (if it already committed). This + matters only when using the commit timestamp. For example, in this case, the last transaction updated the second attribute (with `attnum = 2`): ```sql @@ -252,82 +254,82 @@ db=# select bdr.column_timestamps_resolve(cts, xmin)::jsonb from test_table; (1 row) ``` -## Handling column conflicts using CRDT Data Types +## Handling column conflicts using CRDT data types -By default, column-level conflict resolution simply picks the value with -a higher timestamp and discards the other one. It is however possible to -reconcile the conflict in different (more elaborate) ways, for example -using CRDT types that allow "merging" the conflicting values without +By default, column-level conflict resolution picks the value with +a higher timestamp and discards the other one. You can, however, +reconcile the conflict in different, more elaborate ways. For example, you can use +CRDT types that allow merging the conflicting values without discarding any information. -## Limitations +## Notes - The attributes modified by an `UPDATE` are determined by comparing the - old and new row in a trigger. This means that if the attribute does - not change a value, it will not be detected as modified even if it is - explicitly set. For example, `UPDATE t SET a = a` will not mark `a` as - modified for any row. Similarly, `UPDATE t SET a = 1` will not mark + old and new row in a trigger. This means that if the attribute doesn't + change a value, it isn't detected as modified even if it's + explicitly set. For example, `UPDATE t SET a = a` doesn't mark `a` as + modified for any row. Similarly, `UPDATE t SET a = 1` doesn't mark `a` as modified for rows that are already set to `1`. -- For `INSERT` statements, we do not have any old row to compare the new +- For `INSERT` statements, we don't have any old row to compare the new one to, so we consider all attributes to be modified and assign them - a new timestamp. This applies even for columns that were not included - in the `INSERT` statement and received default values. We could detect - which attributes have a default value, but it is not possible to decide if - it was included automatically or specified explicitly by the user. + a new timestamp. This applies even for columns that weren't included + in the `INSERT` statement and received default values. We can detect + which attributes have a default value but can't know if + it was included automatically or specified explicitly. - This effectively means column-level conflict resolution does not work - for `INSERT-INSERT` conflicts (even if the `INSERT` statements specify - different subsets of columns, because the newer row will have all - timestamps newer than the older one). + This effectively means column-level conflict resolution doesn't work + for `INSERT-INSERT` conflicts even if the `INSERT` statements specify + different subsets of columns. The newer row has + timestamps that are all newer than the older row. -- By treating the columns independently, it is easy to violate constraints - in a way that would not be possible when all changes happen on the same - node. Consider for example a table like this: +- By treating the columns independently, it's easy to violate constraints + in a way that isn't possible when all changes happen on the same + node. Consider, for example, a table like this: ```sql CREATE TABLE t (id INT PRIMARY KEY, a INT, b INT, CHECK (a > b)); INSERT INTO t VALUES (1, 1000, 1); ``` -...and assume one node does: +Assume one node does: ```sql UPDATE t SET a = 100; ``` -...while another node does concurrently: +Another node concurrently does: ```sql UPDATE t SET b = 500; ``` - Each of those updates is valid when executed on the initial row, and - so will pass on each node. But when replicating to the other node, + Each of those updates is valid when executed on the initial row and + so passes on each node. But when replicating to the other node, the resulting row violates the `CHECK (A > b)` constraint, and the - replication will stop until the issue is resolved manually. + replication stops until the issue is resolved manually. -- The column storing timestamp mapping is managed automatically. Do not - specify or override the value in your queries, as it may result in - unpredictable effects (we do ignore the value where possible anyway). +- The column storing timestamp mapping is managed automatically. Don't + specify or override the value in your queries, as it can result in + unpredictable effects. (We do ignore the value where possible anyway.) - The timestamp mapping is maintained by triggers, but the order in which - triggers execute does matter. So if you have custom triggers that modify + triggers execute matters. So if you have custom triggers that modify tuples and are executed after the `pgl_clcd_` triggers, the modified - columns will not be detected correctly. + columns aren't detected correctly. -- When using regular timestamps to order changes/commits, it is possible +- When using regular timestamps to order changes/commits, it's possible that the conflicting changes have exactly the same timestamp (because two or more nodes happened to generate the same timestamp). This risk - is not unique to column-level conflict resolution, as it may happen - even for regular row-level conflict resolution, and we use node id as a - tie-breaker in this situation (the higher node id wins), which ensures that + isn't unique to column-level conflict resolution, as it can happen + even for regular row-level conflict resolution. We use node id as a + tie-breaker in this situation (the higher node id wins), which ensures that the same changes are applied on all nodes. - It is possible that there is a clock skew between different nodes. While it - may induce somewhat unexpected behavior (discarding seemingly newer - changes because the timestamps are inverted), clock skew between nodes can - be managed using the parameters `bdr.maximum_clock_skew` and + can induce somewhat unexpected behavior (discarding seemingly newer + changes because the timestamps are inverted), you can manage clock skew between nodes + using the parameters `bdr.maximum_clock_skew` and `bdr.maximum_clock_skew_action`. ```sql diff --git a/product_docs/docs/bdr/4/configuration.mdx b/product_docs/docs/bdr/4/configuration.mdx new file mode 100644 index 00000000000..39325ef8bed --- /dev/null +++ b/product_docs/docs/bdr/4/configuration.mdx @@ -0,0 +1,538 @@ +--- +navTitle: Configuration +title: PostgreSQL configuration for BDR + + +--- + +Several PostgreSQL configuration parameters affect BDR +nodes. You can set these parameters differently on each node, +although that isn't generally recommended. + +## PostgreSQL settings for BDR + +BDR requires these PostgreSQL settings to run correctly: + +- `wal_level` — Must be set to `logical`, since BDR relies on logical decoding. +- `shared_preload_libraries` — Must contain `bdr`, although it can contain + other entries before or after, as needed. However, don't include `pglogical`. +- `track_commit_timestamp` — Must be set to `on` for conflict resolution to + retrieve the timestamp for each conflicting row. + +BDR requires these PostgreSQL settings to be set to appropriate values, +which vary according to the size and scale of the cluster. + +- `logical_decoding_work_mem` — Memory buffer size used by logical decoding. + Transactions larger than this overflow the buffer and are stored + temporarily on local disk. Default is 64 MB, but you can set it much higher. +- `max_worker_processes` — BDR uses background workers for replication + and maintenance tasks, so you need enough worker slots for it to + work correctly. The formula for the correct minimal number of workers, for each database, is: + - One per PostgreSQL instance plus + - One per database on that instance plus + - Four per BDR-enabled database plus + - One per peer node in the BDR group plus + - One for each writer-enabled per peer node in the BDR group + You might need more worker processes temporarily when a node is being + removed from a BDR group. +- `max_wal_senders` — Two needed per every peer node. +- `max_replication_slots` — Same as `max_wal_senders`. +- `wal_sender_timeout` and `wal_receiver_timeout` — Determines how + quickly a node considers its CAMO partner as disconnected or + reconnected. See [CAMO failure scenarios](camo#failure-scenarios) for + details. + +In normal running for a group with N peer nodes, BDR requires +N slots and WAL senders. During synchronization, BDR temporarily uses another +N - 1 slots and WAL senders, so be careful to set the parameters high enough +for this occasional peak demand. + +With parallel apply turned on, the number of slots must be increased to +N slots from the formula \* writers. This is because the `max_replication_slots` +also sets the maximum number of replication origins, and some of the functionality +of parallel apply uses extra origin per writer. + +When the [decoding worker](nodes#decoding-worker) is enabled, this +process requires one extra replication slot per BDR group. + +The general safe recommended value on a 4-node BDR group with a single database +is to set `max_replication_slots` and `max_worker_processes` to something +like `50` and `max_wal_senders` to at least `10`. + +Changing these parameters requires restarting the local node: +`max_worker_processes`, `max_wal_senders`, `max_replication_slots`. + +You might also want your applications to set these parameters. See +[Durability and performance options](durability) for details. + +- `synchronous_commit` — Affects the durability and performance of BDR replication. + in a similar way to [physical replication](https://www.postgresql.org/docs/11/runtime-config-wal.html#GUC-SYNCHRONOUS-COMMIT). +- `synchronous_standby_names` — Same as above. + +## BDR-specific settings + +You can also set BDR-specific configuration settings. +Unless noted otherwise, you can set the values at any time. + +### Conflict handling + +- `bdr.default_conflict_detection` — Sets the default conflict detection method + for newly created tables. Accepts same values as + [bdr.alter_table_conflict_detection()](conflicts#bdralter_table_conflict_detection). + +### Global sequence parameters + +- `bdr.default_sequence_kind` — Sets the default [sequence kind](sequences.md). + The default value is `distributed`, which means `snowflakeid` is used + for `int8` sequences (i.e., `bigserial`) and `galloc` sequence for `int4` + (i.e., `serial`) and `int2` sequences. + +### DDL handling + +- `bdr.default_replica_identity` — Sets the default value for `REPLICA IDENTITY` + on newly created tables. The `REPLICA IDENTITY` defines the information + written to the write-ahead log to identify rows that are updated or deleted. + + The accepted values are: + + - `DEFAULT` — Records the old values of the columns of the primary key, + if any (this is the default PostgreSQL behavior). + - `FULL` — Records the old values of all columns in the row. + - `NOTHING` — Records no information about the old row. + + See [PostgreSQL documentation](https://www.postgresql.org/docs/current/sql-altertable.html#SQL-CREATETABLE-REPLICA-IDENTITY) + for more details. + + BDR can't replicate `UPDATE` and `DELETE` operations on tables without a `PRIMARY KEY` + or `UNIQUE` constraint. The exception is when the replica identity for the table is `FULL`, + either by table-specific configuration or by `bdr.default_replica_identity`. + + If `bdr.default_replica_identity` is `DEFAULT` and there is a `UNIQUE` + constraint on the table, it isn't automatically picked up as + `REPLICA IDENTITY`. You need to set it explicitly when creating the table + or after, as described above. + + Setting the replica identity of tables to `FULL` increases the volume of + WAL written and the amount of data replicated on the wire for the table. + +- `bdr.ddl_replication` — Automatically replicate DDL across nodes (default is + `on`). + + This parameter can be set only by bdr_superuser or superuser roles. + + Running DDL or calling BDR administration functions with + `bdr.ddl_replication = off` can create situations where replication stops + until an administrator can intervene. See [DDL replication](ddl) + for details. + + A `LOG`-level log message is emitted to the PostgreSQL server logs whenever + `bdr.ddl_replication` is set to `off`. Additionally, a `WARNING-level` + message is written whenever replication of captured DDL commands or BDR + replication functions is skipped due to this setting. + +- `bdr.role_replication` — Automatically replicate ROLE commands across nodes + (default is `on`). Only a superuser can set this parameter. This setting + works only if `bdr.ddl_replication` is turned on as well. + + Turning this off without using external methods to ensure roles are in sync + across all nodes might cause replicated DDL to interrupt replication until + the administrator intervenes. + + See [Role manipulation statements](ddl#Role_manipulation_statements) + for details. + +- `bdr.ddl_locking` — Configures the operation mode of global locking for DDL. + + This parameter can be set only by bdr_superuser or superuser roles. + + Possible options are: + + - off — Don't use global locking for DDL operations. + - on — Use global locking for all DDL operations. + - dml — Use global locking only for DDL operations that need to prevent + writes by taking the global DML lock for a relation. + + A `LOG`-level log message is emitted to the PostgreSQL server logs + whenever `bdr.ddl_replication` is set to `off`. Additionally, a `WARNING` + message is written whenever any global locking steps are skipped due to + this setting. It's normal for some statements to result in two `WARNING` messages: + one for skipping the DML lock and one for skipping the DDL lock. + +- `bdr.truncate_locking` — False by default, this configuration option sets the + TRUNCATE command's locking behavior. Determines whether (when true) TRUNCATE + obeys the `bdr.ddl_locking` setting. + +### Global locking + +- `bdr.ddl_locking` — Described above. +- `bdr.global_lock_max_locks` — Maximum number of global locks that can be + held on a node (default 1000). Can be set only at Postgres server start. +- `bdr.global_lock_timeout` — Sets the maximum allowed duration of any wait + for a global lock (default 10 minutes). A value of zero disables this timeout. +- `bdr.global_lock_statement_timeout` — Sets the maximum allowed duration of + any statement holding a global lock (default 60 minutes). + A value of zero disables this timeout. +- `bdr.global_lock_idle_timeout` — Sets the maximum allowed duration of + idle time in transaction holding a global lock (default 10 minutes). + A value of zero disables this timeout. +- `bdr.predictive_checks` — Log level for predictive checks (currently used only + by global locks). Can be `DEBUG`, `LOG`, `WARNING` (default), or `ERROR`. Predictive checks + are early validations for expected cluster state when doing certain operations. You + can use them for those operations for fail early rather than wait for + timeouts. In global lock terms, BDR checks that there are enough nodes + connected and withing reasonable lag limit for getting quorum needed by the + global lock. + +### Node management + +- `bdr.replay_progress_frequency` — Interval for sending replication position + info to the rest of the cluster (default 1 minute). + +- `bdr.standby_slot_names` — Require these slots to receive and confirm + replication changes before any other ones. This setting is useful primarily when + using physical standbys for failover or when using subscribe-only nodes. + +### Generic replication + +- `bdr.writers_per_subscription` — Default number of writers per + subscription (in BDR, you can also change this with + `bdr.alter_node_group_config` for a group). + +- `bdr.max_writers_per_subscription` — Maximum number of writers + per subscription (sets upper limit for the setting above). + +- `bdr.xact_replication` — Replicate current transaction (default is `on`). + + Turning this off makes the whole transaction local only, which + means the transaction isn't visible to logical decoding by + BDR and all other downstream targets of logical decoding. Data isn't + transferred to any other node, including logical standby nodes. + + This parameter can be set only by the bdr_superuser or superuser roles. + + This parameter can be set only inside the current transaction using the + `SET LOCAL` command unless `bdr.permit_unsafe_commands = on`. + +!!! Note + Even with transaction replication disabled, WAL is generated, + but those changes are filtered away on the origin. + +!!! Warning + Turning off `bdr.xact_replication` leads to data + inconsistency between nodes. Use it only to recover from + data divergence between nodes or in + replication situations where changes on single nodes are required for + replication to continue. Use at your own risk. + +- `bdr.permit_unsafe_commands` — Option to override safety check on commands + that are deemed unsafe for general use. + + Requires `bdr_superuser` or PostgreSQL superuser. + +!!! Warning + The commands that are normally not considered safe can either + produce inconsistent results or break replication altogether. Use at your + own risk. + +- `bdr.batch_inserts` — Number of consecutive inserts to one table in + a single transaction turns on batch processing of inserts for that table. + + This option allows replication of large data loads as COPY internally, + rather than set of inserts. It is also how the initial data during node join + is copied. + +- `bdr.maximum_clock_skew` + + This option specifies the maximum difference between + the incoming transaction commit timestamp and the current time on the + subscriber before triggering `bdr.maximum_clock_skew_action`. + + It checks if the timestamp of the currently replayed transaction is in the + future compared to the current time on the subscriber. If it is, and the + difference is larger than `bdr.maximum_clock_skew`, it performs the action + specified by the `bdr.maximum_clock_skew_action` setting. + + The default is `-1`, which means ignore clock skew (the check is turned + off). It's valid to set 0 as when the clock on all servers are synchronized. + The fact that we are replaying the transaction means it has been committed in + the past. + +- `bdr.maximum_clock_skew_action` + + This specifies the action to take if a clock skew higher than + `bdr.maximum_clock_skew` is detected. + + There are two possible values for this option: + + - `WARN` — Log a warning about this fact. The warnings are logged once per + minute (the default) at the maximum to prevent flooding the server log. + - `WAIT` — Wait until the current local timestamp is no longer older than + remote commit timestamp minus the `bdr.maximum_clock_skew`. + +- `bdr.accept_connections` — Option to enable or disable connections to BDR. + Defaults to `on`. + + Requires `bdr_superuser` or PostgreSQL superuser. + +### `bdr.standby_slot_names` + +This option is typically used in failover configurations to ensure that the +failover-candidate streaming physical replicas for this BDR node +have received and flushed all changes before they ever become visible to +subscribers. That guarantees that a commit can't vanish on failover to a +standby for the provider. + +Replication slots whose names are listed in the comma-separated +`bdr.standby_slot_names` list are treated specially by the walsender +on a BDR node. + +BDR's logical replication walsenders ensures that all local changes +are sent and flushed to the replication slots in `bdr.standby_slot_names` +before the node sends those changes to any other BDR replication +clients. Effectively, it provides a synchronous replication barrier between the +named list of slots and all other replication clients. + +Any replication slot can be listed in `bdr.standby_slot_names`. Both +logical and physical slots work, but it's generally used for physical slots. + +Without this safeguard, two anomalies are possible where a commit can be +received by a subscriber and then vanish from the provider on failover because +the failover candidate hadn't received it yet: + +- For 1+ subscribers, the subscriber might have applied the change but the new + provider might execute new transactions that conflict with the received change, + as it never happened as far as the provider is concerned. + +- For 2+ subscribers, at the time of failover, not all subscribers have applied + the change. The subscribers now have inconsistent and irreconcilable states + because the subscribers that didn't receive the commit have no way to get it. + +Setting `bdr.standby_slot_names` by design causes subscribers to +lag behind the provider if the provider's failover-candidate replicas aren't +keeping up. Monitoring is thus essential. + +Another use case where the `bdr.standby_slot_names` is useful is when using +subscriber-only, to ensure that the subscriber-only node doesn't move ahead +of any of the other BDR nodes. + +### `bdr.standby_slots_min_confirmed` + +Controls how many of the `bdr.standby_slot_names` have to confirm before +we send data to BDR subscribers. + +### `bdr.writer_input_queue_size` + +This option specifies the size of the shared memory queue used +by the receiver to send data to the writer process. If the writer process is +stalled or making slow progress, then the queue might get filled up, stalling +the receiver process too. So it's important to provide enough shared memory for +this queue. The default is 1 MB, and the maximum allowed size is 1 GB. While any +storage size specifier can be used to set the GUC, the default is KB. + +### `bdr.writer_output_queue_size` + +This option specifies the size of the shared memory queue used +by the receiver to receive data from the writer process. Since the writer +isn't expected to send a large amount of data, a relatively smaller sized queue +is enough. The default is 32 KB, and the maximum allowed size is 1 MB. +While any storage size specifier can be used to set the GUC, the default is +KB. + +### `bdr.min_worker_backoff_delay` + +Rate limit BDR background worker launches by preventing a given worker +from being relaunched more often than every +`bdr.min_worker_backoff_delay` milliseconds. On repeated errors, the backoff +increases exponentially with added jitter up to maximum of +`bdr.max_worker_backoff_delay`. + +Time-unit suffixes are supported. + +!!! Note + This setting currently affects only receiver worker, which means it + primarily affects how fast a subscription tries to reconnect on error + or connection failure. + +The default for `bdr.min_worker_backoff_delay` is 1 second. For +`bdr.max_worker_backoff_delay`, it is 1 minute. + +If the backoff delay setting is changed and the PostgreSQL configuration is +reloaded, then all current backoff waits for reset. Additionally, the +`bdr.worker_task_reset_backoff_all()` function is provided to allow the +administrator to force all backoff intervals to immediately expire. + +A tracking table in shared memory is maintained to remember the last launch +time of each type of worker. This tracking table isn't persistent. It is +cleared by PostgreSQL restarts, including soft restarts during crash recovery +after an unclean backend exit. + +You can use the view [`bdr.worker_tasks`](monitoring#bdr.worker_tasks) to inspect this state so the administrator can see any backoff +rate limiting currently in effect. + +For rate limiting purposes, workers are classified by task. This key consists +of the worker role, database OID, subscription ID, subscription writer ID, +extension library name and function name, extension-supplied worker name, and +the remote relation ID for sync writers. `NULL` is used where a given +classifier doesn't apply, for example, manager workers don't have a subscription ID +and receivers don't have a writer ID. + +### CRDTs + +- `bdr.crdt_raw_value` — Sets the output format of [CRDT data types](crdt). + The default output (when this setting is `off`) is to return only the current + value of the base CRDT type (for example, a bigint for `crdt_pncounter`). + When set to `on`, the returned value represents the full representation of + the CRDT value, which can, for example, include the state from multiple nodes. + +### Max prepared transactions + +- `max_prepared_transactions` — Needs to be set high enough to cope + with the maximum number of concurrent prepared transactions across + the cluster due to explicit two-phase commits, CAMO, or Eager + transactions. Exceeding the limit prevents a node from running a + local two-phase commit or CAMO transaction and prevents all + Eager transactions on the cluster. + You can set this only at Postgres server start. + +### Eager Replication + +- `bdr.commit_scope` — Setting the commit scope to `global` enables + [eager all node replication](eager) (default `local`). + +- `bdr.global_commit_timeout` — Timeout for both stages of a global + two-phase commit (default 60s) as well as for CAMO-protected transactions + in their commit phase, as a limit for how long to wait for the CAMO + partner. + +### Commit At Most Once + +- `bdr.enable_camo` — Used to enable and control the CAMO feature. + Defaults to `off`. CAMO can be switched on per transaction by + setting this to `remote_write`, `remote_commit_async`, or + `remote_commit_flush`. For backward-compatibility, the values + `on`, `true`, and `1` set the safest `remote_commit_flush` mode, + while `false` or `0` also disable CAMO. +- `bdr.standby_dsn` — Allows manual override of the connection + string (DSN) to reach the CAMO partner, in case it has changed since + the crash of the local node. Is usually unset. + You can set it only at Postgres server start. +- `bdr.camo_local_mode_delay` — The commit delay that applies in + CAMO's local mode to emulate the overhead that normally occurs with + the CAMO partner having to confirm transactions. Defaults to 5 ms. + Set to `0` to disable this feature. +- `bdr.camo_enable_client_warnings` — Emit warnings if an activity is + carried out in the database for which CAMO properties can't be + guaranteed. This is enabled by default. Well-informed users can choose + to disable this to reduce the amount of warnings going into their logs. +- `synchronous_replication_availability` — Can optionally be `async` + for increased availability by allowing a node to continue and + commit after its CAMO partner got disconnected. Under the default + value of `wait`, the node waits indefinitely and proceeds to + commit only after the CAMO partner reconnects and sends + confirmation. + +### Transaction streaming + +- `bdr.default_streaming_mode` — Used to control transaction streaming by + the subscriber node. Permissible values are: `off`, `writer`, `file`, and `auto`. + Defaults to `auto`. If set to `off`, the subscriber doesn't request + transaction streaming. If set to one of the other values, the + subscriber requests transaction streaming and the publisher provides + it if it supports them and if configured at group level. For + more details, see [Transaction streaming](transaction-streaming). + +### Lag control + +- `bdr.lag_control_max_commit_delay` — Maximum acceptable post commit delay that + can be tolerated, in fractional milliseconds. +- `bdr.lag_control_max_lag_size` — Maximum acceptable lag size that can be tolerated, + in kilobytes. +- `bdr.lag_control_max_lag_time` — Maximum acceptable lag time that can be tolerated, + in milliseconds. +- `bdr.lag_control_min_conforming_nodes` — Minimum number of nodes required to stay + below acceptable lag measures. +- `bdr.lag_control_commit_delay_adjust` — Commit delay micro adjustment measured as a + fraction of the maximum commit delay time. At a default value of 0.01%, it takes + 100 net increments to reach the maximum commit delay. +- `bdr.lag_control_sample_interval` — Minimum time between lag samples and + commit delay micro adjustments, in milliseconds. +- `bdr.lag_control_commit_delay_start` — The lag threshold at which commit delay + increments start to be applied, expressed as a fraction of acceptable lag measures. + At a default value of 1.0%, commit delay increments don't begin until acceptable lag + measures are breached. + + By setting a smaller fraction, it might be possible to prevent a breach + by "bending the lag curve" earlier so that it's asymptotic with the + acceptable lag measure. + +### Timestamp-based snapshots + +- `snapshot_timestamp` — Turns on the use of + [timestamp-based snapshots](tssnapshots) and sets the timestamp to use. +- `bdr.timestamp_snapshot_keep` — Time to keep valid snapshots for the + timestamp-based snapshot use (default is `0`, meaning don't keep past snapshots). + +### Monitoring and logging + +- `bdr.debug_level` — Defines the log level that BDR uses to write + its debug messages. The default value is `debug2`. If you want to see + detailed BDR debug output, set `bdr.debug_level = 'log'`. + +- `bdr.trace_level` — Similar to the above, this defines the log level + to use for BDR trace messages. Enabling tracing on all nodes of a + BDR cluster might help EDB Support to diagnose issues. + You can set this only at Postgres server start. + +!!! Warning + Setting `bdr.debug_level` or `bdr.trace_level` to a value >= + `log_min_messages` can produce a very large volume of log output, so don't + enabled it long term in production unless plans are in place for log filtering, + archival, and rotation to prevent disk space exhaustion. + +- `bdr.track_subscription_apply` — Track apply statistics for + each subscription. +- `bdr.track_relation_apply` — Track apply statistics for each + relation. +- `bdr.track_apply_lock_timing` — Track lock timing when tracking + statistics for relations. + +### Internals + +- `bdr.raft_keep_min_entries` — The minimum number of entries to keep in the + Raft log when doing log compaction (default 100). The value of 0 disables + log compaction. You can set this only at Postgres server start. + !!! Warning + If log compaction is disabled, the log grows in size forever. +- `bdr.raft_response_timeout` — To account for network failures, the + Raft consensus protocol implemented times out requests after a + certain amount of time. This timeout defaults to 30 seconds. +- `bdr.raft_log_min_apply_duration` — To move the state machine + forward, Raft appends entries to its internal log. During normal + operation, appending takes only a few milliseconds. This poses an + upper threshold on the duration of that append action, above which + an `INFO` message is logged. This can indicate a + problem. Default value of this parameter is 3000 ms. +- `bdr.raft_log_min_message_duration` — When to log a consensus request. + Measure roundtrip time of a bdr consensus request and log an + `INFO` message if the time exceeds this parameter. Default value + of this parameter is 5000 ms. +- `bdr.raft_group_max_connections` — The maximum number of connections + across all BDR groups for a Postgres server. These connections carry + bdr consensus requests between the groups' nodes. Default value of this + parameter is 100 connections. You can set it only at Postgres server start. +- `bdr.backwards_compatibility` — Specifies the version to be + backward compatible to, in the same numerical format as used by + `bdr.bdr_version_num`, e.g., `30618`. Enables exact behavior of a + former BDR version, even if this has generally unwanted effects. + Defaults to the current BDR version. Since this changes from release + to release, we advise against explicit use in the configuration + file unless the value is different from the current version. +- `bdr.track_replication_estimates` — Track replication estimates in terms + of apply rates and catchup intervals for peer nodes. Protocols like CAMO can use this information + to estimate the readiness of a + peer node. This parameter is enabled by default. +- `bdr.lag_tracker_apply_rate_weight` — We monitor how far behind peer nodes + are in terms of applying WAL from the local node and calculate a moving + average of the apply rates for the lag tracking. This parameter specifies + how much contribution newer calculated values have in this moving average + calculation. Default value is 0.1. diff --git a/product_docs/docs/bdr/4.0/conflicts.mdx b/product_docs/docs/bdr/4/conflicts.mdx similarity index 99% rename from product_docs/docs/bdr/4.0/conflicts.mdx rename to product_docs/docs/bdr/4/conflicts.mdx index ba229f6a554..478b01d9b52 100644 --- a/product_docs/docs/bdr/4.0/conflicts.mdx +++ b/product_docs/docs/bdr/4/conflicts.mdx @@ -1,6 +1,6 @@ --- title: Conflicts -originalFilePath: conflicts.md + --- @@ -31,7 +31,7 @@ described in the [CLCD](column-level-conflicts) chapter. If you wish to avoid conflicts, you can use these features in BDR. - Conflict-free data types (CRDTs) - described in the [CRDT](crdt) chapter. -- Eager replication - described in the [Eager Replication](eager) chapter. +- Eager Replication - described in the [Eager Replication](eager) chapter. By default, all conflicts are logged to `bdr.conflict_history`. If conflicts are possible then table owners should monitor for them, analyze to see how they @@ -48,7 +48,7 @@ advocates an optimistic approach: avoid conflicts where possible, but allow some types of conflict to occur and resolve them when they arise. !!! Warning "Upgrade Notes" - All the SQL visible interfaces are in the `bdr` schema. + All the SQL visible interfaces are in the `bdr` schema. All the previously deprecated interfaces in the `bdr_conflicts` or `bdr_crdt` schema were removed and will **not** work on 3.7+ nodes or in groups that contain at least one 3.7+ node. @@ -79,7 +79,7 @@ That's part of why it performs well with latency and network partitions. As a result, *transactions on different nodes execute entirely independently from each other*, when using the default, lazy replication. Less independence between nodes can avoid conflicts altogether, which is why BDR also offers -eager replication for when this is important. +Eager Replication for when this is important. ## Types of conflict @@ -198,7 +198,7 @@ INSERT INTO pktest VALUES (3,3); UPDATE pktest SET pk=4 WHERE pk=3; SELECT * FROM pktest; - pk | val + pk | val ----+----- 2 | 1 4 | 3 @@ -226,7 +226,7 @@ node1: UPDATE pktest SET pk=pk+1 WHERE pk = 2; node2: UPDATE pktest SET pk=pk+1 WHERE pk = 4; SELECT * FROM pktest; - pk | val + pk | val ----+----- 3 | 1 5 | 3 @@ -248,7 +248,7 @@ node2: UPDATE pktest SET pk=2 WHERE pk = 3; ```sql node1: SELECT * FROM pktest; - pk | val + pk | val ----+----- 1 | 1 5 | 3 @@ -256,7 +256,7 @@ SELECT * FROM pktest; node2: SELECT * FROM pktest; - pk | val + pk | val ----+----- 2 | 1 5 | 3 @@ -292,7 +292,7 @@ rows both get updated concurrently to the same new primary key. As a result, we recommend strongly against allowing PK UPDATEs in your applications, especially with BDR. If there are parts of your application that change Primary Keys, then to avoid concurrent -changes, make those changes using Eager replication. +changes, make those changes using Eager Replication. !!! Warning In case the conflict resolution of `update_pkey_exists` conflict results @@ -424,7 +424,7 @@ we simply log delete_missing. Later releases will automatically resolve INSERT/DELETE anomalies via re-checks using LiveCompare when delete_missing conflicts occur. These can be performed manually by applications by checking -conflict logs or conflict log tables; see later. +the `bdr.conflict_history_summary` view. These conflicts can occur in two main problem use cases: @@ -1146,8 +1146,8 @@ The following conflict resolutions are currently supported for the ## Conflict Logging To ease the diagnosis and handling of multi-master conflicts, BDR -will, by default, log every conflict into the PostgreSQL log file. This behaviour -can be changed with more granularity with the following functions. +will, by default, log every conflict into the `bdr.conflict_history` table. +This behaviour can be changed with more granularity with the following functions. ### bdr.alter_node_set_log_config diff --git a/product_docs/docs/bdr/4.0/crdt.mdx b/product_docs/docs/bdr/4/crdt.mdx similarity index 99% rename from product_docs/docs/bdr/4.0/crdt.mdx rename to product_docs/docs/bdr/4/crdt.mdx index dc67187e1de..07f25ea7ce4 100644 --- a/product_docs/docs/bdr/4.0/crdt.mdx +++ b/product_docs/docs/bdr/4/crdt.mdx @@ -1,7 +1,7 @@ --- navTitle: CRDT Data Types title: Conflict-free Replicated Data Types -originalFilePath: crdt.md + --- @@ -294,7 +294,7 @@ in the current transaction, etc.). ## CRDT Types vs. Conflict Reporting -By default, detected conflicts are written into the server log. Without +By default, detected conflicts are individually reported. Without CRDT types this makes perfect sense, because the conflict resolution essentially throws away one half of the available information (local or remote row, depending on configuration). This presents a data loss. diff --git a/product_docs/docs/bdr/4.0/ddl.mdx b/product_docs/docs/bdr/4/ddl.mdx similarity index 99% rename from product_docs/docs/bdr/4.0/ddl.mdx rename to product_docs/docs/bdr/4/ddl.mdx index 64aeae2ca5e..5bf0468aadf 100644 --- a/product_docs/docs/bdr/4.0/ddl.mdx +++ b/product_docs/docs/bdr/4/ddl.mdx @@ -1,6 +1,6 @@ --- title: DDL Replication -originalFilePath: ddl.md + --- @@ -538,7 +538,7 @@ under the following table. | LISTEN | Y | N | N | | LOAD | Y | N | N | | LOAD ROW DATA | Y | Y | DDL | -| LOCK TABLE | Y | N | N | +| LOCK TABLE | Y | N | [Details](#bdr_ddl_lock_lock_stmt) | | MOVE | Y | N | N | | NOTIFY | Y | N | N | | PREPARE | Y | N | N | @@ -565,7 +565,7 @@ under the following table.
-### ALTER SEQUENCE +### ALTER SEQUENCE Generally `ALTER SEQUENCE` is supported, but when using global sequences, some options have no effect. @@ -580,7 +580,7 @@ sub-commands that are not supported.
-#### ALTER TABLE Disallowed Commands +#### ALTER TABLE Disallowed Commands Some variants of `ALTER TABLE` are currently not allowed on a BDR node: @@ -630,7 +630,7 @@ for more details.
-#### ALTER TABLE Locking +#### ALTER TABLE Locking The following variants of `ALTER TABLE` will only take DDL lock and **not** a DML lock: @@ -818,28 +818,28 @@ record those dependencies. See workarounds, below.
-### COMMENT ON +### COMMENT ON All variants of COMMENT ON are allowed, but `COMMENT ON TABLESPACE/DATABASE/LARGE OBJECT` will not be replicated.
-### CREATE SEQUENCE +### CREATE SEQUENCE Generally `CREATE SEQUENCE` is supported, but when using global sequences, some options have no effect.
-### CREATE TABLE +### CREATE TABLE Generally `CREATE TABLE` is supported but `CREATE TABLE WITH OIDS` is not allowed on a BDR node.
-### CREATE TABLE AS and SELECT INTO +### CREATE TABLE AS and SELECT INTO `CREATE TABLE AS` and `SELECT INTO` are only allowed if all sub-commands are also allowed. @@ -851,42 +851,43 @@ effects on the database, there are some restrictions on it.
-#### EXPLAIN ANALYZE Replication +#### EXPLAIN ANALYZE Replication EXPLAIN ANALYZE will follow replication rules of the analyzed statement.
-#### EXPLAIN ANALYZE Locking +#### EXPLAIN ANALYZE Locking EXPLAIN ANALYZE will follow locking rules of the analyzed statement.
-### GRANT and REVOKE +### GRANT and REVOKE Generally `GRANT` and `REVOKE` statements are supported, however `GRANT/REVOKE ON TABLESPACE/LARGE OBJECT` will not be replicated. +
+ ### LOCK TABLE -`LOCK TABLE` is only executed locally and is not replicated. Normal replication -happens after transaction commit, so `LOCK TABLE` would not have any effect -on other nodes. +LOCK TABLE is not replicated, but it may acquire the global DML lock when +`bdr.lock_table_locking` is set `on`. -For globally locking table, users can request a global DML lock explicitly -by calling `bdr.global_lock_table()`. +The `bdr.global_lock_table()` function can also be used to request a global DML +lock explicitly.
-### SECURITY LABEL +### SECURITY LABEL All variants of `SECURITY LABEL` are allowed, but `SECURITY LABEL ON TABLESPACE/DATABASE/LARGE OBJECT` will not be replicated.
-### TRUNCATE Replication +### TRUNCATE Replication `TRUNCATE` command is replicated as DML, not as DDL statement, so whether the `TRUNCATE` on table is replicated depends on replication set settings for @@ -894,7 +895,7 @@ each affected table.
-### TRUNCATE Locking +### TRUNCATE Locking Even though `TRUNCATE` is not replicated same way as other DDL, it may acquire the global DML lock when `bdr.truncate_locking` is set to `on`. diff --git a/product_docs/docs/bdr/4/durability.mdx b/product_docs/docs/bdr/4/durability.mdx new file mode 100644 index 00000000000..f37c672a622 --- /dev/null +++ b/product_docs/docs/bdr/4/durability.mdx @@ -0,0 +1,253 @@ +--- +title: Durability & Performance Options +--- + +## Overview + +Synchronous or *Eager Replication* synchronizes between at least two +nodes of the cluster before committing a transaction. This provides +three properties of interest to applications, which are related, but +can all be implemented individually: + +- *Durability*: writing to multiple nodes increases crash resilience + and allows the data to be recovered after a crash and restart. +- *Visibility*: with the commit confirmation to the client, the database + guarantees immediate visibility of the committed transaction on some + sets of nodes. +- *No Conflicts After Commit*: the client can rely on the transaction to + eventually be applied on all nodes without further conflicts, or get + an abort directly informing the client of an error. + +BDR provides a [Group Commit](group_commit.md) feature to guarante +durability and visibility by providing a variant of synchronous +replication. This is very similar to Postgres' `synchronous_commit` +feature for physical standbys, but providing a lot more flexibility +for large scale distributed systems. + +In addition to Group Commit, BDR also offers two additional modes +(which cannot currently be combined with Group Commit): + +- Commit At Most Once (CAMO). This feature solves the problem with knowing + whether your transaction has COMMITed (and replicated) or not in case of + node or network failures COMMIT. Normally, it might be hard to know whether + or not the COMMIT was processed in. With this feature, your application can + find out what happened, even if your new database connection is to a different node + than your previous connection. For more info about this feature see the + [Commit At Most Once](camo) chapter. +- Eager Replication. This is an optional feature to check for conflicts prior + to the commit. Every transaction is applied and prepared on *all nodes* simultaneously, + and commits only if no replication conflicts are detected. This feature does + reduce performance, but provides very strong consistency guarantees. + For more info about this feature see the [Eager All-Node Replication](eager) + chapter. + +Postgres itself provides [Physical Streaming Replication](https://www.postgresql.org/docs/current/warm-standby.html#STREAMING-REPLICATION) +(PSR), which is uni-directional, but offers a [synchronous variant](https://www.postgresql.org/docs/current/warm-standby.html#SYNCHRONOUS-REPLICATION). +For backwards compatibility, BDR still supports configuring synchronous +replication via `synchronous_commit` and `synchronous_standby_names`, see +[Legacy Synchronous Replication](durability.md#legacy-synchronous-replication), +but the use of [Group Commit](group_commit.md) is recommended instead +in all cases. + +This chapter covers the various forms of synchronous or eager +replication and its timing aspects. + +## Terms and Definitions + +This and subsequent chapters refer to BDR nodes taking different +roles. These are implicitly assigned per transaction and are +unrelated even for concurrent transactions. + +* the *origin* is the node that receives the transaction from the + client or application. It is the node processing the transaction + first, initiating replication to other BDR nodes and responding back + to the client with a confirmation or an error. + +* a *partner* node is a BDR node expected to confirm transactions + either according to Group Commit or CAMO requirements. + +* a *commit group* is the group of all BDR nodes involved in the + commit, i.e. the origin and all of its partner nodes, which may be + just a few or all peer nodes. + +## Comparison + +Most options for synchronous replication available to +BDR allow for different levels of synchronization, offering different +trade-offs between performance and protection against node or network +outages. + +The following table summarizes what a client can expect from a peer +node replicated to after having received a COMMIT confirmation from +the origin node the transaction was issued to. The "Mode" column takes +on different meaning depending on the variant. For PSR and Legacy +Synchronous Replication with BDR, it refers to the +`synchronous_commit` setting. For CAMO, it refers to the +`bdr.enable_camo` setting. Lastly, for Group Commit, it refers to the +confirmation requirements of the +[commit scope configuration](group_commit#configuration). + +| Variant | Mode | Received | Visible | Durable | +|--------------|-------------------------------|----------|----------|----------| +| async BDR | off (default) | no | no | no | +| PSR | remote_write (2) | yes | no | no (1) | +| PSR | on (2) | yes | no | yes | +| PSR | remote_apply (2) | yes | yes | yes | +| Group Commit | 'ON received' nodes | yes | no | no | +| Group Commit | 'ON replicated' nodes | yes | no | no | +| Group Commit | 'ON durable' nodes | yes | no | yes | +| Group Commit | 'ON visible' nodes | yes | yes | yes | +| CAMO | remote_write (2) | yes | no | no | +| CAMO | remote_commit_async (2) | yes | yes | no | +| CAMO | remote_commit_flush (2) | yes | yes | yes | +| Eager | n/a | yes | yes | yes | +| legacy (3) | remote_write (2) | yes | no | no | +| legacy (3) | on (2) | yes | yes | yes | +| legacy (3) | remote_apply (2) | yes | yes | yes | + +*(1) written to the OS, durable if the OS remains running and only +Postgres crashes.* + +*(2) unless switched to Local mode (if allowed) by setting +`synchronous_replication_availability` to `async'`, otherwise the +values for the asynchronous BDR default apply.* + +*(3) not recommended, please consider using Group Commit instead.* + +Reception ensures the normally operating peer will be able to +eventually apply the transaction without requiring any further +communication, i.e. even in the face of a full or partial network +outage. A crash of a peer node may still require a re-transmission of +the transaction, as this confirmation does not involve persistent +storage. All modes considered synchronous provide this protection. + +Visibility implies the transaction was applied remotely and all other +clients will see the results of the transaction on all nodes providing +this guarantee immediately after the commit is confirmed by the origin +node. Without visibility, other clients connected may not see the +results of the transaction and experience stale reads. + +Durability relates to the peer node's storage and provides protection +against loss of data after a crash and recovery of the peer node. +This can either relate to the reception of the data (as with Physical +Streaming Replication) or to visibility (as with Group Commit, CAMO +and Eager). The former eliminates the need for retransmissions after +a crash, while the latter ensures visibility is maintained across +restarts. + +## Internal Timing of Operations + +For a better understanding of how the different modes work, it is +helpful to realize PSR and BDR apply transactions rather +differently. + +With physical streaming replication, the order of operations is: + +- origin flushes a commit record to WAL, making the transaction + visible locally +- peer node receives changes and issues a write +- peer flushes the received changes to disk +- peer applies changes, making the transaction visible locally + +With BDR, the order of operations is different: + +- origin flushes a commit record to WAL, making the transaction + visible locally +- peer node receives changes into its apply queue in memory +- peer applies changes, making the transaction visible locally +- peer persists the transaction by flushing to disk + +For Group Commit, CAMO, and Eager, note that the origin node waits for +a certain number of confirmations prior to making the transaction +visible locally. The order of operations is: + +- origin flushes a prepare or pre-commit record to WAL +- peer node receives changes into its apply queue in memory +- peer applies changes, making the transaction visible locally +- peer persists the transaction by flushing to disk +- origin commits and makes the transaction visible locally + +The following table summarizes the differences. + +| Variant | Order of apply vs persist | replication before or after commit | +|:-------------|:-------------------------:|:----------------------------------:| +| PSR | persist first | after WAL flush of commit record | +| BDR | apply first | after WAL flush of commit record | +| Group Commit | apply first | before COMMIT on origin | +| CAMO | apply first | before COMMIT on origin | +| Eager | apply first | before COMMIT on origin | + +## Configuration + +The following table provides an overview of which configuration +settings are required to be set to a non-default value (req) or +optional (opt), but affecting a specific variant. + +| setting (GUC) | Group Commit | CAMO | Eager | PSR (1) | +|--------------------------------------|:-------------:|:-----:|:------:|:-------:| +| synchronous_standby_names | n/a | n/a | n/a | req | +| synchronous_commit | n/a | n/a | n/a | opt | +| synchronous_replication_availability | n/a | opt | n/a | opt | +| bdr.enable_camo | n/a | req | n/a | n/a | +| bdr.commit_scope | req | n/a | opt | n/a | +| bdr.global_commit_timeout | opt | opt | opt | n/a | + +*(1) values in this column apply also to `synchronous_commit` and +`synchronous_standby_names` being used in combination with BDR.* + +## Planned Shutdown and Restarts + + +When using Group Commit with receive confirmations or CAMO in +combination with `remote_write`, care must be taken +with planned shutdown or restart. By default, the apply queue is consumed +prior to shutting down. However, in the `immediate` shutdown mode, the queue +is discarded at shutdown, leading to the stopped node "forgetting" +transactions in the queue. A concurrent failure of the origin node could +lead to loss of data, as if both nodes failed. + +To ensure the apply queue gets flushed to disk, please use either +`smart` or `fast` shutdown for maintenance tasks. This maintains the +required synchronization level and prevents loss of data. + +## Legacy Synchronous Replication using BDR + +!!! Note + This approach is not recommended. Please consider using + [Group Commit](group_commit.md) instead. + +### Usage + +To enable synchronous replication using BDR, the application +name of the relevant BDR peer nodes need to be added to +`synchronous_standby_names`. The use of `FIRST x` or `ANY x` offers a +some flexibility, if this does not conflict with the requirements of +non-BDR standby nodes. + +Once added, the level of synchronization can be configured per +transaction via `synchronous_commit`, which defaults to `on` - meaning that +adding to `synchronous_standby_names` already enables synchronous +replication. Setting `synchronous_commit` to `local` or `off` turns +off synchronous replication. + +Due to BDR applying the transaction before persisting it, the +values `on` and `remote_apply` are equivalent (for logical +replication). + +### Migration to Group Commit + +The Group Commit feature of BDR is configured independent of +`synchronous_commit` and `synchronous_standby_names`. Instead, the +`bdr.commit_scope` GUC allows to select the scope per transaction. And +instead of `synchronous_standby_names` configured on each node +individually, Group Commit uses globally synchronized Commit Scopes. + +!!! Note + While the grammar for `synchronous_standby_names` and Commit + Scopes looks very similar, it is important to note that the former + does not account for the origin node, but the latter does. + Therefore, for example `synchronous_standby_names = 'ANY 1 (..)'` + is equivalent to a Commit Scope of `ANY 2 (...)`. This choice + makes reasoning about majority easier and reflects that the origin + node also contributes to the durability and visibility of the + transaction. diff --git a/product_docs/docs/bdr/4/eager.mdx b/product_docs/docs/bdr/4/eager.mdx new file mode 100644 index 00000000000..e90111792f6 --- /dev/null +++ b/product_docs/docs/bdr/4/eager.mdx @@ -0,0 +1,162 @@ +--- +title: Eager Replication +--- + +To prevent conflicts after a commit, set the `bdr.commit_scope` +parameter to `global`. The default setting of `local` disables eager +replication, so BDR will apply changes and resolve potential conflicts +post-commit, as described in the [Conflicts chapter](conflicts.md). + +In this mode, BDR uses two-phase commit (2PC) internally to detect and +resolve conflicts prior to the local commit. It turns a normal +`COMMIT` of the application into an implicit two-phase commit, +requiring all peer nodes to prepare the transaction, before the origin +node continues to commit it. If at least one node is down or +unreachable during the prepare phase, the commit will time out after +`bdr.global_commit_timeout`, leading to an abort of the +transaction. Note that there is no restriction on the use of +temporary tables, as exists in explicit 2PC in PostgreSQL. + +Once prepared, Eager All-Node Replication employs Raft to reach a +commit decision. In case of failures, this allows a remaining +majority of nodes to reach a congruent commit or abort decision so +they can finish the transaction. This unblocks the objects and +resources locked by the transaction and allows the cluster to proceed. + +In case all nodes remain operational, the origin will confirm the +commit to the client only after all nodes have committed, to ensure that the +transaction is immediately visible on all nodes after the commit. + +## Requirements + +Eager All-Node Replication uses prepared transactions internally; +therefore all replica nodes need to have a `max_prepared_transactions` +configured high enough to be able to handle all incoming transactions +(possibly in addition to local two-phase commit and CAMO transactions; +see [Configuration: Max Prepared Transactions](configuration.md#max-prepared-transactions)). +We recommend to configure it the same on all nodes, and high enough to +cover the maximum number of concurrent transactions across the cluster +for which CAMO or Eager All-Node Replication is used. Other than +that, no special configuration is required, and every BDR cluster can +run Eager All-Node transactions. + +## Usage + +To enable Eager All-Node Replication, the client needs to switch to +global commit scope at session level, or for individual transactions +as shown here: + +```sql +BEGIN; + +... other commands possible... + +SET LOCAL bdr.commit_scope = 'global'; + +... other commands possible... +``` + +The client can continue to simply issue a `COMMIT` at the end of the +transaction and let BDR manage the two phases: + +```sql +COMMIT; +``` + +## Error handling + +Given that BDR manages the transaction, the client only needs to check the +result of the `COMMIT` (as is advisable in any case, including single-node +Postgres). + +In case of an origin node failure, the remaining nodes will eventually +(after at least `bdr.global_commit_timeout`) decide to rollback the +globally prepared transaction. Raft prevents inconsistent commit vs. +rollback decisions. This, however, requires a majority of connected +nodes. Disconnected nodes keep the transactions prepared to be able +to eventually commit them (or rollback) as needed to reconcile with the +majority of nodes that may have decided and made further progress. + +### Eager All-Node Replication with CAMO + +Eager All-Node Replication goes beyond CAMO and implies it; there is no +need to additionally enable `bdr.enable_camo`, if `bdr.commit_scope` +is set to `global`. Nor does a CAMO pair need to be +configured via `bdr.add_camo_pair()`. + +Any other active BDR node may be used in the role of a CAMO partner to +query a transaction's status'. However, this non-CAMO usage needs to +be indicated to the `bdr.logical_transaction_status` function with a +third argument of `require_camo_partner = false`. Otherwise, it may +complain about a missing CAMO configuration (which is not required for +Eager transactions). + +Other than this difference in configuration and invocation of that +function, the client needs to adhere to the protocol +described for [CAMO](camo.md). See the [reference client +implementations](camo_clients.md). + +### Limitations + +Transactions using Eager Replication cannot yet execute DDL, +nor do they support explicit two-phase commit. +These may be allowed in later releases. +Note that the TRUNCATE command is allowed. + +Replacing a crashed and unrecoverable BDR node with its physical +standby is not currently supported in combination with Eager All Node +transactions. + +BDR currently offers a global commit scope only; later releases +will support Eager Replication with fewer nodes for increased +availability. + +It is not possible for Eager All Node replication to be combined with +`synchronous_replication_availability = 'async'`. Trying to configure +both will yield an error. + +The Decoding Worker feature is not currently supported in combination +with Eager All Node transactions. Installations using Eager must keep +`enable_wal_decoder` disabled for the BDR node group using Eager All +Node transactions. + +Synchronous replication uses a mechanism for transaction confirmation +different from Eager. The two are not compatible and must not be used +together. Therefore, whenever using Eager All Node transactions, +please make sure none of the BDR nodes are configured in +`synchronous_standby_names`. Using synchronous replication to a +non-BDR node acting as a physical standby is well possible. + +## Effects of Eager Replication in General + +#### Increased Commit Latency + +Adding a synchronization step means additional communication between +the nodes, resulting in additional latency at commit time. Eager All +Node Replication adds roughly two network round trips (to the furthest +peer node in the worst case). Logical standby nodes and nodes still +in the process of joining or catching up are not included, but will +eventually receive changes. + +Before a peer node can confirm its local preparation of the +transaction, it also needs to apply it locally. This further adds to +the commit latency, depending on the size of the transaction. Note +that this is independent of the `synchronous_commit` setting and +applies whenever `bdr.commit_scope` is set to `global`. + +#### Increased Abort Rate + +!!! Note + The performance of Eager Replication is currently known to be + unexpectedly slow (around 10 TPS only). This is expected to be + improved in the next release. + +With single-node Postgres, or even with BDR in its default asynchronous +replication mode, errors at `COMMIT` time are rare. The additional +synchronization step adds a source of errors, so applications need to +be prepared to properly handle such errors (usually by applying a +retry loop). + +The rate of aborts depends solely on the workload. Large transactions +changing many rows are much more likely to conflict with other +concurrent transactions. diff --git a/product_docs/docs/bdr/4.0/feature-matrix.mdx b/product_docs/docs/bdr/4/feature-matrix.mdx similarity index 94% rename from product_docs/docs/bdr/4.0/feature-matrix.mdx rename to product_docs/docs/bdr/4/feature-matrix.mdx index 2cd059161c7..87bf88fe420 100644 --- a/product_docs/docs/bdr/4.0/feature-matrix.mdx +++ b/product_docs/docs/bdr/4/feature-matrix.mdx @@ -1,7 +1,5 @@ --- title: 'Appendix A: Feature Compatibility' -originalFilePath: feature-matrix.md - --- Some features of BDR only work on specific version of Postgres that's capable of @@ -18,6 +16,7 @@ given variant of Postgres and optionally from which version. | Decoding Worker | N | 13+ | 14+ | | Assesment Tooling | N | Y | 14+ | | Lag Tracker | N | Y | 14+ | +| Lag Control | N | Y | 14+ | | Timestamp Snapshots | N | Y | 14+ | | Transaction Streaming | 14+ | 13+ | 14+ | | Missing Partition Conflict | N | Y | 14+ | diff --git a/product_docs/docs/bdr/4.0/functions.mdx b/product_docs/docs/bdr/4/functions.mdx similarity index 56% rename from product_docs/docs/bdr/4.0/functions.mdx rename to product_docs/docs/bdr/4/functions.mdx index 44394ec5849..54ddbcd9588 100644 --- a/product_docs/docs/bdr/4.0/functions.mdx +++ b/product_docs/docs/bdr/4/functions.mdx @@ -1,8 +1,6 @@ --- navTitle: System Functions title: BDR System Functions -originalFilePath: functions.md - --- BDR management is primarily accomplished via SQL-callable functions. @@ -34,6 +32,16 @@ value: MAJOR_VERSION * 10000 + MINOR_VERSION * 100 + PATCH_RELEASE ``` +## System Information Functions + +### bdr.get_relation_stats + +Returns the relation information. + +### bdr.get_subscription_stats + +Returns the current subscription statistics. + ## System and Progress Information Parameters BDR exposes some parameters that can be queried via `SHOW` in `psql` @@ -250,7 +258,7 @@ bdr.wait_slot_confirm_lsn(slot_name text DEFAULT NULL, target_lsn pg_lsn DEFAULT #### Parameters - `slot_name` - name of replication slot, or if NULL, all BDR slots (only) -- `target_lsn` - LSN to wait for, or if NULL, use the current write LSN on the +- `target_lsn` - LSN to wait for, or if NULL, use the current write LSN on the local node ### bdr.wait_for_apply_queue @@ -567,7 +575,7 @@ Returns the name of the group slot on the local node. ```sql bdrdb=# SELECT bdr.local_group_slot_name(); - local_group_slot_name + local_group_slot_name ----------------------- bdr_bdrdb_bdrgroup ``` @@ -590,7 +598,7 @@ bdrdb=# SELECT bdr.node_group_type('bdrgroup'); ## Global Advisory Locks -BDR supports global advisory locks. These locks are very similar to +BDR supports global advisory locks. These locks are very similar to the advisory locks available in PostgreSQL except that the advisory locks supported by BDR are global in nature. They follow semantics similar to DDL locks. So an advisory lock is obtained by majority consensus and @@ -663,3 +671,541 @@ bdr.global_advisory_unlock(key1 integer, key2 integer) - `key1` - first part of the composite key. - `key2` - second part of the composite key. + +## Monitoring Functions + +### bdr.monitor_group_versions + +To provide a cluster-wide version check, this function uses +BDR version information returned from the view +`bdr.group_version_details`. + +#### Synopsis + +```sql +bdr.monitor_group_versions() +``` + +#### Notes + +This function returns a record with fields `status` and `message`, +as explained in [Monitoring]. + +This function calls `bdr.run_on_all_nodes()`. + +### bdr.monitor_group_raft + +To provide a cluster-wide Raft check, this function uses +BDR Raft information returned from view +`bdr.group_raft_details`. + +#### Synopsis + +```sql +bdr.monitor_group_raft() +``` + +#### Notes + +This function returns a record with fields `status` and `message`, +as explained in [Monitoring]. + +This function calls `bdr.run_on_all_nodes()`. + +### bdr.monitor_local_replslots + +This function uses replication slot status information returned from +view `pg_replication_slots` (slot active or inactive) to provide a +local check considering all replication slots, except the BDR group +slots. + +#### Synopsis + +```sql +bdr.monitor_local_replslots() +``` + +#### Notes + +This function returns a record with fields `status` and `message`, +as explained in [Monitoring Replication Slots](monitoring.md#monitoring-replication-slots). + +### bdr.wal_sender_stats + +If the [Decoding Worker](nodes.md#decoding-worker) is enabled, this +function shows information about the decoder slot and current LCR +(`Logical Change Record`) segment file being read by each WAL sender. + +#### Synopsis + +```sql +bdr.wal_sender_stats() → setof record (pid integer, is_using_lcr boolean, decoder_slot_name TEXT, lcr_file_name TEXT) +``` + +#### Output columns + +- `pid` - PID of the WAL sender (corresponds to `pg_stat_replication`'s `pid` column) + +- `is_using_lcr` - Whether the WAL sender is sending LCR files. The next columns will be `NULL` if `is_using_lcr` is `FALSE`. + +- `decoder_slot_name` - The name of the decoder replication slot. + +- `lcr_file_name` - The name of the current LCR file. + + +### bdr.get_decoding_worker_stat + +If the [Decoding Worker](nodes.md#decoding-worker) is enabled, this function +shows information about the state of the Decoding Worker associated with the +current database. This also provides more granular information about Decoding +Worker progress than is available via `pg_replication_slots`. + +#### Synopsis + +```sql +bdr.get_decoding_worker_stat() → setof record (pid integer, decoded_upto_lsn pg_lsn, waiting BOOL, waiting_for_lsn pg_lsn) +``` + +#### Output columns + +- `pid` - the PID of the Decoding Worker (corresponds to the column `active_pid` in `pg_replication_slots`) + +- `decoded_upto_lsn` - LSN upto which the Decoding Worker has read transactional logs + +- `waiting` - whether the Decoding Worker is waiting for new WAL + +- `waiting_for_lsn` - the LSN of the next expected WAL + +#### Notes + +For further details see [Monitoring WAL senders using LCR](monitoring.md#monitoring-wal-senders-using-lcr). + +### bdr.lag_control + +If [Lag Control](lag-control.mdx#configuration) is enabled, this function +shows information about the commit delay and number of nodes conforming +to their configured lag measure for the local node and current database. + +#### Synopsis + +```sql +bdr.lag_control() +``` + +#### Output columns + +- `commit_delay` - current runtime commit delay in fractional milliseconds + +- `commit_delay_maximum` - configured maximum commit delay in fractional milliseconds + +- `commit_delay_adjustment` - Change to runtime commit delay possible during + a sample interval in fractional milliseconds + +- `conforming_nodes` - current runtime number of nodes conforming to lag measures + +- `conforming_nodes_minimum` - configured minimum number of nodes required t + conform to lag measures, below which a commit delay adjustment is applied + +- `lag_bytes_threshold` - lag size at which a commit delay is applied in kilobytes + +- `lag_bytes_maximum` - configured maximum lag size in kilobytes + +- `lag_time_threshold` - lag time at which a commit delay is applied in milliseconds + +- `lag_time_maximum` - configured maximum lag time in milliseconds + +- `sample_interval` - configured minimum time between lag samples and possible + commit delay adjustments in milliseconds + + +## Internal Functions + +### BDR Message Payload Functions + +bdr.decode_message_response_payload and bdr.decode_message_payload + +These functions decode the consensus payloads to a more human-readable output. + +Used primarily by the `bdr.global_consensus_journal_details` debug view. + +### bdr.get_global_locks + +This function shows information about global locks held on the local node. + +Used to implement the `bdr.global_locks` view, to provide a more detailed +overview of the locks. + +### bdr.get_slot_flush_timestamp + +Retrieves the timestamp of the last flush position confirmation for a given +replication slot. + +Used internally to implement the `bdr.node_slots` view. + +### BDR Internal Function Replication Functions + +bdr.internal_alter_sequence_set_kind, internal_replication_set_add_table, internal_replication_set_remove_table + +Functions used internally for replication of the various function calls. + +No longer used by the current version of BDR. Only exists for backwards +compatibility during rolling upgrades. + +### bdr.internal_submit_join_request + +Submits a consensus request for joining a new node. + +Needed by the BDR group reconfiguration internal mechanisms. + +### bdr.isolation_test_session_is_blocked + +A helper function, extending (and actually invoking) the original +`pg_isolation_test_session_is_blocked` with an additional check for blocks +on global locks. + +Used for isolation/concurrency tests. + +### bdr.local_node_info + +This function displays information for the local node, needed by the BDR group +reconfiguration internal mechanisms. + +The view `bdr.local_node_summary` provides similar information useful for +user consumption. + +### bdr.msgb_connect + +Function for connecting to the connection pooler of another node, +used by the consensus protocol. + +### bdr.msgb_deliver_message + +Function for sending messages to another node's connection pooler, +used by the consensus protocol. + +### bdr.peer_state_name + +This function transforms the node state (`node_state`) into a textual +representation, and is used mainly to implement the `bdr.node_summary` view. + +### bdr.request_replay_progress_update + +Requests the immediate writing of a 'replay progress update' Raft message. +It is used mainly for test purposes, but can be also used to test if the +consensus mechanism is working. + +### bdr.seq_nextval + +Internal implementation of sequence increments. + +This function will be used instead of standard `nextval` in queries which +interact with [BDR Global Sequences]. + +#### Notes + +The following are also internal BDR sequence manipulation functions. +`bdr.seq_currval` and `bdr.sql_lastval` are used automatically. + +### bdr.show_subscription_status + +Retrieves information about the subscription status, and is used mainly to +implement the `bdr.subscription_summary` view. + +### bdr.conflict_resolution_to_string + +Transforms the conflict resolution from oid to text. + +The view `bdr.conflict_history_summary` uses this to give user-friendly information for the +conflict resolution. + +### bdr.conflict_type_to_string + +Transforms the conflict type from oid to text. + +The view `bdr.conflict_history_summary` uses this to give user-friendly information for the +conflict resolution. + +### bdr.get_node_conflict_resolvers + +Display a text string of all the conflict resolvers on the local node. + +### bdr.reset_subscription_stats + +Returns a boolean result after resetting the statistics created by subscriptions, +as viewed by bdr.stat_subscription. + +### bdr.reset_relation_stats + +Returns a boolean result after resetting the relation stats, +as viewed by bdr.stat_relation. + +### bdr.pg_xact_origin + +Return origin id of a given transaction. + +#### Synopsis + +```sql +bdr.pg_xact_origin(xmin xid) +``` + +#### Parameters + +- `xid` - Transaction id whose origin is returned + +### bdr.difference_fix_origin_create + +Creates a replication origin with a given name passed as argument, but adding a `bdr_` prefix. +It returns the internal id of the origin. This performs same functionality +as `pg_replication_origin_create()`, except this requires `bdr_superuser` +rather than postgres superuser permissions. + +#### Synopsis + +### bdr.difference_fix_session_setup + +Marks the current session as replaying from the current origin. +The function uses the pre-created `bdr_local_only_origin` local +replication origin implicitly for the session. It allows replay +progress to be reported. It returns void. This performs the +same functionality as `pg_replication_origin_session_setup()`, +except that this requires `bdr_superuser` rather than postgres +superuser permissions. Note that the earlier form of the function: +`bdr.difference_fix_session_setup(text)` has been deprecated and will be +removed in upcoming releases. + +#### Synopsis + +```sql +bdr.difference_fix_session_setup() +``` + +### bdr.difference_fix_session_reset + +Marks the current session as not replaying from any origin, essentially +resetting the effect of `bdr.difference_fix_session_setup()`. +It returns void. This performs the same functionality as +`pg_replication_origin_session_reset()`, except this requires +`bdr_superuser` rather than postgres superuser permissions. + +#### Synopsis + +```sql +bdr.difference_fix_session_reset() +``` + +### bdr.difference_fix_xact_set_avoid_conflict + +Marks the current transaction as replaying a transaction that has +committed at LSN '0/0' and timestamp '2000-01-01'. This performs +the same functionality as +`pg_replication_origin_xact_setup('0/0', '2000-01-01')`, +except this requires `bdr_superuser` rather than postgres superuser +permissions. + +#### Synopsis + +```sql +bdr.difference_fix_xact_set_avoid_conflict() +``` + +### bdr.resynchronize_table_from_node(node_name name, relation regclass) + +Resynchronizes the relation from a remote node. + +#### Synopsis + +```sql +bdr.resynchronize_table_from_node(node_name name, relation regclass) +``` + +#### Parameters + +- `node_name` - the node from which to copy/resync the relation data. +- `relation` - the relation to be copied from the remote node. + +#### Notes + +This acquires a global DML lock on the relation, truncates the relation +locally, and copies data into it from the remote node. + +The relation must exist on both nodes with the same name and definition. + +Resynchronization of partitioned tables with identical partition definitions, +resynchronization partitioned table to non-partitioned table and vice-versa and +resynchronization of referenced tables by temporarily dropping and recreating +foreign key constraints are all supported. + +After running the function on a referenced table, if the referenced column +data no longer matches the referencing column values, it throws an error and +function should be rerun after resynchronizing the referencing table data. + +Furthermore, it supports resynchronization of table with generated columns, by +computing the generated column values locally, after copying the data from +remote node. + +Currently, row_filters are ignored by this function. + +The `bdr.resynchronize_table_from_node` function can be only executed by +the owner of the table, provided the owner has bdr_superuser privileges. + +### bdr.consensus_kv_store + +Stores value in the consistent KV Store. + +Returns timestamp of the value expiration time. This depends on `ttl`, if `ttl` +is `NULL`, then this will return `infinity` if the value was deleted it will +return `-infinity`. + +#### Synopsis + +```sql +bdr.consensus_kv_store(key text, value jsonb, + prev_value jsonb DEFAULT NULL, ttl int DEFAULT NULL) +``` + +#### Parameters + +- `key` - an arbitrary unique key to insert, update or delete. +- `value` - json value to store, if NULL, any existing record will be deleted +- `prev_value` - if set, the write operation is only done if the current value + is equal to `prev_value`. +- `ttl` - time to live of the new value, in milliseconds. + +#### Notes + +This is an internal function, mainly used by HARP. + +!!! Warning + This function should never be used by user applications. + +### bdr.consensus_kv_fetch + +Fetch value from the consistent KV Store in json format. + +#### Synopsis + +```sql +bdr.consensus_kv_fetch(IN key text) RETURNS jsonb +``` + +#### Parameters + +- `key` - an arbitrary key to fetch. + +#### Notes + +This is an internal function, mainly used by HARP. + +!!! Warning + This function should never be used by user applications. + + +### bdr.alter_subscription_skip_changes_upto + +Because logical replication can replicate across versions, doesn't replicate +global changes like roles, and can replicate selectively, sometimes the logical +replication apply process can encounter an error and stop applying changes. + +Wherever possible such problems should be fixed by making changes to the +target side. `CREATE`ing any missing table that's blocking replication, +`CREATE` a needed role, `GRANT` a necessary permission, etc. But occasionally a +problem can't be fixed that way and it may be necessary to skip entirely over a +transaction. +Changes are skipped as entire transactions, all or nothing. To decide where to +skip to, use log output to find the commit LSN, per the example below, or peek +the change stream with the logical decoding functions. + +Unless a transaction only made one change, it's often necessary to manually +apply the transaction's effects on the target side, so it's important to +save the problem transaction whenever possible. See the example below. + +It's possible to skip over changes without +`bdr.alter_subscription_skip_changes_upto` by using +`pg_catalog.pg_logical_slot_get_binary_changes` to skip to the LSN of interest, +so this is really a convenience function. It does do a faster skip; however, it +may bypass some kinds of errors in logical decoding. + +This function only works on disabled subscriptions. + +The usual sequence of steps is: + +* identify the problem subscription and LSN of the problem commit +* disable the subscription +* save a copy of the transaction(s) using `pg_catalog.pg_logical_slot_peek_changes` *on the source node* (if possible) +* `bdr.alter_subscription_skip_changes_upto` on the target node +* apply repaired or equivalent changes on the target manually, if necessary +* re-enable the subscription + +!!! Warning + It's easy to make problems worse when using this function. Don't + do anything unless you're really, really sure it's the only option. + +#### Synopsis + +```sql + bdr.alter_subscription_skip_changes_upto( + subname text, + skip_upto_and_including pg_lsn + ); +``` + +#### Example + +Apply of a transaction is failing with an ERROR, and you've determined that +lower-impact fixes such as changes on the target side will not resolve this +issue. You determine that you must skip the transaction. + +In the error logs, find the commit record LSN to skip to, as in this +artificial example: + +``` +ERROR: XX000: CONFLICT: target_table_missing; resolver skip_if_recently_dropped returned an error: table does not exist +CONTEXT: during apply of INSERT from remote relation public.break_me in xact with commit-end lsn 0/300AC18 xid 131315 +committs 2021-02-02 15:11:03.913792+01 (action #2) (effective sess origin id=2 lsn=0/300AC18) +while consuming 'I' message from receiver for subscription bdr_regression_bdrgroup_node1_node2 (id=2667578509) +on node node2 (id=3367056606) from upstream node node1 (id=1148549230, reporiginid=2) +``` + +In this portion of log we have the information we need: +the_target_lsn: **0/300AC18** +the_subscription: **bdr_regression_bdrgroup_node1_node2** + +Next, disable the subscription so the apply worker doesn't try to connect to the replication slot: + +```sql + SELECT bdr.alter_subscription_disable('the_subscription'); +``` + +Note that you cannot skip only parts of the transaction, it's all or nothing. So +it's strongly recommended that you save a record of it by `COPY`ing it out on the +provider side first, using the subscription's slot name. + +```sql + \\copy (SELECT * FROM pg_catalog.pg_logical_slot_peek_changes('the_slot_name', + 'the_target_lsn', NULL, 'min_proto_version', '1', 'max_proto_version', '1', + 'startup_params_format', '1', 'proto_format', 'json')) + TO 'transaction_to_drop.csv' WITH (FORMAT csv); +``` + +Note that the example is broken into multiple lines for readability, +but it should be issued in a single line because `\copy` does not +support multi-line commands. + +Now you can skip the change by changing "peek" to "get" above, but +`bdr....skip_changes_upto` does a faster skip that avoids decoding +and outputting all the data: + +```sql + SELECT bdr.alter_subscription_skip_changes_upto('subscription_name', + 'the_target_lsn'); +``` + +If necessary or desired, apply the same changes (or repaired versions of them) +manually to the target node, using the dumped transaction contents as a guide. + +Finally, re-enable the subscription: + +```sql + SELECT bdr.alter_subscription_enable('the_subscription'); +``` diff --git a/product_docs/docs/bdr/4/group-commit.mdx b/product_docs/docs/bdr/4/group-commit.mdx new file mode 100644 index 00000000000..800bc22dcc3 --- /dev/null +++ b/product_docs/docs/bdr/4/group-commit.mdx @@ -0,0 +1,177 @@ +--- +title: Group Commit +--- + +The goal of Group Commit is to provide protection against data loss +in case of single node failures or temporary outages. This is achieved +by requiring more than one BDR node to successfully receive and +confirm a transaction at COMMIT time. + +## Requirements + +During normal operation, Group Commit it completely transparent to the +application. Upon failover, the reconciliation phase needs to be +explicitly triggered or consolidated by either the application or a +proxy in between. HARP provides native support for Group Commit and +will trigger the reconciliation phase, making this equally transparent +to the client. + +On the origin node, a transaction committed with Group Commit uses +two-phase commit underneath. Therefore, `max_prepared_transactions` +needs to be configured high enough to handle all such transactions +originating per node. + +## Configuration + +To use Group Commit, a Commit Scope first needs to be defined. This +determines the BDR nodes involved in the commit of a transaction. +Once a scope is established, a transaction may be configured to use +Group Commit as follows: + +```sql +BEGIN; +SET LOCAL bdr.commit_scope = 'example_scope'; +... +COMMIT; +``` + +To complete this example, the Commit Scope in question may previously +have been defined as: + +```sql +SELECT bdr.add_commit_scope( + commit_scope_name := 'example_scope', + origin_node_group := 'example_bdr_group', + rule := 'ANY 2 (example_bdr_group)' +); +``` + +This assumes a *Node Group* named `example_bdr_group` exists and +includes at least two BDR nodes as members (either directly or in +sub-groups). Any transaction committed within the `example_scope` +would require one extra confirmation from a BDR node within the group. +Together with the origin node, this accounts for "ANY 2" nodes out of +the group, on which the transaction is guaranteed to be visible and +durable after the commit. + +### Origin Groups + +Rules for Commit Scopes may depend on which node the transaction is +committed on, i.e. which node acts as the origin for the transaction. +To make this transparent for the application, BDR allows a Commit +Scope to define different rules depending on where the transaction +originates from. + +For example, consider a BDR cluster with nodes spread across two data +centers, a left and a right one. Assume the top-level BDR node group +is called `top_group`. The following commands may be used to setup +sub-groups and create a Commit Scope requiring all nodes in the local +data center to confirm the transaction, but only one node from the +remote one. + +```sql +-- create sub-groups +SELECT bdr.create_node_group(node_group_name := 'left_dc', + parent_group_name := 'top_group', + join_node_group := false); +SELECT bdr.create_node_group(node_group_name := 'right_dc', + parent_group_name := 'top_group', + join_node_group := false); + +-- create a commit scope with individual rules for each sub-group +SELECT bdr.add_commit_scope( + commit_scope_name := 'example_scope', + origin_node_group := 'left_dc', + rule := 'ALL (left_dc) AND ANY 1 (right_dc)' +); +SELECT bdr.add_commit_scope( + commit_scope_name := 'example_scope', + origin_node_group := 'right_dc', + rule := 'ANY 1 (left_dc) AND ALL (right_dc)' +); +``` + +### Confirmation Levels + +BDR nodes can send confirmations for a transaction at different points +in time, similar to [Commit At Most Once](camo.md). In increasing +levels of protection, from the perspective of the confirming node, +these are: + +* `received` - a remote BDR node confirms the transaction immediately + after having received it, prior to starting local application +* `replicated` - confirm after applying changes of the transaction, + but before flushing them to disk +* `durable` - confirm the transaction after all of its changes have + been flushed to disk +* `visible` (default) - confirm the transaction after all of its + changes have been flushed to disk and it has been made visible to + concurrent transactions. + +In rules for Commit Scopes, these confirmation levels may be appended +to the node group definition (in parenthesis) with `ON` as follows: + +* `ANY 1 (right_dc) ON remote_write` +* `ALL (left_dc) ON remote_commit_flush` (default and may as well be + omitted) +* `ALL (left_dc) ON remote_commit_async AND ANY 1 (right_dc) ON remote_write` + +## Reference + +### Commit Scope Grammar + +For reference, the grammar for commit scopes is as follows: + +``` +commit_scope: confirmation + | confirmation AND commit_scope + +confirmation: node_def (ON [received|replicated|durable|visible]) + +node_def: ANY num (node_group [, ...]) + | MAJORITY (node_group [, ...]) + | ALL (node_group [, ...]) +``` + +### Adding a commit scope rule + +The function `bdr.add_commit_scope` creates a rule for the given +commit scope name and origin node group. If the rule is the same for +all nodes in the BDR cluster, invoking this function once for the +top-level node group is sufficient to fully define the commit scope. + +Alternatively, it may be invoked multiple times with the same +`commit_scope_name` but different origin node groups and rules for +commit scopes that vary depending on the origin of the transaction. + +#### Synopsis + +```sql +bdr.add_commit_scope(commit_scope_name NAME, origin_node_group NAME, + rule TEXT) +``` + +### Changing a commit scope rule + +To change a specific rule for a single origin node group within a +commit scope, the function `bdr.alter_commit_scope` may be used. + +#### Synopsis + +```sql +bdr.alter_commit_scope(commit_scope_name NAME, origin_node_group NAME, + rule TEXT) +``` + +### Removing a commit scope rule + +The `bdr.remove_commit_scope` can be used to drop a single rule within +a commit scope. If multiple rules are defined for the commit scope, +this function must be invoked once per rule to fully remove the entire +commit scope. + +#### Synopsis + +```sql +bdr.remove_commit_scope(commit_scope_name NAME, origin_node_group NAME) +``` diff --git a/product_docs/docs/bdr/4.0/img/bdr.png b/product_docs/docs/bdr/4/img/bdr.png similarity index 100% rename from product_docs/docs/bdr/4.0/img/bdr.png rename to product_docs/docs/bdr/4/img/bdr.png diff --git a/product_docs/docs/bdr/4.0/img/frontpage.svg b/product_docs/docs/bdr/4/img/frontpage.svg similarity index 100% rename from product_docs/docs/bdr/4.0/img/frontpage.svg rename to product_docs/docs/bdr/4/img/frontpage.svg diff --git a/product_docs/docs/bdr/4.0/img/nodes.png b/product_docs/docs/bdr/4/img/nodes.png similarity index 100% rename from product_docs/docs/bdr/4.0/img/nodes.png rename to product_docs/docs/bdr/4/img/nodes.png diff --git a/product_docs/docs/bdr/4.0/img/nodes.svg b/product_docs/docs/bdr/4/img/nodes.svg similarity index 100% rename from product_docs/docs/bdr/4.0/img/nodes.svg rename to product_docs/docs/bdr/4/img/nodes.svg diff --git a/product_docs/docs/bdr/4.0/index.mdx b/product_docs/docs/bdr/4/index.mdx similarity index 95% rename from product_docs/docs/bdr/4.0/index.mdx rename to product_docs/docs/bdr/4/index.mdx index f3db6e66998..503c91b8b97 100644 --- a/product_docs/docs/bdr/4.0/index.mdx +++ b/product_docs/docs/bdr/4/index.mdx @@ -2,7 +2,7 @@ navTitle: BDR navigation: - index - - release-notes + - release_notes - overview - appusage - configuration @@ -15,29 +15,26 @@ navigation: - crdt - transaction-streaming - durability + - group-commit - eager - camo + - lag-control - scaling - tssnapshots - repsets - striggers - - backup - - upgrades - twophase - catalogs - - monitoring - functions - feature-matrix - - isolation_details - known-issues - - camo_clients title: BDR (Bi-Directional Replication) directoryDefaults: description: >- BDR (Bi-Directional Replication) is a ground-breaking multi-master replication capability for PostgreSQL clusters that has been in full production status since 2014. -originalFilePath: index.md + --- diff --git a/product_docs/docs/bdr/4/known-issues.mdx b/product_docs/docs/bdr/4/known-issues.mdx new file mode 100644 index 00000000000..4ed5e6ed68e --- /dev/null +++ b/product_docs/docs/bdr/4/known-issues.mdx @@ -0,0 +1,112 @@ +--- +title: 'Appendix B: Known Issues' + + +--- + +This section discusses currently known issues in BDR4. + +## Data Consistency + +Please remember to read about [Conflicts](conflicts) to understand +the implications of the asynchronous operation mode in terms of data +consistency. + +## List of Issues + +In this section we list a number of known issues that are tracked in BDR's +ticketing system, that are expected to be re-solved in one of the future +releases. + +- If the resolver for the `update_origin_change` conflict + is set to `skip`, and `synchronous_commit=remote_apply` is used, and + concurrent updates of the same row are repeatedly applied on two + different nodes, then one of the update statements might hang due + to a deadlock with the BDR writer. As mentioned in the + [Conflicts](conflicts) chapter, `skip` is not the default + resolver for the `update_origin_change` conflict, and this + combination is not intended to be used in production: it discards + one of the two conflicting updates based on the order of arrival + on that node, which is likely to cause a divergent cluster. + In the rare situation that you do choose to use the `skip` + conflict resolver, please note the issue with the use of the + `remote_apply` mode. + +- Decoding Worker feature does not work with CAMO/EAGER/Group Commit. + Installations using CAMO/Eager/Group Commit must keep `enable_wal_decoder` + disabled. + +- Decoding Worker works only with the default replication sets + +- Lag Control does not adjust commit delay in any way on a fully + isolated node, i.e. in case all other nodes are unreachable or not + operational. As soon as at least one node is connected, Replication + Lag Control will pick up its work and adjust the BDR commit delay, + again. + +- For time based Lag Control, BDR currently uses the lag time (measured + by commit timestamps) rather than the estimated catchup time that is + based on historic apply rate. + +- Changing the CAMO partners in a CAMO pair is not currently possible. + It's only possible to add or remove a pair. + Adding or removing a pair does not need a restart of Postgres or even a + reload of the configuration. + +- Group Commit cannot be combined with [CAMO](camo) or [Eager All Node + replication](eager). Eager Replication currently only works by using the + "global" BDR commit scope. + +- Neither Eager replication nor Group Commit support + `synchronous_replication_availability = 'async'`. + +- Group Commit does not support support for a timeout of the + commit after `bdr.global_commit_timeout`. + +- Transactions using Eager Replication cannot yet execute DDL, + nor do they support explicit two-phase commit. + Note that the TRUNCATE command is allowed. + +- Not all DDL can be run when either CAMO or Group Commit is used. + +- Parallel apply is not currently supported in combination with Group + Commit, please make sure to disable it when using Group Commit by + either setting `num_writers` to 1 for the node group (using + [`bdr.alter_node_group_config`](nodes#bdralter_node_group_config)) or + via the GUC `bdr.writers_per_subscription` (see + [Configuration of Generic Replication](configuration#generic-replication)). + +- There currently is no protection against altering or removing a commit + scope. Running transactions in a commit scope that is concurrently + being altered or removed may lead to the transaction blocking or + replication stalling completely due to an error on the downstream node + attempting to apply the transaction. Please ensure that any transactions + using a specific commit scope have finished before altering or removing it. + +## List of Limitations + +This section contains a (non-comprehensive) list of design limitations that are +expected and are by design, and hence are not expected to be resolved in the +future. + +- Replacing a node with it's physical standby does not work for nodes that + use CAMO/Eager/Group Commit. Combining physical standbys and BDR is in + general not recommended, even if otherwise possible. + +- A `galloc` sequence might skip some chunks if the + sequence is created in a rolled back transaction and then created + again with the same name, or if it is created and dropped when DDL + replication is not active and then it is created again when DDL + replication is active. + The impact of the problem is mild, because the sequence + guarantees are not violated; the sequence will only skip some + initial chunks. Also, as a workaround the user can specify the + starting value for the sequence as an argument to the + `bdr.alter_sequence_set_kind()` function. + +- Legacy BDR synchronous replication uses a mechanism for transaction + confirmation different from the one used by CAMO, Eager and Group Commit . + The two are not compatible and must not be used together. Therefore, nodes + that appear in `synchronous_standby_names` must not be part of CAMO, Eager + or Group Commit configuration. Using synchronous replication to other nodes, + including both logical and physical standby is possible. diff --git a/product_docs/docs/bdr/4/lag-control.mdx b/product_docs/docs/bdr/4/lag-control.mdx new file mode 100644 index 00000000000..1b34a7ae0d2 --- /dev/null +++ b/product_docs/docs/bdr/4/lag-control.mdx @@ -0,0 +1,182 @@ +--- +title: Lag Control +--- + +Data throughput of database applications on a BDR origin node can +exceed the rate at which committed data can be safely replicated to +downstream peer nodes. If this disparity persists beyond a period of +time, or chronically, in high availability applications then +organizational objectives related to disaster recovery or business +continuity plans may not be satisfied. + +The Replication Lag Control (RLC) feature is designed to regulate this +imbalance using a dynamic rate limiting device so that data flow between +BDR group nodes complies with these organizational objectives. It does so +by controlling the extent of replication lag between BDR nodes. + +Some of these objectives include the following: + +1) Recovery Point Objective (RPC) specifies the maximum tolerated + amount of data that can be lost due to unplanned events usually + expressed as an amount of time. In non-replicated systems, RPC + is used to set backup intervals to limit the risk of lost committed + data due to failure. For replicated systems, RPC determines the + acceptable amount of committed data that has not been safely applied + to one or more peer nodes. + +2) Resource Constraint Objective (RCO) acknowledges that there are finite + storage constraints. This storage includes database files, WAL and + temporary or intermediate files necessary for continued operation. + For replicated systems, as lag increases the demands on these storage + resources also increase. + +3) Group Elasticity Objective (GEO) insures that any node is not + originating new data at a clip that cannot be acceptably saved to + its peer nodes. When that is the case then the detection of that + condition may be used as one metric in the decision to expand the + number of database nodes. Similarly, when that condition abates then + it may influence the decision to contract the number of database nodes. + +Lag Control manages replication lag by controlling the rate at which client +connections may commit READ WRITE transactions. Replication lag is +measured either as lag time or lag size depending on the objectives +to be met. Transaction commit rate is regulated using a configured +BDR commit delay time. + +## Requirements + +To get started using Lag Control take the following steps: + +1) Determine the maximum acceptable commit delay time + `bdr.lag_control_max_commit_delay` that can be tolerated for all + database applications. + +2) Decide which lag measure is to be used. Either lag size + `bdr.lag_control_max_lag_size` or lag time `bdr.lag_control_max_lag_time` + must be chosen. + +3) Decide on the number of BDR nodes in the group + `bdr.lag_control_min_conforming_nodes` that must satisfy the lag + measure chosen in step 2) as the minimal acceptable number of nodes. + +## Configuration + +Lag Control is configured on each BDR node in the group using `postgresql.conf` +configuration parameters. For Lag Control to be enabled, +`bdr.lag_control_max_commit_delay` and either `bdr.lag_control_max_lag_size` or +`bdr.lag_control_max_lag_time` must be set to positive non-zero values. + +`bdr.lag_control_max_commit_delay` allows, and encourages, a specification +of milliseconds with a fractional part including a sub-millisecond +setting if appropriate. + +By default, `bdr.lag_control_min_conforming_nodes` is set to one (1). +For a complete list, see [Lag Control](configuration.md) + +## Overview + +Lag Control is a dynamic TPS rate limiting mechanism that operates at the client +connection level. It is designed to be as unobtrusive as possible while +satisfying configured Lag Control constraints. This means that if enough +BDR nodes can replicate changes fast enough to remain below configured +lag measure thresholds then the BDR runtime commit delay will stay fixed +at 0 milliseconds. + +If this is not the case, the BDR runtime commit delay will be minimally +adjusted as high as necessary, but no higher, until the number of +conforming nodes is returned to the minimum threshold. + +Even after the minimum node threshold is reached, Lag Control continues to attempt +to drive the BDR runtime commit delay back to zero. The BDR commit delay +may rise and fall around an equilibrium level most of the time, but if +data throughput or lag apply rates improve then the commit delay will +decrease over time. + +The BDR commit delay is a post commit delay. It occurs after the transaction +has committed and after all Postgres resources locked or acquired by the +transaction have been released. Therefore, the delay does not prevent +concurrent active transactions from observing or modifying its values nor +acquiring its resources. The same guarantee cannot be made for external +resources managed by Postgres extensions. Notwithstanding extension +dependencies, the same guarantee can be made if the BDR extension is listed +before extension-based resource managers in `postgresql.conf`. + +Strictly speaking, the BDR commit delay is not a per transaction delay. +It is the mean value of commit delays over a stream of transactions for a +particular client connection. This technique allows the commit delay and +fine-grained adjustments of the value to escape the coarse granularity of +OS schedulers, clock interrupts and variation due to system load. It also +allows the BDR runtime commit delay to settle within microseconds of the +lowest duration possible to maintain a lag measure threshold. + +Finally, the BDR commit delay should not be conflated with the Postgres +commit delay; they are unrelated, perform different functions and should +not be substituted for each other. + +## Transaction Application + +The BDR commit delay is applied to all READ WRITE transactions that +modify data for user applications. This implies that any transaction +that does not modify data, including declared READ WRITE transactions, +is exempt from the commit delay. + +Asynchronous transaction commit also executes a BDR commit delay. This +may appear counterintuitive, but asynchronous commit, by virtue of its +performance, can be one of the greatest sources of replication lag. + +Postgres and BDR auxillary processes do not delay at transaction commit. +Most notably, BDR writers do not execute a commit delay when applying +remote transactions on the local node. This is by design as BDR writers +contribute nothing to outgoing replication lag and can reduce incoming +replication lag the most by not having their transaction commits throttled +by a delay. + +## Limitations + +The maximum commit delay `bdr.lag_control_max_commit_delay` is a ceiling +value representing a hard limit. This means that under no circumstance +does a commit delay ever exceed the configured value. Conversely, the +maximum lag measures `bdr.lag_control_max_lag_size` and +`bdr.lag_control_max_lag_time` are soft limits that can be exceeded. +When the maximum commit delay is reached, there exists no additional +back-pressure on the lag measures to prevent their continued increase. + +There is no way to exempt origin transactions that do not modify BDR +replication sets from the commit delay. For these transactions, it +may be useful to SET LOCAL the maximum transaction delay to zero (0). + +## Caveats + +Application TPS is one of many factors that can affect replication lag. +Other factors include the average size of transactions for which BDR commit +delay can be less efficacious. In particular, bulk load operations can +cause replication lag to rise, which can trigger a concomitant rise in +the BDR runtime commit delay beyond the level reasonably expected by normal +applications although still under the maximum allowed delay. + +Similarly, an application with a very high OLTP requirement and modest +data changes could be unduly restrained by the acceptable BDR commit delay +setting. + +In these cases, it may be useful to use `SET [SESSION|LOCAL]` command to +custom configure Lag Control settings for those applications or modify +those applications. For example, bulk load operations are sometimes split +into multiple, smaller transactions to limit transaction snapshot duration +and WAL retention size or establish a restart point if the bulk load fails. +In a bow to Lag Control, those transaction commits can also schedule very +long BDR commit delays to allow digestion of the lag contributed by the +prior partial bulk load. + +## Meeting Organizational Objectives + +In the example objectives list earlier: + +- RPO can be met by setting an appropriate maximum lag time; +- RCO can be met by setting an appropriate maximum lag size; +- GEO can be met by monitoring the BDR runtime commit delay + and the BDR runtime lag measures. + + As mentioned, when the maximum BDR runtime commit delay is + pegged at the BDR configured commit delay limit and the lag + measures consistency exceed their BDR configured maximum + levels, it can be a marker for BDR group expansion. diff --git a/product_docs/docs/bdr/4.0/nodes.mdx b/product_docs/docs/bdr/4/nodes.mdx similarity index 97% rename from product_docs/docs/bdr/4.0/nodes.mdx rename to product_docs/docs/bdr/4/nodes.mdx index 3dc82432eb3..15809b6f8a3 100644 --- a/product_docs/docs/bdr/4.0/nodes.mdx +++ b/product_docs/docs/bdr/4/nodes.mdx @@ -1,6 +1,6 @@ --- title: Node Management -originalFilePath: nodes.md + --- @@ -28,7 +28,7 @@ or URI format: `postgresql://myhost:5432/mydb`. The SQL function `bdr.create_node_group()` is used to create the BDR group from the local node. Doing so activates BDR on that node and allows other nodes to join the BDR group (which consists of only one node at that point). -At the time of creation, you must specify the connection string that other +At the time of creation, you must specify the connection string that other nodes will use to connect to this node. Once the node group is created, every further node can join the BDR @@ -66,7 +66,7 @@ The schema synchronization can be optionally skipped using `synchronize_structur parameter of `bdr.join_node_group()` function in which case the schema must exist on the newly joining node already. -We recommend that the source node which has the best connection (i.e. is +We recommend that the source node which has the best connection (i.e. is closest) is selected as the source node for joining, since that lowers the time needed for the join to finish. @@ -237,7 +237,7 @@ slot was last advanced. In extreme cases, this may require a full 16 MB before slots are synced/created on the streaming replica. If a failover or switchover occurs during this interval, the streaming standby cannot be promoted to replace its BDR node, as the -group slot and other dependent slots do not exist yet. +group slot and other dependent slots do not exist yet. The slot sync-up process on the standby solves this by invoking a function on the upstream. This function moves the group slot in the @@ -300,14 +300,14 @@ ensure that divergent nodes are never promoted. ## Physical Standby Nodes -BDR also enables the creation of traditional physical standby failover -nodes. These are commonly intended to directly replace a BDR +BDR also enables the creation of traditional physical standby failover +nodes. These are commonly intended to directly replace a BDR node within the cluster after a short promotion procedure. As with any standard Postgres cluster, a node may have any number of these physical replicas. -There are, however, some minimal prerequisites for this to work properly -due to the use of replication slots and other functional requirements in +There are, however, some minimal prerequisites for this to work properly +due to the use of replication slots and other functional requirements in BDR: - The connection between BDR Primary and Standby uses streaming @@ -325,20 +325,20 @@ BDR: - `bdr.standby_slot_names` should specify the physical replication slot used for the Standby's `primary_slot_name`. -While this is enough to produce a working physical standby of a BDR +While this is enough to produce a working physical standby of a BDR node, there are some additional concerns that should be addressed. -Once established, the Standby requires sufficient time and WAL traffic -to trigger an initial copy of the Primary's other BDR-related -replication slots, including the BDR group slot. At minimum, slots on a -Standby are only "live" and will survive a failover if they report +Once established, the Standby requires sufficient time and WAL traffic +to trigger an initial copy of the Primary's other BDR-related +replication slots, including the BDR group slot. At minimum, slots on a +Standby are only "live" and will survive a failover if they report a non-zero `confirmed_flush_lsn` as reported by `pg_replication_slots`. -As a consequence, physical standby nodes in newly initialized BDR -clusters with low amounts of write activity should be checked before -assuming a failover will work normally. Failing to take this precaution -can result in the Standby having an incomplete subset of required -replication slots necessary to function as a BDR node, and thus an +As a consequence, physical standby nodes in newly initialized BDR +clusters with low amounts of write activity should be checked before +assuming a failover will work normally. Failing to take this precaution +can result in the Standby having an incomplete subset of required +replication slots necessary to function as a BDR node, and thus an aborted failover. The protection mechanism that ensures physical standby nodes are up to date @@ -365,30 +365,30 @@ the Primary: 2. Inform the BDR cluster of the change in address by executing the [bdr.alter_node_interface] function on all other BDR nodes. -Once this is done, the other BDR nodes will re-establish communication -with the newly promoted Standby -> Primary node. Since replication -slots are only synchronized periodically, this new Primary may reflect -a lower LSN than expected by the existing BDR nodes. If this is the +Once this is done, the other BDR nodes will re-establish communication +with the newly promoted Standby -> Primary node. Since replication +slots are only synchronized periodically, this new Primary may reflect +a lower LSN than expected by the existing BDR nodes. If this is the case, BDR will fast-forward each lagging slot to the last location used by each BDR node. Take special note of the `bdr.standby_slot_names` parameter as -well. Tt is important to set in a BDR cluster where there is a -Primary -> Physical Standby relationship or when using subscriber-only groups. +well. Tt is important to set in a BDR cluster where there is a +Primary -> Physical Standby relationship or when using subscriber-only groups. -BDR maintains a group slot that always reflects the state of the +BDR maintains a group slot that always reflects the state of the cluster node showing the most lag for any outbound replication. With the addition of a physical -replica, BDR must be informed that there is a non-participating node +replica, BDR must be informed that there is a non-participating node member that will, regardless, affect the state of the group slot. -Since the Standby does not directly communicate with the other BDR -nodes, the `standby_slot_names` parameter informs BDR to consider named -slots as necessary constraints on the group slot as well. When set, the -group slot will be held if the Standby shows lag, even if the group +Since the Standby does not directly communicate with the other BDR +nodes, the `standby_slot_names` parameter informs BDR to consider named +slots as necessary constraints on the group slot as well. When set, the +group slot will be held if the Standby shows lag, even if the group slot would have normally been advanced. -As with any physical replica, this type of standby may also be +As with any physical replica, this type of standby may also be configured as a synchronous replica. As a reminder, this requires: - On the Standby: @@ -556,7 +556,7 @@ Catching-up will continue for a period of time that depends upon the amount of missing data from each peer node, which will likely increase over time, depending upon the server workload. -If the amount of write activity on each node is not uniform, the catchup period +If the amount of write activity on each node is not uniform, the catchup period from nodes with more data could take significantly longer than other nodes. Eventually, the slot state will change to `bdr.node_slots.state` = `streaming`. @@ -578,7 +578,7 @@ PANIC: could not write to file "pg_wal/xlogtemp.559": No space left on device ...or report other out-of-disk related symptoms. -In addition, slots for offline nodes also hold back the catalog xmin, preventing +In addition, slots for offline nodes also hold back the catalog xmin, preventing vacuuming of catalog tables. On EDB Postgres Extended Server and EDB Postgres Advanced Server, offline nodes @@ -646,7 +646,7 @@ The group slot name is given by the function `bdr.local_group_slot_name()`. The group slot can: -- join new nodes to the BDR group without having all existing nodes +- join new nodes to the BDR group without having all existing nodes up and running (although the majority of nodes should be up), without incurring data loss in case the node which was down during join starts replicating again @@ -725,7 +725,7 @@ DROP EXTENSION bdr; If the database depends on some BDR-specific objects, then the BDR extension cannot be dropped. Examples include: -- Tables using BDR-specific sequences such as timeshard or galloc +- Tables using BDR-specific sequences such as SnowflakeId or galloc - Column using CRDT data types - Views that depend on some BDR catalog tables @@ -901,9 +901,9 @@ which case only metadata for that specific node is removed. !!! Note BDR4 can have a maximum of 1024 node records (both ACTIVE and PARTED) at one time. This is because each node has a unique sequence number - assigned to it, for use by timeshard sequences. PARTED nodes are not - automatically cleaned up at the moment; should this become a problem, - this function can be used to remove those records. + assigned to it, for use by snowflakeid and timeshard sequences. + PARTED nodes are not automatically cleaned up at the moment; should this + become a problem, this function can be used to remove those records. ### bdr.create_node_group @@ -952,7 +952,7 @@ The group creation does not hold any locks. ### bdr.alter_node_group_config -This function changes the configuration parameter(s) of an existing BDR group. +This function changes the configuration parameter(s) of an existing BDR group. Options with NULL value (default for all of them) will not be modified. #### Synopsis @@ -966,8 +966,9 @@ bdr.alter_node_group_config(node_group_name text, apply_delay interval DEFAULT NULL, check_constraints boolean DEFAULT NULL, num_writers int DEFAULT NULL, - enable_wal_decoder boolean DEFAULT NULL, - streaming_mode text DEFAULT NULL) + enable_wal_decoder boolean DEFAULT NULL, + streaming_mode text DEFAULT NULL, + default_commit_scope text DEFAULT NULL) ``` #### Parameters @@ -1008,6 +1009,12 @@ bdr.alter_node_group_config(node_group_name text, For more details, see[Transaction Streaming](transaction-streaming). +- `default_commit_scope` - the commit scope to use by default, + initially the 'local' commit scope. This is only applicable to the + top-level node group. Individual rules can be used for different + origin groups of the same commit scope. See the section about + [Origin Groups](group_commit.md) for Group Commit for more details. + #### Notes This function will pass a request to the group consensus mechanism to change @@ -1023,7 +1030,7 @@ This function does not hold any locks. !!! Warning When this function is used to change the `apply_delay` value, the change does not apply to nodes that are already members of the - group. + group. Note that this restriction has little consequence on production usage, because this value is normally not used outside of testing. @@ -1263,7 +1270,8 @@ Returns the number of subscriptions affected by this operation. ```sql bdr.alter_subscription_disable( subscription_name name DEFAULT NULL, - immediate boolean DEFAULT false + immediate boolean DEFAULT false, + fast boolean DEFAULT true ) ``` @@ -1274,6 +1282,9 @@ bdr.alter_subscription_disable( - `immediate` - Immediate is used to force the action immediately, stopping all the workers associated with the disabled subscription. With this option true, this function cannot be run inside of the transaction block. +- `fast` - This argument only influences the behavior of `immediate`. + If set to `true` (the default) it stops all the workers associated with the + disabled subscription without waiting for them to finish current work. #### Notes @@ -1283,8 +1294,9 @@ This function is not replicated and only affects local node subscriptions This function is transactional - it can be rolled back and any catalog changes can be seen by the current transaction. However, the timing of the subscription worker stopping depends on the value of `immediate`; if set to `true`, the -workers will be stopped immediately; if set to `false`, they will be stopped at -the `COMMIT` time. +workers will receive the stop witout waiting for the `COMMIT` and if the `fast` +argument is set to `true` the interruption of the workers wil not wait for +current work to finish. ## Node Management Commands diff --git a/product_docs/docs/bdr/4.0/overview.mdx b/product_docs/docs/bdr/4/overview.mdx similarity index 98% rename from product_docs/docs/bdr/4.0/overview.mdx rename to product_docs/docs/bdr/4/overview.mdx index 4b85f739798..07908f8d97f 100644 --- a/product_docs/docs/bdr/4.0/overview.mdx +++ b/product_docs/docs/bdr/4/overview.mdx @@ -1,7 +1,7 @@ --- navTitle: Overview title: Architectural Overview -originalFilePath: overview.md + --- @@ -163,7 +163,7 @@ applications with high contention could perform worse than a single node. These results are consistent with any multi-master technology, they are not a facet or peculiarity of BDR. -Eager replication can avoid conflicts, but is inherently more expensive. +Eager Replication can avoid conflicts, but is inherently more expensive. Changes are sent concurrently to all nodes so that the replication lag is minimised. Adding more nodes means using more CPU for replication, so peak TPS will reduce @@ -190,7 +190,7 @@ architecture. Since we write mainly to only one node, the possibility of contention between is reduced to almost zero and as a result performance impact is much reduced. -CAMO is eager replication within the local Group, lazy with regard to other Groups. +CAMO is Eager Replication within the local Group, lazy with regard to other Groups. Secondary applications may execute against the shadow nodes, though these should be reduced or interrupted if the main application begins using that node. diff --git a/product_docs/docs/bdr/4/release_notes/bdr4.0.1_rel_notes.mdx b/product_docs/docs/bdr/4/release_notes/bdr4.0.1_rel_notes.mdx new file mode 100644 index 00000000000..afc1c4bb573 --- /dev/null +++ b/product_docs/docs/bdr/4/release_notes/bdr4.0.1_rel_notes.mdx @@ -0,0 +1,35 @@ +--- +title: "BDR 4.0.1" +--- + +This is a maintenance release for BDR 4.0 which includes minor +improvements as well as fixes for issues identified in previous +versions. + +| Type | Category | Description | +| ---- | -------- | ----------- | +| Improvement | Reliability and operability | Reduce frequency of CAMO partner connection attempts.

In case of a failure to connect to a CAMO partner to verify its configuration and check the status of transactions, do not retry immediately (leading to a fully busy pglogical manager process), but throttle down repeated attempts to reconnect and checks to once per minute.

+| Improvement | RPerformance and scalability | Implement buffered read for LCR segment file (BDR-1422)

Implement LCR segment file buffering so that multiple LCR chunks can be read at a time. This should reduce I/O and improve CPU usage of Wal Senders when using the Decoding Worker.

+| Improvement | Performance and scalability | Avoid unnecessary LCR segment reads (BDR-1426)

BDR now attempts to only read new LCR segments when there is at least one available. This reduces I/O load when Decoding Worker is enabled.

+| Improvement | Performance and scalability | Performance of COPY replication including the initial COPY during join has been greatly improved for partitioned tables (BDR-1479)

For large tables this can improve the load times by order of magnitude or more.

+| Bug fix | Performance and scalability | Fix the parallel apply worker selection (BDR-1761)

This makes parallel apply work again. In 4.0.0 parallel apply was never in effect due to this bug.

+| Bug fix | Reliability and operability | Fix Raft snapshot handling of `bdr.camo_pairs` (BDR-1753)

The previous release would not correctly propagate changes to the CAMO pair configuration when they were received via Raft snapshot.

+| Bug fix | Reliability and operability | Correctly handle Raft snapshots from BDR 3.7 after upgrades (BDR-1754) +| Bug fix | Reliability and operability | Upgrading a CAMO configured cluster taking into account the `bdr.camo_pairs` in the snapshot while still excluding the ability to perform in place upgrade of a cluster (due to upgrade limitations unrelated to CAMO). +| Bug fix | Reliability and operability | Switch from CAMO to Local Mode only after timeouts (RT74892)

Do not use the `catchup_interval` estimate when switching from CAMO protected to Local Mode, as that could induce inadvertent switching due to load spikes. Use the estimate only when switching from Local Mode back to CAMO protected (to prevent toggling forth and back due to lag on the CAMO partner).

+| Bug fix | Reliability and operability | Fix replication set cache invalidation when published replication set list have changed (BDR-1715)

In previous versions we could use stale information about which replication sets (and as a result which tables) should be published until the subscription has reconnected.

+| Bug fix | Reliability and operability | Prevent duplicate values generated locally by galloc sequence in high concurrency situations when the new chunk is used (RT76528)

The galloc sequence could have temporarily produce duplicate value when switching which chunk is used locally (but not across nodes) if there were multiple sessions waiting for the new value. This is now fixed.

+| Bug fix | Reliability and operability | Address memory leak on streaming transactions (BDR-1479)

For large transaction this reduces memory usage and I/O considerably when using the streaming transactions feature. This primarily improves performance of COPY replication.

+| Bug fix | Reliability and operability | Don't leave slot behind after PART_CATCHUP phase of node parting when the catchup source has changed while the node was parting (BDR-1716)

When node is being removed (parted) from BDR group, we do so called catchup in order to forward any missing changes from that node between remaining nodes in order to keep the data on all nodes consistent. This requires an additional replication slot to be created temporarily. Normally this replication slot is removed at the end of the catchup phase, however in certain scenarios where we have to change the source node for the changes, this slot could have previously been left behind. From this version, this slot is always correctly removed.

+| Bug fix | Reliability and operability | Ensure that the group slot is moved forward when there is only one node in the BDR group

This prevents disk exhaustion due to WAL accumulation when the group is left running with just single BDR node for a prolonged period of time. This is not recommended setup but the WAL accumulation was not intentional.

+| Bug fix | Reliability and operability | Advance Raft protocol version when there is only one node in the BDR group

Single node clusters would otherwise always stay on oldest support protocol until another node was added. This could limit available feature set on that single node.

+ +## Upgrades + +This release supports upgrading from the following versions of BDR: + +- 3.7.14 +- 4.0.0 and higher + +Please make sure you read and understand the process and limitations described +in the [Upgrade Guide](upgrades) before upgrading. diff --git a/product_docs/docs/bdr/4/release_notes/bdr4.0.2_rel_notes.mdx b/product_docs/docs/bdr/4/release_notes/bdr4.0.2_rel_notes.mdx new file mode 100644 index 00000000000..20cbf7fb0c9 --- /dev/null +++ b/product_docs/docs/bdr/4/release_notes/bdr4.0.2_rel_notes.mdx @@ -0,0 +1,37 @@ +--- +title: "BDR 4.0.2" +--- + +This is a maintenance release for BDR 4.0 which includes minor +improvements as well as fixes for issues identified in previous +versions. + +| Type | Category | Description | +| ---- | -------- | ----------- | +| Improvement | Reliability and operability | Add `bdr.max_worker_backoff_delay` (BDR-1767)

This changes the handling of the backoff delay to exponentially increase from `bdr.min_worker_backoff_delay` to `bdr.max_worker_backoff_delay` in presence of repeated errors. This reduces log spam and in some cases also prevents unnecessary connection attempts.

+| Improvement | User Experience | Add `execute_locally` option to `bdr.replicate_ddl_command()` (RT73533)

This allows optional queueing of ddl commands for replication to other groups without executing it locally.

+| Improvement | User Experience | Change ERROR on consensus issue during JOIN to WARNING

The reporting of these transient errors was confusing as they were also shown in bdr.worker_errors. These are now changed to WARNINGs.

+| Bug fix | Reliability and operability | WAL decoder confirms end LSN of the running transactions record (BDR-1264)

Confirm end LSN of the running transactions record processed by WAL decoder so that the WAL decoder slot remains up to date and WAL senders get the candidate in timely manner.

+| Bug fix | Reliability and operability | Don't wait for autopartition tasks to complete on parting nodes (BDR-1867)

When a node has started parting process, it makes no sense to wait for autopartition tasks on such nodes to finish since it's not part of the group anymore.

+| Bug fix | User Experience | Improve handling of node name reuse during parallel join (RT74789)

Nodes now have a generation number so that it's easier to identify the name reuse even if the node record is received as part of a snapshot.

+| Bug fix | Reliability and operability | Fix locking and snapshot use during node management in the BDR manager process (RT74789)

When processing multiple actions in the state machine, make sure to reacquire the lock on the processed node and update the snapshot to make sure all updates happening through consensus are taken into account.

+| Bug fix | Reliability and operability | Improve cleanup of catalogs on local node drop

Drop all groups, not only the primary one and drop all the node state history info as well.

+| Bug fix | User Experience | Improve error checking for join request in bdr_init_physical

Previously bdr_init_physical would simply wait forever when there was any issue with the consensus request, now we do same checking as the logical join does.

+| Bug fix | Reliability and operability | Improve handling of various timeouts and sleeps in consensus

This reduces the amount of new consensus votes needed when processing many consensus requests or time consuming consensus requests, for example during join of a new node.

+| Bug fix | Reliability and operability | Fix handling of `wal_receiver_timeout` (BDR-1848)

The `wal_receiver_timeout` has not been triggered correctly due to a regression in BDR 3.7 and 4.0.

+| Bug fix | Reliability and operability | Limit the `bdr.standby_slot_names` check when reporting flush position only to physical slots (RT77985, RT78290)

Otherwise flush progress is not reported in presence of disconnected nodes when using `bdr.standby_slot_names`.

+| Bug fix | Reliability and operability | Fix replication of data types created during bootstrap (BDR-1784) +| Bug fix | Reliability and operability | Fix replication of arrays of builtin types that don't have binary transfer support (BDR-1042) +| Bug fix | Reliability and operability | Prevent CAMO configuration warnings if CAMO is not being used (BDR-1825) + +## Upgrades + +This release supports upgrading from the following versions of BDR: + +- 4.0.0 and higher + +The upgrade path from BDR 3.7 is not currently stable and needs to be +considered beta. Tests should be performed with at least BDR 3.7.15. + +Please make sure you read and understand the process and limitations described +in the [Upgrade Guide](upgrades) before upgrading. diff --git a/product_docs/docs/bdr/4/release_notes/bdr4.1.0_rel_notes.mdx b/product_docs/docs/bdr/4/release_notes/bdr4.1.0_rel_notes.mdx new file mode 100644 index 00000000000..a2fab9c4a28 --- /dev/null +++ b/product_docs/docs/bdr/4/release_notes/bdr4.1.0_rel_notes.mdx @@ -0,0 +1,68 @@ +--- +title: "BDR 4.1.0" +--- + +This is a minor release of BDR 4 which includes new features as well +as fixes for issues identified in previous versions. + +| Type | Category | Description | +| ---- | -------- | ----------- | +| Feature | Reliability and operability | Support in-place major upgrade of Postgres on a BDR node

This BDR release include as new command-line utility `bdr_pg_upgrade` which uses `pg_upgrade` to do a major version upgrade of Postgres on a BDR node.

This reduces the time and network bandwidth necessary to do major version upgrades of Postgres in a BDR cluster.

+| Feature | Performance and scalability | Replication Lag Control

Add configuration for a replication lag threshold after which the transaction commits get throttled. This allows limiting RPO without incurring the latency impact on every transaction that comes with synchronous replication.

+| Feature | UX / Initial experience | Distributed sequences by default

The default value of `bdr.default_sequence_kind` has been changed to `'distributed'` which is new kind of sequence that uses SnowFlakeId for `bigserial` and Galloc sequences for `serial` column type.

+| Feature | UX | Simplified synchronous replication configuration

New syntax for specifying the synchronous replication options, with focus on BDR groups and SQL based management (as opposed to config file).

In future versions this will also replace the current Eager Replication and CAMO configuration options.

+| Feature | High availability and disaster recovery | Group Commit

The initial kind of synchronous commit that can be configured via the new configuration syntax.

+| Feature | High availability and disaster recovery | Allow a Raft request to be required for CAMO switching to Local Mode (RT78928)

Add a `require_raft` flag to the CAMO pairing configuration which controls the behavior of switching from CAMO protected to Local Mode, introducing the option to require a majority of nodes to be connected to allow to switch to Local Mode.

+| Feature | High availability and disaster recovery | Allow replication to continue on `ALTER TABLE ... DETACH PARTITION CONCURRENTLY` of already detached partition (RT78362)

Similarly to how BDR 4 handles `CREATE INDEX CONCURRENTLY` when same index already exists, we now allow replication to continue when `ALTER TABLE ... DETACH PARTITION CONCURRENTLY` is receiver for partition that has been already detached.

+| Feature | User Experience | Add additional filtering options to DDL filters.

DDL filters allow for replication of different DDL statements to different replication sets. Similar to how table membership in replication set allows DML on different tables to be replicated via different replication sets.

This release adds new controls that make it easier to use the DDL filters:
- query_match - if defined query must match this regex
- exclusive - if true, other matched filters are not taken into consideration (i.e. only the exclusive filter is applied), when multiple exclusive filters match, we throw error

+| Feature | User Experience | Add `bdr.lock_table_locking` configuration variable.

When enabled this changes behavior of `LOCK TABLE` command to take take a global DML lock

+| Feature | Performance and scalability | Implement buffered write for LCR segment file

This should reduce I/O and improve CPU usage of the Decoding Worker.

+| Feature | User Experience | Add support for partial unique index lookups for conflict detection (RT78368).

Indexes on expression are however still not supported for conflict detection.

+| Feature | User Experience | Add additional statistics to `bdr.stat_subscription`:
- nstream_insert => the count of INSERTs on streamed transactions
- nstream_update => the count of UPDATEs on streamed transactions
- nstream_delete => the count of DELETEs on streamed transactions
- nstream_truncate => the count of TRUNCATEs on streamed transactions
- npre_commit_confirmations => the count pre-commit confirmations, when using CAMO
- npre_commit => the count of pre-commits
- ncommit_prepared => the count of prepared commits with 2PC
- nabort_prepared => the count of aborts of prepared transactions with 2PC +| Feature | User Experience | Add execute_locally option to bdr.replicate_ddl_command (RT73533).

This allows optional queueing of ddl commands for replication to other groups without executing it locally.

+| Feature | User Experience | Add `fast` argument to `bdr.alter_subscription_disable()` (RT79798)

The argument only influences the behavior of `immediate`. When set to `true` (default) it will stop the workers without letting them finish the current work.

+| Feature | User Experience | Keep the `bdr.worker_error` records permanently for all types of workers.

BDR used to remove receiver and writer errors when those workers managed to replicate the LSN that was previously resulting in error. However this was inconsistent with how other workers behaved, as other worker errors were permanent and it also made the troubleshooting of past issues harder. So keep the last error record permanently for every worker type.

+| Feature | User Experience | Simplify `bdr.{add,remove}_camo_pair` functions to return void. +| Feature | Initial Experience | Add connectivity/lag check before taking global lock.

So that application or user does not have to wait for minutes to get lock timeout when there are obvious connectivity issues.

Can be set to DEBUG, LOG, WARNING (default) or ERROR.

+| Feature | Initial Experience | Only log conflicts to conflict log table by default. They are no longer logged to the server log file by default, but this can be overridden. +| Feature | User Experience | Improve reporting of remote errors during node join. +| Feature | Reliability and operability | Make autopartition worker's max naptime configurable. +| Feature | User Experience | Add ability to request partitions upto the given upper bound with autopartition. +| Feature | Initial Experience | Don't try replicate DDL run on subscribe-only node. It has nowhere to replicate so any attempt to do so will fail. This is same as how logical standbys behave. +| Feature | User Experience | Add `bdr.accept_connections` configuration variable. When `false`, walsender connections to replication slots using BDR output plugin will fail. This is useful primarily during restore of single node from backup. +| Bug fix | Reliability and operability | Keep the `lock_timeout` as configured on non-CAMO-partner BDR nodes

A CAMO partner uses a low `lock_timeout` when applying transactions from its origin node. This was inadvertently done for all BDR nodes rather than just the CAMO partner, which may have led to spurious `lock_timeout` errors on pglogical writer processes on normal BDR nodes.

+| Bug fix | User Experience | Show a proper wait event for CAMO / Eager confirmation waits (RT75900)

Show correct "BDR Prepare Phase"/"BDR Commit Phase" in `bdr.stat_activity` instead of the default “unknown wait event”.

+| Bug fix | User Experience | Reduce log for bdr.run_on_nodes (RT80973)

Don't log when setting `bdr.ddl_replication` to off if it's done with the "run_on_nodes" variants of function. This eliminates the flood of logs for monitoring functions.

+| Bug fix | Reliability and operability | Fix replication of arrays of composite types and arrays of builtin types that don't support binary network encoding +| Bug fix | Reliability and operability | Fix replication of data types created during bootstrap +| Bug fix | Performance and scalability | Confirm end LSN of the running transactions record processed by WAL decoder so that the WAL decoder slot remains up to date and WAL sender get the candidate in timely manner. +| Bug fix | Reliability and operability | Don't wait for autopartition tasks to complete on parting nodes +| Bug fix | Reliability and operability | Limit the `bdr.standby_slot_names` check when reporting flush position only to physical slots (RT77985, RT78290)

Otherwise flush progress is not reported in presence of disconnected nodes when using `bdr.standby_slot_names`.

+| Bug fix | Reliability and operability | Request feedback reply from walsender if we are close to wal_receiver_timeout +| Bug fix | Reliability and operability | Don't record dependency of auto-paritioned table on BDR extension more than once.

This resulted in "ERROR: unexpected number of extension dependency records" errors from auto-partition and broken replication on conflicts when this happens.

Note that existing broken tables need to still be fixed manually by removing the double dependency from `pg_depend`

+| Bug fix | Reliability and operability | Improve keepalive handling in receiver.

Don't update position based on keepalive when in middle of streaming transaction as we might lose data on crash if we do that.

There is also new flush and signalling logic that should improve latency in low TPS scenarios. +| Bug fix | Reliability and operability | Only do post `CREATE` commands processing when BDR node exists in the database. +| Bug fix | Reliability and operability | Don't try to log ERROR conflicts to conflict history table. +| Bug fix | Reliability and operability | Fixed segfault where a conflict_slot was being used after it was released during multi-insert (COPY) (RT76439). +| Bug fix | Reliability and operability | Prevent walsender processes spinning when facing lagging standby slots (RT80295, RT78290).

Correct signaling to reset a latch so that a walsender process does consume 100% of a CPU in case one of the standby slots is lagging behind.

+| Bug fix | Reliability and operability | Fix handling of `wal_sender_timeout` when `bdr.standby_slot_names` are used (RT78290) +| Bug fix | Reliability and operability | Make ALTER TABLE lock the underlying relation only once (RT80204). +| Bug fix | User Experience | Fix reporting of disconnected slots in `bdr.monitor_local_replslots`. They could have been previously reported as missing instead of disconnected. +| Bug fix | Reliability and operability | Fix apply timestamp reporting for down subscriptions in `bdr.get_subscription_progress()` function and in the `bdr.subscription_summary` that uses that function. It would report garbage value before. +| Bug fix | Reliability and operability | Fix snapshot handling in various places in BDR workers. +| Bug fix | User Experience | Be more consistent about reporting timestamps and LSNs as NULLs in monitoring functions when there is no available value for those. +| Bug fix | Reliability and operability | Reduce log information when switching between writer processes. +| Bug fix | Reliability and operability | Don't do superuser check when configuration parameter was specified on PG command-line. We can't do transactions there yet and it's guaranteed to be superuser changed at that stage. +| Bug fix | Reliability and operability | Use 64 bits for calculating lag size in bytes. To eliminate risk of overflow with large lag. + + +### Upgrades + +This release supports upgrading from the following versions of BDR: + +- 4.0.0 and higher +- 3.7.15 +- 3.7.16 + +Please make sure you read and understand the process and limitations described +in the [Upgrade Guide](upgrades) before upgrading. diff --git a/product_docs/docs/bdr/4/release_notes/bdr4_rel_notes.mdx b/product_docs/docs/bdr/4/release_notes/bdr4_rel_notes.mdx new file mode 100644 index 00000000000..f012798c2b8 --- /dev/null +++ b/product_docs/docs/bdr/4/release_notes/bdr4_rel_notes.mdx @@ -0,0 +1,27 @@ +--- +title: "BDR 4.0.0" +--- + +BDR 4.0 is a new major version of BDR and adopted with this release number is +semantic versioning (for details see semver.org). The two previous major +versions are 3.7 and 3.6. + +| Type | Category | Description | +| ---- | -------- | ----------- | +| Feature | Compatibility | BDR on EDB Postgres Advanced 14 now supports following features which were previously only available on EDB Postgres Extended:
- Commit At Most Once - a consistency feature helping an application to commit each transaction only once, even in the presence of node failures
- Eager Replication - synchronizes between the nodes of the cluster before committing a transaction to provide conflict free replication
- Decoding Worker - separation of decoding into separate worker from wal senders allowing for better scalability with many nodes
- Estimates for Replication Catch-up times
- Timestamp-based Snapshots - providing consistent reads across multiple nodes for retrieving data as they appeared or will appear at a given time
- Automated dynamic configuration of row freezing to improve consistency of UPDATE/DELETE conflicts resolution in certain corner cases
- Assesment checks
- Support for handling missing partitions as conflicts rather than errors
- Advanced DDL Handling for NOT VALID constraints and ALTER TABLE +| Feature | Compatibility | BDR on community version of PostgreSQL 12-14 now supports following features which were previously only available on EDB Postgres Advanced or EDB Postgres Extended:
- Conflict-free Replicated Data Types - additional data types which provide mathematically proven consistency in asynchronous multi-master update scenarios
- Column Level Conflict Resolution - ability to use per column last-update wins resolution so that UPDATEs on different fields can be "merged" without losing either of them
- Transform Triggers - triggers that are executed on the incoming stream of data providing ability to modify it or to do advanced programmatic filtering
- Conflict triggers - triggers which are called when conflict is detected, providing a way to use custom conflict resolution techniques
- CREATE TABLE AS replication
- Parallel Apply - allow multiple writers to apply the incoming changes +| Feature | Performance | Support streaming of large transactions.

This allows BDR to stream a large transaction (greater than `logical_decoding_work_mem` in size) either to a file on the downstream or to a writer process. This ensures that the transaction is decoded even before it's committed, thus improving parallelism. Further, the transaction can even be applied concurrently if streamed straight to a writer. This improves parallelism even more.

When large transactions are streamed to files, they are decoded and the decoded changes are sent to the downstream even before they are committed. The changes are written to a set of files and applied when the transaction finally commits. If the transaction aborts, the changes are discarded, thus wasting resources on both upstream and downstream.

Sub-transactions are also handled automatically.

This feature is available on PostgreSQL 14, EDB Postgres Extended 13+ and EDB Postgres Advanced 14, see [Feature Compatibility](feature-matrix) appendix for more details on which features can be used on which versions of Postgres.

+| Feature | Compatibility | The differences that existed in earlier versions of BDR between standard and enterprise edition have been removed. With BDR 4.0 there is one extension for each supported Postgres distribution and version, i.e., PostgreSQL v12-14, EDB Postgres Extended v12-14, and EDB Postgres Advanced 12-14.

Not all features are available on all versions of PostgreSQL, the available features are reported via feature flags using either `bdr_config` command line utility or `bdr.bdr_features()` database function. See [Feature Compatibility](feature-matrix) appendix for more details.

+| Feature | User Experience | There is no pglogical 4.0 extension that corresponds to the BDR 4.0 extension. BDR no longer has a requirement for pglogical.

This means also that only BDR extension and schema exist and any configuration parameters were renamed from `pglogical.` to `bdr.`.

+| Feature | Initial experience | Some configuration options have change defaults for better post-install experience:
- Parallel apply is now enabled by default (with 2 writers). Allows for better performance, especially with streaming enabled.
- `COPY` and `CREATE INDEX CONCURRENTLY` are now streamed directly to writer in parallel (on Postgres versions where streaming is supported) to all available nodes by default, eliminating or at least reducing replication lag spikes after these operations.
- The timeout for global locks have been increased to 10 minutes
- The `bdr.min_worker_backoff_delay` now defaults to 1s so that subscriptions retry connection only once per second on error +| Feature | Reliability and operability | Greatly reduced the chance of false positives in conflict detection during node join for table that use origin based conflict detection +| Feature | Reliability and operability | Move configuration of CAMO pairs to SQL catalogs

To reduce chances of misconfiguration and make CAMO pairs within the BDR cluster known globally, move the CAMO configuration from the individual node's postgresql.conf to BDR system catalogs managed by Raft. This for example can prevent against inadvertently dropping a node that's still configured to be a CAMO partner for another active node.

Please see the [Upgrades chapter](upgrades#upgrading-a-camo-enable-cluster) for details on the upgrade process.

This deprecates GUCs `bdr.camo_partner_of` and `bdr.camo_origin_for` and replaces the functions `bdr.get_configured_camo_origin_for()` and `get_configured_camo_partner_of` with `bdr.get_configured_camo_partner`.

+ +## Upgrades + +This release supports upgrading from the following version of BDR: + +- 3.7.13.1 + +Please make sure you read and understand the process and limitations described +in the [Upgrade Guide](upgrades) before upgrading. diff --git a/product_docs/docs/bdr/4/release_notes/index.mdx b/product_docs/docs/bdr/4/release_notes/index.mdx new file mode 100644 index 00000000000..0dfb5faf952 --- /dev/null +++ b/product_docs/docs/bdr/4/release_notes/index.mdx @@ -0,0 +1,23 @@ +--- +title: Release Notes +navigation: +- bdr4.1.0_rel_notes +- bdr4.0.2_rel_notes +- bdr4.0.1_rel_notes +- bdr4_rel_notes +--- + +BDR is a PostgreSQL extension providing multi-master replication and data +distribution with advanced conflict management, data-loss protection, and +throughput up to 5X faster than native logical replication, and enables +distributed PostgreSQL clusters with a very high availability. + +The release notes in this section provide information on what was new in each release. + +| Version | Release Date | +| ----------------------- | ------------ | +| [4.1.0](bdr4.1.0_rel_notes) | 2022 May 17 | +| [4.0.2](bdr4.0.2_rel_notes) | 2022 Feb 15 | +| [4.0.1](bdr4.0.1_rel_notes) | 2022 Jan 18 | +| [4.0.0](bdr4_rel_notes) | 2021 Dec 01 | + diff --git a/product_docs/docs/bdr/4.0/repsets.mdx b/product_docs/docs/bdr/4/repsets.mdx similarity index 97% rename from product_docs/docs/bdr/4.0/repsets.mdx rename to product_docs/docs/bdr/4/repsets.mdx index 44dc54d5d13..eefb8abad29 100644 --- a/product_docs/docs/bdr/4.0/repsets.mdx +++ b/product_docs/docs/bdr/4/repsets.mdx @@ -1,6 +1,6 @@ --- title: Replication Sets -originalFilePath: repsets.md + --- @@ -277,16 +277,16 @@ transaction. Do not drop a replication set which is being used by at least another node, because this will stop replication on that node. Should this happen, please unsubscribe the affected node - from that replication set. + from that replication set. For the same reason, you should not drop a replication set if there is a join operation in progress, and the node being joined is a member of that replication set; replication set membership is - only checked at the beginning of the join. + only checked at the beginning of the join. This happens because the information on replication set usage is local to each node, so that it can be configured on a node before it joins the group. -You can manage replication set subscription for a node using `alter_node_replication_sets` +You can manage replication set subscription for a node using `alter_node_replication_sets` which is mentioned below. ### bdr.alter_node_replication_sets @@ -461,16 +461,15 @@ Use the following SQL to show those replication sets that the current node publishes and subscribes from: ```sql -SELECT s.node_id, - s.node_name, + SELECT node_id, + node_name, COALESCE( - i.pub_repsets, s.pub_repsets + pub_repsets, pub_repsets ) AS pub_repsets, COALESCE( - i.sub_repsets, s.sub_repsets + sub_repsets, sub_repsets ) AS sub_repsets -FROM bdr.local_node_summary s -INNER JOIN bdr.node_local_info i ON i.node_id = s.node_id; + FROM bdr.local_node_summary; ``` This produces output like this: @@ -490,17 +489,16 @@ the following query: WITH node_repsets AS ( SELECT jsonb_array_elements( bdr.run_on_all_nodes($$ - SELECT s.node_id, - s.node_name, - COALESCE( - i.pub_repsets, s.pub_repsets - ) AS pub_repsets, - COALESCE( - i.sub_repsets, s.sub_repsets - ) AS sub_repsets - FROM bdr.local_node_summary s - INNER JOIN bdr.node_local_info i - ON i.node_id = s.node_id; + SELECT + node_id, + node_name, + COALESCE( + pub_repsets, pub_repsets + ) AS pub_repsets, + COALESCE( + sub_repsets, sub_repsets + ) AS sub_repsets + FROM bdr.local_node_summary; $$)::jsonb ) AS j ) @@ -508,7 +506,7 @@ SELECT j->'response'->'command_tuples'->0->>'node_id' AS node_id, j->'response'->'command_tuples'->0->>'node_name' AS node_name, j->'response'->'command_tuples'->0->>'pub_repsets' AS pub_repsets, j->'response'->'command_tuples'->0->>'sub_repsets' AS sub_repsets -FROM node_repsets;; +FROM node_repsets; ``` This will show, for example: diff --git a/product_docs/docs/bdr/4.0/scaling.mdx b/product_docs/docs/bdr/4/scaling.mdx similarity index 99% rename from product_docs/docs/bdr/4.0/scaling.mdx rename to product_docs/docs/bdr/4/scaling.mdx index d3b19c85c2f..dfa2dd38d97 100644 --- a/product_docs/docs/bdr/4.0/scaling.mdx +++ b/product_docs/docs/bdr/4/scaling.mdx @@ -1,7 +1,5 @@ --- title: AutoPartition -originalFilePath: scaling.md - --- AutoPartition allows tables to grow easily to large sizes by automatic @@ -21,7 +19,7 @@ your `search_path`, you will need to schema-qualify the name of each function. range partitioning for a table. If no definition exists, it will be created, otherwise later executions will alter the definition. -`bdr.autopartition()` does not lock the actual table, it only changes the +`bdr.autopartition()` does not lock the actual table, it only changes the definition of when and how new partition maintenance actions will take place. `bdr.autopartition()` leverages the features that allow a partition to be @@ -37,7 +35,7 @@ key of type `timestamp` or `date`, the `partition_increment` must be a valid constant of type `interval`. For example, specifying `1 Day` will cause a new partition to be added each day, with partition bounds that are 1 day apart. -If the partition column is connected to a `timeshard` or `ksuuid` sequence, +If the partition column is connected to a `snowflakeid`, `timeshard` or `ksuuid` sequence, the `partition_increment` must be specified as type `interval`. Otherwise, if the partition key is integer or numeric, then the `partition_increment` must be a valid constant of the same datatype. For example, specifying diff --git a/product_docs/docs/bdr/4.0/security.mdx b/product_docs/docs/bdr/4/security.mdx similarity index 99% rename from product_docs/docs/bdr/4.0/security.mdx rename to product_docs/docs/bdr/4/security.mdx index 4789b4de23e..a0bb8d722f3 100644 --- a/product_docs/docs/bdr/4.0/security.mdx +++ b/product_docs/docs/bdr/4/security.mdx @@ -1,6 +1,6 @@ --- title: Security and Roles -originalFilePath: security.md + --- diff --git a/product_docs/docs/bdr/4.0/sequences.mdx b/product_docs/docs/bdr/4/sequences.mdx similarity index 69% rename from product_docs/docs/bdr/4.0/sequences.mdx rename to product_docs/docs/bdr/4/sequences.mdx index 72546b97890..caca78639fb 100644 --- a/product_docs/docs/bdr/4.0/sequences.mdx +++ b/product_docs/docs/bdr/4/sequences.mdx @@ -1,6 +1,6 @@ --- title: Sequences -originalFilePath: sequences.md + --- @@ -37,14 +37,15 @@ them, you must have been granted the `bdr_application` role. There are various possible algorithms for global sequences: -- Timeshard sequences +- SnowflakeId sequences - Globally-allocated range sequences -Timeshard sequences generate values using an algorithm that does not require +SnowflakeId sequences generate values using an algorithm that does not require inter-node communication at any point, so is faster and more robust, as well as having the useful property of recording the timestamp at which they were created. -Timeshard sequences have the restriction that they work only for 64-bit BIGINT + +SnowflakeId sequences have the restriction that they work only for 64-bit BIGINT datatypes and produce values 19 digits long, which may be too long for use in some host language datatypes such as Javascript Integer types. Globally-allocated sequences allocate a local range of values which can @@ -64,27 +65,34 @@ command is executed or when a `serial`, `bigserial` or - `local` (the default) meaning that newly created sequences are the standard PostgreSQL (local) sequences. - `galloc` which always creates globally-allocated range sequences. -- `timeshard` which creates time-sharded global sequences for BIGINT sequences, - but will throw ERRORs when used with INTEGER sequences. +- `snowflakeid` which creates global sequences for BIGINT sequences which + consist of time, nodeid and counter components, cannot be used with + INTEGER sequences (so it can be used for `bigserial` but not for `serial`). +- `timeshard` older version of SnowflakeId sequence which is provided for + backwards compatibility only, the SnowflakeId is preferred +- `distributed` special value which can only be used for + `bdr.default_sequence_kind` and will select `snowflakeid` for `int8` + sequences (i.e. `bigserial`) and `galloc` sequence for `int4` + (i.e. `serial`) and `int2` sequences. The `bdr.sequences` view shows information about individual sequence kinds. `currval()` and `lastval()` work correctly for all types of global sequence. -### Timeshard Sequences +### SnowflakeId Sequences -The ids generated by timeshard sequences are loosely time-ordered so they can +The ids generated by SnowflakeId sequences are loosely time-ordered so they can be used to get the approximate order of data insertion, like standard PostgreSQL sequences. Values generated within the same millisecond might be out of order, even on one node. The property of loose time-ordering means they are suitable for use as range partition keys. -Timeshard sequences work on one or more nodes, and do not require any inter-node +SnowflakeId sequences work on one or more nodes, and do not require any inter-node communication after the node join process completes. So they may continue to be used even if there's the risk of extended network partitions, and are not affected by replication lag or inter-node latency. -Timeshard sequences generate unique ids in a different +SnowflakeId sequences generate unique ids in a different way to standard sequences. The algorithm uses 3 components for a sequence number. The first component of the sequence is a timestamp at the time of sequence number generation. The second component of @@ -99,37 +107,30 @@ property of sequences, which is that the ordering of the sequence numbers roughly corresponds to the order in which data was inserted into the table. Putting the timestamp first ensures this. -A few limitations and caveats apply to timeshard sequences. +A few limitations and caveats apply to SnowflakeId sequences. -Timeshard sequences are 64-bits wide and need a `bigint` or `bigserial`. +SnowflakeId sequences are 64-bits wide and need a `bigint` or `bigserial`. Values generated will be at least 19 digits long. There is no practical 32-bit `integer` version, so cannot be used with `serial` sequences - use globally-allocated range sequences instead. -There is a limit of 8192 sequence values generated per millisecond on any -given node for any given sequence. If more than 8192 sequences per -millisecond are generated from one sequence on one node, the generated -values will wrap around and could collide. There is no check on that for -performance reasons; the value is not reset to 0 at the start of each ms. -Collision will usually result in a -`UNIQUE` constraint violation on `INSERT` or `UPDATE`. It cannot cause a -replication conflict, because sequence values generated on different nodes -cannot *ever* collide since they contain the nodeid. - -In practice this is harmless; values are not generated fast enough -to trigger this limitation as there will be other -work being done, rows inserted, indexes updated, etc. Despite that, -applications should have a `UNIQUE` constraint in place where they -absolutely rely on a lack of collisions. - -Perhaps more importantly, the timestamp component will run out of values in -the year 2050, and if used in combination with bigint, the values will wrap to -negative numbers in the year 2033. This means that sequences generated after 2033 -will have negative values. If you plan to deploy your application beyond this -date, try one of [UUIDs, KSUUIDs and Other Approaches] mentioned below, or -use globally-allocated range sequences instead. - -The `INCREMENT` option on a sequence used as input for timeshard sequences is +For SnowflakeId there is a limit of 4096 sequence values generated per +millisecond on any given node (this means about 4 million sequence values per +second). In case the sequence value generation wraps around within given +millisecond, the SnowflakeId sequence will wait until next millisecond and get +fresh value for that millisecond. + +Since SnowflakeId sequences encode timestamp into sequence value, new sequence +values can only be generated within given time frame (depending on system clock). +The oldest timestamp which can be used 2016-10-07 which is the epoch time for +the SnowflakeId. The values will wrap to negative values in year 2086 and +completely run out of numbers by 2156. + +Since timestamp is important part of SnowflakeId sequence, there is additional +protection from generating sequences with older timestamp than the latest one +used within the lifetime of postgres process (but not between postgres restarts). + +The `INCREMENT` option on a sequence used as input for SnowflakeId sequences is effectively ignored. This could be relevant for applications that do sequence ID caching, like many object-relational mapper (ORM) tools, notably Hibernate. Because the sequence is time-based, this has little practical effect since the @@ -140,7 +141,32 @@ Similarly, the `START`, `MINVALUE`, `MAXVALUE` and `CACHE` settings may be changed on the underlying sequence, but there is no benefit to doing so. The sequence's low 14 bits are used and the rest is discarded, so the value range limits do not affect the function's result. For the same -reason, `setval()` is not useful for timeshard sequences. +reason, `setval()` is not useful for SnowflakeId sequences. + +#### Timeshard sequences + +Timeshard sequences are provided for backwards compatibility with existing +installations but are not recommended for new application use. It's recommended +to use the SnowflakeId sequence instead. + +Timeshard is very similar to SnowflakeId, but has different limits and fewer +protections and worse performance. + +The differences between timeshard and SnowflakeId are as following: + + - Timeshard can generate up to 16384 per millisecond (about 16 million per + second) which is more than SnowflakeId, however there is no protection + against wraparound within given millisecond so schemas using the timeshard + sequence should protect use `UNIQUE` constraint when using timeshard values + for given column. + - The timestamp component of timeshard sequence will run out of values in + the year 2050, and if used in combination with bigint, the values will wrap + to negative numbers in the year 2033. This means that sequences generated + after 2033 will have negative values. This is considerably shorter time + span than SnowflakeId and is the main reason why SnowflakeId is preferred. + - Timeshard sequences require occasional disk writes (similar to standard local + sequences), while SnowflakeId are calculated in memory so the SnowflakeId + sequences are in general a little faster than timeshard sequences. ### Globally-allocated range Sequences @@ -151,11 +177,11 @@ space efficiently, but requires that the local node be connected to a majority of the nodes in the cluster for the sequence generator to progress, when the currently assigned local range has been used up. -Unlike timeshard sequences, galloc sequences support all sequence data types -provided by PostgreSQL - smallint, integer and bigint. This means that galloc -sequences can be used in environments where 64-bit sequences are problematic, -such as using integers in javascript, since that supports only 53-bit -values, or when the sequence is displayed on output with limited space. +Unlike SnowflakeId sequences, galloc sequences support all sequence data types +provided by PostgreSQL - `smallint`, `integer` and `bigint`. This means that +galloc sequences can be used in environments where 64-bit sequences are +problematic, such as using integers in javascript, since that supports only +53-bit values, or when the sequence is displayed on output with limited space. The range assigned by each voting is currently predetermined based on the datatype the sequence is using: @@ -186,23 +212,82 @@ to or more than the above ranges assigned for each sequence datatype. should not be used. A few limitations apply to galloc sequences. BDR tracks galloc sequences in a -special BDR catalog `bdr.sequence_alloc`. This catalog is required to track the -currently allocated chunks for the galloc sequences. The sequence name and -namespace is stored in this catalog. Since the sequence chunk allocation is -managed via Raft whereas any changes to the sequence name/namespace is managed -via replication stream, BDR currently does not support renaming galloc -sequences, or moving them to another namespace or renaming the namespace that -contains a galloc sequence. The user should be mindful of this limitation while -designing application schema. +special BDR catalog [bdr.sequence_alloc](catalogs.md#bdrsequence_alloc). This +catalog is required to track the currently allocated chunks for the galloc +sequences. The sequence name and namespace is stored in this catalog. Since the +sequence chunk allocation is managed via Raft whereas any changes to the +sequence name/namespace is managed via replication stream, BDR currently does +not support renaming galloc sequences, or moving them to another namespace or +renaming the namespace that contains a galloc sequence. The user should be +mindful of this limitation while designing application schema. + +#### Converting a local sequence to a galloc sequence + +Before transforming a local sequence to galloc, you need to take care of several +prerequisites. + +##### 1. Verify that sequence and column data type match + +Check that the sequence's data type matches the data type of the column with +which it will be used. For example, it is possible to create a `bigint` sequence +and assign an `integer` column's default to the `nextval()` returned by that +sequence. With galloc sequences, which for `bigint` are allocated in blocks of +1 000 000 000, this will quickly result in the values returned by `nextval()` +exceeding the `int4` range if more than two nodes are in use. + +The following example demonstrates what can happen: + +```sql +CREATE SEQUENCE int8_seq; + +SELECT sequencename, data_type FROM pg_sequences; + sequencename | data_type +--------------+----------- + int8_seq | bigint +(1 row) + +CREATE TABLE seqtest (id INT NOT NULL PRIMARY KEY); + +ALTER SEQUENCE int8_seq OWNED BY seqtest.id; + +SELECT bdr.alter_sequence_set_kind('public.int8_seq'::regclass, 'galloc', 1); + alter_sequence_set_kind +------------------------- + +(1 row) + +ALTER TABLE seqtest ALTER COLUMN id SET DEFAULT nextval('int8_seq'::regclass); +``` + +After executing `INSERT INTO seqtest VALUES(DEFAULT)` on two nodes, the table will +contain the following values: + +```sql +SELECT * FROM seqtest; + id +------------ + 2 + 2000000002 +(2 rows) +``` + +However, attempting the same operation on a third node will fail with an +`integer out of range` error, as the sequence will have generated the value +`4000000002`. The next section contains more details on how chunks of sequences +are allocated. -#### Usage +!!! Tip + The current data type of a sequence can be retrieved from the PostgreSQL + [pg_sequences](https://www.postgresql.org/docs/current/view-pg-sequences.html) + view. The data type of a sequence can be modified with `ALTER SEQUENCE ... AS ...`, + e.g.: `ALTER SEQUENCE public.sequence AS integer`, as long as its current + value has not exceeded the maximum value of the new data type. -Before transforming a local sequence to galloc, you need to take care of these -prerequisites: +##### 2. Set a new start value for the sequence -When sequence kind is altered to galloc, it will be rewritten and restart from +When the sequence kind is altered to `galloc`, it will be rewritten and restart from the defined start value of the local sequence. If this happens on an existing -sequence in a production database you will need to query the current value +sequence in a production database you, will need to query the current value then set the start value appropriately. To assist with this use case, BDR allows users to pass a starting value with the function `bdr.alter_sequence_set_kind()`. If you are already using offset and you have writes from multiple nodes, you @@ -218,11 +303,11 @@ SELECT max((x->'response'->0->>'nextval')::bigint) )::jsonb AS x; -- turn into a galloc sequence -SELECT bdr.alter_sequence_set_kind('public.sequence'::regclass, 'galloc', $MAX+MARGIN); +SELECT bdr.alter_sequence_set_kind('public.sequence'::regclass, 'galloc', $MAX + $MARGIN); ``` -Since users cannot lock a sequence, you must leave a $MARGIN value to allow -operations to continue while the max() value is queried. +Since users cannot lock a sequence, you must leave a `$MARGIN` value to allow +operations to continue while the `max()` value is queried. The `bdr.sequence_alloc` table will give information on the chunk size and what ranges are allocated around the whole cluster. @@ -242,7 +327,7 @@ SELECT * FROM bdr.sequence_alloc To see the ranges currently assigned to a given sequence on each node, use these queries: -- Node `Node1` is using range from `333` to `2000333`. +* Node `Node1` is using range from `333` to `2000333`. ```sql SELECT last_value AS range_start, log_cnt AS range_end @@ -260,7 +345,7 @@ SELECT last_value AS range_start, log_cnt AS range_end (1 row) ``` -- Node `Node2` is using range from `2000004` to `4000003`. +* Node `Node2` is using range from `2000004` to `4000003`. ```sql SELECT last_value AS range_start, log_cnt AS range_end @@ -328,9 +413,9 @@ known as KSUUID, which generates values that can be stored using PostgreSQL's standard `UUID` data type. A `KSUUID` value is similar to `UUIDv1` in that it stores both timestamp and random data, following the `UUID` standard. The difference is that `KSUUID` is K-Sortable, meaning that it's weakly -sortable by timestamp. This makes it more useful as a database key as it +sortable by timestamp. This makes it more useful as a database key as it produces more compact `btree` indexes, which improves -the effectiveness of search, and allows natural time-sorting of result data. +the effectiveness of search, and allows natural time-sorting of result data. Unlike `UUIDv1`, `KSUUID` values do not include the MAC of the computer on which they were generated, so there should be no security concerns from using `KSUUID`s. @@ -338,7 +423,7 @@ generated, so there should be no security concerns from using `KSUUID`s. `KSUUID` v2 is now recommended in all cases. Values generated are directly sortable with regular comparison operators. -There are two versions of `KSUUID` in BDR, v1 and v2. +There are two versions of `KSUUID` in BDR, v1 and v2. The legacy `KSUUID` v1 is now deprecated but is kept in order to support existing installations and should not be used for new installations. @@ -352,7 +437,7 @@ In offset-step sequences, a normal PostgreSQL sequence is used on each node. Each sequence increments by the same amount and starts at differing offsets. For example with step 1000, node1's sequence generates 1001, 2001, 3001, and so on, node2's generates 1002, 2002, 3002, etc. This scheme works well -even if the nodes cannot communicate for extended periods, but the designer +even if the nodes cannot communicate for extended periods, but the designer must specify a maximum number of nodes when establishing the schema, and it requires per-node configuration. However, mistakes can easily lead to overlapping sequences. @@ -423,19 +508,15 @@ BDR treats this function as `DDL`, so DDL replication and global locking applies if that is currently active. See [DDL Replication]. #### Synopsis - ```sql bdr.alter_sequence_set_kind(seqoid regclass, seqkind text) ``` #### Parameters - - `seqoid` - name or Oid of the sequence to be altered -- `seqkind` - `local` for a standard PostgreSQL sequence, `timeshard` for BDR - global sequence which uses the "time and sharding" based algorithm described in the - [BDR Global Sequences] section, or `galloc` for globally-allocated range - sequences which use consensus between nodes to assign unique ranges of - sequence numbers to each node +- `seqkind` - `local` for a standard PostgreSQL sequence, `snowflakeid` or + `galloc` for globally unique BDR sequences, or `timeshard` for legacy + globally unique sequence #### Notes @@ -463,6 +544,83 @@ The `bdr.alter_sequence_set_kind` function can be only executed by the owner of the sequence, unless `bdr.backwards_compatibility` is set is set to 30618 or below. +### bdr.extract_timestamp_from_snowflakeid + +This function extracts the timestamp component of the `snowflakeid` sequence. +The return value is of type "timestamptz". + +#### Synopsis +```sql +bdr.extract_timestamp_from_snowflakeid(snowflakeid bigint) +``` + +#### Parameters + - `snowflakeid` - value of a snowflakeid sequence + +#### Notes + +This function is only executed on the local node. + +### bdr.extract_nodeid_from_snowflakeid + +This function extracts the nodeid component of the `snowflakeid` sequence. + +#### Synopsis +```sql +bdr.extract_nodeid_from_snowflakeid(snowflakeid bigint) +``` + +#### Parameters + - `snowflakeid` - value of a snowflakeid sequence + +#### Notes + +This function is only executed on the local node. + +### bdr.extract_localseqid_from_snowflakeid + +This function extracts the local sequence value component of the `snowflakeid` sequence. + +#### Synopsis +```sql +bdr.extract_localseqid_from_snowflakeid(snowflakeid bigint) +``` + +#### Parameters + - `snowflakeid` - value of a snowflakeid sequence + +#### Notes + +This function is only executed on the local node. + +### bdr.timestamp_to_snowflakeid + +This function converts a timestamp value to a dummy snowflakeid sequence value. + +This is useful for doing indexed searches or comparisons of values in the +snowflakeid column and for a specific timestamp. + +For example, given a table `foo` with a column `id` which is using a `snowflakeid` +sequence, we can get the number of changes since yesterday midnight like this: + +``` +SELECT count(1) FROM foo WHERE id > bdr.timestamp_to_snowflakeid('yesterday') +``` + +A query formulated this way will use an index scan on the column `id`. + +#### Synopsis +```sql +bdr.timestamp_to_snowflakeid(ts timestamptz) +``` + +#### Parameters + - `ts` - timestamp to be used for the snowflakeid sequence generation + +#### Notes + +This function is only executed on local node. + ### bdr.extract_timestamp_from_timeshard This function extracts the timestamp component of the `timeshard` sequence. diff --git a/product_docs/docs/bdr/4.0/striggers.mdx b/product_docs/docs/bdr/4/striggers.mdx similarity index 99% rename from product_docs/docs/bdr/4.0/striggers.mdx rename to product_docs/docs/bdr/4/striggers.mdx index c4edc7c2136..bbb9a6ec5fb 100644 --- a/product_docs/docs/bdr/4.0/striggers.mdx +++ b/product_docs/docs/bdr/4/striggers.mdx @@ -1,6 +1,6 @@ --- title: Stream Triggers -originalFilePath: striggers.md + --- @@ -54,7 +54,7 @@ triggering table. These triggers fire before we have even attempted to locate a matching target row, allowing a very wide range of transforms to be applied efficiently and consistently. -Next, for UPDATE and DELETE changes we locate the target row. If there is no +Next, for UPDATE and DELETE changes we locate the target row. If there is no target row, then there is no further processing for those change types. We then execute any normal triggers that previously have been explicitly enabled @@ -174,10 +174,10 @@ BDR. !!! Warning - Multiple conflict triggers can be specified on a single table, but they should match distinct event, i.e. each conflict should only - match a single conflict trigger. + match a single conflict trigger. - Multiple triggers matching the same event on the same table are not recommended; they might result in inconsistent behaviour, and - will be forbidden in a future release. + will be forbidden in a future release. If the same conflict trigger matches more than one event, the `TG_OP` variable can be used within the trigger to identify the operation that @@ -452,9 +452,6 @@ Execution order for triggers: ## Stream Triggers Manipulation Interfaces -Stream Triggers are managed using SQL interfaces provided as part of -bdr-enterprise extension. - Stream Triggers can only be created on tables with `REPLICA IDENTITY FULL` or tables without any `TOAST`able columns. diff --git a/product_docs/docs/bdr/4.0/transaction-streaming.mdx b/product_docs/docs/bdr/4/transaction-streaming.mdx similarity index 99% rename from product_docs/docs/bdr/4.0/transaction-streaming.mdx rename to product_docs/docs/bdr/4/transaction-streaming.mdx index 2f4b0e49f9c..09778d299d2 100644 --- a/product_docs/docs/bdr/4.0/transaction-streaming.mdx +++ b/product_docs/docs/bdr/4/transaction-streaming.mdx @@ -1,7 +1,7 @@ --- navTitle: Transaction Streaming title: Transaction streaming -originalFilePath: transaction-streaming.md + --- diff --git a/product_docs/docs/bdr/4.0/tssnapshots.mdx b/product_docs/docs/bdr/4/tssnapshots.mdx similarity index 98% rename from product_docs/docs/bdr/4.0/tssnapshots.mdx rename to product_docs/docs/bdr/4/tssnapshots.mdx index 8dec9fa701d..d8a4cb9d690 100644 --- a/product_docs/docs/bdr/4.0/tssnapshots.mdx +++ b/product_docs/docs/bdr/4/tssnapshots.mdx @@ -1,6 +1,6 @@ --- title: Timestamp-Based Snapshots -originalFilePath: tssnapshots.md + --- diff --git a/product_docs/docs/bdr/4.0/twophase.mdx b/product_docs/docs/bdr/4/twophase.mdx similarity index 98% rename from product_docs/docs/bdr/4.0/twophase.mdx rename to product_docs/docs/bdr/4/twophase.mdx index bd0ed9d5d38..a6af5008fa0 100644 --- a/product_docs/docs/bdr/4.0/twophase.mdx +++ b/product_docs/docs/bdr/4/twophase.mdx @@ -1,7 +1,7 @@ --- navTitle: Two-Phase Commit title: Explicit Two-Phase Commit (2PC) -originalFilePath: twophase.md + --- diff --git a/product_docs/docs/biganimal/release/getting_started/identity_provider/okta.mdx b/product_docs/docs/biganimal/release/getting_started/identity_provider/okta.mdx index a84181e2872..e5226d39439 100644 --- a/product_docs/docs/biganimal/release/getting_started/identity_provider/okta.mdx +++ b/product_docs/docs/biganimal/release/getting_started/identity_provider/okta.mdx @@ -48,13 +48,12 @@ Unique URL and access code are provided in an email from cloudcare@enterprisedb. | <assertion_path>/surname | user.lastName | | | <assertion_path>/givenname | user.firstName | | - Where <assertion_path> is http://schemas.xmlsoap.org/ws/2005/05/identity/claims. 1. On the **Assignments** tab on the Applications page, select **Assign** to assign people or groups to the newly created application. If you need to sign into BigAnimal, be sure to assign yourself. - 1. On the **Sign On** tab, select **View SAML setup instructions** to open a tab with instructions for your application. - 1. Copy from the Identity Provider Single Sign-on URL from step 1 of the SAML setup instructions. - 1. Select **Download certificate** in step 3 of the instructions. + 1. On the **Sign On** tab, select **View SAML setup instructions** to open a tab with instructions for your application. + 1. Copy from the Identity Provider Single Sign-on URL from step 1 of the SAML setup instructions. + 1. Select **Download certificate** in step 3 of the instructions. 1. In BigAnimal, on the **Setup Config** tab on the Set Up Identity Provider page: 1. Paste the Identity Provider Single Sign-on URL you copied from Okta into the **Single Sign-On URL** field. diff --git a/product_docs/docs/harp/1.0/index.mdx b/product_docs/docs/harp/1.0/index.mdx deleted file mode 100644 index 7138f7ccbec..00000000000 --- a/product_docs/docs/harp/1.0/index.mdx +++ /dev/null @@ -1,18 +0,0 @@ ---- -navTitle: HARP -title: "High Availability Routing for Postgres (HARP)" -directoryDefaults: - description: "High Availability Routing for Postgres (HARP) is a cluster management tool for Bi-directional Replication (BDR) clusters." ---- - -High Availability Routing for Postgres (HARP) is a cluster management tool for [Bi-directional Replication (BDR)](/bdr/latest) clusters. The core design of the tool is to route all application traffic within a data center to only one lead master at a time. A distributed consensus system is used to determine availability of the BDR nodes. On failure or unavailability of the lead master HARP determines a new lead master and changes application traffic routing accordingly. - -Together with the core capabilities of BDR this mechanism of routing application traffic to the lead master node rules out split brain scenarios and allows for fast failover and switchover without risk of data loss. - -HARP requires BDR Enterprise or BDR Standard versions 3.6 and 3.7. - -!!! Note - The documentation for the latest stable release is available here: - [HARP](https://documentation.2ndquadrant.com/harp/release/latest/) - - **This is a protected area of our website, if you need access please [contact us](https://www.enterprisedb.com/contact).** diff --git a/product_docs/docs/harp/2.0/01_release-notes.mdx b/product_docs/docs/harp/2.0/01_release-notes.mdx deleted file mode 100644 index a02de9957bb..00000000000 --- a/product_docs/docs/harp/2.0/01_release-notes.mdx +++ /dev/null @@ -1,168 +0,0 @@ ---- -navTitle: Release Notes -title: Release Notes ---- - -## Release 2.0.3 (2022-3-31) -### Enhancements - -* **HARP Proxy supports read-only user dedicated TLS Certificate** -You may specify a TLS Certificate and Key in the HARP Proxy DSN for the DCS read-only user. (78516, HNG-522) - -### Bug Fixes - -* HARP Proxy will continue to try and connect to DCS instead of exiting after 50 seconds. (75406, HNG-548, HNG-560, HNG-561) - -## Release 2.0.2 (2022-2-24) -### Enhancements - -* **Connection Throttling for Builtin Proxy** -You can now specify the maximum number of connections that could be used by `builtin` proxy. The proxy will adjust downward the number of connections to fit within your calculated system resource limits. (75406, 79250, HNG-489, HNG-498, HNG-503, HNG-508) - -* **CAMO disabled for BDR DCS** -HARP disables CAMO for its connection to the database to avoid performance degradation when using BDR for the Distributed Consensus System (DCS). (HNG-438) - -* **Improved Security for HARP Proxy** -You can specify a user with read-only DCS permissions for the `builtin` proxy. (75406, HNG-452) - -* **Start, Stop, Status hooks for managing Postgres** -You can provide start, stop, and status commands in the HARP Manager configuration for starting postgres, stopping postgres, and retrieving the status of postgres. If you do not provide commands then systemd is used by default. (HNG-492) - -* **Pgbouncer has been removed as a dependency for HARP Proxy rpm and deb packages.** -Pgbouncer will not be installed unless you select `pgbouncer` as the `harp_proxy_mode`. (HNG-511) - -* **HARP Manager has improved performance communicating with BDR DCS** -HARP Manager only communicates with BDR DCS using a local UNIX domain socket. (78516,HNG-494) - -* **Builtin proxy is now the default proxy** -If pgbouncer was being used by default in a previous release, the `harp_proxy_mode` must now be specified as `pgbouncer` to continue as the proxy on upgrade. (HNG-511) - -* **Binaries now match service names** -The `harp_manager` binary is now named `harp-manager`. Correspondingly, the ` harp_proxy` binary is now named `harp-proxy`. Symlinks with the previous names are provided. (HNG-514) - -* **Improved Harp Manager defaults** -Lag configuration defaults are now set to be off by default. Duration of leader lease now has a default setting of 6 seconds. Leader lease renewal setting is now defaulted to 2 seconds. (HNG-520) - -### Bug Fixes - -* HARP Manager now stops the database on exit. (HNG-497) -* HARP Proxy no longer leaks connections when using the `builtin` proxy. (75406,HNG-445) -* HARP Proxy no longer erroneously reports “splice: connection reset by peer” when connections are closed for `builtin` proxy. (75406, HNG-445,) -* Harpctl now returns when querying for `builtin` or `pgbouncer` proxy status. (HNG-499) -* `harpctl get cluster` output no longer contains leader and previous leader fields. - (HNG-483) -* Harpctl now validates proxy name when executing proxy related commands. (HNG-471) -* Harpctl now reports correct routing status for leader. (HNG-441) -* HARP configuration files now contain the correct configuration parameters for the corresponding proxy-- `builtin` or `pgbouncer`. (78516, HNG-456) -* TPAExec no longer creates a confusing DSN for DCS endpoint with a duplicate user. (78516, HNG-495) -* `request_timeout` configuration parameter does not need unit specified, which are in milliseconds. (78363, HNG-504) -* The `listen_port` and `listen_host` settings can now be configured per proxy instance using TPAExec. (78848, HNG-456) -* Subscriber only nodes,which cannot become leader nodes,are no longer considered for leadership. (78516, HNG-411) - -### Known Issues - -* When a previously isolated shadow node returns back as an active cluster node this triggers a raft election and leadership change. -* Promoting a node may cause a different node to be promoted. A race for leadership occurs between the eligible nodes. The first eligible node will become leader. Use the `--force` option with the promote command to have the desired node become leader. -* Harpctl cannot return the HARP Proxy version if HARP Proxy is configured with read only user access for BDR DCS. The version information cannot be stored by a user with read only permissions. This leads to missing version information for proxy when using harpctl to query version information. -* After fencing the database with the stop database option, if the HARP Manager is restarted and BDR DCS is configured, the database will be restarted, but will be in a fenced state. -* `use_unix_sock` will not work when deploying EDB Postgres Advanced Server. The default UNIX socket directory is not determined correctly for EDB Postgres Advanced Server. - -## Release 2.0.1 (2022-1-31) - -### Enhancements - -* BDR consensus now generally available - - HARP offers multiple options for Distributed Consensus Service (DCS) source: etcd and BDR. The BDR consensus option can be used in deployments where etcd is not present. Use of the BDR consensus option is no longer considered beta and is now supported for use in production environments. - -* Transport layer proxy now generally available - - HARP offers multiple proxy options for routing connections between the client application and database: application layer (L7) and transport layer (L4). The network layer 4 or transport layer proxy simply forwards network packets, whereas layer 7 terminates network traffic. The transport layer proxy, previously called simple proxy, is no longer considered beta and is now supported for use in production environments. - -## Release 2.0.0 (2021-12-01) - -### Engine - -* Complete rewrite of system in golang to optimize all operations -* Cluster state can now be bootstrapped or revised via YAML - -### Configuration - -* Rewritten in YAML -* Configuration file changed from `harp.ini` to `config.yml` - -### Enhancements - -* HARP Proxy deprecates need for HAProxy in supported architecture. - - The use of HARP Router to translate DCS contents into appropriate online or - offline states for HTTP-based URI requests meant a load balancer or HAProxy - was necessary to determine the Lead Master. HARP Proxy now does this - automatically without periodic iterative status checks. - -* Utilizes DCS key subscription to respond directly to state changes. - - With relevant cluster state changes, the cluster will respond immediately, thereby resulting in improved failover and switchover times. - -* Compatibility with etcd SSL settings. - - It is now possible to communicate with etcd through SSL encryption. - -* Zero transaction lag on switchover. - - The new lead node will not have transactions routed to it until all replicated transactions are replayed, thereby reducing the potential for conflicts. - -* Experimental BDR Consensus layer - - Using BDR Consensus as the Distributed Consensus Service (DCS) reduces amount of change needed for implementations. - -* Experimental Proxy - - Proxy implementation for increased session control. - -## Release 1.0.1 (2021-06-23) - -### Documentation - -* Standardize resolution of the `HARP` acronym - -### Bug fixes - -* Fix CAMO lag check to accommodate cases where `maximum_camo_lag` is set to `0` - -## Release 1.0 (2021-06-15) - -### Enhancements - -* `--dry-run` option added to `harpctl leader set` -* minimum configuration values will be enforced -* `lock_interval` parameter can be specified as fractions of a second -* logging and output improvements -* replication lag query updated to handle parallel apply - -### Bug fixes - -* `harpctl` returns an error code if `leader set` fails -* prevent corner-case failure when node peer progress not returned -* handle potentially empty node record -* catch unhandled exception when deciding the lead node candidate - -## Release 0.2 (2021-02-23) - -This is a maintenance release with following changes: - -* documentation available via the EnterpriseDB customer portal -* report non-availability of nodes other than the lead master -* when using BDR as a DCS layer, fix potential failure situations when a - BDR node is not running -* fixes RPM packaging issue preventing a new start on fresh installations - -## Release 0.1 (2020-08-13) - -This is an initial beta release providing HARP support for BDR, including: - -* Usage of native BDR (3.6.21 and later) as a consensus layer -* Usage of etcd as a consensus layer - -Note that currently HARP does not support operation on a physical streaming -replica when BDR is used as a consensus layer. diff --git a/product_docs/docs/harp/2.0/02_overview.mdx b/product_docs/docs/harp/2.0/02_overview.mdx deleted file mode 100644 index d5af1fc4436..00000000000 --- a/product_docs/docs/harp/2.0/02_overview.mdx +++ /dev/null @@ -1,263 +0,0 @@ ---- -navTitle: Overview -title: HARP Functionality Overview ---- - -HARP is a new approach to High Availability for BDR -clusters. It -leverages consensus-driven Quorum to determine the correct connection end-point -in a semi-exclusive manner to prevent unintended multi-node writes from an -application. - -## The Importance of Quorum - -The central purpose of HARP is to enforce full Quorum on any Postgres cluster -it manages. Quorum is merely a term generally applied to a voting body that -mandates a certain minimum of attendees are available to make a decision. Or -perhaps even more simply: Majority Rules. - -In order for any vote to end in a result other than a tie, an odd number of -nodes must constitute the full cluster membership. Quorum however does not -strictly demand this restriction; a simple majority will suffice. This means -that in a cluster of N nodes, Quorum requires a minimum of N/2+1 nodes to hold -a meaningful vote. - -All of this ensures the cluster is always in agreement regarding which node -should be "in charge". For a BDR cluster consisting of multiple nodes, this -determines which node is the primary write target. HARP designates this node -as the Lead Master. - -## Reducing Write Targets - -The consequence of ignoring the concept of Quorum, or applying it -insufficiently, may lead to a Split Brain scenario where the "correct" write -target is ambiguous or unknowable. In a standard Postgres cluster, it is -important that only a single node is ever writable and sending replication -traffic to the remaining nodes. - -Even in Multi-Master capable approaches such as BDR, it can be beneficial to -reduce the amount of necessary conflict management to derive identical data -across the cluster. In clusters that consist of multiple BDR nodes per physical -location or region, this usually means a single BDR node acts as a "Leader" and -remaining nodes are "Shadows". These Shadow nodes are still writable, but doing -so is discouraged unless absolutely necessary. - -By leveraging Quorum, it's possible for all nodes to agree exactly which -Postgres node should represent the entire cluster, or a local BDR region. Any -nodes that lose contact with the remainder of the Quorum, or are overruled by -it, by definition cannot become the cluster Leader. - -This prevents Split Brain situations where writes unintentionally reach two -Postgres nodes. Unlike technologies such as VPNs, Proxies, load balancers, or -DNS, a Quorum-derived consensus cannot be circumvented by mis-configuration or -network partitions. So long as it's possible to contact the Consensus layer to -determine the state of the Quorum maintained by HARP, only one target is ever -valid. - -## Basic Architecture - -The design of HARP comes in essentially two parts consisting of a Manager and -a Proxy. The following diagram describes how these interact with a single -Postgres instance: - -![HARP Unit](images/ha-unit.png) - -The Consensus Layer is an external entity where Harp Manager maintains -information it learns about its assigned Postgres node, and HARP Proxy -translates this information to a valid Postgres node target. Because Proxy -obtains the node target from the Consensus Layer, several such instances may -exist independently. - -While using BDR itself as the Consensus Layer, each server node resembles this -variant instead. - -![HARP Unit w/BDR Consensus](images/ha-unit-bdr.png) - -In either case, each unit consists of the following elements: - -* A Postgres or EDB instance -* A Consensus Layer resource, meant to track various attributes of the Postgres - instance -* A HARP Manager process to convey the state of the Postgres node to the - Consensus Layer -* A HARP Proxy service that directs traffic to the proper Lead Master node, - as derived from the Consensus Layer - -Not every application stack has access to additional node resources -specifically for the Proxy component, so it can be combined with the -application server to simplify the stack itself. - -This is a typical design using two BDR nodes in a single Data Center organized in a Lead Master / Shadow Master configuration: - -![HARP Cluster](images/ha-ao.png) - -Note that when using BDR itself as the HARP Consensus Layer, at least three -fully qualified BDR nodes must be present to ensure a quorum majority. - -![HARP Cluster w/BDR Consensus](images/ha-ao-bdr.png) - -(Not shown in the above diagram are connections between BDR nodes.) - -## How it Works - -When managing a BDR cluster, HARP maintains at most one "Leader" node per -defined Location. Canonically this is referred to as the Lead Master. Other BDR -nodes which are eligible to take this position are Shadow Master state until -such a time they take the Leader role. - -Applications may contact the current Leader only through the Proxy service. -Since the Consensus Layer requires Quorum agreement before conveying Leader -state, any and all Proxy services will direct traffic to that node. - -At a high level, this is ultimately what prevents application interaction with -multiple nodes simultaneously. - -### Determining a Leader - -As an example, consider the role of Lead Master within a locally subdivided -BDR Always-On group as may exist within a single data center. When any -Postgres or Manager resource is started, and after a configurable refresh -interval, the following must occur: - -1. The Manager checks the status of its assigned Postgres resource. - - If Postgres is not running, try again after configurable timeout. - - If Postgres is running, continue. -2. The Manager checks the status of the Leader lease in the Consensus Layer. - - If the lease is unclaimed, acquire it and assign the identity of - the Postgres instance assigned to this Manager. This lease duration is - configurable, but setting it too low may result in unexpected leadership - transitions. - - If the lease is already claimed by us, renew the lease TTL. - - Otherwise do nothing. - -Obviously a lot more happens here, but this simplified version should explain -what's happening. The Leader lease can only be held by one node, and if it's -held elsewhere, HARP Manager gives up and tries again later. - -!!! Note - Depending on the chosen Consensus Layer, rather than repeatedly looping to - check the status of the Leader lease, HARP will subscribe to notifications - instead. In this case, it can respond immediately any time the state of the - lease changes, rather than polling. Currently this functionality is - restricted to the etcd Consensus Layer. - -This means HARP itself does not hold elections or manage Quorum; this is -delegated to the Consensus Layer. The act of obtaining the lease must be -acknowledged by a Quorum of the Consensus Layer, so if the request succeeds, -that node leads the cluster in that Location. - -### Connection Routing - -Once the role of the Lead Master is established, connections are handled -with a similar deterministic result as reflected by HARP Proxy. Consider a case -where HARP Proxy needs to determine the connection target for a particular backend -resource: - -1. HARP Proxy interrogates the Consensus layer for the current Lead Master in - its configured location. -2. If this is unset or in transition; - - New client connections to Postgres are barred, but clients will - accumulate and be in a paused state until a Lead Master appears. - - Existing client connections are allowed to complete current transaction, - and are then reverted to a similar pending state as new connections. -3. Client connections are forwarded to the Lead Master. - -Note that the interplay demonstrated in this case does not require any -interaction with either HARP Manager or Postgres. The Consensus Layer itself -is the source of all truth from the Proxy's perspective. - -### Colocation - -The arrangement of the work units is such that their organization is required -to follow these principles: - -1. The Manager and Postgres units must exist concomitantly within the same - node. -2. The contents of the Consensus Layer dictate the prescriptive role of all - operational work units. - -This delegates cluster Quorum responsibilities to the Consensus Layer itself, -while HARP leverages it for critical role assignments and key/value storage. -Neither storage or retrieval will succeed if the Consensus Layer is inoperable -or unreachable, thus preventing rogue Postgres nodes from accepting -connections. - -As a result, the Consensus Layer should generally exist outside of HARP or HARP -managed nodes for maximum safety. Our reference diagrams reflect this in order -to encourage such separation, though it is not required. - -!!! Note - In order to operate and manage cluster state, BDR contains its own - implementation of the Raft Consensus model. HARP may be configured to - leverage this same layer to reduce reliance on external dependencies and - to preserve server resources. However, there are certain drawbacks to this - approach that are discussed in further depth in the section on the - [Consensus Layer](09_consensus-layer). - -## Recommended Architecture and Use - -HARP was primarily designed to represent a BDR Always-On architecture which -resides within two (or more) Data Centers and consists of at least five BDR -nodes. This does not count any Logical Standby nodes. - -The current and standard representation of this can be seen in the following -diagram: - -![BDR Always-On Reference Architecture](images/bdr-ao-spec.png) - -In this diagram, HARP Manager would exist on BDR Nodes 1-4. The initial state -of the cluster would be that BDR Node 1 is the Lead master of DC A, and BDR -Node 3 is the Lead Master of DC B. - -This would result in any HARP Proxy resource in DC A connecting to BDR Node 1, -and likewise the HARP Proxy resource in DC B connecting to BDR Node 3. - -!!! Note - While this diagram only shows a single HARP Proxy per DC, this is merely - illustrative and should not be considered a Single Point of Failure. Any - number of HARP Proxy nodes may exist, and they will all direct application - traffic to the same node. - -### Location Configuration - -In order for multiple BDR nodes to be eligible to take the Lead Master lock in -a location, a Location must be defined within the `config.yml` configuration -file. - -To reproduce the diagram above, we would have these lines in the `config.yml` -configuration for BDR Nodes 1 and 2: - -```yaml -location: dca -``` - -And for BDR Nodes 3 and 4: - -```yaml -location: dcb -``` - -This applies to any HARP Proxy nodes which are designated in those respective -data centers as well. - -### BDR 3.7 Compatibility - -BDR 3.7 and above offers more direct Location definition by assigning a -Location to the BDR node itself. This is done by calling the following SQL -API function while connected to the BDR node. So for BDR Nodes 1 and 2, we -might do this: - -```sql -SELECT bdr.set_node_location('dca'); -``` - -And for BDR Nodes 3 and 4: - -```sql -SELECT bdr.set_node_location('dcb'); -``` - -Afterwards, future versions of HARP Manager would derive the `location` field -directly from BDR itself. This HARP functionality is not available yet, so we -recommend using this and the setting in `config.yml` until HARP reports -compatibility with this BDR API method. diff --git a/product_docs/docs/harp/2.0/03_installation.mdx b/product_docs/docs/harp/2.0/03_installation.mdx deleted file mode 100644 index 918105dc5dc..00000000000 --- a/product_docs/docs/harp/2.0/03_installation.mdx +++ /dev/null @@ -1,129 +0,0 @@ ---- -navTitle: Installation -title: Installation ---- - -A standard installation of HARP includes two system services: - -* HARP Manager (`harp-manager`) on the node being managed -* HARP Proxy (`harp-proxy`) elsewhere - -There are generally two ways to install and configure these services to manage -Postgres for proper Quorum-based connection routing. - -## Software Versions - -HARP does have dependencies on external software. These must fit a minimum -version as listed here. - -| Software | Min Ver | -|-----------|---------| -| etcd | 3.4 | -| PgBouncer | 1.14 | - -## TPAExec - -The easiest way to install and configure HARP is to use EDB's TPAexec utility -for cluster deployment and management. For details on this software, see the -[TPAexec product page](https://access.2ndquadrant.com/customer_portal/sw/tpa/). - -!!! Note - TPAExec is currently only available through an EULA specifically dedicated - to BDR cluster deployments. If you are unable to access the above URL, - please contact your sales or account representative for more information. - -TPAexec itself must be configured to recognize that cluster routing should be -managed through HARP by ensuring the TPA `config.yml` file contains these -attributes: - -```yaml -cluster_vars: - failover_manager: harp -``` - -!!! Note - Versions of TPAexec prior to 21.1 require a slightly different approach: - - ```yaml - cluster_vars: - enable_harp: true - ``` - -After this, HARP will be installed by invoking the regular `tpaexec` commands -for making cluster modifications: - -```bash -tpaexec provision ${CLUSTER_DIR} -tpaexec deploy ${CLUSTER_DIR} -``` - -No other modifications should be necessary, barring cluster-specific -considerations. - - -## Package Installation - -Currently CentOS/RHEL packages are provided via the EDB packaging -infrastructure. For details, see the [HARP product -page](https://access.2ndquadrant.com/customer_portal/sw/harp/). - -### etcd Packages - -Currently `etcd` packages for many popular Linux distributions are not -available via their standard public repositories. EDB has therefore packaged -`etcd` for RHEL and CentOS versions 7 and 8, Debian, and variants such as -Ubuntu LTS. Again, access to our HARP package repository is necessary to use -these libraries. - -## Consensus layer - -HARP requires a distributed consensus layer to operate. Currently this must be -either `bdr` or `etcd`. If using fewer than 3 BDR nodes, it may become -necessary to rely on `etcd`. Otherwise any BDR service outage will reduce the -consensus layer to a single node and thus prevent node consensus and disable -Postgres routing. - -### etcd - -If using `etcd` as the consensus layer, `etcd` must be installed either -directly on the Postgres nodes, or in some separate location they can access. - -To set `etcd` as the consensus layer, include this in the HARP `config.yml` -configuration file: - -```yaml -dcs: - driver: etcd - endpoints: - - host1:2379 - - host2:2379 - - host3:2379 -``` - -When using TPAExec, all configured etcd endpoints will be entered here -automatically. - -### BDR - -The `bdr` native consensus layer is available from BDR 3.6.21 and 3.7.3. This -Consensus Layer model requires no supplementary software when managing routing -for a BDR cluster. - -As previously mentioned, to ensure Quorum is possible in the cluster, always -use more than two nodes so BDR's consensus layer remains responsive during node -maintenance or outages. - -To set BDR as the consensus layer, include this in the `config.yml` -configuration file: - -```yaml -dcs: - driver: bdr - endpoints: - - host=host1 dbname=bdrdb user=harp_user - - host=host2 dbname=bdrdb user=harp_user - - host=host3 dbname=bdrdb user=harp_user -``` - -As can be seen here, the endpoints for a BDR consensus layer follow the -standard Postgres DSN connection format. diff --git a/product_docs/docs/harp/2.0/04_configuration.mdx b/product_docs/docs/harp/2.0/04_configuration.mdx deleted file mode 100644 index 4236674aae4..00000000000 --- a/product_docs/docs/harp/2.0/04_configuration.mdx +++ /dev/null @@ -1,536 +0,0 @@ ---- -navTitle: Configuration -title: Configuring HARP for Cluster Management ---- - -The HARP configuration file follows a standard YAML style formatting which has -been simplified for readability. This file can be found in the `/etc/harp` -directory by default, and is named `config.yml` - -The configuration file location can be explicitly provided to all HARP -executables with the `-f`/`--config` argument. - -## Standard Configuration - -HARP essentially operates as three components: - -* HARP Manager -* HARP Proxy -* harpctl - -Each of these use the same standard `config.yml` configuration format, which -should always include the following sections: - -* `cluster.name` - The name of the cluster to target for all operations. -* `dcs` - DCS driver and connection configuration for all endpoints. - -Essentially this means a standard preamble will always be included for HARP -operations, and will resemble this: - -```yaml -cluster: - name: mycluster - -dcs: - ... -``` - -Other sections should be considered optional or specific to the named HARP -component. - -### Cluster Name - -The **`name`** entry under the `cluster` heading is required for _all_ -interaction with HARP. Each HARP cluster has a name for both disambiguation -purposes and for labeling data within the DCS for the specific cluster. - -HARP Managers will write information about the cluster here for consumption by -HARP Proxy and harpctl. HARP Proxy services will direct traffic to nodes within -this cluster. The `harpctl` management tool will interact with this cluster. - -### DCS Settings - -Configuring the Consensus Layer is key to HARP functionality. Without the DCS, -HARP has nowhere to store cluster metadata, cannot hold leadership elections, -and so on. Therefore this portion of the configuration is required, though -certain elements are optional. - -All elements should be specified under a section named `dcs` with multiple -supplementary entries that will be described here. - -- **`driver`**: Required type of consensus layer to use. - Currently may be `etcd` or `bdr`. Support for `bdr` as a consensus layer is - experimental. Using `bdr` as the consensus layer reduces the - additional software for consensus storage, but expects a minimum of three - full BDR member nodes to maintain quorum during database maintenance. - -- **`endpoints`**: Required list of connection strings to contact the DCS. - Every node of the DCS should be listed here if possible. This ensures HARP - will continue to function so long as a majority of the DCS is still - operational and reachable via the network. - - Format when using `etcd` as the consensus layer is as follows: - - ```yaml - dcs: - endpoints: - - host1:2379 - - host2:2379 - - host3:2379 - ``` - Format when using the experimental `bdr` consensus layer is as follows: - - ```yaml - dcs: - # only DSN format is supported - endpoints: - - "host=host1 port=5432 dbname=bdrdb user=postgres" - - "host=host2 port=5432 dbname=bdrdb user=postgres" - - "host=host3 port=5432 dbname=bdrdb user=postgres" - ``` -Currently, `bdr` consensus layer requires the first endpoint to point to the local postgres instance. - -- **`request_timeout`**: Time in milliseconds to consider a request as failed. - If HARP makes a request to the DCS and receives no response within this time - period, it should consider the operation as failed. This may cause the issue - to be logged as an error or retried, depending on the nature of the request. - Default: 250. - -The following DCS SSL settings only apply when ```driver: etcd``` is set in the -configuration file. - -- **`ssl`**: Either `on` or `off` to enable SSL communication with the DCS. - Default: `off` - -- **`ssl_ca_file`**: Client SSL Certificate Authority (CA) file. - -- **`ssl_cert_file`**: Client SSL certificate file. - -- **`ssl_key_file`**: Client SSL key file. - -#### Example - -Here is an example of how HARP should be configured to contact an etcd DCS -consisting of three nodes: - -```yaml -dcs: - driver: etcd - endpoints: - - host1:2379 - - host2:2379 - - host3:2379 -``` - -### HARP Manager Specific - -Besides the generic service options required for all HARP components, Manager -needs at least one more setting: - -- **`log_level`**: One of `DEBUG`, `INFO`, `WARNING`, `ERROR`, or `CRITICAL` - which may alter the amount of log output from HARP services. - -- **`name`**: Required name of the Postgres node represented by this Manager. - Since Manager can only represent a specific node, that node is named here and - also serves to name this Manager. If this is a BDR node, it should match the - value used at node creation when executing the - `bdr.create_node(node_name, ...)` function and as reported by the - `bdr.local_node_summary.node_name` view column. Alphanumeric characters - and underscores only. - -- **`start_command`**: This can be used instead of the information in DCS for - starting the database to be monitored. This is required if using bdr as the - consensus layer. - -- **`status_command`**: This can be used instead of the information in DCS for - the Harp Manager to determine whether or not the database is running. This is - required if using bdr as the consensus layer. - -- **`stop_command`**: This can be used instead of the information in DCS for - stopping the database. - - -Thus a complete configuration example for HARP Manager could resemble this: - -```yaml -cluster: - name: mycluster - -dcs: - driver: etcd - endpoints: - - host1:2379 - - host2:2379 - - host3:2379 - -manager: - name: node1 - log_level: INFO -``` - -Note that this is essentially the DCS contact information, any associated -service customizations, the name of the cluster itself, and the name of the -node. All other settings are associated with the node itself and is stored -within the DCS. - -Please read the section on [Node Bootstrapping](05_bootstrapping) for more about -specific node settings and initializing nodes to be managed by HARP Manager. - -### HARP Proxy Specific - -Some configuration options are specific to HARP Proxy. These affect how the -daemon itself operates, and thus are currently located in the `config.yml` file -itself. - -Proxy-based settings may be specified under a `proxy` heading, and include: - -- **`location`**: Required name of Location HARP Proxy should represent. - HARP Proxy nodes are directly tied to the Location where they are running, as - they always direct traffic to the current Lead Master node. This must be - specified for any defined proxy. - -- **`log_level`**: One of `DEBUG`, `INFO`, `WARNING`, `ERROR`, or `CRITICAL` - which may alter the amount of log output from HARP services. - - * Default: `INFO` - -- **`name`**: Name of this specific Proxy. - Each Proxy node is named to ensure any associated statistics or operating - state are available in status checks and other interactive events. - -- **`type`**: Specifies whether pgbouncer or the experimental built-in passthrough proxy will be used. All proxies must use the same proxy type. It is recommended to only experiment with the simple proxy in combination with the experimental BDR DCS. - May be `pgbouncer` or `builtin`. - - * Default: `pgbouncer` - -- **`pgbouncer_bin_dir`**: Directory where PgBouncer binaries are located. - As HARP utilizes PgBouncer binaries, it needs to know where these are - located. This can be platform or distribution dependent, so has no - default. The assumption is that the appropriate binaries are in the - environment's `PATH` variable otherwise. - -#### Example - -HARP Proxy requires the cluster name, DCS connection settings, location, and -name of the proxy in operation. An example lies below: - -```yaml -cluster: - name: mycluster - -dcs: - driver: etcd - endpoints: - - host1:2379 - - host2:2379 - - host3:2379 - -proxy: - name: proxy1 - location: dc1 - pgbouncer_bin_dir: /usr/sbin -``` - -All other attributes are obtained from the DCS upon Proxy startup. - -## Run-Time Directives - -While it is possible to confige HARP Manager, HARP Proxy, or harpctl with a -minimum of YAML in the `config.yml` file, some customizations are held within -the DCS itself. These values must either initialized via bootstrap or set -specifically with `harpctl set` directives. - -This section will outline these, and how they may be specified. - -### Cluster Wide - -Settings here should be set under a `cluster` YAML heading during bootstrap, or -modified with a `harpctl set cluster` command. - -- **`event_sync_interval`**: Time in milliseconds to wait for synchronization. - When events occur within HARP, they do so asynchronously across the cluster. - HARP Managers start operating immediately when they detect metadata changes, - and HARP Proxies may pause traffic and start reconfiguring endpoints. This is - a safety interval that is meant to roughly approximate the maximum amount of - event time skew that may exist between all HARP components. - - For example, suppose Node A goes offline and HARP Manager on Node B commonly - receives this event 5 milliseconds before Node C. A setting of at least 5ms - would then be necessary to ensure all HARP Manager services have received the - event before they begin to process it. - - This also applies to HARP Proxy. - -### Node Directives - -Most node-oriented settings can be changed and subsequently applied while HARP -Manager is active. These items are retained in the DCS after initial bootstrap, -and thus may be modified without altering a configuration file. - -Settings here should be set under a `node` YAML heading during bootstrap, or -modified with a `harpctl set node` command. - -- **`camo_enforcement`**: Whether CAMO queue state should be strictly enforced. - When set to `strict`, HARP will never allow switchover or failover to a BDR - CAMO partner node unless it is fully caught up with the entire CAMO queue at - the time of the migration. When set to `lag_only`, only standard lag - thresholds such as `maximum_camo_lag` are applied. - -- **`dcs_reconnect_interval`**: The interval, measured in ms, between attempts that a disconnected node tries to reconnect to the DCS. - - * Default 1000. - -- **`dsn`**: Required full connection string to the managed Postgres node. - This parameter applies equally to all HARP services and enables - micro-architectures which run only one service per container. - - !!! Note - HARP sets the `sslmode` argument to `require` by default and will prevent - connections to servers that do not require SSL. To disable this behavior, - explicitly set this parameter to a more permissive value such as - `disable`, `allow`, or `prefer`. - -- **`db_data_dir`**: Required Postgres data directory. - This is required by HARP Manager to start, stop, or reload the Postgres - service. It is also the default location for configuration files, which may - be used at a later time for controlling promotion of streaming replicas. - -- **`db_conf_dir`**: Location of Postgres configuration files. - Some platforms prefer storing Postgres confgiuration files away from the - Postgres data directory itself. In these cases, this should be set to that - expected location. - -- **`db_log_file`**: Location of Postgres log file. - - * Default `/tmp/pg_ctl.out` - -- **`fence_node_on_dcs_failure`**: In the event HARP is unable to reach the DCS, several readiness keys and the leadership lease itself will expire. This will implicitly prevent a node from routing consideration. However, such a node is not officially fenced, and the manager will not stop monitoring the database if `stop_database_when_fenced` is set to false. - - * Default: False - -- **`leader_lease_duration`**: Amount of time in seconds the Lead Master - lease will persist if not refreshed. This allows any HARP Manager a certain - grace period to refresh the lock, before expiration allows another node to - obtain the Lead Master lock instead. - - * Default: 30 - -- **`lease_refresh_interval`**: Amount of time in milliseconds between - refreshes of the Lead Master lease. This essentially controls the time - between each series of checks HARP Manager performs against its assigned - Postgres node, and when the status of the node is updated in the Consensus - layer. - - * Default: 5000 -- **`max_dcs_failures`**: The amount of DCS request failures before marking a node as fenced according to fence_node_on_dcs_failure. This prevents transient communication disruptions from shutting down database nodes. - - * Default: 10 - -- **`maximum_lag`**: Highest allowable variance (in bytes) between last - recorded LSN of previous Lead Master and this node before being allowed to - take the Lead Master lock. This prevents nodes experiencing terminal amounts - of lag from taking the Lead Master lock. Set to -1 to disable this check. - - * Default: 1048576 (1MB) - -- **`maximum_camo_lag`**: Highest allowable variance (in bytes) between last - received LSN and applied LSN between this node and its CAMO partner(s). - This should only apply to clusters where CAMO is both available and enabled. - Thus this only applies to BDR EE clusters where `pg2q.enable_camo` is set. - Clusters with particularly stringent CAMO apply queue restrictions should set - this very low, or even to 0 to avoid any unapplied CAMO transactions. Set to - -1 to disable this check. - - * Default: 1048576 (1MB) - -- **`ready_status_duration`**: Amount of time in seconds the node's readiness - status will persist if not refreshed. This is a failsafe that will remove a - node from being contacted by HARP Proxy if the HARP Manager in charge if it - stops operating. - - * Default: 30 - -- **`db_bin_dir`**: Directory where Postgres binaries are located. - As HARP utilizes Postgres binaries, such as `pg_ctl`, it needs to know where - these are located. This can be platform or distribution dependent, so has no - default. The assumption is that the appropriate binaries are in the - environment's `PATH` variable otherwise. - -- **`priority`**: Any numeric value. - Any node where this option is set to -1 will be unable to take the Lead Master role, even when attempting to explicitly set the Lead Master using `harpctl`. - - * Default: 100 - -- **`stop_database_when_fenced`**: Rather than simply removing a node from all possible routing, stop the database on a node when it is fenced. This is an extra safeguard to prevent data from other sources than HARP Proxy from reaching the database, or in case proxies are unable to disconnect clients for some other reason. - - * Default: False - -- **`consensus_timeout`**: Amount of milliseconds before aborting a read or - write to the consensus layer. In the event the consensus layer loses - quorum or becomes unreachable, we want near-instant errors rather than - infinite timeouts. This prevents blocking behavior in such cases. - Note: When using `bdr` as the consensus layer, the highest recognized timeout - is 1000ms. - - * Default: 250 - -- **`use_unix_socket`**: Specifies that HARP Manager should prefer to use - unix sockets to connect to the database. - - * Default: False - -All of these run-time directives can be modified via `harpctl`. Consider if we -wished to decrease the `lease_refresh_interval` to 100ms on `node1`: - -```bash -harpctl set node node1 lease_refresh_interval=100 -``` - -### Proxy Directives - -Certain settings to the Proxy can be changed while the service is active. These -items are retained in the DCS after initial bootstrap, and thus may be modified -without altering a configuration file. Many of these settings are direct -mappings to their PgBouncer equivalent, and we will note these where relevant. - -Settings here should be set under a `proxies` YAML heading during bootstrap, or -modified with a `harpctl set proxy` command. -Properties set via `harpctl set proxy` require a restart of the proxy. - -- **`auth_file`**: The full path to a PgBouncer-style `userlist.txt` file. - HARP Proxy will use this file to store a `pgbouncer` user which will have - access to PgBouncer's Admin database. This file may be used for other users - as well. Proxy will modify this file to add and modify the password for the - `pgbouncer` user. - - * Default `/etc/harp/userlist.txt` - -- **`auth_type`**: What type of Postgres authentication to use for password - matching. This is actually a PgBouncer setting and is not fully compatible - with the Postgres `pg_hba.conf` capabilities. We recommend using `md5`, `pam` - `cert`, or `scram-sha-256`. - - * Default `md5` - -- **`auth_query`**: Query to verify a user’s password with Postgres. - Direct access to `pg_shadow` requires admin rights. It’s preferable to use a - non-superuser that calls a `SECURITY DEFINER` function instead. If using - TPAexec to create a cluster, a function named `pgbouncer_get_auth` will be - installed on all databases within the `pg_catalog` namespace to fulfill this - purpose. - -- **`auth_user`**: If `auth_user` is set, then any user not specified in - `auth_file` will be queried through the `auth_query` query from `pg_shadow` - in the database, using `auth_user`. The password of `auth_user` will be - taken from `auth_file`. - -- **`client_tls_ca_file`**: Root certificate file to validate client - certificates. Requires `client_tls_sslmode` to be set. - -- **`client_tls_cert_file`**: Certificate for private key. Clients can - validate it. Requires `client_tls_sslmode` to be set. - -- **`client_tls_key_file`**: Private key for PgBouncer to accept client - connections. Requires `client_tls_sslmode` to be set. - -- **`client_tls_protocols`**: Which TLS protocol versions are allowed for - client connections. - Allowed values: `tlsv1.0`, `tlsv1.1`, `tlsv1.2`, `tlsv1.3`. - Shortcuts: `all` (tlsv1.0,tlsv1.1,tlsv1.2,tlsv1.3), - `secure` (tlsv1.2,tlsv1.3), `legacy` (all). - - * Default `secure` - -- **`client_tls_sslmode`**: Whether to enable client SSL functionality. - May be one of `disable` `allow` `prefer` `require` `verify-ca` `verify-full`. - - * Default `disable` - -- **`database_name`**: Required name that represents which database clients - will use when connecting to HARP Proxy. This is a stable endpoint that will - not change and points to the current node, database name, port, etc., - necessary to connect to the Lead Master. The global value `*` may be used - here so all connections get directed to this target regardless of database - name. - -- **`default_pool_size`**: The maximum amount of active connections to allow - per database / user combination. This is for connection pooling purposes, - but will do nothing in session pooling mode. This is a PgBouncer setting. - - * Default 25 - -- **`ignore_startup_parameters`**: By default, PgBouncer allows only - parameters it can keep track of in startup packets: `client_encoding`, - `datestyle`, `timezone`, and `standard_conforming_strings`. All other - parameters will raise an error. To allow other parameters, they can be - specified here so that PgBouncer knows that they are handled by the admin - and it can ignore them. It is often necessary to set this to - `extra_float_digits` for Java applications to function properly. - - * Default `extra_float_digits` - -- **`listen_address`**: IP address(es) where Proxy should listen for - connections. Used by pgbouncer and builtin proxy. - - * Default 0.0.0.0 - -- **`listen_port`**: System Port where Proxy should listen for connections. - Used by pgbouncer and builtin proxy. - - * Default 6432 - -- **`max_client_conn`**: The total maximum amount of active client - connections that are allowed on the Proxy. This can be many orders of - magnitude greater than `default_pool_size`, as these are all connections that - have yet to be assigned a session, or have released a session for use by - another client connection. This is a PgBouncer setting. - - * Default 100 - -- **`monitor_interval`**: Time in seconds between Proxy checks of PgBouncer. - Since HARP Proxy manages PgBouncer as the actual connection management - layer, it needs to periodically check various status and stats to verify - it's still operational. Some of this information may also be logged or - registered to the DCS. - - * Default 5 - -- **`server_tls_protocols`**: Which TLS protocol versions are allowed for - server connections. - Allowed values: `tlsv1.0`, `tlsv1.1`, `tlsv1.2`, `tlsv1.3`. - Shortcuts: `all` (tlsv1.0,tlsv1.1,tlsv1.2,tlsv1.3), - `secure` (tlsv1.2,tlsv1.3), `legacy` (all). - - * Default `secure` - -- **`server_tls_sslmode`**: Whether to enable server SSL functionality. - May be one of `disable` `allow` `prefer` `require` `verify-ca` `verify-full`. - - * Default `disable` - -- **`session_transfer_mode`**: Method by which to transfer sessions. - May be one of `fast` `wait` `reconnect`. - - * Default `wait` - -- **`server_transfer_timeout`**: The number of seconds Harp proxy will wait before giving up on a PAUSE and issuing a KILL command. - - * Default 30 - -The following two options only apply when using the built-in proxy. - -- **`keepalive`**: The number of seconds the built-in proxy will wait before sending a keepalive message to an idle leader connection. - - * Default 5 - - -- **`timeout`**: The number of seconds the built-in proxy will wait before giving up on connecting to the leader. - - * Default 1 - -When using `harpctl` to change any of these settings for all proxies, use the -`global` keyword in place of the proxy name. Example: - -```bash -harpctl set proxy global max_client_conn=1000 -``` diff --git a/product_docs/docs/harp/2.0/05_bootstrapping.mdx b/product_docs/docs/harp/2.0/05_bootstrapping.mdx deleted file mode 100644 index 9808e5eb858..00000000000 --- a/product_docs/docs/harp/2.0/05_bootstrapping.mdx +++ /dev/null @@ -1,199 +0,0 @@ ---- -navTitle: Bootstrapping -title: Cluster Bootstrapping ---- - -In order to use HARP, a minimum amount of metadata must exist in the DCS. The -process of "bootstrapping" a cluster essentially means initializing node, -location, and other run-time configuration either all at once, or on a -per-resource basis. - -This entire process is governed through the `harpctl apply` command. For more -information on this, please check the [harpctl docs](08_harpctl). - -This section assumes the DCS is set up and functional. - -!!! Important - While examples in this document imply bootstrapping must be done section - by section, this is not the case. It is possible to combine any or all of - these into a single YAML document and apply it all at once. We simply split - up these sections for illustrative purposes and to simplify. - -## Cluster-Wide Bootstrapping - -Some settings are applied cluster-wide, and can be specified during -bootstrapping. Currently this only applies to the `event_sync_interval` -run-time directive, but others may be added later. - -The format for this is as follows: - -```yaml -cluster: - name: mycluster - event_sync_interval: 100 -``` - -Assuming that file was named `cluster.yml`, you would then apply it with the -following: - -```bash -harpctl apply cluster.yml -``` - -If the cluster name is not already defined within the DCS, this will also -initialize that value. - -!!! Important - The Cluster name parameter specified here will always override the cluster - name supplied in `config.yml`. The assumption is that the bootstrap file - supplies all necessary elements to bootstrap a cluster or some portion of - its larger configuration. A `config.yml` file is primarily meant to control - the execution of HARP Manager, HARP Proxy, or `harpctl` specifically. - -## Location Bootstrapping - -Every HARP node is associated with at most one Location. This Location may be -a single Data Center, a grouped Region consisting of multiple underlying -servers, an Amazon Availability Zone, and so on. This is merely a logical -structure that allows HARP to group nodes together such that only one will -represent the nodes in that Location as the Lead Master. - -Thus it is necessary to initialize one or more locations. The format for this -is as follows: - -```yaml -cluster: - name: mycluster - -locations: - - location: dc1 - - location: dc2 -``` - -Assuming that file was named `locations.yml`, you would then apply it with the -following: - -```bash -harpctl apply locations.yml -``` - -Note that when performing any manipulation of the cluster, the name should be -included as a preamble so the changes get directed to the right place. - -Once Locations are bootstrapped, they should show up with a quick examination: - -```bash -> harpctl get locations - -Cluster Location Leader Previous Leader Target Leader Lease Renewals -------- -------- ------ --------------- ------------- -------------- -mycluster dc1 -mycluster dc2 -``` - -As can be seen here, both locations are recognized by HARP and available for -node and Proxy assignment. - -## Node Bootstrapping - -HARP Nodes exist within a named cluster, and must have a designated name. -Beyond this, all other settings are retained within the DCS itself, as they are -dynamic and may affect how HARP interacts with them. To this end, each node -should be bootstrapped using one or more of the run-time directives discussed -in the [Configuration](04_configuration) documentation. - -While bootstrapping a node, there are a few required fields: - -* `name` -* `location` -* `dsn` -* `pg_data_dir` - -Everything else is optional and may depend on the cluster itself. Because it -is possible to bootstrap multiple nodes at once, the format generally fits -this structure: - -```yaml -cluster: - name: mycluster - -nodes: - - name: node1 - location: dc1 - dsn: host=node1 dbname=bdrdb user=postgres - pg_data_dir: /db/pgdata - leader_lease_duration: 10 - priority: 500 -``` - -Assuming that file was named `node1.yml`, you would then apply it with the -following: - -```bash -harpctl apply node1.yml -``` - -Once Nodes are bootstrapped, they should show up with a quick examination: - -```bash -> harpctl get nodes - -Cluster Name Location Ready Fenced Allow Routing Routing Status Role Type Lock Duration -------- ---- -------- ----- ------ ------------- -------------- ---- ---- ------------- -mycluster bdra1 dc1 true false true ok primary bdr 30 -``` - -## Proxy Bootstrapping - -Unlike Locations or Nodes, Proxies can also supply a configuration template -that is applied to all proxies within a Location. These are stored in the DCS -under the `global` designation. Each proxy also requires a name to exist as -an instance, but no further customization is necessary unless some setting -needs a specific override. - -This is because there are likely to be multiple proxies that have the same -default configuration settings for the cluster, and repeating these values for -every single proxy shouldn't be necessary. - -Additionally, when bootstrapping the Proxy template, at least one database -should be defined for connection assignments. With these notes in mind, the -format for this is as follows: - -```yaml -cluster: - name: mycluster - -proxies: - monitor_interval: 5 - default_pool_size: 20 - max_client_conn: 1000 - database_name: bdrdb - instances: - - name: proxy1 - - name: proxy2 - default_pool_size: 50 -``` - -This would configure HARP for two proxies: `proxy1` and `proxy2`, which only -`proxy2` would have a custom `default_pool_size`, while using the global -settings otherwise. - -Assuming that file was named `proxy.yml`, you would then apply it with the -following: - -```bash -harpctl apply proxy.yml -``` - -Once the Proxy template is bootstrapped, it should show up with a quick -examination: - -```bash -> harpctl get proxies - -Cluster Name Pool Mode Auth Type Max Client Conn Default Pool Size -------- ---- --------- --------- --------------- ----------------- -mycluster global session md5 1000 20 -mycluster proxy1 session md5 1000 20 -mycluster proxy2 session md5 1000 50 -``` diff --git a/product_docs/docs/harp/2.0/06_harp_manager.mdx b/product_docs/docs/harp/2.0/06_harp_manager.mdx deleted file mode 100644 index cc6d237bcb5..00000000000 --- a/product_docs/docs/harp/2.0/06_harp_manager.mdx +++ /dev/null @@ -1,122 +0,0 @@ ---- -navTitle: HARP Manager -title: HARP Manager ---- - -HARP Manager is a daemon which interacts with the local PostgreSQL / BDR node -and stores information about its state in the Consensus Layer. Manager -determines which node currently holds Leader status for a respective location, -and enforces configuration (lag, CAMO lag, etc.) constraints to prevent -ineligible nodes from Leader consideration. - -Every Postgres node in the cluster should have an associated HARP Manager. -Other nodes may exist, but they will not be able to participate as Lead or -Shadow Master roles, or other functionality HARP supports in the future. - -## How it Works - -Upon starting, HARP Manager will use `pg_ctl` to start Postgres if it is not -already running. After this, it will periodically check the server as defined -by the `node.lease_refresh_interval` setting. HARP Manager collects various -bits of data about Postgres including: - -* The node's current LSN -* If Postgres is running and accepting connections. This particular data point - is considered a lease which must be periodically renewed. If it expires, HARP - Proxy will remove the node from any existing routing. -* The current apply LSN position for all upstream BDR peer nodes. -* If CAMO is enabled: - - Name of the CAMO partner - - Peer CAMO state (`is_ready`) - - CAMO queue received and applied LSN positions -* Node type, such as whether the node is BDR or regular Postgres. -* The node's current role, such as a read/write, physical streaming replica, - logical standby, and so on. -* BDR node state, which should be `ACTIVE` except in limited cases. -* BDR Node ID for other metadata gathering. -* Other tracking values. - -!!! Important - When naming BDR nodes within HARP, the BDR node name should match the node - name represented in the `node.name` configuration attribute. This should - have already been done in the bootstrap process. - -The data collected here is fully available to other HARP Manager processes, and -is used to evaluate lag, partner readiness, and other criteria that will direct -switchover and failover behavior. - -After updating the node metadata, HARP Manager will either refresh the Lead -Master lease if it is already held by the local node, or seek to obtain the -lease if it has expired. Since the current state of all nodes is known to all -other nodes, the node which was the previous Lead Master is given automatic -priority ranking if present. If not, all other nodes will list themselves by -LSN lag, node priority, and other criteria, and the most qualified node will seize the Lead Master lease. - -This procedure happens for every defined Location where nodes are present. Thus -for Locations DC1 and DC2, there would be a Lead Master node in each, with a -separate lease and election process for both. - -HARP Manager repeats these Postgres status checks, lease renewals, and -elections repeatedly to ensure the Cluster always has a Lead Master target for -connections from HARP Proxy. - -## Configuration - -HARP Manager expects the `dcs`, `cluster`, and `manager` configuration stanzas. -The following is a functional example: - -```yaml -cluster: - name: mycluster - -dcs: - driver: etcd - endpoints: - - host1:2379 - - host2:2379 - - host3:2379 - -manager: - name: node1 - postgres_bin_dir: /usr/lib/postgresql/13/bin -``` - -Changes to the configuration file (default: `/etc/harp/config.yml`) can be -applied by issuing `SIGHUP` to the running instance, or by calling a -service-level reload. - -See [Configuration](04_configuration) for further details. - -## Usage - -This is the basic usage for HARP Manager: - -```bash -Usage of ./harp-manager: - -f string - Optional path to config file (shorthand) - --config string - Optional path to config file -``` - -Note that there are no arguments to launch `harp-manager` as a forked daemon. -This software is designed to be launched through systemd or within a container -as a top-level process. This also means output is directed to STDOUT and STDERR -for capture and access through journald or an attached container terminal. - -## Disabling and Re-enabling HARP Manager Control of Postgres - -It is possible to temporarily pause HARP Manager control of Postgres. This -results in a state where the daemon continues running but does not perform any -operations that could affect existing behavior of the cluster. Re-enabling -management causes it to resume operation. - -An example of temporarily disabling node management would be: - -```bash -harpctl unmanage node node1 -``` - -See the [harpctl](08_harpctl) documentation for more details. - -Node management by HARP Manager is enabled by default. diff --git a/product_docs/docs/harp/2.0/07_harp_proxy.mdx b/product_docs/docs/harp/2.0/07_harp_proxy.mdx deleted file mode 100644 index 998b13af117..00000000000 --- a/product_docs/docs/harp/2.0/07_harp_proxy.mdx +++ /dev/null @@ -1,227 +0,0 @@ ---- -navTitle: HARP Proxy -title: HARP Proxy ---- - -HARP Proxy is a daemon which acts as an abstraction layer between the client -application and Postgres. It interfaces with the Consensus Layer to obtain the -identity of the current Lead Master node and directs traffic to that location. -In the event of a planned switchover or unplanned failover, it will -automatically redirect to the new Lead Master node as dictated by the DCS. - -You may select between pgbouncer or builtin for HARP Proxy. When using pgbouncer, -HARP Proxy is an interface layer between the DCS and PgBouncer. As such, PgBouncer -is a prerequisite and should be installed in addition, in order for HARP Proxy to -fully manage its activity. - -The builtin proxy does not require any additional software. When using builtin, -HARP Proxy functions as a level 4 pass-through proxy. -# PgBouncer -## How it Works - -Upon starting, HARP Proxy will launch PgBouncer if it is not already running, -and leave client connections in a paused state. Afterwards, it will contact the -DCS to determine the identity of the Lead Master, configure PgBouncer to use -this as the target for database connections, and resume connection activity. -All application client traffic will then pass through PgBouncer into the -current Lead Master node for the Location where this proxy is operating. - -While PgBouncer is running, HARP Proxy will check its status based on the -`monitor_interval` configuration setting within the DCS, and store it in the -DCS for monitoring purposes. This will allow interrogation with `harpctl` to -retrieve status of all configured proxies, or any one proxy in particular. - -In the event the Lead Master lease is not set, HARP Proxy will pause all -connection traffic until a new Lead Master is established. This also applies -to circumstances when `harpctl promote` is used to invoke a planned transition -to a new Lead Master. It uses a PgBouncer `PAUSE` command for this, so existing -sessions are allowed to complete any pending transactions before they are held -in stasis. - -## Configuration - -HARP Proxy expects the `dcs`, `cluster`, and `proxy` configuration stanzas. The -following is a functional example: - -```yaml -cluster: - name: mycluster - -dcs: - driver: etcd - endpoints: - - host1:2379 - - host2:2379 - - host3:2379 - -proxy: - name: proxy1 -``` - -## Usage - -This is the basic usage for HARP Proxy: - -```bash -Usage of ./harp-proxy: - -f string - Optional path to config file (shorthand) - --config string - Optional path to config file -``` - -Note that there are no arguments to launch `harp-proxy` as a forked daemon. -This software is designed to be launched through systemd or within a container -as a top-level process. This also means output is directed to STDOUT and STDERR -for capture and access through journald or an attached container terminal. - -## PgBouncer Configuration File - -Since HARP Proxy currently utilizes PgBouncer for connection management and -redirection, a `pgbouncer.ini` file must exist. HARP Manager builds this file -based on various run-time directives as defined in the -[Proxy Directives](04_configuration) documentation. - -This file will be located in the same folder as the `config.yml` used by HARP -Proxy. Any PgBouncer process launched by HARP Proxy will use this configuration -file, and it may be used for debugging or information purposes. Modifications -to this automatically generated `pgbouncer.ini` file will be lost any time -HARP Proxy is restarted, so use `harpctl set proxy` to alter these settings -instead. Calling `harpctl set proxy` does not update the `pgbouncer.ini` file until the proxy has been restarted. - -## Disabling and Re-enabling HARP Proxy Node Management - -It is possible to temporarily pause HARP Proxy control of PgBouncer. This -results in a state where the daemon continues running but does not perform any -operations that could affect existing behavior of the cluster. Re-enabling -management causes it to resume operation. - -An example of temporarily disabling management of a specific proxy would be: - -```bash -harpctl unmanage proxy proxy1 -``` - -See the [harpctl](08_harpctl) documentation for more details. - -Proxy node management is enabled by default. - -## Passthrough User Authentication - -We strongly recommend configuring HARP Proxy to use the `auth_user` and -`auth_query` run-time directives. If these are not set, the PgBouncer -`userlist.txt` file must include username and password hash combinations for -every user PgBouncer needs to authenticate on Postgres' behalf. - -this should *not* be the `pgbouncer` user itself, as this is utilized by HARP -Proxy as an admin-level user in order to operate the underlying PgBouncer -service. - -In clusters administered by TPAexec, a function will be created and installed -in the `pg_catalog` schema in the `template1` database during provisioning. -This means any subsequently created databases will also include the function, -and it will be available to PgBouncer regardless of which database the user is -attempting to contact. - -If TPAexec is not used, we still recommend this function definition: - -```sql -CREATE OR REPLACE FUNCTION pg_catalog.pgbouncer_get_auth(p_usename TEXT) -RETURNS TABLE(username TEXT, password TEXT) AS $$ -BEGIN - RETURN QUERY - SELECT usename::TEXT, passwd::TEXT FROM pg_catalog.pg_shadow - WHERE usename = p_usename; -END; -$$ LANGUAGE plpgsql SECURITY DEFINER - -REVOKE ALL ON FUNCTION pg_catalog.pgbouncer_get_auth(p_usename TEXT) - FROM PUBLIC - -GRANT EXECUTE ON FUNCTION pg_catalog.pgbouncer_get_auth(p_usename TEXT) - TO ; -``` - -Don't forget to substitute `` for the `auth_user` field supplied to -HARP Proxy. - -Then in the Bootstrap file, the following will complete the configuration: - -```yaml -cluster: - name: mycluster - -proxies: - monitor_interval: 5 - default_pool_size: 20 - max_client_conn: 1000 - auth_user: pgb_auth - auth_query: "SELECT * FROM pg_catalog.pgbouncer_get_auth($1)" - database_name: bdrdb - instances: - - name: proxy1 - - name: proxy2 -``` - -It is also possible to define these fields with `harpctl set proxy`: - -```bash -harpctl set proxy global auth_user=pgb_auth -``` - -!!! Note - This means the `postgres` or `enterprisedb` OS user that launches HARP - Proxy will need a `.pgpass` file so that `auth_user` can authenticate - against Postgres. - -# Builtin Proxy -## How it Works - -Upon starting, HARP Proxy will listen for incoming connections on the listening -address and listening port specified in the bootstrap file per proxy instance. -All application client traffic will then pass through Builtin Proxy into the -current Lead Master node for the Location where this proxy is operating. - -In the event the Lead Master lease is not set, HARP Proxy will disconnect all -connection traffic until a new Lead Master is established. This also applies -to circumstances when `harpctl promote` is used to invoke a planned transition -to a new Lead Master. The disconnect is immediate. - -## Configuration - -HARP Proxy expects the `dcs`, `cluster`, and `proxy` configuration stanzas. The -following is a functional example: - -```yaml -cluster: - name: mycluster - -dcs: - driver: etcd - endpoints: - - host1:2379 - - host2:2379 - - host3:2379 - -proxy: - name: proxy1 -``` -Each proxy will connect to the DCS to retrieve what hosts and ports to listen on for connections. - -## Usage - -This is the basic usage for HARP Proxy: - -```bash -Usage of ./harp-proxy: - -f string - Optional path to config file (shorthand) - --config string - Optional path to config file -``` - -Note that there are no arguments to launch `harp-proxy` as a forked daemon. -This software is designed to be launched through systemd or within a container -as a top-level process. This also means output is directed to STDOUT and STDERR -for capture and access through journald or an attached container terminal. - diff --git a/product_docs/docs/harp/2.0/09_consensus-layer.mdx b/product_docs/docs/harp/2.0/09_consensus-layer.mdx deleted file mode 100644 index b37082b3968..00000000000 --- a/product_docs/docs/harp/2.0/09_consensus-layer.mdx +++ /dev/null @@ -1,144 +0,0 @@ ---- -navTitle: Consensus Layer -title: Consensus Layer Considerations ---- - -HARP is designed so that it can work with different implementations of -Consensus Layer, also known as Distributed Control Systems (DCS). - -Currently the following DCS implementations are supported: - - - etcd - - BDR - -This section provides information specific to HARP's interaction with the -supported DCS implementations. - -## BDR Driver Compatibility - -The `bdr` native consensus layer is available from BDR versions -[3.6.21](/bdr/latest/release-notes/#bdr-3621) -and [3.7.3](/bdr/latest/release-notes/#bdr-373) respectively. - -Note that for the purpose of maintaining a voting Quorum, BDR Logical Standby -nodes do not participate in consensus communications within a BDR cluster. Do -not count these in the total node list to fulfill DCS Quorum requirements. - -## Maintaining Quorum - -Clusters of any architecture require at least n/2 + 1 nodes to maintain -Consensus via a voting Quorum. Thus a 3-node cluster may tolerate the outage of -a single node, a 5-node cluster can tolerate a 2-node outage, and so on. If -consensus is ever lost, HARP will become inoperable because the DCS prevents it -from deterministically identifying which node is the Lead Master within a -particular Location. - -As a result, whichever DCS is chosen, more than half of the nodes must always -be available _cluster-wide_. This can become a non-trivial element when -distributing DCS nodes among two or more Data Centers. A Network Partition will -prevent Quorum in any Location that cannot maintain a voting majority, and thus -HARP will cease operations. - -Thus an odd-number of nodes (with a minimum of 3) is crucial when building the -Consensus Layer itself. An ideal case distributes nodes across a minimum of -three independent locations to prevent a single Network Partition from -disrupting Consensus. - -One example configuration is to designate two DCS nodes in two Data Centers -coinciding with the primary BDR nodes, and a fifth DCS node (such as a BDR -Witness) elsewhere. Using such a design, a network partition between the two -BDR Data Centers would not disrupt Consensus thanks to the independently -located node. - -### Multi-Consensus Variant - -HARP itself assumes one Lead Master per configured Location. Normally each -Location is specified within HARP using the `location` configuration setting. -By creating a separate DCS cluster per Location, it becomes possible to emulate -this behavior independently of HARP. - -To accomplish this, HARP should be configured in `config.yml` to use a different -DCS connection target per desired Location. - -HARP nodes in DC-A would use something like this: - -```yaml -location: dca -dcs: - driver: etcd - endpoints: - - dcs-a1:2379 - - dcs-a2:2379 - - dcs-a3:2379 -``` - -While DC-B would use different hostnames corresponding to nodes in its -canonical Location: - -```yaml -location: dcb -dcs: - driver: etcd - endpoints: - - dcs-a1:2379 - - dcs-a2:2379 - - dcs-a3:2379 -``` - -There is no DCS communication between different Data Centers in this design, -and thus a Network Partition between them will not impact HARP operation. A -consequence of this is that HARP is completely unaware of nodes in the other -Location, and each Location operates essentially as a separate HARP cluster. - -This is not possible when using BDR as the DCS, as BDR maintains a Consensus -Layer across all participant nodes. - -A possible drawback to this approach is that `harpctl` is unable to interact -with nodes outside of the current Location. It will be impossible to obtain -node information, get or set the Lead Master, or any other operation that -targets the other Location. Essentially this organization renders the -`--location` parameter to `harpctl` unusable. - -### TPAexec and Consensus - -The above considerations are integrated into TPAexec as well. When deploying a -cluster using etcd, it will automatically construct a separate DCS cluster per -Location to facilitate High Availability in favor of strict Consistency. - -Thus this configuration example: - -```yaml -cluster_vars: - failover_manager: harp - harp_consensus_protocol: etcd - -locations: - - Name: first - - Name: second -``` - -Would group any DCS nodes assigned to the `first` location together, and the -`second` location would be a separate cluster. To override this behavior, -configure the `harp_location` implicitly to force a particular grouping. - -Thus this example would return all etcd nodes into a single cohesive DCS layer: - -```yaml -cluster_vars: - failover_manager: harp - harp_consensus_protocol: etcd - -locations: - - Name: first - - Name: second - - Name: all_dcs - -instance_defaults: - vars: - harp_location: all_dcs -``` - -The `harp_location` override may also be necessary to favor specific node -groupings when using cloud providers such as Amazon which favor Availability -Zones within Regions in favor of traditional Data Centers. - diff --git a/product_docs/docs/harp/2.0/index.mdx b/product_docs/docs/harp/2.0/index.mdx deleted file mode 100644 index e20b8634f56..00000000000 --- a/product_docs/docs/harp/2.0/index.mdx +++ /dev/null @@ -1,22 +0,0 @@ ---- -navTitle: HARP -title: "High Availability Routing for Postgres (HARP)" -directoryDefaults: - description: "High Availability Routing for Postgres (HARP) is a cluster management tool for Bi-directional Replication (BDR) clusters." ---- - -High Availability Routing for Postgres (HARP) is a cluster management tool for -[Bi-directional Replication (BDR)](/bdr/latest) clusters. The core design of -the tool is to route all application traffic within a single data center or -region to only one node at a time. This node, designated the Lead Master, acts -as the principle write target to reduce the potential for data conflicts. - -HARP leverages a distributed consensus model to determine availability of the -BDR nodes in the cluster. On failure or unavailability of the Lead Master, HARP -elects a new Lead Master and redirects application traffic accordingly. - -Together with the core capabilities of BDR, this mechanism of routing -application traffic to the Lead Master node enables fast failover and -switchover without risk of data loss. - -HARP requires BDR versions 3.6 and above. diff --git a/product_docs/docs/harp/2/01_release_notes/harp2.0.1_rel_notes.mdx b/product_docs/docs/harp/2/01_release_notes/harp2.0.1_rel_notes.mdx new file mode 100644 index 00000000000..208b8eb8be2 --- /dev/null +++ b/product_docs/docs/harp/2/01_release_notes/harp2.0.1_rel_notes.mdx @@ -0,0 +1,18 @@ +--- +title: "Version 2.0.1" +--- + +This is a patch release of HARP 2 that includes fixes for issues identified +in previous versions. + +| Type | Description | +| ---- |------------ | +| Enhancement | Support for selecting a leader per location rather than relying on DCS like etcd to have separate setup in different locations. This still requires a majority of nodes to survive loss of a location, so an odd number of both locations and database nodes is recommended.| +| Enhancement | The BDR DCS now uses a push notification from the consensus rather than through polling nodes. This change reduces the time for new leader selection and the load that HARP does on the BDR DCS since it doesn't need to poll in short intervals anymore. | +| Enhancement | TPA now restarts each HARP Proxy one by one and wait until they come back to reduce any downtime incurred by the application during software upgrades. | +| Enhancement | The support for embedding PGBouncer directly into HARP Proxy is now deprecated and will be removed in the next major release of HARP. It's now possible to configure TPA to put PGBouncer on the same node as HARP Proxy and point to that HARP Proxy.| +| Bug Fix | `harpctl promote ` would occasionally promote a different node than the one specified. This has been fixed. [Support Ticket #75406] | +| Bug Fix | Fencing would sometimes fail when using BDR as the Distributed Consensus Service. This has been corrected. | +| Bug Fix | `harpctl apply` no longer turns off routing for leader after the cluster has been established. [Support Ticket #80790] | +| Bug Fix | Harp-manager no longer exits if it cannot start a failed database. Harp-manager will keep retrying with randomly increasing periods. [Support Ticket #78516] | +| Bug Fix | The internal pgbouncer proxy implementation had a memory leak. This has been remediated. | diff --git a/product_docs/docs/harp/2/01_release_notes/harp2.0.2_rel_notes.mdx b/product_docs/docs/harp/2/01_release_notes/harp2.0.2_rel_notes.mdx new file mode 100644 index 00000000000..407bed9c8ec --- /dev/null +++ b/product_docs/docs/harp/2/01_release_notes/harp2.0.2_rel_notes.mdx @@ -0,0 +1,11 @@ +--- +title: "Version 2.0.2" +--- + +This is a patch release of HARP 2 that includes fixes for issues identified +in previous versions. + +| Type | Description | +| ---- |------------ | +| Enhancement | BDR consensus now generally available.

HARP offers multiple options for Distributed Consensus Service (DCS) source: etcd and BDR. The BDR consensus option can be used in deployments where etcd isn't present. Use of the BDR consensus option is no longer considered beta and is now supported for use in production environments.

| +| Enhancement | Transport layer proxy now generally available.

HARP offers multiple proxy options for routing connections between the client application and database: application layer (L7) and transport layer (L4). The network layer 4 or transport layer proxy simply forwards network packets, and layer 7 terminates network traffic. The transport layer proxy, previously called simple proxy, is no longer considered beta and is now supported for use in production environments.

| diff --git a/product_docs/docs/harp/2/01_release_notes/harp2.0.3_rel_notes.mdx b/product_docs/docs/harp/2/01_release_notes/harp2.0.3_rel_notes.mdx new file mode 100644 index 00000000000..75722ff6794 --- /dev/null +++ b/product_docs/docs/harp/2/01_release_notes/harp2.0.3_rel_notes.mdx @@ -0,0 +1,11 @@ +--- +title: "Version 2.0.3" +--- + +This is a patch release of HARP 2 that includes fixes for issues identified +in previous versions. + +| Type | Description | +| ---- |------------ | +| Enhancement | HARP Proxy supports read-only user dedicated TLS Certificate (RT78516) | +| Bug Fix | HARP Proxy continues to try and connect to DCS instead of exiting after 50 seconds. (RT75406) | diff --git a/product_docs/docs/harp/2/01_release_notes/harp2.1.0_rel_notes.mdx b/product_docs/docs/harp/2/01_release_notes/harp2.1.0_rel_notes.mdx new file mode 100644 index 00000000000..6f32304a894 --- /dev/null +++ b/product_docs/docs/harp/2/01_release_notes/harp2.1.0_rel_notes.mdx @@ -0,0 +1,18 @@ +--- +title: "Version 2.1.0" +--- + +This is a minor release of HARP 2 that includes new features as well +as fixes for issues identified in previous versions. + +| Type | Description | +| ---- |------------ | +| Feature | Support for selecting a leader per location rather than relying on DCS like etcd to have separate setup in different locations.

This still requires a majority of nodes to survive loss of a location, so an odd number of both locations and database nodes is recommended.

| +| Feature | The BDR DCS now uses a push notification from the consensus rather than through polling nodes.

This change reduces the time for new leader selection and the load that HARP does on the BDR DCS since it doesn't need to poll in short intervals anymore.

| +| Feature | TPA now restarts each HARP Proxy one by one and wait until they come back to reduce any downtime incurred by the application during software upgrades. | +| Feature | The support for embedding PGBouncer directly into HARP Proxy is now deprecated and will be removed in the next major release of HARP.

It's now possible to configure TPA to put PGBouncer on the same node as HARP Proxy and point to that HARP Proxy.

| +| Bug Fix | `harpctl promote ` would occasionally promote a different node than the one specified. This has been fixed. (RT75406) | +| Bug Fix | Fencing would sometimes fail when using BDR as the Distributed Consensus Service. This has been corrected. | +| Bug Fix | `harpctl apply` no longer turns off routing for leader after the cluster has been established. (RT80790) | +| Bug Fix | Harp-manager no longer exits if it cannot start a failed database. Harp-manager will keep retrying with randomly increasing periods. (RT78516) | +| Bug Fix | The internal pgbouncer proxy implementation had a memory leak. This has been remediated. | diff --git a/product_docs/docs/harp/2/01_release_notes/harp2_rel_notes.mdx b/product_docs/docs/harp/2/01_release_notes/harp2_rel_notes.mdx new file mode 100644 index 00000000000..8f63b7c921b --- /dev/null +++ b/product_docs/docs/harp/2/01_release_notes/harp2_rel_notes.mdx @@ -0,0 +1,18 @@ +--- +title: "Version 2.0.0" +--- + +This is new major release of HARP that constitutes of complete rewrite of the +product. + +| Type | Description | +| ---- |------------ | +| Engine | Complete rewrite of system in golang to optimize all operations | +| Engine | Cluster state can now be bootstrapped or revised via YAML | +| Feature | Configuration now in YAML, configuration file changed from `harp.ini` to `config.yml` | +| Feature | HARP Proxy deprecates need for HAProxy in supported architecture.

The use of HARP Router to translate DCS contents into appropriate online or offline states for HTTP-based URI requests meant a load balancer or HAProxy was necessary to determine the lead master. HARP Proxy now does this automatically without periodic iterative status checks.

| +| Feature | Utilizes DCS key subscription to respond directly to state changes.

With relevant cluster state changes, the cluster responds immediately, resulting in improved failover and switchover times.

| +| Feature | Compatibility with etcd SSL settings.

It is now possible to communicate with etcd through SSL encryption.

| +| Feature | Zero transaction lag on switchover.

Transactions are not routed to the new lead node until all replicated transactions are replayed, thereby reducing the potential for conflicts.

+| Feature | Experimental BDR Consensus layer.

Using BDR Consensus as the Distributed Consensus Service (DCS) reduces the amount of change needed for implementations.

+| Feature | Experimental built-in proxy.

Proxy implementation for increased session control.

| diff --git a/product_docs/docs/harp/2/01_release_notes/index.mdx b/product_docs/docs/harp/2/01_release_notes/index.mdx new file mode 100644 index 00000000000..d8e0f48b601 --- /dev/null +++ b/product_docs/docs/harp/2/01_release_notes/index.mdx @@ -0,0 +1,25 @@ +--- +title: Release Notes +navigation: +- harp2.1.0_rel_notes +- harp2.0.3_rel_notes +- harp2.0.2_rel_notes +- harp2.0.1_rel_notes +- harp2_rel_notes +--- + +High Availability Routing for Postgres (HARP) is a cluster-management tool for +[Bi-directional Replication (BDR)](/bdr/latest) clusters. The core design of +the tool is to route all application traffic in a single data center or +region to only one node at a time. This node, designated the lead master, acts +as the principle write target to reduce the potential for data conflicts. + +The release notes in this section provide information on what was new in each release. + +| Version | Release Date | +| ----------------------- | ------------ | +| [2.1.0](harp2.1.0_rel_notes) | 2022 May 17 | +| [2.0.3](harp2.0.3_rel_notes) | 2022 Mar 31 | +| [2.0.2](harp2.0.2_rel_notes) | 2022 Feb 24 | +| [2.0.1](harp2.0.1_rel_notes) | 2021 Jan 31 | +| [2.0.0](harp2_rel_notes) | 2021 Dec 01 | diff --git a/product_docs/docs/harp/2/02_overview.mdx b/product_docs/docs/harp/2/02_overview.mdx new file mode 100644 index 00000000000..7db92e093cd --- /dev/null +++ b/product_docs/docs/harp/2/02_overview.mdx @@ -0,0 +1,246 @@ +--- +navTitle: Overview +title: HARP functionality overview +--- + +HARP is a new approach to high availability for BDR +clusters. It leverages a consensus-driven quorum to determine the correct connection endpoint +in a semi-exclusive manner to prevent unintended multi-node writes from an +application. + +## The importance of quorum + +The central purpose of HARP is to enforce full quorum on any Postgres cluster +it manages. Quorum is a term applied to a voting body that +mandates a certain minimum of attendees are available to make a decision. More simply: majority rules. + +For any vote to end in a result other than a tie, an odd number of +nodes must constitute the full cluster membership. Quorum, however, doesn't +strictly demand this restriction; a simple majority is enough. This means +that in a cluster of N nodes, quorum requires a minimum of N/2+1 nodes to hold +a meaningful vote. + +All of this ensures the cluster is always in agreement regarding the node +that is "in charge." For a BDR cluster consisting of multiple nodes, this +determines the node that is the primary write target. HARP designates this node +as the lead master. + +## Reducing write targets + +The consequence of ignoring the concept of quorum, or not applying it +well enough, can lead to a "split brain" scenario where the "correct" write +target is ambiguous or unknowable. In a standard Postgres cluster, it's +important that only a single node is ever writable and sending replication +traffic to the remaining nodes. + +Even in multi-master-capable approaches such as BDR, it can be help to +reduce the amount of necessary conflict management to derive identical data +across the cluster. In clusters that consist of multiple BDR nodes per physical +location or region, this usually means a single BDR node acts as a "leader" and +remaining nodes are "shadow." These shadow nodes are still writable, but writing to them is discouraged unless absolutely necessary. + +By leveraging quorum, it's possible for all nodes to agree on the exact +Postgres node to represent the entire cluster or a local BDR region. Any +nodes that lose contact with the remainder of the quorum, or are overruled by +it, by definition can't become the cluster leader. + +This restriction prevents split-brain situations where writes unintentionally reach two +Postgres nodes. Unlike technologies such as VPNs, proxies, load balancers, or +DNS, you can't circumvent a quorum-derived consensus by misconfiguration or +network partitions. So long as it's possible to contact the consensus layer to +determine the state of the quorum maintained by HARP, only one target is ever +valid. + +## Basic architecture + +The design of HARP comes in essentially two parts, consisting of a manager and +a proxy. The following diagram describes how these interact with a single +Postgres instance: + +![HARP Unit](images/ha-unit.png) + +The consensus layer is an external entity where Harp Manager maintains +information it learns about its assigned Postgres node, and HARP Proxy +translates this information to a valid Postgres node target. Because Proxy +obtains the node target from the consensus layer, several such instances can +exist independently. + +While using BDR as the consensus layer, each server node resembles this +variant instead: + +![HARP Unit w/BDR Consensus](images/ha-unit-bdr.png) + +In either case, each unit consists of the following elements: + +* A Postgres or EDB instance +* A consensus layer resource, meant to track various attributes of the Postgres + instance +* A HARP Manager process to convey the state of the Postgres node to the + consensus layer +* A HARP Proxy service that directs traffic to the proper lead master node, + as derived from the consensus layer + +Not every application stack has access to additional node resources +specifically for the Proxy component, so it can be combined with the +application server to simplify the stack. + +This is a typical design using two BDR nodes in a single data center organized in a lead master/shadow master configuration: + +![HARP Cluster](images/ha-ao.png) + +When using BDR as the HARP consensus layer, at least three +fully qualified BDR nodes must be present to ensure a quorum majority. (Not shown in the diagram are connections between BDR nodes.) + +![HARP Cluster w/BDR Consensus](images/ha-ao-bdr.png) + +## How it works + +When managing a BDR cluster, HARP maintains at most one leader node per +defined location. This is referred to as the lead master. Other BDR +nodes that are eligible to take this position are shadow master state until they take the leader role. + +Applications can contact the current leader only through the proxy service. +Since the consensus layer requires quorum agreement before conveying leader +state, proxy services direct traffic to that node. + +At a high level, this mechanism prevents simultaneous application interaction with +multiple nodes. + +### Determining a leader + +As an example, consider the role of lead master in a locally subdivided +BDR Always-On group as can exist in a single data center. When any +Postgres or Manager resource is started, and after a configurable refresh +interval, the following must occur: + +1. The Manager checks the status of its assigned Postgres resource. + - If Postgres isn't running, try again after configurable timeout. + - If Postgres is running, continue. +2. The Manager checks the status of the leader lease in the consensus layer. + - If the lease is unclaimed, acquire it and assign the identity of + the Postgres instance assigned to this manager. This lease duration is + configurable, but setting it too low can result in unexpected leadership + transitions. + - If the lease is already claimed by us, renew the lease TTL. + - Otherwise do nothing. + +A lot more occurs, but this simplified version explains +what's happening. The leader lease can be held by only one node, and if it's +held elsewhere, HARP Manager gives up and tries again later. + +!!! Note + Depending on the chosen consensus layer, rather than repeatedly looping to + check the status of the leader lease, HARP subscribes to notifications. In this case, it can respond immediately any time the state of the + lease changes rather than polling. Currently this functionality is + restricted to the etcd consensus layer. + +This means HARP itself doesn't hold elections or manage quorum, which is +delegated to the consensus layer. A quorum of the consensus layer must acknowledge the act of obtaining the lease, so if the request succeeds, +that node leads the cluster in that location. + +### Connection routing + +Once the role of the lead master is established, connections are handled +with a similar deterministic result as reflected by HARP Proxy. Consider a case +where HARP Proxy needs to determine the connection target for a particular backend +resource: + +1. HARP Proxy interrogates the consensus layer for the current lead master in + its configured location. +2. If this is unset or in transition: + - New client connections to Postgres are barred, but clients + accumulate and are in a paused state until a lead master appears. + - Existing client connections are allowed to complete current transactions + and are then reverted to a similar pending state as new connections. +3. Client connections are forwarded to the lead master. + +The interplay shown in this case doesn't require any +interaction with either HARP Manager or Postgres. The consensus layer +is the source of all truth from the proxy's perspective. + +### Colocation + +The arrangement of the work units is such that their organization must follow these principles: + +1. The manager and Postgres units must exist concomitantly in the same + node. +2. The contents of the consensus layer dictate the prescriptive role of all + operational work units. + +This arrangement delegates cluster quorum responsibilities to the consensus layer, +while HARP leverages it for critical role assignments and key/value storage. +Neither storage nor retrieval succeeds if the consensus layer is inoperable +or unreachable, thus preventing rogue Postgres nodes from accepting +connections. + +As a result, the consensus layer generally exists outside of HARP or HARP-managed nodes for maximum safety. Our reference diagrams show this separation, although it isn't required. + +!!! Note + To operate and manage cluster state, BDR contains its own + implementation of the Raft Consensus model. You can configure HARP to + leverage this same layer to reduce reliance on external dependencies and + to preserve server resources. However, certain drawbacks to this + approach are discussed in + [Consensus layer](09_consensus-layer). + +## Recommended architecture and use + +HARP was primarily designed to represent a BDR Always-On architecture that +resides in two or more data centers and consists of at least five BDR +nodes. This configuration doesn't count any logical standby nodes. + +The following diagram shows the current and standard representation: + +![BDR Always-On Reference Architecture](images/bdr-ao-spec.png) + +In this diagram, HARP Manager exists on BDR Nodes 1-4. The initial state +of the cluster is that BDR Node 1 is the lead master of DC A, and BDR +Node 3 is the lead master of DC B. + +This configuration results in any HARP Proxy resource in DC A connecting to BDR Node 1 +and the HARP Proxy resource in DC B connecting to BDR Node 3. + +!!! Note + While this diagram shows only a single HARP Proxy per DC, this is + an example only and should not be considered a single point of failure. Any + number of HARP Proxy nodes can exist, and they all direct application + traffic to the same node. + +### Location configuration + +For multiple BDR nodes to be eligible to take the lead master lock in +a location, you must define a location in the `config.yml` configuration +file. + +To reproduce the BDR Always-On reference architecture shown in the diagram, include these lines in the `config.yml` +configuration for BDR Nodes 1 and 2: + +```yaml +location: dca +``` + +For BDR Nodes 3 and 4, add: + +```yaml +location: dcb +``` + +This applies to any HARP Proxy nodes that are designated in those respective +data centers as well. + +### BDR 3.7 compatibility + +BDR 3.7 and later offers more direct location definition by assigning a +location to the BDR node. This is done by calling the following SQL +API function while connected to the BDR node. So for BDR Nodes 1 and 2, you +might do this: + +```sql +SELECT bdr.set_node_location('dca'); +``` + +And for BDR Nodes 3 and 4: + +```sql +SELECT bdr.set_node_location('dcb'); +``` diff --git a/product_docs/docs/harp/2/03_installation.mdx b/product_docs/docs/harp/2/03_installation.mdx new file mode 100644 index 00000000000..a347de1ba26 --- /dev/null +++ b/product_docs/docs/harp/2/03_installation.mdx @@ -0,0 +1,128 @@ +--- +navTitle: Installation +title: Installation +--- + +A standard installation of HARP includes two system services: + +* HARP Manager (`harp-manager`) on the node being managed +* HARP Proxy (`harp-proxy`) elsewhere + +There are two ways to install and configure these services to manage +Postgres for proper quorum-based connection routing. + +## Software versions + +HARP has dependencies on external software. These must fit a minimum +version as listed here. + +| Software | Min version | +|-----------|---------| +| etcd | 3.4 | +| PgBouncer | 1.14 | + +## TPAExec + +The easiest way to install and configure HARP is to use the EDB TPAexec utility +for cluster deployment and management. For details on this software, see the +[TPAexec product page](https://access.2ndquadrant.com/customer_portal/sw/tpa/). + +!!! Note + TPAExec is currently available only through an EULA specifically dedicated + to BDR cluster deployments. If you can't access the TPAExec URL, + contact your sales or account representative. + +Configure TPAexec to recognize that cluster routing is +managed through HARP by ensuring the TPA `config.yml` file contains these +attributes: + +```yaml +cluster_vars: + failover_manager: harp +``` + +!!! Note + Versions of TPAexec earlier than 21.1 require a slightly different approach: + + ```yaml + cluster_vars: + enable_harp: true + ``` + +After this, install HARP by invoking the `tpaexec` commands +for making cluster modifications: + +```bash +tpaexec provision ${CLUSTER_DIR} +tpaexec deploy ${CLUSTER_DIR} +``` + +No other modifications are necessary apart from cluster-specific +considerations. + + +## Package installation + +Currently CentOS/RHEL packages are provided by the EDB packaging +infrastructure. For details, see the [HARP product +page](https://access.2ndquadrant.com/customer_portal/sw/harp/). + +### etcd packages + +Currently `etcd` packages for many popular Linux distributions aren't +available by their standard public repositories. EDB has therefore packaged +`etcd` for RHEL and CentOS versions 7 and 8, Debian, and variants such as +Ubuntu LTS. You need access to our HARP package repository to use +these libraries. + +## Consensus layer + +HARP requires a distributed consensus layer to operate. Currently this must be +either `bdr` or `etcd`. If using fewer than three BDR nodes, you might need to rely on `etcd`. Otherwise any BDR service outage reduces the +consensus layer to a single node and thus prevents node consensus and disables +Postgres routing. + +### etcd + +If you're using `etcd` as the consensus layer, `etcd` must be installed either +directly on the Postgres nodes or in a separate location they can access. + +To set `etcd` as the consensus layer, include this code in the HARP `config.yml` +configuration file: + +```yaml +dcs: + driver: etcd + endpoints: + - host1:2379 + - host2:2379 + - host3:2379 +``` + +When using TPAExec, all configured etcd endpoints are entered here +automatically. + +### BDR + +The `bdr` native consensus layer is available from BDR 3.6.21 and 3.7.3. This +consensus layer model requires no supplementary software when managing routing +for a BDR cluster. + +To ensure quorum is possible in the cluster, always +use more than two nodes so that BDR's consensus layer remains responsive during node +maintenance or outages. + +To set BDR as the consensus layer, include this in the `config.yml` +configuration file: + +```yaml +dcs: + driver: bdr + endpoints: + - host=host1 dbname=bdrdb user=harp_user + - host=host2 dbname=bdrdb user=harp_user + - host=host3 dbname=bdrdb user=harp_user +``` + +The endpoints for a BDR consensus layer follow the +standard Postgres DSN connection format. diff --git a/product_docs/docs/harp/2/04_configuration.mdx b/product_docs/docs/harp/2/04_configuration.mdx new file mode 100644 index 00000000000..10ac24fba06 --- /dev/null +++ b/product_docs/docs/harp/2/04_configuration.mdx @@ -0,0 +1,540 @@ +--- +navTitle: Configuration +title: Configuring HARP for cluster management +--- + +The HARP configuration file follows a standard YAML-style formatting that was simplified for readability. This file is located in the `/etc/harp` +directory by default and is named `config.yml` + +You can explicitly provide the configuration file location to all HARP +executables by using the `-f`/`--config` argument. + +## Standard configuration + +HARP essentially operates as three components: + +* HARP Manager +* HARP Proxy +* harpctl + +Each of these use the same standard `config.yml` configuration format, which always include the following sections: + +* `cluster.name` — The name of the cluster to target for all operations. +* `dcs` — DCS driver and connection configuration for all endpoints. + +This means a standard preamble is always included for HARP +operations, such as the following: + +```yaml +cluster: + name: mycluster + +dcs: + ... +``` + +Other sections are optional or specific to the named HARP +component. + +### Cluster name + +The `name` entry under the `cluster` heading is required for all +interaction with HARP. Each HARP cluster has a name for both disambiguation +and for labeling data in the DCS for the specific cluster. + +HARP Manager writes information about the cluster here for consumption by +HARP Proxy and harpctl. HARP Proxy services direct traffic to nodes in +this cluster. The `harpctl` management tool interacts with this cluster. + +### DCS settings + +Configuring the consensus layer is key to HARP functionality. Without the DCS, +HARP has nowhere to store cluster metadata, can't hold leadership elections, +and so on. Therefore this portion of the configuration is required, and +certain elements are optional. + +Specify all elements under a section named `dcs` with these multiple +supplementary entries: + +- **`driver`**: Required type of consensus layer to use. + Currently can be `etcd` or `bdr`. Support for `bdr` as a consensus layer is + experimental. Using `bdr` as the consensus layer reduces the + additional software for consensus storage but expects a minimum of three + full BDR member nodes to maintain quorum during database maintenance. + +- **`endpoints`**: Required list of connection strings to contact the DCS. + List every node of the DCS here if possible. This ensures HARP + continues to function as long as a majority of the DCS can still + operate and be reached by the network. + + Format when using `etcd` as the consensus layer is as follows: + + ```yaml + dcs: + endpoints: + - host1:2379 + - host2:2379 + - host3:2379 + ``` + Format when using the experimental `bdr` consensus layer is as follows: + + ```yaml + dcs: + # only DSN format is supported + endpoints: + - "host=host1 port=5432 dbname=bdrdb user=postgres" + - "host=host2 port=5432 dbname=bdrdb user=postgres" + - "host=host3 port=5432 dbname=bdrdb user=postgres" + ``` +Currently, `bdr` consensus layer requires the first endpoint to point to the local postgres instance. + +- **`request_timeout`**: Time in milliseconds to consider a request as failed. + If HARP makes a request to the DCS and receives no response in this time, it considers the operation as failed. This can cause the issue + to be logged as an error or retried, depending on the nature of the request. + Default: 250. + +The following DCS SSL settings apply only when ```driver: etcd``` is set in the +configuration file: + +- **`ssl`**: Either `on` or `off` to enable SSL communication with the DCS. + Default: `off` + +- **`ssl_ca_file`**: Client SSL certificate authority (CA) file. + +- **`ssl_cert_file`**: Client SSL certificate file. + +- **`ssl_key_file`**: Client SSL key file. + +#### Example + +This example shows how to configure HARP to contact an etcd DCS +consisting of three nodes: + +```yaml +dcs: + driver: etcd + endpoints: + - host1:2379 + - host2:2379 + - host3:2379 +``` + +### HARP Manager specific + +Besides the generic service options required for all HARP components, Manager +needs other settings: + +- **`log_level`**: One of `DEBUG`, `INFO`, `WARNING`, `ERROR`, or `CRITICAL`, + which might alter the amount of log output from HARP services. + +- **`name`**: Required name of the Postgres node represented by this Manager. + Since Manager can represent only a specific node, that node is named here and + also serves to name this Manager. If this is a BDR node, it must match the + value used at node creation when executing the + `bdr.create_node(node_name, ...)` function and as reported by the + `bdr.local_node_summary.node_name` view column. Alphanumeric characters + and underscores only. + +- **`start_command`**: This can be used instead of the information in DCS for + starting the database to monitor. This is required if using bdr as the + consensus layer. + +- **`status_command`**: This can be used instead of the information in DCS for + the Harp Manager to determine whether the database is running. This is + required if using bdr as the consensus layer. + +- **`stop_command`**: This can be used instead of the information in DCS for + stopping the database. + +- **`db_retry_wait_min`**: The initial time in seconds to wait if Harp Manager cannot + connect to the database before trying again. Harp Manager will increase the + wait time with each attempt, up to the `db_retry_wait_max` value. + +- **`db_retry_wait_max`**: The maximum time in seconds to wait if Harp Manager cannot + connect to the database before trying again. + + +Thus a complete configuration example for HARP Manager might look like this: + +```yaml +cluster: + name: mycluster + +dcs: + driver: etcd + endpoints: + - host1:2379 + - host2:2379 + - host3:2379 + +manager: + name: node1 + log_level: INFO +``` + +This configuration is essentially the DCS contact information, any associated +service customizations, the name of the cluster, and the name of the +node. All other settings are associated with the node and is stored +in the DCS. + +Read the [Node bootstrapping](05_bootstrapping) for more about +specific node settings and initializing nodes to be managed by HARP Manager. + +### HARP Proxy specific + +Some configuration options are specific to HARP Proxy. These affect how the +daemon operates and thus are currently located in `config.yml`. + +Specify Proxy-based settings under a `proxy` heading, and include: + +- **`location`**: Required name of location for HARP Proxy to represent. + HARP Proxy nodes are directly tied to the location where they are running, as + they always direct traffic to the current lead master node. Specify location + for any defined proxy. + +- **`log_level`**: One of `DEBUG`, `INFO`, `WARNING`, `ERROR`, or `CRITICAL`, + which might alter the amount of log output from HARP services. + + * Default: `INFO` + +- **`name`**: Name of this specific proxy. + Each proxy node is named to ensure any associated statistics or operating + state are available in status checks and other interactive events. + +- **`type`**: Specifies whether to use pgbouncer or the experimental built-in passthrough proxy. All proxies must use the same proxy type. We recommend to experimenting with only the simple proxy in combination with the experimental BDR DCS. + Can be `pgbouncer` or `builtin`. + + * Default: `pgbouncer` + +- **`pgbouncer_bin_dir`**: Directory where PgBouncer binaries are located. + As HARP uses PgBouncer binaries, it needs to know where they are + located. This can be depend on the platform or distribution, so it has no + default. Otherwise, the assumption is that the appropriate binaries are in the + environment's `PATH` variable. + +#### Example + +HARP Proxy requires the cluster name, DCS connection settings, location, and +name of the proxy in operation. For example: + +```yaml +cluster: + name: mycluster + +dcs: + driver: etcd + endpoints: + - host1:2379 + - host2:2379 + - host3:2379 + +proxy: + name: proxy1 + location: dc1 + pgbouncer_bin_dir: /usr/sbin +``` + +All other attributes are obtained from the DCS on proxy startup. + +## Runtime directives + +While it is possible to configure HARP Manager, HARP Proxy, or harpctl with a +minimum of YAML in the `config.yml` file, some customizations are held in +the DCS. These values must either be initialized via bootstrap or set +specifically with `harpctl set` directives. + +### Cluster-wide + +Set these settings under a `cluster` YAML heading during bootstrap, or +modify them with a `harpctl set cluster` command. + +- **`event_sync_interval`**: Time in milliseconds to wait for synchronization. + When events occur in HARP, they do so asynchronously across the cluster. + HARP managers start operating immediately when they detect metadata changes, + and HARP proxies might pause traffic and start reconfiguring endpoints. This is + a safety interval that roughly approximates the maximum amount of + event time skew that exists between all HARP components. + + For example, suppose Node A goes offline and HARP Manager on Node B commonly + receives this event 5 milliseconds before Node C. A setting of at least 5 ms + is then needed to ensure all HARP Manager services receive the + event before they begin to process it. + + This also applies to HARP Proxy. + +### Node directives + +You can change most node-oriented settings and then apply them while HARP +Manager is active. These items are retained in the DCS after initial bootstrap, +and thus you can modify them without altering a configuration file. + +Set these settings under a `node` YAML heading during bootstrap, or +modify them with a `harpctl set node` command. + +- **`node_type`**: The type of this database node, either `bdr` or `witness`. You can't promote a + witness node to leader. + +- **`camo_enforcement`**: Whether to strictly enforce CAMO queue state. + When set to `strict`, HARP never allows switchover or failover to a BDR + CAMO partner node unless it's fully caught up with the entire CAMO queue at + the time of the migration. When set to `lag_only`, only standard lag + thresholds such as `maximum_camo_lag` are applied. + +- **`dcs_reconnect_interval`**: The interval, measured in milliseconds, between attempts that a disconnected node tries to reconnect to the DCS. + + * Default: 1000. + +- **`dsn`**: Required full connection string to the managed Postgres node. + This parameter applies equally to all HARP services and enables + micro-architectures that run only one service per container. + + !!! Note + HARP sets the `sslmode` argument to `require` by default and prevents + connections to servers that don't require SSL. To disable this behavior, + explicitly set this parameter to a more permissive value such as + `disable`, `allow`, or `prefer`. + +- **`db_data_dir`**: Required Postgres data directory. + This is required by HARP Manager to start, stop, or reload the Postgres + service. It's also the default location for configuration files, which you can use + later for controlling promotion of streaming replicas. + +- **`db_conf_dir`**: Location of Postgres configuration files. + Some platforms prefer storing Postgres configuration files away from the + Postgres data directory. In these cases, set this option to that + expected location. + +- **`db_log_file`**: Location of Postgres log file. + + * Default: `/tmp/pg_ctl.out` + +- **`fence_node_on_dcs_failure`**: If HARP can't reach the DCS, several readiness keys and the leadership lease expire. This implicitly prevents a node from routing consideration. However, such a node isn't officially fenced, and the Manager doesn't stop monitoring the database if `stop_database_when_fenced` is set to `false`. + + * Default: False + +- **`leader_lease_duration`**: Amount of time in seconds the lead master + lease persists if not refreshed. This allows any HARP Manager a certain + grace period to refresh the lock, before expiration allows another node to + obtain the lead master lock instead. + + * Default: 6 + +- **`lease_refresh_interval`**: Amount of time in milliseconds between + refreshes of the lead master lease. This essentially controls the time + between each series of checks HARP Manager performs against its assigned + Postgres node and when the status of the node is updated in the consensus + layer. + + * Default: 2000 +- **`max_dcs_failures`**: The amount of DCS request failures before marking a node as fenced according to `fence_node_on_dcs_failure`. This setting prevents transient communication disruptions from shutting down database nodes. + + * Default: 10 + +- **`maximum_lag`**: Highest allowable variance (in bytes) between last + recorded LSN of previous lead master and this node before being allowed to + take the lead master lock. This setting prevents nodes experiencing terminal amounts + of lag from taking the lead master lock. Set to `-1` to disable this check. + + * Default: -1 + +- **`maximum_camo_lag`**: Highest allowable variance (in bytes) between last + received LSN and applied LSN between this node and its CAMO partners. + This applies only to clusters where CAMO is both available and enabled. + Thus this applies only to BDR EE clusters where `pg2q.enable_camo` is set. + For clusters with particularly stringent CAMO apply queue restrictions, set + this very low or even to `0` to avoid any unapplied CAMO transactions. Set to + `-1` to disable this check. + + * Default: -1 + +- **`ready_status_duration`**: Amount of time in seconds the node's readiness + status persists if not refreshed. This is a failsafe that removes a + node from being contacted by HARP Proxy if the HARP Manager in charge of it + stops operating. + + * Default: 30 + +- **`db_bin_dir`**: Directory where Postgres binaries are located. + As HARP uses Postgres binaries, such as `pg_ctl`, it needs to know where + they're located. This can depend on the platform or distribution, so it has no + default. Otherwise, the assumption is that the appropriate binaries are in the + environment's `PATH` variable. + +- **`priority`**: Any numeric value. + Any node where this option is set to `-1` can't take the lead master role, even when attempting to explicitly set the lead master using `harpctl`. + + * Default: 100 + +- **`stop_database_when_fenced`**: Rather than removing a node from all possible routing, stop the database on a node when it is fenced. This is an extra safeguard to prevent data from other sources than HARP Proxy from reaching the database or in case proxies can't disconnect clients for some other reason. + + * Default: False + +- **`consensus_timeout`**: Amount of milliseconds before aborting a read or + write to the consensus layer. If the consensus layer loses + quorum or becomes unreachable, you want near-instant errors rather than + infinite timeouts. This prevents blocking behavior in such cases. + When using `bdr` as the consensus layer, the highest recognized timeout + is 1000 ms. + + * Default: 250 + +- **`use_unix_socket`**: Specifies for HARP Manager to prefer to use + Unix sockets to connect to the database. + + * Default: False + +All of these runtime directives can be modified via `harpctl`. Consider if you +want to decrease the `lease_refresh_interval` to 100ms on `node1`: + +```bash +harpctl set node node1 lease_refresh_interval=100 +``` + +### Proxy directives + +You can change certain settings to the proxy while the service is active. These +items are retained in the DCS after initial bootstrap, and thus you can modify them +without altering a configuration file. Many of these settings are direct +mappings to their PgBouncer equivalent, and we will note these where relevant. + +Set these settings under a `proxies` YAML heading during bootstrap, or +modify them with a `harpctl set proxy` command. +Properties set by `harpctl set proxy` require a restart of the proxy. + +- **`auth_file`**: The full path to a PgBouncer-style `userlist.txt` file. + HARP Proxy uses this file to store a `pgbouncer` user that has + access to PgBouncer's Admin database. You can use this for other users + as well. Proxy modifies this file to add and modify the password for the + `pgbouncer` user. + + * Default: `/etc/harp/userlist.txt` + +- **`auth_type`**: The type of Postgres authentication to use for password + matching. This is actually a PgBouncer setting and isn't fully compatible + with the Postgres `pg_hba.conf` capabilities. We recommend using `md5`, `pam` + `cert`, or `scram-sha-256`. + + * Default: `md5` + +- **`auth_query`**: Query to verify a user’s password with Postgres. + Direct access to `pg_shadow` requires admin rights. It’s better to use a + non-superuser that calls a `SECURITY DEFINER` function instead. If using + TPAexec to create a cluster, a function named `pgbouncer_get_auth` is + installed on all databases in the `pg_catalog` namespace to fulfill this + purpose. + +- **`auth_user`**: If `auth_user` is set, then any user not specified in + `auth_file` is queried through the `auth_query` query from `pg_shadow` + in the database, using `auth_user`. The password of `auth_user` is + taken from `auth_file`. + +- **`client_tls_ca_file`**: Root certificate file to validate client + certificates. Requires `client_tls_sslmode` to be set. + +- **`client_tls_cert_file`**: Certificate for private key. Clients can + validate it. Requires `client_tls_sslmode` to be set. + +- **`client_tls_key_file`**: Private key for PgBouncer to accept client + connections. Requires `client_tls_sslmode` to be set. + +- **`client_tls_protocols`**: TLS protocol versions allowed for + client connections. + Allowed values: `tlsv1.0`, `tlsv1.1`, `tlsv1.2`, `tlsv1.3`. + Shortcuts: `all` (tlsv1.0,tlsv1.1,tlsv1.2,tlsv1.3), + `secure` (tlsv1.2,tlsv1.3), `legacy` (all). + + * Default: `secure` + +- **`client_tls_sslmode`**: Whether to enable client SSL functionality. + Possible values are `disable`, `allow`, `prefer`, `require`, `verify-ca`, and `verify-full`. + + * Default: `disable` + +- **`database_name`**: Required name that represents the database clients + use when connecting to HARP Proxy. This is a stable endpoint that doesn't + change and points to the current node, database name, port, etc., + necessary to connect to the lead master. You can use the global value `*` + here so all connections get directed to this target regardless of database + name. + +- **`default_pool_size`**: The maximum number of active connections to allow + per database/user combination. This is for connection pooling purposes + but does nothing in session pooling mode. This is a PgBouncer setting. + + * Default: 25 + +- **`ignore_startup_parameters`**: By default, PgBouncer allows only + parameters it can keep track of in startup packets: `client_encoding`, + `datestyle`, `timezone`, and `standard_conforming_strings`. All other + parameters raise an error. To allow other parameters, you can specify them here so that PgBouncer knows that they are handled by the admin + and it can ignore them. Often, you need to set this to + `extra_float_digits` for Java applications to function properly. + + * Default: `extra_float_digits` + +- **`listen_address`**: IP addresses where Proxy should listen for + connections. Used by pgbouncer and builtin proxy. + + * Default: 0.0.0.0 + +- **`listen_port`**: System port where Proxy listens for connections. + Used by pgbouncer and builtin proxy. + + * Default: 6432 + +- **`max_client_conn`**: The total maximum number of active client + connections that are allowed on the proxy. This can be many orders of + magnitude greater than `default_pool_size`, as these are all connections that + have yet to be assigned a session or have released a session for use by + another client connection. This is a PgBouncer setting. + + * Default: 100 + +- **`monitor_interval`**: Time in seconds between Proxy checks of PgBouncer. + Since HARP Proxy manages PgBouncer as the actual connection management + layer, it needs to periodically check various status and stats to verify + it's still operational. You can also log or register some of this information to the DCS. + + * Default: 5 + +- **`server_tls_protocols`**: TLS protocol versions are allowed for + server connections. + Allowed values: `tlsv1.0`, `tlsv1.1`, `tlsv1.2`, `tlsv1.3`. + Shortcuts: `all` (tlsv1.0,tlsv1.1,tlsv1.2,tlsv1.3), + `secure` (tlsv1.2,tlsv1.3), `legacy` (all). + + * Default: `secure` + +- **`server_tls_sslmode`**: Whether to enable server SSL functionality. + Possible values are `disable`, `allow`, `prefer`, `require`, `verify-ca`, and `verify-full`. + + * Default: `disable` + +- **`session_transfer_mode`**: Method by which to transfer sessions. + Possible values are `fast`, `wait`, and `reconnect`. + + * Default: `wait` + + This property isn't used by the builtin proxy. + +- **`server_transfer_timeout`**: The number of seconds Harp Proxy waits before giving up on a PAUSE and issuing a KILL command. + + * Default: 30 + +The following two options apply only when using the built-in proxy. + +- **`keepalive`**: The number of seconds the built-in proxy waits before sending a keepalive message to an idle leader connection. + + * Default: 5 + + +- **`timeout`**: The number of seconds the built-in proxy waits before giving up on connecting to the leader. + + * Default: 1 + +When using `harpctl` to change any of these settings for all proxies, use the +`global` keyword in place of the proxy name. For example: + +```bash +harpctl set proxy global max_client_conn=1000 +``` diff --git a/product_docs/docs/harp/2/05_bootstrapping.mdx b/product_docs/docs/harp/2/05_bootstrapping.mdx new file mode 100644 index 00000000000..55d78e8dac4 --- /dev/null +++ b/product_docs/docs/harp/2/05_bootstrapping.mdx @@ -0,0 +1,194 @@ +--- +navTitle: Bootstrapping +title: Cluster bootstrapping +--- + +To use HARP, a minimum amount of metadata must exist in the DCS. The +process of "bootstrapping" a cluster essentially means initializing node, +location, and other runtime configuration either all at once or on a +per-resource basis. + +This process is governed through the `harpctl apply` command. For more +information, see [harpctl command-line tool](08_harpctl). + +Set up the DCS and make sure it is functional before bootstrapping. + +!!! Important + You can combine any or all of + these example into a single YAML document and apply it all at once. + +## Cluster-wide bootstrapping + +Some settings are applied cluster-wide and you can specify them during +bootstrapping. Currently this applies only to the `event_sync_interval` +runtime directive, but others might be added later. + +The format is as follows: + +```yaml +cluster: + name: mycluster + event_sync_interval: 100 +``` + +Assuming that file was named `cluster.yml`, you then apply it with the +following: + +```bash +harpctl apply cluster.yml +``` + +If the cluster name isn't already defined in the DCS, this also +initializes that value. + +!!! Important + The cluster name parameter specified here always overrides the cluster + name supplied in `config.yml`. The assumption is that the bootstrap file + supplies all necessary elements to bootstrap a cluster or some portion of + its larger configuration. A `config.yml` file is primarily meant to control + the execution of HARP Manager, HARP Proxy, or `harpctl` specifically. + +## Location bootstrapping + +Every HARP node is associated with at most one location. This location can be +a single data center, a grouped region consisting of multiple underlying +servers, an Amazon availability zone, and so on. This is a logical +structure that allows HARP to group nodes together such that only one +represents the nodes in that location as the lead master. + +Thus it is necessary to initialize one or more locations. The format for this +is as follows: + +```yaml +cluster: + name: mycluster + +locations: + - location: dc1 + - location: dc2 +``` + +Assuming that file was named `locations.yml`, you then apply it with the +following: + +```bash +harpctl apply locations.yml +``` + +When performing any manipulation of the cluster, include the name as a preamble so the changes are directed to the right place. + +Once locations are bootstrapped, they show up with a quick examination: + +```bash +> harpctl get locations + +Cluster Location Leader Previous Leader Target Leader Lease Renewals +------- -------- ------ --------------- ------------- -------------- +mycluster dc1 +mycluster dc2 +``` + +Both locations are recognized by HARP and available for +node and proxy assignment. + +## Node bootstrapping + +HARP nodes exist in a named cluster and must have a designated name. +Beyond this, all other settings are retained in the DCS, as they are +dynamic and can affect how HARP interacts with them. To this end, bootstrap each node +using one or more of the runtime directives discussed +in [Configuration](04_configuration). + +While bootstrapping a node, there are a few required fields: + +* `name` +* `location` +* `dsn` +* `pg_data_dir` + +Everything else is optional and can depend on the cluster. Because you can bootstrap multiple nodes at once, the format generally fits +this structure: + +```yaml +cluster: + name: mycluster + +nodes: + - name: node1 + location: dc1 + dsn: host=node1 dbname=bdrdb user=postgres + pg_data_dir: /db/pgdata + leader_lease_duration: 10 + priority: 500 +``` + +Assuming that file was named `node1.yml`, you then apply it with the +following: + +```bash +harpctl apply node1.yml +``` + +Once nodes are bootstrapped, they show up with a quick examination: + +```bash +> harpctl get nodes + +Cluster Name Location Ready Fenced Allow Routing Routing Status Role Type Lock Duration +------- ---- -------- ----- ------ ------------- -------------- ---- ---- ------------- +mycluster bdra1 dc1 true false true ok primary bdr 30 +``` + +## Proxy bootstrapping + +Unlike locations or nodes, proxies can also supply a configuration template +that is applied to all proxies in a location. These are stored in the DCS +under the `global` designation. Each proxy also requires a name to exist as +an instance, but no further customization is needed unless some setting +needs a specific override. + +This is because there are likely to be multiple proxies that have the same +default configuration settings for the cluster, and repeating these values for +every single proxy isn't necessary. + +Additionally, when bootstrapping the proxy template, define at least one database for connection assignments. With these notes in mind, the +format for this is as follows: + +```yaml +cluster: + name: mycluster + +proxies: + monitor_interval: 5 + default_pool_size: 20 + max_client_conn: 1000 + database_name: bdrdb + instances: + - name: proxy1 + - name: proxy2 + default_pool_size: 50 +``` + +This configures HARP for two proxies: `proxy1` and `proxy2`. Only +`proxy2` has a custom `default_pool_size`, while using the global +settings otherwise. + +Assuming that file was named `proxy.yml`, you then apply it with the +following: + +```bash +harpctl apply proxy.yml +``` + +Once the proxy template is bootstrapped, it shows up with a quick +examination: + +```bash +> harpctl get proxies + +Cluster Name Pool Mode Auth Type Max Client Conn Default Pool Size +------- ---- --------- --------- --------------- ----------------- +mycluster global session md5 1000 20 +mycluster proxy1 session md5 1000 20 +mycluster proxy2 session md5 1000 50 +``` diff --git a/product_docs/docs/harp/2/06_harp_manager.mdx b/product_docs/docs/harp/2/06_harp_manager.mdx new file mode 100644 index 00000000000..f13e87c24ec --- /dev/null +++ b/product_docs/docs/harp/2/06_harp_manager.mdx @@ -0,0 +1,126 @@ +--- +navTitle: HARP Manager +title: HARP Manager +--- + +HARP Manager is a daemon that interacts with the local PostgreSQL/BDR node +and stores information about its state in the consensus layer. Manager +determines the node that currently holds leader status for a respective location +and enforces configuration (lag, CAMO lag, etc.) constraints to prevent +ineligible nodes from leader consideration. + +Every Postgres node in the cluster must have an associated HARP Manager. +Other nodes can exist, but they can't to participate as lead or +shadow master roles or any other functionality that requires a HARP Manager. + +!!! Important + HARP Manager expects the be used to start and stop the database. Stopping HARP Manager + will stop the database. Starting HARP Manager will start the database if it + isn't already started. If another method is used to stop the database then + HARP Manager will try and restart it. + +## How it works + +Upon starting, HARP Manager uses `pg_ctl` to start Postgres if it isn't +already running. After this, it periodically checks the server as defined +by the `node.lease_refresh_interval` setting. HARP Manager collects various +bits of data about Postgres including: + +* The node's current LSN. +* If Postgres is running and accepting connections. This particular data point + is considered a lease that must be periodically renewed. If it expires, HARP + Proxy removes the node from any existing routing. +* The current apply LSN position for all upstream BDR peer nodes. +* If CAMO is enabled: + - Name of the CAMO partner + - Peer CAMO state (`is_ready`) + - CAMO queue received and applied LSN positions +* Node type, such as whether the node is BDR or regular Postgres. +* The node's current role, such as a read/write, physical streaming replica, + logical standby, and so on. +* BDR node state, which is `ACTIVE` except in limited cases. +* BDR Node ID for other metadata gathering. +* Other tracking values. + +!!! Important + When naming BDR nodes in HARP, the BDR node name must match the node + name represented in the `node.name` configuration attribute. This occurs in the bootstrap process. + +The data collected here is fully available to other HARP Manager processes and +is used to evaluate lag, partner readiness, and other criteria that direct +switchover and failover behavior. + +After updating the node metadata, HARP Manager either refreshes the lead +master lease if it's already held by the local node or seeks to obtain the +lease if it's expired. Since the current state of all nodes is known to all +other nodes, the node that was the previous lead master is given automatic +priority ranking if present. If not, all other nodes list themselves by +LSN lag, node priority, and other criteria, and the most qualified node seizes the lead master lease. + +This procedure happens for every defined location where nodes are present. Thus +for locations DC1 and DC2, there is a lead master node in each, with a +separate lease and election process for both. + +HARP Manager repeats these Postgres status checks, lease renewals, and +elections repeatedly to ensure the cluster always has a lead master target for +connections from HARP Proxy. + +## Configuration + +HARP Manager expects the `dcs`, `cluster`, and `manager` configuration stanzas. +The following is a functional example: + +```yaml +cluster: + name: mycluster + +dcs: + driver: etcd + endpoints: + - host1:2379 + - host2:2379 + - host3:2379 + +manager: + name: node1 + postgres_bin_dir: /usr/lib/postgresql/13/bin +``` + +You can apply changes to the configuration file (default: `/etc/harp/config.yml`) by issuing `SIGHUP` to the running instance or by calling a +service-level reload. + +See [Configuration](04_configuration) for further details. + +## Usage + +This is the basic usage for HARP Manager: + +```bash +Usage of ./harp-manager: + -f string + Optional path to config file (shorthand) + --config string + Optional path to config file +``` + +There are no arguments to launch `harp-manager` as a forked daemon. +This software is designed to be launched through systemd or in a container +as a top-level process. This also means output is directed to STDOUT and STDERR +for capture and access through journald or an attached container terminal. + +## Disabling and reenabling HARP Manager control of Postgres + +You can temporarily pause HARP Manager control of Postgres. This +results in a state where the daemon continues running but doesn't perform any +operations that can affect existing behavior of the cluster. Reenabling +management causes it to resume operation. + +An example of temporarily disabling node management is: + +```bash +harpctl unmanage node node1 +``` + +See [harpctl command-line tool](08_harpctl) for more details. + +Node management by HARP Manager is enabled by default. diff --git a/product_docs/docs/harp/2/07_harp_proxy.mdx b/product_docs/docs/harp/2/07_harp_proxy.mdx new file mode 100644 index 00000000000..bb20a18ad54 --- /dev/null +++ b/product_docs/docs/harp/2/07_harp_proxy.mdx @@ -0,0 +1,202 @@ +--- +navTitle: HARP Proxy +title: HARP Proxy +--- + +HARP Proxy is a daemon that acts as an abstraction layer between the client +application and Postgres. It interfaces with the consensus layer to obtain the +identity of the current lead master node and directs traffic to that location. +During a planned switchover or unplanned failover, it +redirects to the new lead master node as dictated by the DCS. + +You can select between `pgbouncer` or `builtin` for HARP Proxy. If you don't specify +a proxy type, the default is `builtin`. When using `pgbouncer`, HARP Proxy is +an interface layer between the DCS and PgBouncer. As such, PgBouncer is a +prerequisite and must also be installed for HARP Proxy to +fully manage its activity. + +The builtin proxy doesn't require any additional software. When using builtin, +HARP Proxy functions as a level 4 pass-through proxy. + +## Builtin proxy: how it works + +Upon starting, HARP Proxy listens for incoming connections on the listening +address and listening port specified in the bootstrap file per proxy instance. +All application client traffic then passes through builtin proxy into the +current lead master node for the location where this proxy is operating. + +If the lead master lease isn't set, HARP Proxy disconnects all +connection traffic until a new lead master is established. This also applies +to circumstances when `harpctl promote` is used to invoke a planned transition +to a new lead master. The disconnect is immediate. + +### Configuration + +Choose the built-in proxy by setting the proxy type to `builtin`. The only +other option that applies to the built-in proxy is `max_client_conn`, which +specifies the maximum allowed client connections. If `max_client_conn` is +higher than what the system can handle, it is lowered to a setting +that's within the capability of the system that the proxy is on. + +## PgBouncer: how it works + +!!! Note + If you need more configurability of pgbouncer than what Harp Proxy provides, + the recommended setup is to use builtin proxy and have pgbouncer point to it. + +Upon starting, HARP Proxy launches PgBouncer if it's not already running +and leaves client connections paused. After, it contacts the +DCS to determine the identity of the lead master, configure PgBouncer to use +this as the target for database connections, and resume connection activity. +All application client traffic then passes through PgBouncer into the +current lead master node for the location where this proxy is operating. + +While PgBouncer is running, HARP Proxy checks its status based on the +`monitor_interval` configuration setting in the DCS and stores it in the +DCS for monitoring purposes. This configuration allows interrogation with `harpctl` to +retrieve status of all configured proxies or any one proxy. + +If the lead master lease isn't set, HARP Proxy pauses all +connection traffic until a new lead master is established. This also applies +to circumstances when `harpctl promote` is used to invoke a planned transition +to a new lead master. It uses a PgBouncer `PAUSE` command for this, so existing +sessions are allowed to complete any pending transactions before they're held +in stasis. + +### PgBouncer configuration file + +When HARP Proxy uses PgBouncer for connection management and redirection, +a `pgbouncer.ini` file must exist. HARP Manager builds this file based on various +runtime directives as defined in [Proxy directives](04_configuration). + +This file is located in the same folder as the `config.yml` used by HARP +Proxy. Any PgBouncer process launched by HARP Proxy uses this configuration +file, and you can use it for debugging or information purposes. Modifications +to this automatically generated `pgbouncer.ini` file are lost any time +HARP Proxy restarts, so use `harpctl set proxy` to alter these settings +instead. Calling `harpctl set proxy` doesn't update the `pgbouncer.ini` file until the proxy restarts. + +### Disabling and reenabling HARP Proxy node management + +You can temporarily pause HARP Proxy control of PgBouncer. This +results in a state where the daemon continues running but doesn't perform any +operations that can affect existing behavior of the cluster. Reenabling +management causes it to resume operation. + +An example of temporarily disabling management of a specific proxy is: + +```bash +harpctl unmanage proxy proxy1 +``` + +See [harpctl command-line tool](08_harpctl) for more details. + +Proxy node management is enabled by default. + +### Passthrough user authentication + +With pgbouncer, we strongly recommend configuring HARP Proxy to use the `auth_user` and +`auth_query` runtime directives. If these aren't set, the PgBouncer +`userlist.txt` file must include username and password hash combinations for +every user PgBouncer needs to authenticate on behalf of Postgres. + +Do *not* use the `pgbouncer` user, as this this is used by HARP +Proxy as an admin-level user to operate the underlying PgBouncer +service. + +In clusters administered by TPAexec, a function is created and installed +in the `pg_catalog` schema in the `template1` database during provisioning. +This means any databases created later also include the function, +and it's available to PgBouncer regardless of the database the user is +attempting to contact. + +If TPAexec isn't used, we still recommend this function definition: + +```sql +CREATE OR REPLACE FUNCTION pg_catalog.pgbouncer_get_auth(p_usename TEXT) +RETURNS TABLE(username TEXT, password TEXT) AS $$ +BEGIN + RETURN QUERY + SELECT usename::TEXT, passwd::TEXT FROM pg_catalog.pg_shadow + WHERE usename = p_usename; +END; +$$ LANGUAGE plpgsql SECURITY DEFINER + +REVOKE ALL ON FUNCTION pg_catalog.pgbouncer_get_auth(p_usename TEXT) + FROM PUBLIC + +GRANT EXECUTE ON FUNCTION pg_catalog.pgbouncer_get_auth(p_usename TEXT) + TO ; +``` + +Substitute `` for the `auth_user` field supplied to +HARP Proxy. + +Then in the Bootstrap file, the following completes the configuration: + +```yaml +cluster: + name: mycluster + +proxies: + monitor_interval: 5 + default_pool_size: 20 + max_client_conn: 1000 + auth_user: pgb_auth + type: pgbouncer + auth_query: "SELECT * FROM pg_catalog.pgbouncer_get_auth($1)" + database_name: bdrdb + instances: + - name: proxy1 + - name: proxy2 +``` + +You can also define these fields with `harpctl set proxy`: + +```bash +harpctl set proxy global auth_user=pgb_auth +``` + +!!! Note + This means the `postgres` or `enterprisedb` OS user that launches HARP + Proxy needs a `.pgpass` file so that `auth_user` can authenticate + against Postgres. + +### Configuration + +HARP Proxy expects the `dcs`, `cluster`, and `proxy` configuration stanzas. The +following is a functional example: + +```yaml +cluster: + name: mycluster + +dcs: + driver: etcd + endpoints: + - host1:2379 + - host2:2379 + - host3:2379 + +proxy: + name: proxy1 +``` +Each proxy connects to the DCS to retrieve the hosts and ports to listen on for connections. + +### Usage + +This is the basic usage for HARP Proxy: + +```bash +Usage of ./harp-proxy: + -f string + Optional path to config file (shorthand) + --config string + Optional path to config file +``` + +There are no arguments to launch `harp-proxy` as a forked daemon. +This software is designed to be launched through systemd or in a container +as a top-level process. This also means output is directed to STDOUT and STDERR +for capture and access through journald or an attached container terminal. + diff --git a/product_docs/docs/harp/2.0/08_harpctl.mdx b/product_docs/docs/harp/2/08_harpctl.mdx similarity index 59% rename from product_docs/docs/harp/2.0/08_harpctl.mdx rename to product_docs/docs/harp/2/08_harpctl.mdx index e89d20353d9..3eb2aec25d2 100644 --- a/product_docs/docs/harp/2.0/08_harpctl.mdx +++ b/product_docs/docs/harp/2/08_harpctl.mdx @@ -1,11 +1,11 @@ --- navTitle: harpctl -title: harpctl Command-line Tool +title: harpctl command-line tool --- -`harpctl` is a command-line tool for directly manipulating the Consensus Layer -contents to fit desired cluster geometry. It can be used to e.g. examine node -status, "promote" a node to Lead Master, disable/enable cluster management, +`harpctl` is a command-line tool for directly manipulating the consensus layer +contents to fit desired cluster geometry. You can use it to, for example, examine node +status, "promote" a node to lead master, disable/enable cluster management, bootstrap cluster settings, and so on. ## Synopsis @@ -41,12 +41,12 @@ Use "harpctl [command] --help" for more information about a command. ``` In addition to this basic synopsis, each of the available commands has its own -series of allowable sub-commands and flags. +series of allowed subcommands and flags. ## Configuration -It's important to be aware that `harpctl` must interact with the Consensus -Layer to operate. This means a certain minimum amount of settings should be +`harpctl` must interact with the consensus +layer to operate. This means a certain minimum amount of settings must be defined in `config.yml` for successful execution. This includes: * `dcs.driver` @@ -76,19 +76,19 @@ Execute `harpctl` like this: harpctl command [flags] ``` -Each command has its own series of sub-commands and flags. Further help for -these are available by executing the command this way: +Each command has its own series of subcommands and flags. Further help for +these are available by executing this command: ```bash -harpctl command --help +harpctl --help ``` ## `harpctl apply` -It is necessary to use an `apply` command to "bootstrap" a HARP cluster using a -file which defines various attributes of the intended cluster. +You must use an `apply` command to "bootstrap" a HARP cluster using a +file that defines various attributes of the intended cluster. -An `apply` command should be executed like this: +Execute an `apply` command like this: ```bash harpctl apply @@ -119,50 +119,50 @@ As seen here, it is good practice to always include a cluster name preamble to ensure all changes target the correct HARP cluster, in case several are operating in the same environment. -Once `apply` completes without error, the node will be integrated with the rest +Once `apply` completes without error, the node is integrated with the rest of the cluster. !!! Note - This command can also be used to bootstrap the entire cluster at once since - all defined sections are applied at the same time. However, we do not + You can also use this command to bootstrap the entire cluster at once since + all defined sections are applied at the same time. However, we don't encourage this use for anything but testing as it increases the difficulty of validating each portion of the cluster during initial definition. ## `harpctl fence` -Marks the local or specified node as **fenced**. A node with this status is -essentially completely excluded from the cluster. HARP Proxy will not send it -traffic, its representative HARP Manager will not claim the Lead Master lease, -and further steps are also taken. If running, HARP Manager will stop Postgres +Marks the local or specified node as fenced. A node with this status is +essentially completely excluded from the cluster. HARP Proxy doesn't send it +traffic, its representative HARP Manager doesn't claim the lead master lease, +and further steps are also taken. If running, HARP Manager stops Postgres on the node as well. -A `fence` command should be executed like this: +Execute a `fence` command like this: ```bash harpctl fence () ``` -The node-name itself is optional; if ommitted, `harpctl` will use the name of +The node-name is optional; if omitted, `harpctl` uses the name of the locally configured node. ## `harpctl get` -Fetches information stored in the Consensus Layer for various elements of the -cluster. This includes nodes, locations, the cluster itself, and so on. The +Fetches information stored in the consensus layer for various elements of the +cluster. This includes nodes, locations, the cluster, and so on. The full list includes: -* `cluster` - Returns the Cluster state -* `leader` - Returns the current or specified location leader -* `location` - Returns current or specified location information -* `locations` - Returns list of all locations -* `node` - Returns the specified Postgres node -* `nodes` - Returns list of all Postgres nodes -* `proxy` - Returns current or specified proxy information -* `proxies` - Returns list of all Proxy nodes +* `cluster` — Returns the cluster state. +* `leader` — Returns the current or specified location leader. +* `location` — Returns current or specified location information. +* `locations` — Returns list of all locations. +* `node` — Returns the specified Postgres node. +* `nodes` — Returns list of all Postgres nodes. +* `proxy` — Returns current or specified proxy information. +* `proxies` — Returns list of all proxy nodes. ### `harpctl get cluster` -Fetches information stored in the Consensus Layer for the current cluster: +Fetches information stored in the consensus layer for the current cluster: ```bash > harpctl get cluster @@ -174,9 +174,9 @@ mycluster true ### `harpctl get leader` -Fetches node information for the current Lead Master stored in the DCS for the -specified location. If no location is passed, `harpctl` will attempt to -derive it based on the location of the current Node where it was executed. +Fetches node information for the current lead master stored in the DCS for the +specified location. If no location is passed, `harpctl` attempts to +derive it based on the location of the current node where it was executed. Example: @@ -191,8 +191,8 @@ mycluster mynode true primary bdr dc1 false 30 ### `harpctl get location` Fetches location information for the specified location. If no location is -passed, `harpctl` will attempt to derive it based on the location of the -current Node where it was executed. +passed, `harpctl` attempts to derive it based on the location of the +current node where it was executed. Example: @@ -267,7 +267,7 @@ mycluster proxy1 session md5 1000 20 ### `harpctl get proxies` Fetches proxy information stored in the DCS for all proxies in the cluster. -Additionally will list the `global` pseudo-proxy for default proxy settings. +Additionally, lists the `global` pseudo-proxy for default proxy settings. Example: @@ -283,80 +283,79 @@ mycluster proxy2 session md5 1500 30 ## `harpctl manage` -In the event a cluster is not in a managed state, instructs all HARP Manager -services to resume monitoring Postgres and updating the Consensus Layer. This -should be done after maintenance is complete following HARP software updates -or other significant changes that could affect the whole cluster. +If a cluster isn't in a managed state, instructs all HARP Manager +services to resume monitoring Postgres and updating the consensus layer. Do this +after maintenance is complete following HARP software updates +or other significant changes that might affect the whole cluster. -A `manage` command should be executed like this: +Execute a `manage` command like this: ```bash harpctl manage cluster ``` !!! Note - Currently it is only possible to enable or disable cluster management at + Currently you can enable or disable cluster management only at the `cluster` level. Later versions will also make it possible to do this for individual nodes or proxies. ## `harpctl promote` -Promotes the next available node that meets leadership requirements to Lead -Master in the current Location. Since this is a requested event, it invokes a +Promotes the next available node that meets leadership requirements to lead +master in the current Location. Since this is a requested event, it invokes a smooth handover where: -1. The existing Lead Master will release the Lead Master lease, provided: +1. The existing lead master releases the lead master lease, provided: - If CAMO is enabled, the promoted node must be up to date and CAMO ready, and the CAMO queue must have less than `node.maximum_camo_lag` bytes remaining to be applied. - - Replication lag between the old Lead Master and the promoted node is - less than `node.maximum_lag` -2. The promoted node will be the only valid candidate to take the Lead Master - lease, and will do so as soon as it is released by the current holder. All - other nodes will ignore the unset Lead Master lease. - - If CAMO is enabled, the promoted node will temporarily disable client + - Replication lag between the old lead master and the promoted node is + less than `node.maximum_lag`. +2. The promoted node is the only valid candidate to take the lead master + lease and does so as soon as it is released by the current holder. All + other nodes ignore the unset lead master lease. + - If CAMO is enabled, the promoted node temporarily disables client traffic until the CAMO queue is fully applied, even though it holds the - Lead Master lease. + lead master lease. 3. HARP Proxy, if using pgbouncer, will `PAUSE` connections to allow ongoing - transactions to complete. Once the Lead Master lease is claimed by the promoted node, it - will reconfigure PgBouncer for the new connection target and resume database - traffic. If HARP Proxy is using the builtin proxy it will terminate existing - connections and create new connections to the Lead Master as new connections are + transactions to complete. Once the lead master lease is claimed by the promoted node, it + reconfigures PgBouncer for the new connection target and resumes database + traffic. If HARP Proxy is using the builtin proxy, it terminates existing + connections and creates new connections to the lead master as new connections are requested from the client. -A `promote` command should be executed like this: +Execute a `promote` command like this: ```bash harpctl promote () ``` -The `--force` option can be provided to forcibly set a node to Lead Master, -even if it does not meet the criteria for becoming lead master. This will -circumvent any verification of CAMO status or replication lag and cause an +Provide the `--force` option to forcibly set a node to lead master, +even if it doesn't meet the criteria for becoming lead master. This +circumvents any verification of CAMO status or replication lag and causes an immediate transition to the promoted node. This is the only way to specify an exact node for promotion. -Note that the node must be online and operational for this to succeed. This -option should be used with care. +The node must be online and operational for this to succeed. Use this +option with care. ## `harpctl set` Sets a specific attribute in the cluster to the supplied value. This is used to tweak configuration settings for a specific node, proxy, location, or the -cluster itself rather than using `apply`. This can be used for the following +cluster rather than using `apply`. You can use this for the following object types: -* `cluster` - Sets cluster-related attributes. -* `location` - Sets specific location attributes. -* `node` - Sets specific node attributes. -* `proxy` - Sets specific proxy attributes. +* `cluster` — Sets cluster-related attributes. +* `location` — Sets specific location attributes. +* `node` — Sets specific node attributes. +* `proxy` — Sets specific proxy attributes. ### `harpctl set cluster` -Sets cluster-related attributes only. There's only one of these at the moment, -but future versions of HARP may add more. +Sets cluster-related attributes only. Example: @@ -366,9 +365,8 @@ harpctl set cluster event_sync_interval=200 ### `harpctl set node` -Sets node-related attributes for the named node. Any options mentioned in the -"Node Directives" section of the [Configuration](04_configuration) -documentation are valid here. +Sets node-related attributes for the named node. Any options mentioned in +[Node directives](04_configuration#node_directives) are valid here. Example: @@ -378,9 +376,8 @@ harpctl set node mynode priority=500 ### `harpctl set proxy` -Sets proxy-related attributes for the named proxy. Any options mentioned in the -"Proxy Directives" section of the [Configuration](04_configuration) -documentation are valid here. +Sets proxy-related attributes for the named proxy. Any options mentioned in the [Proxy directives](04_configuration#proxy_directives) +are valid here. Properties set this way require a restart of the proxy before the new value takes effect. Example: @@ -397,37 +394,36 @@ harpctl set proxy global default_pool_size=10 ## `harpctl unfence` -Removes the **fenced** attribute from the local or specified node. This will -remove all previously applied cluster exclusions from the node so that it can -again receive traffic or hold the Lead Master lease. Postgres will also be -started if it is not running. +Removes the `fenced` attribute from the local or specified node. This +removes all previously applied cluster exclusions from the node so that it can +again receive traffic or hold the lead master lease. Postgres is also started if it isn't running. -An `unfence` command should be executed like this: +Execute an `unfence` command like this: ```bash harpctl unfence () ``` -The node-name itself is optional; if ommitted, `harpctl` will use the name of +The node-name is optional. If you omit it, `harpctl` uses the name of the locally configured node. ## `harpctl unmanage` Instructs all HARP Manager services in the cluster to remain running but no -longer actively monitor Postgres, or modify the contents of the Consensus -Layer. This will mean that any ordinary failover event such as a node outage -will not result in a leadership migration. This is intended for system or HARP -maintenance, and should be done prior to making changes to HARP software or -other significant modifications to the cluster. +longer actively monitoring Postgres, or modify the contents of the consensus +layer. This means that any ordinary failover event such as a node outage +doesn't result in a leadership migration. This is intended for system or HARP +maintenance prior to making changes to HARP software or +other significant changes to the cluster. -An `unmanage` command should be executed like this: +Execute an `unmanage` command like this: ```bash harpctl unmanage cluster ``` !!! Note - Currently it is only possible to enable or disable cluster management at + Currently you can enable or disable cluster management at only the `cluster` level. Later versions will also make it possible to do this for individual nodes or proxies. diff --git a/product_docs/docs/harp/2/09_consensus-layer.mdx b/product_docs/docs/harp/2/09_consensus-layer.mdx new file mode 100644 index 00000000000..953b754519a --- /dev/null +++ b/product_docs/docs/harp/2/09_consensus-layer.mdx @@ -0,0 +1,142 @@ +--- +navTitle: Consensus layer +title: Consensus layer considerations +--- + +HARP is designed so that it can work with different implementations of +consensus layer, also known as Distributed Control Systems (DCS). + +Currently the following DCS implementations are supported: + + - etcd + - BDR + +This information is specific to HARP's interaction with the +supported DCS implementations. + +## BDR driver compatibility + +The `bdr` native consensus layer is available from BDR versions +[3.6.21](/bdr/latest/release-notes/#bdr-3621) +and [3.7.3](/bdr/latest/release-notes/#bdr-373). + +For the purpose of maintaining a voting quorum, BDR Logical Standby +nodes don't participate in consensus communications in a BDR cluster. Don't count these in the total node list to fulfill DCS quorum requirements. + +## Maintaining quorum + +Clusters of any architecture require at least n/2 + 1 nodes to maintain +consensus via a voting quorum. Thus a three-node cluster can tolerate the outage of +a single node, a five-node cluster can tolerate a two-node outage, and so on. If +consensus is ever lost, HARP becomes inoperable because the DCS prevents it +from deterministically identifying the node that is the lead master in a +particular location. + +As a result, whichever DCS is chosen, more than half of the nodes must always +be available _cluster-wide_. This can become a non-trivial element when +distributing DCS nodes among two or more data centers. A network partition +prevents quorum in any location that can't maintain a voting majority, and thus +HARP stops working. + +Thus an odd-number of nodes (with a minimum of three) is crucial when building the +consensus layer. An ideal case distributes nodes across a minimum of +three independent locations to prevent a single network partition from +disrupting consensus. + +One example configuration is to designate two DCS nodes in two data centers +coinciding with the primary BDR nodes, and a fifth DCS node (such as a BDR +witness) elsewhere. Using such a design, a network partition between the two +BDR data centers doesn't disrupt consensus thanks to the independently +located node. + +### Multi-consensus variant + +HARP assumes one lead master per configured location. Normally each +location is specified in HARP using the `location` configuration setting. +By creating a separate DCS cluster per location, you can emulate +this behavior independently of HARP. + +To accomplish this, configure HARP in `config.yml` to use a different +DCS connection target per desired Location. + +HARP nodes in DC-A use something like this: + +```yaml +location: dca +dcs: + driver: etcd + endpoints: + - dcs-a1:2379 + - dcs-a2:2379 + - dcs-a3:2379 +``` + +While DC-B uses different hostnames corresponding to nodes in its +canonical location: + +```yaml +location: dcb +dcs: + driver: etcd + endpoints: + - dcs-a1:2379 + - dcs-a2:2379 + - dcs-a3:2379 +``` + +There's no DCS communication between different data centers in this design, +and thus a network partition between them doesn't affect HARP operation. A +consequence of this is that HARP is completely unaware of nodes in the other +location, and each location operates essentially as a separate HARP cluster. + +This isn't possible when using BDR as the DCS, as BDR maintains a consensus +layer across all participant nodes. + +A possible drawback to this approach is that `harpctl` can't interact +with nodes outside of the current location. It's impossible to obtain +node information, get or set the lead master, or perform any other operation that +targets the other location. Essentially this organization renders the +`--location` parameter to `harpctl` unusable. + +### TPAexec and consensus + +These considerations are integrated into TPAexec as well. When deploying a +cluster using etcd, it constructs a separate DCS cluster per +location to facilitate high availability in favor of strict consistency. + +Thus this configuration example groups any DCS nodes assigned to the `first` location together, and the +`second` location is a separate cluster: + +```yaml +cluster_vars: + failover_manager: harp + harp_consensus_protocol: etcd + +locations: + - Name: first + - Name: second +``` + +To override this behavior, +configure the `harp_location` implicitly to force a particular grouping. + +Thus this example returns all etcd nodes into a single cohesive DCS layer: + +```yaml +cluster_vars: + failover_manager: harp + harp_consensus_protocol: etcd + +locations: + - Name: first + - Name: second + - Name: all_dcs + +instance_defaults: + vars: + harp_location: all_dcs +``` + +The `harp_location` override might also be necessary to favor specific node +groupings when using cloud providers such as Amazon that favor availability +zones in regions over traditional data centers. diff --git a/product_docs/docs/harp/2.0/10_security.mdx b/product_docs/docs/harp/2/10_security.mdx similarity index 50% rename from product_docs/docs/harp/2.0/10_security.mdx rename to product_docs/docs/harp/2/10_security.mdx index 30001423fe1..a8c53c66fc3 100644 --- a/product_docs/docs/harp/2.0/10_security.mdx +++ b/product_docs/docs/harp/2/10_security.mdx @@ -1,25 +1,25 @@ --- navTitle: Security -title: Security and Roles +title: Security and roles --- -Beyond basic package installation and configuration, HARP itself does require +Beyond basic package installation and configuration, HARP requires Postgres permissions to operate. These allow it to gather information about -Postgres or BDR as necessary to maintain node status within the Consensus -Layer. +Postgres or BDR as needed to maintain node status in the consensus +layer. -## Postgres Permissions +## Postgres permissions -The role specified in the `node.dsn` parameter must have been created one +Create the role specified in the `node.dsn` parameter in one of the following ways: * `CREATE USER ...` * `CREATE ROLE ... WITH LOGIN` -This ensures the role is capable of logging into the database in order to +This syntax ensures the role can log into the database to gather diagnostic information. -Similarly, an entry must exist in `pg_hba.conf` for this role. This can be done +Similarly, an entry must exist in `pg_hba.conf` for this role. You can do this in many ways. As an example, consider a VPN subnet where all database hosts are located somewhere in `10.10.*`. In such a case, the easiest approach is to add a specific line: @@ -31,12 +31,12 @@ hostssl all harp_user 10.10.1.1/16 scram-sha-256 !!! Note In this case we've used the more modern `scram-sha-256` authentication - rather than `md5` which is now deprecated. We've also elected to require + rather than `md5`, which is now deprecated. We've also elected to require SSL authentication by specifying `hostssl`. -## BDR Permissions +## BDR permissions -BDR nodes have metadata and views that are only visible when certain roles are +BDR nodes have metadata and views that are visible only when certain roles are granted to the HARP-enabled user. In this case, the HARP user requires the following: @@ -45,23 +45,23 @@ GRANT bdr_monitor TO harp_user; ``` The `bdr_monitor` BDR role is meant for status monitoring tools to maintain -ongoing information on cluster operation, thus is perfectly suited to HARP. +ongoing information on cluster operation, thus it is well-suited to HARP. -## BDR Consensus Permissions +## BDR consensus permissions -When the `dcs.driver` configuration parameter is set to `bdr`, HARP will -utilize BDR itself as the Consensus Layer. As such, it requires access to API -methods that are currently only available to the `bdr_superuser` role. This +When the `dcs.driver` configuration parameter is set to `bdr`, HARP +uses BDR as the consensus layer. As such, it requires access to API +methods that are currently available only to the `bdr_superuser` role. This means the HARP-enabled user requires the following: ```sql GRANT bdr_superuser TO foobar; ``` -This may change in future versions of BDR, but currently access to the BDR -consensus model does require superuser equivalent permission. +Currently access to the BDR consensus model requires superuser equivalent +permission. !!! Important - BDR Superusers *are not* Postgres superusers. The `bdr_superuser` role is - merely granted elevated privileges within BDR itself, such as access to + BDR superusers *are not* Postgres superusers. The `bdr_superuser` role is + merely granted elevated privileges in BDR, such as access to restricted functions, tables, views, and other objects. diff --git a/product_docs/docs/harp/2.0/Makefile b/product_docs/docs/harp/2/Makefile similarity index 100% rename from product_docs/docs/harp/2.0/Makefile rename to product_docs/docs/harp/2/Makefile diff --git a/product_docs/docs/harp/2.0/images/bdr-ao-spec.dia b/product_docs/docs/harp/2/images/bdr-ao-spec.dia similarity index 100% rename from product_docs/docs/harp/2.0/images/bdr-ao-spec.dia rename to product_docs/docs/harp/2/images/bdr-ao-spec.dia diff --git a/product_docs/docs/harp/2.0/images/bdr-ao-spec.png b/product_docs/docs/harp/2/images/bdr-ao-spec.png similarity index 100% rename from product_docs/docs/harp/2.0/images/bdr-ao-spec.png rename to product_docs/docs/harp/2/images/bdr-ao-spec.png diff --git a/product_docs/docs/harp/2.0/images/ha-ao-bdr.dia b/product_docs/docs/harp/2/images/ha-ao-bdr.dia similarity index 100% rename from product_docs/docs/harp/2.0/images/ha-ao-bdr.dia rename to product_docs/docs/harp/2/images/ha-ao-bdr.dia diff --git a/product_docs/docs/harp/2.0/images/ha-ao-bdr.png b/product_docs/docs/harp/2/images/ha-ao-bdr.png similarity index 100% rename from product_docs/docs/harp/2.0/images/ha-ao-bdr.png rename to product_docs/docs/harp/2/images/ha-ao-bdr.png diff --git a/product_docs/docs/harp/2.0/images/ha-ao.dia b/product_docs/docs/harp/2/images/ha-ao.dia similarity index 100% rename from product_docs/docs/harp/2.0/images/ha-ao.dia rename to product_docs/docs/harp/2/images/ha-ao.dia diff --git a/product_docs/docs/harp/2.0/images/ha-ao.png b/product_docs/docs/harp/2/images/ha-ao.png similarity index 100% rename from product_docs/docs/harp/2.0/images/ha-ao.png rename to product_docs/docs/harp/2/images/ha-ao.png diff --git a/product_docs/docs/harp/2.0/images/ha-unit-bdr.dia b/product_docs/docs/harp/2/images/ha-unit-bdr.dia similarity index 100% rename from product_docs/docs/harp/2.0/images/ha-unit-bdr.dia rename to product_docs/docs/harp/2/images/ha-unit-bdr.dia diff --git a/product_docs/docs/harp/2.0/images/ha-unit-bdr.png b/product_docs/docs/harp/2/images/ha-unit-bdr.png similarity index 100% rename from product_docs/docs/harp/2.0/images/ha-unit-bdr.png rename to product_docs/docs/harp/2/images/ha-unit-bdr.png diff --git a/product_docs/docs/harp/2.0/images/ha-unit.dia b/product_docs/docs/harp/2/images/ha-unit.dia similarity index 100% rename from product_docs/docs/harp/2.0/images/ha-unit.dia rename to product_docs/docs/harp/2/images/ha-unit.dia diff --git a/product_docs/docs/harp/2.0/images/ha-unit.png b/product_docs/docs/harp/2/images/ha-unit.png similarity index 100% rename from product_docs/docs/harp/2.0/images/ha-unit.png rename to product_docs/docs/harp/2/images/ha-unit.png diff --git a/product_docs/docs/harp/2/index.mdx b/product_docs/docs/harp/2/index.mdx new file mode 100644 index 00000000000..7ddbea33d25 --- /dev/null +++ b/product_docs/docs/harp/2/index.mdx @@ -0,0 +1,22 @@ +--- +navTitle: HARP +title: "High Availability Routing for Postgres (HARP)" +directoryDefaults: + description: "High Availability Routing for Postgres (HARP) is a cluster-management tool for Bi-directional Replication (BDR) clusters." +--- + +High Availability Routing for Postgres (HARP) is a cluster-management tool for +[Bi-directional Replication (BDR)](/bdr/latest) clusters. The core design of +the tool is to route all application traffic in a single data center or +region to only one node at a time. This node, designated the lead master, acts +as the principle write target to reduce the potential for data conflicts. + +HARP leverages a distributed consensus model to determine availability of the +BDR nodes in the cluster. On failure or unavailability of the lead master, HARP +elects a new lead master and redirects application traffic. + +Together with the core capabilities of BDR, this mechanism of routing +application traffic to the lead master node enables fast failover and +switchover without risk of data loss. + +HARP requires BDR versions 3.6 and later. diff --git a/product_docs/docs/pgd/4/architectures/bronze.mdx b/product_docs/docs/pgd/4/architectures/bronze.mdx new file mode 100644 index 00000000000..7f47bfc48bb --- /dev/null +++ b/product_docs/docs/pgd/4/architectures/bronze.mdx @@ -0,0 +1,15 @@ +--- +title: "AlwaysOn Bronze (single active location - cloud region or on prem data center)" +navTitle: Bronze +--- + +The AlwaysOn Bronze architecture includes the following: + +- Two BDR data nodes +- One BDR witness node that doesn't hold data but is used for consensus +- Two HARP-Proxy nodes for routing application traffic to the "lead" master +- One barman node for backup and recovery can be onsite or offsite + +BDR and HARP-Proxy nodes should be spread across availability zones. + +![Always On Bronze architecture](../images/bronze.png) diff --git a/product_docs/docs/pgd/4/architectures/gold.mdx b/product_docs/docs/pgd/4/architectures/gold.mdx new file mode 100644 index 00000000000..9315d6550ba --- /dev/null +++ b/product_docs/docs/pgd/4/architectures/gold.mdx @@ -0,0 +1,24 @@ +--- +title: "AlwaysOn Gold (two active locations - cloud regions or on prem data centers)" +navTitle: "Gold" +--- + +The AlwaysOn Gold architecture is ideal for production environments where customers are ...... + +This architecture favors local resiliency/redundancy first and remote locations only when and an entire location is offline or fails. + +This architecture enables geo-distributed writes where no/low conflict handling is expected + +The AlwaysOn Gold architecture requires intervention to move between locations but all data will be replicated and available to the application when failover is initiated with full capacity. + +The AlwaysOn Gold architecture includes the following: + +- Four BDR data nodes (two in location A, two in location B) +- One BDR witness node in location C (optional but recommended) +- Four HARP-Proxy nodes (two in location A, two in location B) +- Two Barman nodes (one in location A, one in location B) + + +![Always On Gold architecture](../images/gold.png) +![Always On Gold architecture](../images/gold2.png) +![Always On Gold architecture](../images/gold3.png) diff --git a/product_docs/docs/pgd/4/architectures/index.mdx b/product_docs/docs/pgd/4/architectures/index.mdx new file mode 100644 index 00000000000..3e1fe2385b5 --- /dev/null +++ b/product_docs/docs/pgd/4/architectures/index.mdx @@ -0,0 +1,48 @@ +--- +title: "Always On architectures for Postgres" +navTitle: "Always On architectures" +navigation: + - bronze + - silver + - gold + - platinum +--- + +Always On architectures for Postgres reflect EDB’s recommended practices and help you to achieve the highest possible service availability in multiple configurations. These configurations range from single-location architectures to complex distributed systems that protect from hardware failures and data center failures. The architectures leverage EDB Postgres Distributed’s multi-master capability and its ability to achieve 99.999% availability, even during maintenance operations. + +You can use EDB Postgres Distributed for architectures beyond the examples described here. Use-case-specific variations have been successfully deployed in production. However, these variations must undergo rigorous architecture review first. Also, EDB’s standard deployment tool for Always On architectures, TPAExec, must be enabled to support the variations before they can be supported in production environments. + +For additional information, see [EDB Postgres Distributed: The Next Generation of PostgreSQL High Availability](https://info.enterprisedb.com/WhitePaperPostgres-BDRTheNextGenerationofPostgreSQLHighAvailability.html?_ga=2.95771928.561267847.1631518032-2107066053.1628865149) and [The End of the Reign of Oracle RAC: EDB Postgres Distributed Always On](https://info.enterprisedb.com/White_Paper_The_End_of_the_Reign_of_OracleRAC_Postgres-BDR_Always_On.html?_ga=2.95771928.561267847.1631518032-2107066053.1628865149). + +## Standard EDB Always On architectures + +EDB has identified four standard architectures: + +- [Always On Bronze](bronze): Single active location (data center or availability zone \[AZ\]) +- [Always On Silver](silver): Single active location with redundant hardware to quickly restore failover capability and a backup in a disaster recovery (DR) location +- [Always On Gold](gold): Two active locations +- [Always On Platinum](platinum): Two active locations with additional redundant hardware in a hot standby mode + +All Always On architectures protect a progressively robust range of failure situations. For example, Always On Bronze protects against local hardware failure but doesn't provide protection from location (data center or AZ) failure. Always On Silver makes sure that a backup is kept at a different location, thus providing some protection in case of the catastrophic loss of a location. However, the database still must be restored from backup first, which might violate recovery time objective (RTO) requirements. Always On Gold provides two active locations connected in a multi-master mesh network, making sure that service remains available even in case a location goes offline. Finally, Always On Platinum adds redundant hot standby hardware in both locations to maintain local high availability in case of a hardware failure. + +Each architecture can provide zero recovery point objective (RPO), as data can be streamed synchronously to at least one local master, thus guaranteeing zero data loss in case of local hardware failure. + +Increasing the availability guarantee drives additional cost for hardware and licenses, networking requirements, and operational complexity. Carefully consider your availability and compliance requirements before choosing an architecture. + +## Architecture details + +EDB Postgres Distributed uses a [Raft](https://raft.github.io)-based consensus architecture. While regular database operations (insert, select, delete) don’t require cluster-wide consensus, EDB Postgres Distributed benefits from an odd number of BDR nodes to make decisions that require consensus, such as generating new global sequences, or distributed DDL operations. Even the simpler architectures always have three BDR nodes, even if not all of them are storing data. Always On Gold and Platinum, which use two active locations, introduce a fifth BDR node as a witness node to support the RAFT requirements. + +Applications connect to the standard Always On architectures by way of multi-host connection strings, where each pgBouncer/HAProxy server is a distinct entry in the multi-host connection string. Other connection mechanisms have been successfully deployed in production, but they're not part of the standard Always On architectures. + + + + + + + + + + + + diff --git a/product_docs/docs/pgd/4/architectures/platinum.mdx b/product_docs/docs/pgd/4/architectures/platinum.mdx new file mode 100644 index 00000000000..6dd946c2180 --- /dev/null +++ b/product_docs/docs/pgd/4/architectures/platinum.mdx @@ -0,0 +1,16 @@ +--- +title: "AlwaysOn Platinum (two locations; fast HA restoration)" +navTitle: "Platinum" +--- + +The AlwaysOn Platinum architecture includes the following: + +- Four BDR data nodes (two in location A, two in location B) +- Two logical standby nodes (one in location A, one in location B) +- One witness node in location C (optional but recommended) +- Four HARP-Proxy nodes (two in location A, two in location B) +- Two Barman nodes (one in location A, one in location B) + +Both locations have redundant hot standby hardware to maintain local high availability in case of a hardware failure without waiting to restore a BDR node from backup. + +![Always On Platinum architecture](../images/platinum.png) diff --git a/product_docs/docs/pgd/4/architectures/silver.mdx b/product_docs/docs/pgd/4/architectures/silver.mdx new file mode 100644 index 00000000000..94e69fde68c --- /dev/null +++ b/product_docs/docs/pgd/4/architectures/silver.mdx @@ -0,0 +1,15 @@ +--- +title: "AlwaysOn Silver (single active location - cloud region or on prem data center)" +navTitle: Silver +--- + +The AlwaysOn Silver architecture includes the following: + +- Three BDR data nodes +- Two HARP-Proxy nodes +- Barman offsite (offsite is optional but recommended) + + +BDR and HARP nodes should be spread across AZs, and Barman can be located in location A. + +![Always On Silver architecture](../images/silver.png) diff --git a/product_docs/docs/bdr/4.0/backup.mdx b/product_docs/docs/pgd/4/backup.mdx similarity index 100% rename from product_docs/docs/bdr/4.0/backup.mdx rename to product_docs/docs/pgd/4/backup.mdx diff --git a/product_docs/docs/pgd/4/compatibility_matrix.mdx b/product_docs/docs/pgd/4/compatibility_matrix.mdx new file mode 100644 index 00000000000..67a37b666df --- /dev/null +++ b/product_docs/docs/pgd/4/compatibility_matrix.mdx @@ -0,0 +1,9 @@ +--- +title: "Compatibility matrix" +--- + +| BDR | HARP | Postgres | +| --- | ----- | -------- | +| 4 | 2 | 14 | +| 3.7 | 2 | 11-13 | +| 3.6 | 2 | 11 | \ No newline at end of file diff --git a/product_docs/docs/pgd/4/considerations/choosing_arch.mdx b/product_docs/docs/pgd/4/considerations/choosing_arch.mdx new file mode 100644 index 00000000000..1baef5fc62d --- /dev/null +++ b/product_docs/docs/pgd/4/considerations/choosing_arch.mdx @@ -0,0 +1,44 @@ +--- +title: "Choosing your architecture" +--- + +Use these criteria to help you to select the appropriate Always On architecture. + +| | Always On Bronze | Always On Silver | Always On Gold | Always On - Platinum | +|-----------------------------|------------------|------------------|----------------|----------------------| +| Hardware failure protection | Yes | Yes | Yes | Yes | +| Location failure protection | No (unless Barman is moved offsite)| Yes - Recovery from backup | Yes - instant failover to fully functional site | Yes - instant failover to fully functional site | +| Failover to DR or full DC | DR (if Barman is located offsite); NA otherwise | DR (if Barman is located offsite) | Full DC | Full DC | +| Zero downtime upgrade | Yes | Yes | Yes | Yes | +| Support of availability zones in public/ private cloud | Yes | Yes | Yes | Yes | +| Fast local restoration of high availability after device failure | No; time to restore HA: (1) VM prov + (2) approx 60 min/500GB | Yes; three local BDR nodes allow to maintain HA after device failure | No; time to restore HA: (1) VM prov + (2) approx 60 min/500GB | Yes; logical standbys can quickly be promoted to full BDR nodes | +| Cross data center network traffic | Backup traffic only (if Barman is located offsite); none otherwise | Backup traffic only (if Barman is located offsite); none otherwise | Full replication traffic | Full replication traffic | +| BDR license cost | 2 BDR nodes | 3 BDR nodes | 4 BDR nodes | 4 BDR nodes
2 logical standbys | + +## Deployment and sizing considerations + +For production deployments, EDB recommends a minimum of 12 cores for each BDR server and logical standbys. Witness nodes don't participate in the data replication operation and don't have to meet this requirement. Always size logical standbys exactly like the BDR nodes to avoid performance degradations in case of a node promotion. In production deployments, pgBouncer/HAProxy nodes require a minimum of four cores, with a minimum of two cores being assigned to pgBouncer and HAProxy each. EDB recommends detailed benchmarking of performance requirements, working with EDB’s Professional Services team. + +For development purposes, don't assign BDR nodes fewer than two cores. The sizing of Barman nodes depends on the database size and the data change rate. + +You can deploy BDR nodes, logical standbys, Barman nodes, and pgBouncer/HAProxy nodes on virtual machines or in a bare metal deployment mode. However, maintain anti-affinity between BDR nodes and pgBouncer/HAProxy nodes. +Don't co-locate multiple BDR nodes on VMs that are on the same physical hardware, as that reduces resilience. +Also don't co-locate multiple pgBouncer/HAProxy nodes on VMs on the same physical hardware, as that, too, reduces resilience. + +You can co-locate single pgBouncer/HAProxy nodes with single BDR nodes when deployed as VMs. + +When pgBouncer/HAProxy nodes are co-located with BDR nodes, use separate VMs to ensure proper CPU and memory resource assignment. + +## Clocks and timezones + +BDR has been designed to operate with nodes in multiple timezones, allowing a +truly worldwide database cluster. Individual servers do not need to be configured +with matching timezones, though we do recommend using log_timezone = UTC to +ensure the human readable server log is more accessible and comparable. + +Server clocks should be synchronized using NTP or other solutions. + +Clock synchronization is not critical to performance, as is the case with some +other solutions. Clock skew can impact Origin Conflict Detection, though +BDR provides controls to report and manage any skew that exists. BDR also +provides Row Version Conflict Detection, as described in [Conflict Detection](/bdr/conflicts). diff --git a/product_docs/docs/pgd/4/considerations/choosing_durability.mdx b/product_docs/docs/pgd/4/considerations/choosing_durability.mdx new file mode 100644 index 00000000000..f9ebe0a7f4c --- /dev/null +++ b/product_docs/docs/pgd/4/considerations/choosing_durability.mdx @@ -0,0 +1,16 @@ +--- +title: "Choosing durability" +--- + +These are the considerations for choosing your architecture based on durability: +- Async replication for speed + - Multiple types of conflict detection + - Column-level conflict resolution + - Conflict triggers + - Foreign key handlers + - Commit at most once (CAMO) +- Conflict-free replicated data types (CRDTs) +- Sync replication for consistency + - Eager replication + +For more information, see [Durability](/bdr/latest/durability.mdx). \ No newline at end of file diff --git a/product_docs/docs/pgd/4/considerations/choosing_server.mdx b/product_docs/docs/pgd/4/considerations/choosing_server.mdx new file mode 100644 index 00000000000..68e2ef5537c --- /dev/null +++ b/product_docs/docs/pgd/4/considerations/choosing_server.mdx @@ -0,0 +1,24 @@ +--- +title: "Choosing the database server distribution" +--- + +It is important to note that some key BDR features depend on certain core capabilities being available within the targeted PostgresSQL database server. Therefore, it is essential for the BDR customers to also adopt the PostgresSQL database server flavor that is best suited to their business needs. For example, if having the BDR feature "Commit At Most Once (CAMO)" is mission critical to a BDR customer’s use case, they should not adopt the community PostgreSQL flavor for it does not have the core capability required to handle CAMO. The full feature matrix compatibility can be found in Feature Compatibility appendix. + +The following table lists features of BDR and whether they are supported by +given variant of Postgres and optionally from which version. + +| Feature | PostgreSQL | EDB Postgres Extended | EDB Postgres Advanced | +| ----------------------------------------------- | ---------- | --------------------- | --------------------- | +| Commit At Most Once (CAMO) | N | Y | 14+ | +| Eager Replication | N | Y | 14+ | +| Decoding Worker | N | 13+ | 14+ | +| Assesment Tooling | N | Y | 14+ | +| Lag Tracker | N | Y | 14+ | +| Timestamp Snapshots | N | Y | 14+ | +| Transaction Streaming | 14+ | 13+ | 14+ | +| Missing Partition Conflict | N | Y | 14+ | +| No need for UPDATE Trigger on tables with TOAST | N | Y | 14+ | +| Automatically hold back FREEZE | N | Y | 14+ | + + + diff --git a/product_docs/docs/pgd/4/considerations/index.mdx b/product_docs/docs/pgd/4/considerations/index.mdx new file mode 100644 index 00000000000..099116ce5ff --- /dev/null +++ b/product_docs/docs/pgd/4/considerations/index.mdx @@ -0,0 +1,5 @@ +--- +title: "Considerations" +indexCards: simple +--- + diff --git a/product_docs/docs/pgd/4/deployments/index.mdx b/product_docs/docs/pgd/4/deployments/index.mdx new file mode 100644 index 00000000000..1fe02a718f1 --- /dev/null +++ b/product_docs/docs/pgd/4/deployments/index.mdx @@ -0,0 +1,16 @@ +--- +title: "Deployment options" +indexCards: simple + +--- + +You can deploy and install EDB Postgres Distributed products using the following methods: + +- TPAexec is an orchestration tool that uses Ansible to build Postgres clusters as specified by TPA (Trusted Postgres Architecture), a set of reference architectures that document how to set up and operate Postgres in various scenarios. TPA represents the best practices followed by EDB (and formerly, 2ndQuadrant), and its recommendations are as applicable to quick testbed setups as to production environments. + +Coming soon: + +- BigAnimal is a fully managed database-as-a-service with built-in Oracle compatibility, running in your cloud account and operated by the Postgres experts. BigAnimal makes it easy to set up, manage, and scale your databases. The addition of extreme high availability support through EDB Postres Distributed allows single-region Always On “Gold” clusters: two BDR groups in different availability zones in a single cloud region, with a witness node in a third availability zone. + +- EDB Postgres for Kubernetes is an operator is designed, developed, and supported by EDB that covers the full lifecycle of a highly available Postgres database clusters with a primary/standby architecture, using native streaming replication. It is based on the open source CloudNativePG operator, and provides additional value such as compatibility with Oracle using EDB Postgres Advanced Server and additional supported platforms such as IBM Power and OpenShift. + diff --git a/product_docs/docs/pgd/4/deployments/tpaexec/configuration_syntax.mdx b/product_docs/docs/pgd/4/deployments/tpaexec/configuration_syntax.mdx new file mode 100644 index 00000000000..b066f933f89 --- /dev/null +++ b/product_docs/docs/pgd/4/deployments/tpaexec/configuration_syntax.mdx @@ -0,0 +1,46 @@ +--- +title: "Configuration syntax" +navTitle: "Configuration syntax" +--- + +---------------- +TAKEN FROM: https://documentation.enterprisedb.com/tpa/release/22.13-1/architecture-BDR-Always-ON/ + +The BDR-Always-ON architecture configuration is intended for use in production. HARP2 is enabled by default in BDR-Always-ON architecture. + +You can select from four variants, using the --layout configure option: + +- bronze: 2×bdr+primary, bdr+witness, barman, 2×harp-proxy + +- silver: bronze, with bdr+witness promoted to bdr+primary, and barman moved to separate location + +- gold: two symmetric locations with 2×bdr+primary, 2×harp-proxy, and barman each; plus a bdr+witness in a third location + +- platinum: gold, but with one bdr+readonly (logical standby) added to each of the main locations + +See [Architectures](/pgd/4/architectures) for the detailed layout diagrams. + +For example, + +``` +[tpa]$ tpaexec configure ~/clusters/bdr \ + --architecture BDR-Always-ON \ + --layout gold \ + --platform aws --region eu-west-1 --instance-type t3.micro \ + --distribution Debian-minimal +``` + +| Flags | Description | +| ------ | ----------- | +| --architecture BDR-Always-ON | Required. Produces a working configuration. | +| --layout layoutname | Required. Specify one of the supported BDR use-case variations: bronze, silver, gold, and platinum. The bronze, gold, and platinum layouts have a BDR witness node to ensure odd number of nodes for Raft consensus majority. Witness nodes do not participate in the data replication. | +| --bdr-node-group groupname | Optional. Set the name of the BDR node group. The default is bdrgroup. | +| --bdr-database dbname | Optional. Set the name of the database with BDR enabled. The default is bdrdb. | +| --enable-camo | Optional. Set the pair of BDR primary instances in each region to be each other's CAMO partners. | + + + +You may also specify any of the options described in the online help for configure-options: +``` +tpaexec help configure-options +``` \ No newline at end of file diff --git a/product_docs/docs/pgd/4/deployments/tpaexec/index.mdx b/product_docs/docs/pgd/4/deployments/tpaexec/index.mdx new file mode 100644 index 00000000000..e0dfab176a1 --- /dev/null +++ b/product_docs/docs/pgd/4/deployments/tpaexec/index.mdx @@ -0,0 +1,19 @@ +--- +title: TPAexec +navigation: +- overview +- configuration_syntax +- quick_start + +--- + +The standard way of deploying EDB Distributed Postgres in self managed setting, +including physical and virtual machines, both self-hosted and in the cloud +(EC2) is to use EDB's deployment tool called TPAexec. + +TPAexec is an orchestration tool which can be used to build Postgres clusters +according to a Trusted Postgres Architecture (TPA) specification. + +TPA represents the best practices followed by EDB and its +recommendations are as applicable to quick testbed setups as to production +environments. \ No newline at end of file diff --git a/product_docs/docs/pgd/4/deployments/tpaexec/overview.mdx b/product_docs/docs/pgd/4/deployments/tpaexec/overview.mdx new file mode 100644 index 00000000000..bb0beebaf40 --- /dev/null +++ b/product_docs/docs/pgd/4/deployments/tpaexec/overview.mdx @@ -0,0 +1,56 @@ +--- +title: "TPAexec overview" +--- + +TAKEN FROM: https://documentation.enterprisedb.com/tpa/release/22.13-1/ + +TPAexec is an orchestration tool that uses Ansible to build Postgres clusters as specified by TPA (Trusted Postgres Architecture), a set of reference architectures that document how to set up and operate Postgres in various scenarios. TPA represents the best practices followed by EDB (and formerly, 2ndQuadrant), and its recommendations are as applicable to quick testbed setups as to production environments. + +## Capabilities and supported software + +### Configuration +The `tpaexec configure` command generates a simple YAML configuration file to describe a cluster, based on the options you select. The configuration is ready for immediate use, and you can modify it to better suit your needs. Editing the configuration file is the usual way to make any configuration changes to your cluster, both before and after it's created. + +At this stage, you must select an architecture and a platform for the cluster. An architecture is a recommended layout of servers and software to set up Postgres for a specific purpose. Examples include "M1" (Postgres with a primary and streaming replicas) and "BDR-Always-ON" (Postgres with BDR in an HA configuration). A platform is a means to host the servers to deploy any architecture, e.g., AWS, Docker, or bare-metal servers. + +### Provisioning +The `tpaexec provision` command creates instances and other resources required by the cluster. The details of the process depend on the architecture (for example, M1) and platform (for example, AWS) that you selected while configuring the cluster. + +For example, given AWS access with the necessary privileges, TPAexec provisions EC2 instances, VPCs, subnets, routing tables, internet gateways, security groups, EBS volumes, elastic IPs, and so on. + +You can also "provision" existing servers by selecting the "bare" platform and providing connection details. Whether these are bare metal servers or those provisioned separately on a cloud platform, they can be used just as if they had been created by TPAexec. + +You are not restricted to a single platform—you can spread your cluster out across some AWS instances (in multiple regions) and some on-premise servers, or servers in other data centres, as needed. + +At the end of the provisioning stage, you will have the required number of instances with the basic operating system installed, which TPAexec can access via SSH (with sudo to root). + +### Deployment +The `tpaexec deploy` command installs and configures Postgres and other software on the provisioned servers (which may or may not have been created by TPAexec; but it doesn't matter who created them so long as SSH and sudo access is available). This includes setting up replication, backups, and so on. + +At the end of the deployment stage, Postgres is up and running. + +### Testing +The `tpaexec test` command executes various architecture and platform-specific tests against the deployed cluster to ensure that it is working as expected. + +At the end of the testing stage, you will have a fully-functioning cluster. + +## Incremental changes +TPAexec is carefully designed so that provisioning, deployment, and testing are idempotent. You can run through them, make a change to config.yml, and run through the process again to deploy the change. If nothing has changed in the configuration or on the instances, then rerunning the entire process does not change anything either. + +## Extensible through Ansible +TPAexec supports a variety of configuration options, so you can do a lot just by editing config.yml and re-running provision/deploy/test. Should you need to go beyond what is already implemented, you can write hook scripts to extend the deployment process. + +This mechanism places the full range of Ansible functionality at your disposal during every stage of the deployment. For example, any tasks in hooks/pre-deploy.yml are executed before the main deployment; and there are also post-deploy and many other hooks. + +## Cluster management +Once your cluster is up and running, TPAexec provides convenient cluster management functions, including configuration changes, switchover, and zero-downtime minor-version upgrades. These features make it easier and safer to manage your cluster than making the changes by hand. + +## It's just Postgres +TPAexec can create complex clusters with many features configured, but the result is just Postgres. The installation follows some conventions designed to make life simpler, but there is no hidden magic or anything standing in the way between you and the database. You can do everything on a TPA cluster that you could do on any other Postgres installation. + +## Getting started +See these topics to get complete your installation and then configure your first cluster: + +- [Quick setup](quick_start) +- [TPAexec installation](https://documentation.enterprisedb.com/tpa/release/22.13-1/INSTALL/) for more detailed instructions + diff --git a/product_docs/docs/pgd/4/deployments/tpaexec/quick_start.mdx b/product_docs/docs/pgd/4/deployments/tpaexec/quick_start.mdx new file mode 100644 index 00000000000..dbe267893f9 --- /dev/null +++ b/product_docs/docs/pgd/4/deployments/tpaexec/quick_start.mdx @@ -0,0 +1,38 @@ +--- +title: "Quick setup with TPAexec" +navTitle: "Quick setup" +--- + +The following steps setup EDB Distributed Postgres with the Always-ON Silver +architecture using Amazon EC2. + +First, we generate configuration file using the `configure` command: +``` +$ tpaexec configure myedbdpcluster --architecture BDR-Always-ONO --layout Silver --platform aws +``` + +This will create a subdirectory directory in current working directory called +`myedbdpcluster`. In that directory there will be `config.yml` with configuration +TPAexec needs to crate the cluster. You can edit the `config.yml` as needed, +for example changing the ip address range used for servers, or adjusting locations +of nodes. + +Next, we provision the cluster, in the case of the current example, this will +create the EC2 instances, configure VPC, etc: +``` +tpaexec provision myedbdpcluster +``` + +Finally, we can deploy the needed packages, configuration and setup the actual +EDB Distributed Postgres cluster: +``` +tpaexec deploy myedbdpcluster +``` + +After the successful run of the `deploy` command the cluster will be ready to use, +use can connect to it via `psql`, or any other database client. + +It's also possible to run a test that ensures the cluster is running as expected: +``` +tpaexec test myedbdpcluster +``` \ No newline at end of file diff --git a/product_docs/docs/pgd/4/images/bdr-ao-spec.png b/product_docs/docs/pgd/4/images/bdr-ao-spec.png new file mode 100644 index 00000000000..45d87bc5c20 --- /dev/null +++ b/product_docs/docs/pgd/4/images/bdr-ao-spec.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68f18458d6e1bc04715e6a64381c39e142b9d09ac7321eba0bf2d79c2e4ba5d2 +size 44744 diff --git a/product_docs/docs/pgd/4/images/bronze.png b/product_docs/docs/pgd/4/images/bronze.png new file mode 100644 index 00000000000..0804afc253d --- /dev/null +++ b/product_docs/docs/pgd/4/images/bronze.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15cd58fb4d4fe0edb9e4a58c77a995e9bf7ca7e7e93784fd40d07f5a750b6bb3 +size 130314 diff --git a/product_docs/docs/pgd/4/images/gold.png b/product_docs/docs/pgd/4/images/gold.png new file mode 100644 index 00000000000..b01911d1d7a --- /dev/null +++ b/product_docs/docs/pgd/4/images/gold.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11bd0eefd95234f11c08170b4bc177bd9740a4686def330d7f9e91210f089b5c +size 200171 diff --git a/product_docs/docs/pgd/4/images/gold2.png b/product_docs/docs/pgd/4/images/gold2.png new file mode 100644 index 00000000000..13062dcdc16 --- /dev/null +++ b/product_docs/docs/pgd/4/images/gold2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d991c511cfbfd3bdc257da62766a46154933e418950cf6da9b98b0a2df32d2fa +size 228061 diff --git a/product_docs/docs/pgd/4/images/gold3.png b/product_docs/docs/pgd/4/images/gold3.png new file mode 100644 index 00000000000..2fd3fb966a1 --- /dev/null +++ b/product_docs/docs/pgd/4/images/gold3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:369c239432398b32a0a91c0e78b56bf546e6f3632cb25ed354e94b205275ed9e +size 265526 diff --git a/product_docs/docs/pgd/4/images/ha-ao-bdr.png b/product_docs/docs/pgd/4/images/ha-ao-bdr.png new file mode 100644 index 00000000000..e53622f1e92 --- /dev/null +++ b/product_docs/docs/pgd/4/images/ha-ao-bdr.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:229f3d9b53ef214ea16b9ed886d44271c65d2a0ff72bde1bb92a060aba67df76 +size 9024 diff --git a/product_docs/docs/pgd/4/images/ha-ao.png b/product_docs/docs/pgd/4/images/ha-ao.png new file mode 100644 index 00000000000..97da1a5d298 --- /dev/null +++ b/product_docs/docs/pgd/4/images/ha-ao.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e272de5db5b7678bd849e76e48a2b44b604c3c1ad1b94076000b95acc53d3da +size 9660 diff --git a/product_docs/docs/pgd/4/images/ha-unit-bdr.png b/product_docs/docs/pgd/4/images/ha-unit-bdr.png new file mode 100644 index 00000000000..3203404e3be --- /dev/null +++ b/product_docs/docs/pgd/4/images/ha-unit-bdr.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15d749eb33d3adbd639ca2bd534afabb67cd3bbe7b28b2a7a4f40372119a912c +size 6119 diff --git a/product_docs/docs/pgd/4/images/ha-unit.png b/product_docs/docs/pgd/4/images/ha-unit.png new file mode 100644 index 00000000000..dc18fc9ffbe --- /dev/null +++ b/product_docs/docs/pgd/4/images/ha-unit.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:877f61a90d58a02b20030b76156d80c632c6bb1d34f431975d308a32bbf9d046 +size 7325 diff --git a/product_docs/docs/pgd/4/images/nodes.png b/product_docs/docs/pgd/4/images/nodes.png new file mode 100644 index 00000000000..7f969ed1e71 --- /dev/null +++ b/product_docs/docs/pgd/4/images/nodes.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:264eccb0911c492ba60dccf3f9df14aa93119336b8845b1c772859bd7a031939 +size 45015 diff --git a/product_docs/docs/pgd/4/images/platinum.png b/product_docs/docs/pgd/4/images/platinum.png new file mode 100644 index 00000000000..a5e619459eb --- /dev/null +++ b/product_docs/docs/pgd/4/images/platinum.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6b7ef0bfb8b3b4179bd37ea299014fc1bff3b738e7d3591080a014481ebb887 +size 335237 diff --git a/product_docs/docs/pgd/4/images/silver.png b/product_docs/docs/pgd/4/images/silver.png new file mode 100644 index 00000000000..2e0badc874e --- /dev/null +++ b/product_docs/docs/pgd/4/images/silver.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c939c611db662ab938e1df6190dc9e1ad2655d060c3cf40fa8e3cbc05706c0ff +size 132604 diff --git a/product_docs/docs/pgd/4/index.mdx b/product_docs/docs/pgd/4/index.mdx new file mode 100644 index 00000000000..d0ae1be601e --- /dev/null +++ b/product_docs/docs/pgd/4/index.mdx @@ -0,0 +1,21 @@ +--- +title: "EDB Postgres Distributed" +navigation: + - rel_notes + - "#Getting started" + - terminology + - "#Planning" + - overview + - architectures + - considerations + - "#Installing" + - deployments + - upgrades + - compatibility_matrix + - "#Using" + - backups + - monitoring +--- + + +EDB Postgres Distributed provides multi-master replication and data distribution with advanced conflict management, data-loss protection, and throughput up to 5X faster than native logical replication, and enables distributed PostgreSQL clusters with high availability up to five 9s. diff --git a/product_docs/docs/bdr/4.0/monitoring.mdx b/product_docs/docs/pgd/4/monitoring.mdx similarity index 100% rename from product_docs/docs/bdr/4.0/monitoring.mdx rename to product_docs/docs/pgd/4/monitoring.mdx diff --git a/product_docs/docs/pgd/4/overview/index.mdx b/product_docs/docs/pgd/4/overview/index.mdx new file mode 100644 index 00000000000..fdf74eb0640 --- /dev/null +++ b/product_docs/docs/pgd/4/overview/index.mdx @@ -0,0 +1,512 @@ +--- +title: "Components and architectures" +--- + +EDB Postgres Distributed provides loosely-coupled multi-master logical replication +using a mesh topology. This means that you can write to any server and the +changes will be sent directly, row-by-row to all the +other servers that are part of the same mesh. + +![node diagram](../images/nodes.png) + +By default EDB Postgres Distributed uses asynchronous replication, applying changes on +the peer nodes only after the local commit. Additional levels of synchronicity can +be configured between different nodes, groups of nodes or all nodes by configuring +[CAMO](/bdr/latest/camo), [Group Commit ](/bdr/latest/group-commit) or +[eager all node replication](/bdr/latest/eager) features. + +EDB Postgres Distributed consists of several components that make the whole +cluster work. + +## Postgres server + +Three different Postgres distributions can be used: + +- [PostgreSQL](https://www.postgresql.org/) - open source +- [EDB Postgres Extended Server](https://techsupport.enterprisedb.com/customer_portal/sw/2ndqpostgres/) - PostgreSQL compatible and optimized for replication +- [EDB Postgres Advanced Server](/epas/latest) - Oracle compatible, optimized for replication, and additional enterprise features + +What Postgres distribution and version is right for you depends on the features you need +please refer to the feature matrix for detailed comparison. + +## Postgres extensions + +### BDR + +A Postgres server with the BDR extension installed will be referred to as a BDR +node. BDR nodes be either data nodes or witness nodes. + +Witness nodes don't participate in data replication, and are only used as a +tie-breaker for consensus. + +### pglogical3 (3.6/3.7) + +pglogical is a Postgres extension that provides logical replication using +logical decoding since Postges 9.4. + +Older versions of BDR (3.6/3.7) depend on pglogical3 to provide the replication +channel upon which BDR builds. + +## HARP + +HARP is connection management tool for BDR cluster. + +It leverages consensus-driven quorum to determine the correct connection end-point +in a semi-exclusive manner to prevent unintended multi-node writes from an +application. This reduces the potential for data conflicts. + +## Basic architecture + +### Multiple groups + +A BDR node is a member of at least one **Node Group**, and in the most +basic architecture there is a single node group for the whole BDR +cluster. + +### Multiple masters + +Each node (database) participating in a BDR group both receives +changes from other members and can be written to directly by the user. + +This is distinct from Hot or Warm Standby, where only one primary +server accepts writes, and all the other nodes are standbys that +replicate either from the primary or from another standby. + +You don't have to write to all the masters, all of the time; it's +a frequent configuration to direct writes mostly to just one master. + +### Asynchronous, by default + +Changes made on one BDR node are not replicated to other nodes until +they are committed locally. As a result the data is not exactly the +same on all nodes at any given time; some nodes will have data that +has not yet arrived at other nodes. PostgreSQL's block-based replication +solutions default to asynchronous replication as well. In BDR, +because there are multiple masters and as a result multiple data streams, +data on different nodes might differ even when +`synchronous_commit` and `synchronous_standby_names` are used. + +Additional levels of synchronicity can +be configured between different nodes, groups of nodes or all nodes by configuring +[CAMO](/bdr/latest/camo), [Group Commit ](/bdr/latest/group-commit) or +[eager all node replication](/bdr/latest/eager) features. + +### Mesh topology + +BDR is structured around a mesh network where every node connects to every +other node and all nodes exchange data directly with each other. There is no +forwarding of data within BDR except in special circumstances such as node +addition and node removal. Data may arrive from outside the BDR cluster or +be sent onwards using native PostgreSQL logical replication. + +### Logical replication + +Logical replication is a method of replicating data rows and their changes, +based upon their replication identity (usually a primary key). +We use the term *logical* in contrast to *physical* replication, which uses +exact block addresses and byte-by-byte replication. Index changes are not +replicated, thereby avoiding write amplification and reducing bandwidth. + +Logical replication starts by copying a snapshot of the data from the +source node. Once that is done, later commits are sent to other nodes as +they occur in real time. Changes are replicated without re-executing SQL, +so the exact data written is replicated quickly and accurately. + +Nodes apply data in the order in which commits were made on the source node, +ensuring transactional consistency is guaranteed for the changes from +any single node. Changes from different nodes are applied independently of +other nodes to ensure the rapid replication of changes. + +Replicated data is sent in binary form, when it is safe to do so. + +### High availability + +Each master node can be protected by one or more standby nodes, so any node +that goes down can be quickly replaced and continue. Each standby node can +be either a logical or a physical standby node. + +Replication continues between currently connected nodes even if one or more +nodes are currently unavailable. When the node recovers, replication +can restart from where it left off without missing any changes. + +Nodes can run different release levels, negotiating the required protocols +to communicate. As a result, BDR clusters can use rolling upgrades, even +for major versions of database software. + +DDL is automatically replicated across nodes by default. DDL execution can +be user controlled to allow rolling application upgrades, if desired. + +### Limits + +BDR can run hundreds of nodes on good enough hardware and network, however +for mesh based deployments it's generally not recommended to run more than +32 nodes in one cluster. +Each master node can be protected by multiple physical or logical standby nodes; +there is no specific limit on the number of standby nodes, +but typical usage would be to have 2-3 standbys per master. Standby nodes don't +add additional connections to the mesh network so they are not included in the +32 node recommendation. + +BDR currently has hard limit of no more than 1000 active nodes. This is both +the current limit for Raft connections allowed and a limitation of nodes +that the distributed sequence algorithm can support. + +BDR places a limit that at most 10 databases in any one PostgreSQL instance +can be BDR nodes across different BDR node groups. However BDR works best if +only one BDR database per PostgreSQL instance is used. + +The minimum recommended number of nodes in BDR cluster is 3, because with +2 nodes the consensus stops working if one of the node stops working. One +of the three nodes can however be a witness node. + +## Architectural options and performance + +### Characterizing BDR performance + +BDR can be configured in a number of different architectures, each of which has +different performance and scalability characteristics. + +The Group is the basic building block of a BDR Group consisting of 2+ nodes +(servers). Within a Group, each node is in a different AZ, with dedicated router +and backup, giving Immediate Switchover and High Availability. Each Group has a +dedicated Replication Set defined on it. If the Group loses a node it is easily +repaired/replaced by copying an existing node from the Group. + +Adding more master nodes to a BDR Group does not result in significant write +throughput increase when most tables are replicated because BDR has to replay +all the writes on all nodes. Because BDR writes are in general more effective +than writes coming from Postgres clients via SQL, some performance increase +can be achieved. Read throughput generally scales linearly with the number of +nodes. + +The following architectures are available: + +- Multimaster/Single group +- BDR AlwaysOn + +The simplest architecture is just to have one group, so let's examine that first: + +### BDR MultiMaster within one group + +By default, BDR will keep one copy of each table on each node in the Group and any +changes will be propagated to all nodes in the Group. + +Since copies of data are everywhere, SELECTs need only ever access the local node. +On a read-only cluster, performance on any one node will not be affected by the +number of nodes. Thus adding nodes will increase linearly the total possible SELECT +throughput. + +INSERTs, UPDATEs and DELETEs (DML) are performed locally, then the changes will +be propagated to all nodes in the Group. The overhead of DML apply is less than the +original execution, so if you run a pure write workload on multiple nodes +concurrently, a multi-node cluster will be able to handle more TPS than a single node. + +Conflict handling has a cost that will act to reduce the throughput. The throughput +is then dependent upon how much contention the application displays in practice. +Applications with very low contention will perform better than a single node; +applications with high contention could perform worse than a single node. +These results are consistent with any multi-master technology, they are not a facet +or peculiarity of BDR. + +Eager Replication can avoid conflicts, but is inherently more expensive. + +Changes are sent concurrently to all nodes so that the replication lag is minimised. +Adding more nodes means using more CPU for replication, so peak TPS will reduce +slightly as each new node is added. + +If the workload tries to uses all CPU resources then this will resource constrain +replication, which could then affect the replication lag. + +### BDR AlwaysOn + +The AlwaysOn architecture is built from 2 Groups, in 2 separate regions. Each Group +provides HA and IS, but together they also provide Disaster Recovery (DR), so we refer +to this architecture as AlwaysOn with Very High Availability. + +Tables are created across both Groups, so any change goes to all nodes, not just to +nodes in the local Group. + +One node is the target for the main application. All other nodes are described as +shadow nodes (or "read-write replica"), waiting to take over when needed. If a node +loses contact we switch immediately to a shadow node to continue processing. If a +Group fails, we can switch to the other Group. Scalability is not the goal of this +architecture. + +Since we write mainly to only one node, the possibility of contention between is +reduced to almost zero and as a result performance impact is much reduced. + +CAMO is Eager Replication within the local Group, lazy with regard to other Groups. + +Secondary applications may execute against the shadow nodes, though these should be +reduced or interrupted if the main application begins using that node. + +Future feature: One node is elected as main replicator to other Groups, limiting CPU +overhead of replication as the cluster grows and minimizing the bandwidth to other Groups. + +## Deployment + +BDR is intended to be deployed in one of a small number of known-good configurations, +using either TPAexec or a configuration management approach +and deployment architecture approved by Technical Support. + +Manual deployment is not recommended and may not be supported. + +Please refer to the `TPAexec Architecture User Manual` for your architecture. + +Log messages and documentation are currently available only in English. + +## The importance of Quorum + +The central purpose of HARP is to enforce full Quorum on any Postgres cluster +it manages. Quorum is merely a term generally applied to a voting body that +mandates a certain minimum of attendees are available to make a decision. Or +perhaps even more simply: Majority Rules. + +In order for any vote to end in a result other than a tie, an odd number of +nodes must constitute the full cluster membership. Quorum however does not +strictly demand this restriction; a simple majority will suffice. This means +that in a cluster of N nodes, Quorum requires a minimum of N/2+1 nodes to hold +a meaningful vote. + +All of this ensures the cluster is always in agreement regarding which node +should be "in charge". For a BDR cluster consisting of multiple nodes, this +determines which node is the primary write target. HARP designates this node +as the Lead Master. + +## Reducing write targets + +The consequence of ignoring the concept of Quorum, or applying it +insufficiently, may lead to a Split Brain scenario where the "correct" write +target is ambiguous or unknowable. In a standard Postgres cluster, it is +important that only a single node is ever writable and sending replication +traffic to the remaining nodes. + +Even in Multi-Master capable approaches such as BDR, it can be beneficial to +reduce the amount of necessary conflict management to derive identical data +across the cluster. In clusters that consist of multiple BDR nodes per physical +location or region, this usually means a single BDR node acts as a "Leader" and +remaining nodes are "Shadows". These Shadow nodes are still writable, but doing +so is discouraged unless absolutely necessary. + +By leveraging Quorum, it's possible for all nodes to agree exactly which +Postgres node should represent the entire cluster, or a local BDR region. Any +nodes that lose contact with the remainder of the Quorum, or are overruled by +it, by definition cannot become the cluster Leader. + +This prevents Split Brain situations where writes unintentionally reach two +Postgres nodes. Unlike technologies such as VPNs, Proxies, load balancers, or +DNS, a Quorum-derived consensus cannot be circumvented by mis-configuration or +network partitions. So long as it's possible to contact the Consensus layer to +determine the state of the Quorum maintained by HARP, only one target is ever +valid. + +## Basic architecture + +The design of HARP comes in essentially two parts consisting of a Manager and +a Proxy. The following diagram describes how these interact with a single +Postgres instance: + +![HARP Unit](../images/ha-unit.png) + +The Consensus Layer is an external entity where Harp Manager maintains +information it learns about its assigned Postgres node, and HARP Proxy +translates this information to a valid Postgres node target. Because Proxy +obtains the node target from the Consensus Layer, several such instances may +exist independently. + +While using BDR itself as the Consensus Layer, each server node resembles this +variant instead. + +![HARP Unit w/BDR Consensus](../images/ha-unit-bdr.png) + +In either case, each unit consists of the following elements: + +* A Postgres or EDB instance +* A Consensus Layer resource, meant to track various attributes of the Postgres + instance +* A HARP Manager process to convey the state of the Postgres node to the + Consensus Layer +* A HARP Proxy service that directs traffic to the proper Lead Master node, + as derived from the Consensus Layer + +Not every application stack has access to additional node resources +specifically for the Proxy component, so it can be combined with the +application server to simplify the stack itself. + +This is a typical design using two BDR nodes in a single Data Center organized in a Lead Master / Shadow Master configuration: + +![HARP Cluster](../images/ha-ao.png) + +Note that when using BDR itself as the HARP Consensus Layer, at least three +fully qualified BDR nodes must be present to ensure a quorum majority. + +![HARP Cluster w/BDR Consensus](../images/ha-ao-bdr.png) + +(Not shown in the above diagram are connections between BDR nodes.) + +## How it works + +When managing a BDR cluster, HARP maintains at most one "Leader" node per +defined Location. Canonically this is referred to as the Lead Master. Other BDR +nodes which are eligible to take this position are Shadow Master state until +such a time they take the Leader role. + +Applications may contact the current Leader only through the Proxy service. +Since the Consensus Layer requires Quorum agreement before conveying Leader +state, any and all Proxy services will direct traffic to that node. + +At a high level, this is ultimately what prevents application interaction with +multiple nodes simultaneously. + +### Determining a Leader + +As an example, consider the role of Lead Master within a locally subdivided +BDR Always-On group as may exist within a single data center. When any +Postgres or Manager resource is started, and after a configurable refresh +interval, the following must occur: + +1. The Manager checks the status of its assigned Postgres resource. + - If Postgres is not running, try again after configurable timeout. + - If Postgres is running, continue. +2. The Manager checks the status of the Leader lease in the Consensus Layer. + - If the lease is unclaimed, acquire it and assign the identity of + the Postgres instance assigned to this Manager. This lease duration is + configurable, but setting it too low may result in unexpected leadership + transitions. + - If the lease is already claimed by us, renew the lease TTL. + - Otherwise do nothing. + +Obviously a lot more happens here, but this simplified version should explain +what's happening. The Leader lease can only be held by one node, and if it's +held elsewhere, HARP Manager gives up and tries again later. + +!!! Note + Depending on the chosen Consensus Layer, rather than repeatedly looping to + check the status of the Leader lease, HARP will subscribe to notifications + instead. In this case, it can respond immediately any time the state of the + lease changes, rather than polling. Currently this functionality is + restricted to the etcd Consensus Layer. + +This means HARP itself does not hold elections or manage Quorum; this is +delegated to the Consensus Layer. The act of obtaining the lease must be +acknowledged by a Quorum of the Consensus Layer, so if the request succeeds, +that node leads the cluster in that Location. + +### Connection routing + +Once the role of the Lead Master is established, connections are handled +with a similar deterministic result as reflected by HARP Proxy. Consider a case +where HARP Proxy needs to determine the connection target for a particular backend +resource: + +1. HARP Proxy interrogates the Consensus layer for the current Lead Master in + its configured location. +2. If this is unset or in transition; + - New client connections to Postgres are barred, but clients will + accumulate and be in a paused state until a Lead Master appears. + - Existing client connections are allowed to complete current transaction, + and are then reverted to a similar pending state as new connections. +3. Client connections are forwarded to the Lead Master. + +Note that the interplay demonstrated in this case does not require any +interaction with either HARP Manager or Postgres. The Consensus Layer itself +is the source of all truth from the Proxy's perspective. + +### Colocation + +The arrangement of the work units is such that their organization is required +to follow these principles: + +1. The Manager and Postgres units must exist concomitantly within the same + node. +2. The contents of the Consensus Layer dictate the prescriptive role of all + operational work units. + +This delegates cluster Quorum responsibilities to the Consensus Layer itself, +while HARP leverages it for critical role assignments and key/value storage. +Neither storage or retrieval will succeed if the Consensus Layer is inoperable +or unreachable, thus preventing rogue Postgres nodes from accepting +connections. + +As a result, the Consensus Layer should generally exist outside of HARP or HARP +managed nodes for maximum safety. Our reference diagrams reflect this in order +to encourage such separation, though it is not required. + +!!! Note + In order to operate and manage cluster state, BDR contains its own + implementation of the Raft Consensus model. HARP may be configured to + leverage this same layer to reduce reliance on external dependencies and + to preserve server resources. However, there are certain drawbacks to this + approach that are discussed in further depth in the section on the + [Consensus Layer](09_consensus-layer). + +## Recommended architecture and use + +HARP was primarily designed to represent a BDR Always-On architecture which +resides within two (or more) Data Centers and consists of at least five BDR +nodes. This does not count any Logical Standby nodes. + +The current and standard representation of this can be seen in the following +diagram: + +![BDR Always-On Reference Architecture](../images/bdr-ao-spec.png) + +In this diagram, HARP Manager would exist on BDR Nodes 1-4. The initial state +of the cluster would be that BDR Node 1 is the Lead master of DC A, and BDR +Node 3 is the Lead Master of DC B. + +This would result in any HARP Proxy resource in DC A connecting to BDR Node 1, +and likewise the HARP Proxy resource in DC B connecting to BDR Node 3. + +!!! Note + While this diagram only shows a single HARP Proxy per DC, this is merely + illustrative and should not be considered a Single Point of Failure. Any + number of HARP Proxy nodes may exist, and they will all direct application + traffic to the same node. + +### Location configuration + +In order for multiple BDR nodes to be eligible to take the Lead Master lock in +a location, a Location must be defined within the `config.yml` configuration +file. + +To reproduce the diagram above, we would have these lines in the `config.yml` +configuration for BDR Nodes 1 and 2: + +```yaml +location: dca +``` + +And for BDR Nodes 3 and 4: + +```yaml +location: dcb +``` + +This applies to any HARP Proxy nodes which are designated in those respective +data centers as well. + +### BDR 3.7 compatibility + +BDR 3.7 and above offers more direct Location definition by assigning a +Location to the BDR node itself. This is done by calling the following SQL +API function while connected to the BDR node. So for BDR Nodes 1 and 2, we +might do this: + +```sql +SELECT bdr.set_node_location('dca'); +``` + +And for BDR Nodes 3 and 4: + +```sql +SELECT bdr.set_node_location('dcb'); +``` + +Afterwards, future versions of HARP Manager would derive the `location` field +directly from BDR itself. This HARP functionality is not available yet, so we +recommend using this and the setting in `config.yml` until HARP reports +compatibility with this BDR API method. + + + diff --git a/product_docs/docs/pgd/4/rel_notes/epd_4_rel_notes.mdx b/product_docs/docs/pgd/4/rel_notes/epd_4_rel_notes.mdx new file mode 100644 index 00000000000..68d3000fed0 --- /dev/null +++ b/product_docs/docs/pgd/4/rel_notes/epd_4_rel_notes.mdx @@ -0,0 +1,15 @@ +--- +title: "Release notes for EDB Postgres Distributed version 4" +navTitle: "Version 4" +--- + +EDB Postgres Distributed version 4 includes: + +| Feature | Description | +|--------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Faster Postgres major version upgrade | Postgres Distributed is the only Postgres high availability technology that allows fully online major version upgrades. This functionality is improved to allow upgrades in-place, without migrating data between nodes. Even the largest clusters can now be upgraded to the latest major version within minutes. | +| Replication lag control | If you use asynchronous replication, you can configure a replication lag threshold after which the transactions get throttled. This allows you to keep a low RPO without incurring the latency impact on every transaction that comes with synchronous replication. | +| Distributed sequences by default | Instead of setting up custom sequence handling configuration on every node, you get distributed sequences configured automatically with the best kind of sequence for the data type used. The new SnowflakeID algorithm replaces Timeshard, which had constraints that prevented us from using it as a default. | +| Postgres Distributed Command Line Interface (CLI | You can get information about your Postgres Distributed cluster right from the CLI, including the current state of the replication, consensus, and nodes. You can also query across the nodes of a cluster. Instead of relying on queries, you can use simplified commands in the CLI. | +| Simplified synchronous replication configuration | Instead of adjusting multiple different Grand Unified Configurations (GUCs) for each node, you can use a SQL level interface to configure how transactions are replicated and what durability and visibility criteria to use, allowing for all nodes to be configured consistently from a single place. This allows (along with Group Commits) more functionality and flexibility around synchronous replication in the future. | +| Group Commit | You can have synchronous replication to require quorum within a Postgres Distributed group before committing a transaction, making EDB a more attractive alternative to Oracle DataGuard or Postgres Synchronous Replication. This allows more functionality around synchronous replication in the future, like Eager Majority. | \ No newline at end of file diff --git a/product_docs/docs/pgd/4/rel_notes/index.mdx b/product_docs/docs/pgd/4/rel_notes/index.mdx new file mode 100644 index 00000000000..a51eb7184d9 --- /dev/null +++ b/product_docs/docs/pgd/4/rel_notes/index.mdx @@ -0,0 +1,12 @@ +--- +title: "Release notes" +--- + +The EDB Postgres Distributed documentation describes the Always On architectures supported in EDB Postgres Distributed 4. For details about individual product features see: +- [BDR release notes](/bdr/latest/release_notes) +- [HARP release notes](/harp/latest/01_release_notes) + + +| Version | Release Date | +| -------------------- | ------------ | +| [4](epd_4_rel_notes) | 2022 May 17 | diff --git a/product_docs/docs/pgd/4/terminology.mdx b/product_docs/docs/pgd/4/terminology.mdx new file mode 100644 index 00000000000..c568e92c73f --- /dev/null +++ b/product_docs/docs/pgd/4/terminology.mdx @@ -0,0 +1,93 @@ +--- +title: Terminology +--- + +The terminology that follows is important for understanding EDB Postgres Distributed functionality and the requirements that it addresses in the realms of high availability, replication, and clustering. + +#### Asynchronous replication + +Copies data to cluster members after the transaction completes on the origin node. Asynchronous replication can provide higher performance and lower latency than synchronous replication. However, it introduces the potential for conflicts because of multiple concurrent changes. You must manage any conflicts that arise. + +#### Availability + +The probability that a system will operate satisfactorily at a given time when used in a stated environment. For many people, this is the overall amount of uptime versus downtime for an application. (See also **Nines**) + +#### CAMO or commit-at-most-once + +Wraps Eager Replication with additional transaction management at the application level to guard against a transaction being executed more than once. This is critical for high-value transactions found in payments solutions. It is roughly equivalent to the Oracle feature Transaction Guard. + +#### Clustering + +An approach for high availability in which multiple redundant systems are managed to avoid single points of failure. It appears to the end user as one system. + +#### Data sharding + +Enables scaling out a database by breaking up data into chunks called *shards* and distributing them across separate nodes. + +#### Eager Replication for BDR + +Conflict-free replication with all cluster members; technically, this is synchronous logical replication using two phase-commit (2PC). + +#### Eventual consistency + +A distributed computing consistency model stating changes to the same item in different cluster members will converge to the same value. With BDR this is achieved through asynchronous logical replication with conflict resolution and conflict-free replicated data types. + +#### Failover + +The automated process that recognizes a failure in a highly available database cluster and takes action to connect the application to another active database. The goal is to minimize downtime and data loss. + +#### Horizontal scaling** or **scale out + +A modern distributed computing approach that manages workloads across multiple nodes, such as scaling out a web server to handle increased traffic. + +#### Logical replication + +Provides more flexibility than physical replication in terms of selecting the data replicated between databases in a cluster. Also important is that cluster members can be on different versions of the database software. + +#### Nines + +A measure of availability expressed as a percentage of uptime in a given year. Three nines (99.9%) allows for 43.83 minutes of downtime per month. Four nines (99.99%) allows for 4.38 minutes of downtime per month. Five nines (99.999%) allows for 26.3 seconds of downtime per month. + +#### Node + +One database server in a cluster. A term "node" differs from the term "database server" because there is more than one node in a cluster. A node includes the database server, the OS, and the physical hardware, which is always separate from other nodes in a high-availability context. + +#### Physical replication + +Copies all changes from a database to one or more standby cluster members by copying an exact copy of database disk blocks. While fast, this method has downsides. For example, only one master node can run write transactions. Also, you can use this method only where all cluster members are on the same major version of the database software, in addition to several other more complex restrictions. + +#### Read scalability + +Can be achieved by introducing one or more read replica nodes to a cluster and have the application direct writes to the primary node and reads to the replica nodes. As the read workload grows, you can increase the number of read replica nodes to maintain performance. + +#### Recovery point objective (RPO) + +The maximum targeted period in which data might be lost due to a disruption in delivery of an application. A very low or minimal RPO is a driver for very high availability. + +#### Recovery time objective (RTO) + +The targeted length of time for restoring the disrupted application. A very low or minimal RTO is a driver for very high availability. + +#### Single point of failure (SPOF) + +The identification of a component in a deployed architecture that has no redundancy and therefore prevents you from achieving higher levels of availability. + +#### Switchover + +A planned change in connection between the application and the active database node in a cluster, typically done for maintenance. + +#### Synchronous replication + +When changes are updated at all participating nodes at the same time, typically leveraging two-phase commit. While this approach delivers immediate consistency and avoids conflicts, a performance cost in latency occurs due to the coordination required across nodes. + +#### Two-phase commit (2PC) + +A multi-step process for achieving consistency across multiple database nodes. + +#### Vertical scaling** or **scale up + +A traditional computing approach of increasing a resource (CPU, memory, storage, network) to support a given workload until the physical limits of that architecture are reached, e.g., Oracle Exadata. + +#### Write scalability + +Occurs when replicating the writes from the original node to other cluster members becomes less expensive. In vertical-scaled architectures, write scalability is possible due to shared resources. However, in horizontal scaled (or nothing-shared) architectures, this is possible only in very limited scenarios. diff --git a/product_docs/docs/pgd/4/upgrades/bdr_pg_upgrade.mdx b/product_docs/docs/pgd/4/upgrades/bdr_pg_upgrade.mdx new file mode 100644 index 00000000000..3525b48c35e --- /dev/null +++ b/product_docs/docs/pgd/4/upgrades/bdr_pg_upgrade.mdx @@ -0,0 +1,161 @@ +--- +title: In-place Postgres Major Version Upgrades +--- + +Upgrading a BDR Node to a newer major version of Postgres is possible using the +command-line utility `bdr_pg_upgrade`. + +`bdr_pg_upgrade` internally uses the standard [`pg_upgrade`](https://www.postgresql.org/docs/current/pgupgrade.html) +with BDR specific logic to ensure a smooth upgrade. + +## Terminology + +Various terminology is used in this documentation to describe the upgrade process and components involved. + +*old cluster* - The existing PostgreSQL cluster node to be upgraded, which data will be migrated from. + +*new cluster* - The new PostgreSQL cluster, which data will be migrated to. This cluster node must be one (1) major version ahead of the old cluster. + +## Precautions + +Standard Postgres major version upgrade precautions apply, including the fact +that all the requirements for [`pg_upgrade`](https://www.postgresql.org/docs/current/pgupgrade.html#id-1.9.5.12.7) +must be met by both clusters. + +Additionaly, `bdr_pg_upgrade` should not be used if there are other tools using +replication slots and replication origins, only BDR slots and origins will be +restored after the upgrade. + +There are several prerequisites for `bdr_pg_upgrade` that have to be met: + +- Applications using the old cluster have been disconnected, it can for example, + be redirected to another node in the cluster +- Peer authentication is configured for both clusters, `bdr_pg_upgrade` + requires peer authentication +- BDR versions on both clusters must be exactly the same and must be version + 4.1.0 or above +- The new cluster must be in a shutdown state +- BDR packages must be installed in the new cluster +- The new cluster must be already initialized and configured as needed to + match the old cluster configuration +- Databases, tables, and other objects must not exist in the new cluster + +It is also recommended to have the old cluster up prior to running `bdr_pg_upgrade` +as the CLI will start the old cluster if it is shutdown. + +## Usage + +To upgrade to a newer major version of PostgreSQL, the new version must first +be installed. + +### bdr_pg_upgrade command-line + +`bdr_pg_upgrade` passes all parameters to `pg_upgrade`. Therefore, you can +specify any parameters supported by [`pg_upgrade`](https://www.postgresql.org/docs/current/pgupgrade.html#id-1.9.5.12.6). + +#### Synopsis + +```shell +bdr_pg_upgrade [OPTION] ... +``` + +#### Options + +In addition to the options for `pg_upgrade`, the following parameters are +can be passed to `bdr_pg_upgrade`: + +- `-b, --old-bindir` - old cluster bin directory (required) +- `-B, --new-bindir`- new cluster bin directory (required) +- `-d, --old-datadir` - old cluster data directory (required) +- `-D, --new-datadir` - `REQUIRED` new cluster data directory (required) +- `--database` - BDR database name (required) +- `-p, --old-port` - old cluster port number +- `-s, --socketdir` - directory to use for postmaster sockets during upgrade +- `--check`- only perform checks, do not modify clusters + + +#### Environment Variables + +Environment variables can be used in place of command line parameters. + +- `PGBINOLD` - old cluster bin directory +- `PGBINNEW` - new cluster bin directory +- `PGDATAOLD` - old cluster data directory +- `PGDATANEW` - new cluster data directory +- `PGPORTOLD` - old cluster port number +- `PGSOCKETDIR` - directory to use for postmaster sockets during upgrade + + +### Example + +Given a scenario where: + +- Old cluster bin directory is `/usr/lib/postgresql/13/bin` +- New cluster bin directory is `/usr/lib/postgresql/14/bin` +- Old cluster data directory is `/var/lib/postgresql/13/main` +- New cluster data directory is `/var/lib/postgresql/14/main` +- Database name is `bdrdb` + + +The following command could be used to upgrade the cluster: + +``` +bdr_pg_upgrade \ +--old-bindir /usr/lib/postgresql/13/bin \ +--new-bindir /usr/lib/postgresql/14/bin \ +--old-datadir /var/lib/postgresql/13/main \ +--new-datadir /var/lib/postgresql/14/main \ +--database bdrdb +``` + +### Steps Performed + +Steps performed when running `bdr_pg_upgrade`. + +!!! Note + When `--check` is supplied as an argument to `bdr_pg_upgrade`, the CLI + will 1skip` steps that modify the database. + +#### BDR Postgres Checks + +| Steps | `--check` supplied | +| :-----------------------------------------------|:------------------:| +| Collecting pre-upgrade new cluster control data | `run` | +| Checking new cluster state is shutdown | `run` | +| Checking BDR versions | `run` | +| Starting old cluster (if shutdown) | `skip` | +| Connecting to old cluster | `skip` | +| Checking if bdr schema exists | `skip` | +| Turning DDL replication off | `skip` | +| Terminating connections to database. | `skip` | +| Disabling connections to database | `skip` | +| Waiting for all slots to be flushed | `skip` | +| Disconnecting from old cluster | `skip` | +| Stopping old cluster | `skip` | +| Starting old cluster with BDR disabled | `skip` | +| Connecting to old cluster | `skip` | +| Collecting replication origins | `skip` | +| Collecting replication slots | `skip` | +| Disconnecting from old cluster | `skip` | +| Stopping old cluster | `skip` | + +#### `pg_upgrade` Steps + +Standard `pg_upgrade` steps are performed + +!!! Note + `--check` is passed to pg_upgrade if supplied + +#### BDR Post-Upgrade Steps + +| Steps | `--check` supplied | +| :-----------------------------------------------|:------------------:| +| Collecting old cluster control data | `skip` | +| Collecting new cluster control data | `skip` | +| Advancing LSN of new cluster | `skip` | +| Starting new cluster with BDR disabled | `skip` | +| Connecting to new cluster | `skip` | +| Creating replication origin Repeated for each origin | `skip` | +| Advancing replication origin Repeated for each origin | `skip` | +| Creating replication slot Repeated for each slot | `skip` | +| Stopping new cluster | `skip` | diff --git a/product_docs/docs/pgd/4/upgrades/index.mdx b/product_docs/docs/pgd/4/upgrades/index.mdx new file mode 100644 index 00000000000..21ff8083568 --- /dev/null +++ b/product_docs/docs/pgd/4/upgrades/index.mdx @@ -0,0 +1,279 @@ +--- +title: "Upgrading" +--- + +Because EDB Postgres Distributed consists in multiple software components, +the upgrade strategy depends partially on which components are being upgraded. + +In general it's possible to upgrade the cluster with almost zero upgrade, by +using an approach called Rolling Upgrade where nodes are upgraded one by one, and +the application connections are switched over to already upgraded nodes. + +Ii's also possible to stop all nodes, perform the upgrade on all nodes and +only then restart the entire cluster, just like with a standard PostgreSQL setup. +This strategy of upgrading all nodes at the same time avoids running with +mixed versions of software and therefore is the simplest, but obviously incurs +some downtime and is not recommended unless the Rolling Upgrade is not possible +for some reason. + +To upgrade an EDB Postgres Distributed cluster, perform the following steps: + +1. Plan the upgrade. +1. Prepare for the upgrade. +1. Upgrade the server software. +1. Check and validate the upgrade. + +## Upgrade Planning + +There are broadly two ways to upgrade each node. + +* Upgrading nodes in-place to the newer software version, see [Rolling Server Software Upgrades](#rolling-server-software-upgrades). + +* Replacing nodes with ones that have the newer version installed, see [Rolling Upgrade Using Node Join](#rolling-upgrade-using-node-join). + +Both of these approaches can be done in a rolling manner. + +### Rolling Upgrade considerations + +While the cluster is going through a rolling upgrade, mixed versions of software +are running in the cluster. For example, nodeA has BDR 3.7.16, while +nodeB and nodeC has 4.1.0. In this state, the replication and group +management uses the protocol and features from the oldest version (3.7.16 +in case of this example), so any new features provided by the newer version +which require changes in the protocol are disabled. Once all nodes are +upgraded to the same version, the new features are automatically enabled. + +Similarly, when a cluster with WAL decoder enabled nodes is going through a +rolling upgrade, WAL decoder on a higher version of BDR node produces LCRs +with a higher pglogical version and WAL decoder on a lower version of BDR node +produces LCRs with lower pglogical version. As a result, WAL senders on a higher +version of BDR nodes are not expected to use LCRs due to a mismatch in protocol +versions while on a lower version of BDR nodes, WAL senders may continue to use +LCRs. Once all the BDR nodes are on the same BDR version, WAL senders use +LCRs. + +A rolling upgrade starts with a cluster with all nodes at a prior release, +then proceeds by upgrading one node at a time to the newer release, until +all nodes are at the newer release. There should never be more than two versions +of any component running at the same time, which means the new upgrade must not +be initiated until the previous upgrade process has fully finished on all nodes. + +An upgrade process may take an extended period of time when the user decides +caution is required to reduce business risk, though it's not recommended +to run the mixed versions of the software indefinitely. + +While Rolling Upgrade can be used for upgrading major version of the software +it is not supported to mix PostgreSQL, EDB Postgres Extended and +EDB Postgres Advanced Server in one cluster, so this approach cannot +be used to change the Postgres variant. + +!!! Warning + Downgrades of the EDB Postgres Distributed are *not* supported and require + manual rebuild of the cluster. + +### Rolling Server Software Upgrades + +A rolling upgrade is the process where the [Server +Software Upgrade](#server-software-upgrade) process is performed on each node in the +cluster one after another, while keeping the remainder of the cluster +operational. + +The actual procedure depends on whether the Postgres component is being +upgraded to a new major version or not. + +During the upgrade process, the application can be switched over to a node +which is currently not being upgraded to provide continuous availability of +the database for applications. + +### Rolling Upgrade Using Node Join + +The other method of upgrade of the server software, is to join a new node +to the cluster and later drop one of the existing nodes running +the older version of the software. + +For this approach, the procedure is always the same, however because it +includes node join, the potentially large data transfer is required. + +Care must be taken to not use features that are available only in +the newer Postgres versions, until all nodes are upgraded to the +newer and same release of Postgres. This is especially true for any +new DDL syntax that may have been added to a newer release of Postgres. + +!!! Note + `bdr_init_physical` makes a byte-by-byte of the source node + so it cannot be used while upgrading from one major Postgres version + to another. In fact, currently `bdr_init_physical` requires that even the + BDR version of the source and the joining node is exactly the same. + It cannot be used for rolling upgrades via joining a new node method. Instead, a logical join must be used. + +### Upgrading a CAMO-Enabled Cluster + +CAMO protection requires at least one of the nodes of a CAMO pair to +be operational. For upgrades, we recommend to ensure that no CAMO +protected transactions are running concurrent to the upgrade, or to +use a rolling upgrade strategy, giving the nodes enough time to +reconcile in between the upgrades and the corresponding node downtime +due to the upgrade. + +## Upgrade Preparation + +Each major release of the software contains several changes that may affect +compatibility with previous releases. These may affect the Postgres +configuration, deployment scripts, as well as applications using BDR. We +recommend to consider and possibly adjust in advance of the upgrade. + +Please see individual changes mentioned in [release notes](/pgd/latest/rel_notes/) and any version +specific upgrade notes in this topic. + +## Server Software Upgrade + +The upgrade of EDB Postgres Distributed on individual nodes happens in-place. +There is no need for backup and restore when upgrading the BDR extension. + +### BDR Extension Upgrade + +BDR extension upgrade process consists of few simple steps. + +#### Stop Postgres + +During the upgrade of binary packages, it's usually best to stop the running +Postgres server first to ensure that mixed versions don't get loaded in case +of unexpected restart during the upgrade. + +#### Upgrade Packages + +The first step in the upgrade is to install the new version of the BDR packages, which +installs both the new binary and the extension SQL script. This step is operating system-specific. + +#### Start Postgres + +Once packages are upgraded the Postgres instance can be started, the BDR +extension is automatically upgraded upon start when the new binaries +detect older version of the extension. + +### Postgres Upgrade + +The process of in-place upgrade of Postgres highly depends on whether you are +upgrading to new minor version of Postgres of to new major version of Postgres. + +#### Minor Version Postgres Upgrade + +Upgrading to a new minor version of Postgres is similar to [upgrading +the BDR extension](#bdr-extension-upgrade). Stopping Postgres, upgrading packages, +and starting Postgres again is typically all that's needed. + +However, sometimes additional steps like reindexing may be recommended for +specific minor version upgrades. Refer to the Release Notes of the +specific version of Postgres you are upgrading to. + +#### Major Version Postgres Upgrade + +Upgrading to a new major version of Postgres is a more complicated process. + +EDB Postgres Distributed provides a `bdr_pg_upgrade` command line utility, +which can be used to do a [In-place Postgres Major Version Upgrades](bdr_pg_upgrade). + +!!! Note + When upgrading to new major version of any software, including Postgres, the + BDR extension and others, it's always important to ensure the compatibility + of your application with the target version of a given software. + +## Upgrade Check and Validation + +After this procedure, your BDR node is upgraded. You can verify the current +version of BDR4 binary like this: + +```sql +SELECT bdr.bdr_version(); +``` + +Always check the [monitoring](monitoring) after upgrade of a node to confirm +that the upgraded node is working as expected. + +## Application Schema Upgrades + +Similar to the upgrade of BDR itself, there are two approaches to +upgrading the application schema. The simpler option is to stop all +applications affected, preform the schema upgrade, and restart the +application upgraded to use the new schema variant. Again, this +imposes some downtime. + +To eliminate this downtime, BDR offers ways to perform a rolling +application schema upgrade. + +### Rolling Application Schema Upgrades + +By default, DDL will automatically be sent to all nodes. This can be +controlled manually, as described in +[DDL Replication](/bdr/latest/ddl/), which +could be used to create differences between database schemas across nodes. +BDR is designed to allow replication to continue even while minor +differences exist between nodes. These features are designed to allow +application schema migration without downtime, or to allow logical +standby nodes for reporting or testing. + +!!! Warning + Rolling Application Schema Upgrades have to be managed outside of BDR. + Careful scripting is required to make this work correctly + on production clusters. Extensive testing is advised. + +See [Replicating between nodes with differences](/bdr/latest/appusage) for details. + +When one node runs DDL that adds a new table, nodes that have not +yet received the latest DDL need to handle the extra table. +In view of this, the appropriate setting for rolling schema upgrades +is to configure all nodes to apply the `skip` resolver in case of a +`target_table_missing` conflict. This must be performed before any +node has additional tables added and is intended to be a permanent +setting. + +This is done with the following query, that must be **executed +separately on each node**, after replacing `node1` with the actual +node name: + +```sql +SELECT bdr.alter_node_set_conflict_resolver('node1', + 'target_table_missing', 'skip'); +``` + +When one node runs DDL that adds a column to a table, nodes that have not +yet received the latest DDL need to handle the extra columns. +In view of this, the appropriate setting for rolling schema +upgrades is to configure all nodes to apply the `ignore` resolver in +case of a `target_column_missing` conflict. This must be performed +before one node has additional columns added and is intended to be a +permanent setting. + +This is done with the following query, that must be **executed +separately on each node**, after replacing `node1` with the actual +node name: + +```sql +SELECT bdr.alter_node_set_conflict_resolver('node1', + 'target_column_missing', 'ignore'); +``` + +When one node runs DDL that removes a column from a table, nodes that +have not yet received the latest DDL need to handle the missing column. +This situation will cause a `source_column_missing` conflict, which uses +the `use_default_value` resolver. Thus, columns that neither +accept NULLs nor have a DEFAULT value require a two step process: + +1. Remove NOT NULL constraint or add a DEFAULT value for a column + on all nodes. +2. Remove the column. + +Constraints can be removed in a rolling manner. +There is currently no supported way for handling adding table +constraints in a rolling manner, one node at a time. + +When one node runs a DDL that changes the type of an existing column, +depending on the existence of binary coercibility between the current +type and the target type, the operation may not rewrite the underlying +table data. In that case, it will be only a metadata update of the +underlying column type. Rewrite of a table is normally restricted. +However, in controlled DBA environments, it is possible to change +the type of a column to an automatically castable one by adopting +a rolling upgrade for the type of this column in a non-replicated +environment on all the nodes, one by one. See [ALTER TABLE](/bdr/latest/ddl) for more details. + section. diff --git a/product_docs/docs/pgd_cli/1/index.mdx b/product_docs/docs/pgd_cli/1/index.mdx new file mode 100644 index 00000000000..126ad2882a1 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/index.mdx @@ -0,0 +1,129 @@ +--- +title: "EDB Postgres Distributed Command Line Interface" +navTitle: "PGD CLI" +navigation: + + - rel_notes + - pgd + - pgd_check-health + - pgd_show-camo + - pgd_show-clockskew + - pgd_show-events + - pgd_show-nodes + - pgd_show-raft + - pgd_show-replslots + - pgd_show-subscriptions + - pgd_show-version +directoryDefaults: + description: "The PGD Command Line Interface (CLI) is a tool to manage your BDR cluster" +--- + +The EDB Postgres Distributed Command Line Interface (PGD CLI) is a tool to manage your BDR cluster. It allows you to run commands against BDR clusters. + +You can use it to inspect and manage cluster resources and to get information on the following: + +- node summary +- BDR/Postgres version on each node +- replication and subscription details +- membership and worker events +- raft consensus details +- CAMO (Commit at Most Once) details +- clock skew +- cluster health summary + +## Supported version + +BDR 4.1 and later. + +## Command line help + +To list the supported commands, enter: + +```sh +pgd help +``` + +For help for a specific command and its parameters, enter `pgd help `. For example: + +```sh +pgd help show-nodes +``` + +## Configuration + +You can either use a configuration file to specify the database connection strings for your cluster or pass the connection string directly to a command. + +### Using a configuration file + +Use the `pgd-config.yml` configuration file to specify the database connection string for your cluster. The configuration file should contain the database connection string for at least one BDR node in the cluster. The cluster name is optional and not validated. + +For example: + +```yaml +cluster: + name: cluster-name + endpoints: + - "host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + - "host=bdr-b1 port=5432 dbname=bdrdb user=postgres " + - "host=bdr-c1 port=5432 dbname=bdrdb user=postgres " +``` + +The `pgd-config.yml`, is located in the `/etc/edb` directory, by default. The PGD CLI searches for `pgd-config.yml` in the following locations (precedence order - higher to lower): + + 1. `/etc/edb` (default) + 2. `$HOME/.edb` + 3. `.` (working directory) + +If you rename the file or move it to another location, specify the new name and location using the optional `-f` or `--config-file` flag. For example: + +```sh + pgd show-nodes -f /opt/my-config.yml +``` + +### Passing a database connection string directly to a command + +Use the `--dsn` flag to pass a database connection string directly to a command. You don't need a configuration file if you pass the connection string with this flag. The flag takes precedence if a configuration file is present. For example: + +```sh +pgd show-nodes --dsn "host=bdr-a1 port=5432 dbname=bdrdb user=postgres " +``` + +## Postgres user privileges + +The PGD CLI requires postgres superuser privileges to run. + +## Installation + +### TPAexec + +The easiest way to install and configure the PGD CLI is to use EDB's TPAexec utility for cluster deployment and management. For more information, see the [TPAexec](/pgd/4/deployments/tpaexec). + +TPAexec installs the PGD CLI package (`edb-pgd-cli`) on each BDR node, by default. It also creates the configuration file (`/etc/edb/pgd-config.yml`) prepopulated with the endpoints of each node in the cluster. +If you wish to install it on any non-BDR instance in the cluster, attach the pgdcli role to that instance in TPAexec's configuration file. + +### Manual + +You can manually install the PGD CLI on any Linux machine using `.deb` and `.rpm` packages available from the [BDR repository](https://techsupport.enterprisedb.com/software_subscriptions/add/products/bdr4/). The package name is `edb-pgd-cli`. For example: + +```sh +# for Debian +sudo apt-get install edb-pgd-cli +``` + +When the PGD CLI is configured by TPAexec, it connects automatically. With a manual installation you need to provide connection string to the BDR cluster. + +## Output format + +The PGD CLI supports the following output formats: + +| Format | Considerations | +| ------- | -------------- | +| tabular | Default format. Presents the data in tabular form.| +| json | Presents the raw data with no formatting. For some commands, the json output may show more data than shown in the tabular output such as extra fields and more detailed messages. | +| yaml | Same as json except field order is alphabetical. Experimental and may not be fully supported in future versions. | + +Use the `-o` or `--output` flag to change the default output format to json or yaml. For example: + +```sh +pgd show-nodes -o json +``` diff --git a/product_docs/docs/pgd_cli/1/pgd.mdx b/product_docs/docs/pgd_cli/1/pgd.mdx new file mode 100644 index 00000000000..21bf6811bc2 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/pgd.mdx @@ -0,0 +1,23 @@ +--- +title: pgd +--- + +pgd is the command name for the PGD command line interface. + +### Synopsis + +The EDB Postgres Distributed Command Line Interface (PGD CLI) is a tool to +manage your BDR cluster. It allows you to run commands against BDR clusters. +You can use it to inspect and manage cluster resources. + +### Options + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -h, --help help for pgd + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd_cli/1/pgd_check-health.mdx b/product_docs/docs/pgd_cli/1/pgd_check-health.mdx new file mode 100644 index 00000000000..72b9f89fb95 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/pgd_check-health.mdx @@ -0,0 +1,79 @@ +--- +title: check-health +--- + +Checks the health of the BDR cluster. + +### Synopsis + +Performs various checks such as if all nodes are accessible, all replication +slots are working, and CAMO pairs are connected. + +```sh +pgd check-health [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down; bdr-a1 + and bdr-b1 are CAMO partners) + + $ pgd check-health + + Check Status Message + ----- ------ ------- + CAMO Critical At least 1 CAMO partner is not connected + ClockSkew Critical Clockskew cannot be determined for at least 1 BDR node pair + Connection Critical The node bdr-b1 is not accessible + Raft Warning There is no RAFT_LEADER, an election might be in progress + Replslots Critical There is at least 1 BDR replication slot which is inactive + Version Warning There is at least 1 node that is not accessible + + + Example 2 (3 node cluster, all nodes are up but system clocks are not in sync) + + $ pgd check-health + + Check Status Message + ----- ------ ------- + CAMO Ok All CAMO pairs are connected + ClockSkew Warning At least 1 BDR node pair has clockskew greater than 2 seconds + Connection Ok All BDR nodes are accessible + Raft Ok Raft Consensus is working correctly + Replslots Ok All BDR replication slots are working correctly + Version Ok All nodes are running same BDR versions + + + Example 3 (3 node cluster, all nodes are up and all checks are Ok) + + $ pgd check-health + + Check Status Message + ----- ------ ------- + CAMO Ok All CAMO pairs are connected + ClockSkew Ok All BDR node pairs have clockskew within permissible limit + Connection Ok All BDR nodes are accessible + Raft Ok Raft Consensus is working correctly + Replslots Ok All BDR replication slots are working correctly + Version Ok All nodes are running same BDR versions + +``` + +### Options + +```text + -h, --help help for check-health +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd_cli/1/pgd_show-camo.mdx b/product_docs/docs/pgd_cli/1/pgd_show-camo.mdx new file mode 100644 index 00000000000..748ca389ad5 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/pgd_show-camo.mdx @@ -0,0 +1,68 @@ +--- +title: show-camo +--- + +Shows BDR CAMO (Commit at Most Once) details. + +### Synopsis + +Shows BDR CAMO (Commit at Most Once) details such as the name of the CAMO +partner, connection and readiness status, any pending and unresolved CAMO +transactions, and differences between apply_lsn and receive_lsn. This command +is available only for EDB Postgres Extended and EDB Postgres Advanced Server +(v14 and later). + +```sh +pgd show-camo [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-b1 are CAMO partner but bdr-b1 is down) + + $ pgd show-camo + + Node CAMO Partner Connected Ready Transactions Resolved Apply LSN Receive LSN Apply Queue Size + ---- ------------ --------- ----- --------------------- --------- ----------- ---------------- + bdr-a1 bdr-b1 false false true 0/E42C99B0 0/E42C99B0 0 + + + Example 2 (3 node cluster, bdr-b1 was down and it has just been restarted) + + $ pgd show-camo + + Node CAMO Partner Connected Ready Transactions Resolved Apply LSN Receive LSN Apply Queue Size + ---- ------------ --------- ----- --------------------- --------- ----------- ---------------- + bdr-a1 bdr-b1 true true true 0/E533DAB8 0/E533DAB8 0 + bdr-b1 bdr-a1 true false true 3/7AE81A28 3/7AE81A28 0 + + + Example 3 (3 node cluster, all nodes are up and in 'streaming' state) + + $ pgd show-camo + + Node CAMO Partner Connected Ready Transactions Resolved Apply LSN Receive LSN Apply Queue Size + ---- ------------ --------- ----- --------------------- --------- ----------- ---------------- + bdr-a1 bdr-b1 true true true 0/E56AE520 0/E56AE520 0 + bdr-b1 bdr-a1 true true true 3/7B180BA8 3/7B180BA8 0 + +``` + +### Options + +```text + -h, --help help for show-camo +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd_cli/1/pgd_show-clockskew.mdx b/product_docs/docs/pgd_cli/1/pgd_show-clockskew.mdx new file mode 100644 index 00000000000..008aba51571 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/pgd_show-clockskew.mdx @@ -0,0 +1,66 @@ +--- +title: show-clockskew +--- + +Shows the status of clock skew between each BDR node pair. + +### Synopsis + +Shows the status of clock skew between each BDR node pair in the cluster. + + Symbol Meaning + ------- -------- + * ok + ~ warning (skew > 2 seconds) + ! critical (skew > 5 seconds) + x down / unreachable + ? unknown + - n/a + +```sh +pgd show-clockskew [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd show-clockskew + + Node bdr-a1 bdr-b1 bdr-c1 Current Time + ---- ------ ------ ------ ------------ + bdr-a1 * ? * 2022-03-30 07:02:21.334472 + bdr-b1 x * x x + bdr-c1 * ? * 2022-03-30 07:02:21.186809 + + + Example 2 (3 node cluster, all nodes are up) + + $ pgd show-clockskew + + Node bdr-a1 bdr-b1 bdr-c1 Current Time + ---- ------ ------ ------ ------------ + bdr-a1 * * * 2022-03-30 07:04:54.147017 + bdr-b1 * * * 2022-03-30 07:04:54.340543 + bdr-c1 * * * 2022-03-30 07:04:53.90451 + +``` + +### Options + +```text + -h, --help help for show-clockskew +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd_cli/1/pgd_show-events.mdx b/product_docs/docs/pgd_cli/1/pgd_show-events.mdx new file mode 100644 index 00000000000..19251c3a194 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/pgd_show-events.mdx @@ -0,0 +1,60 @@ +--- +title: show-events +--- + +Shows events such as background worker errors and node membership changes. + +### Synopsis + +Shows events such as background worker errors and node membership changes. +Output is sorted by Time column in descending order. Message column is +truncated after a few lines. To view complete message use json output format +('-o json'). + +For more details on each node state, see show-nodes command help +('pgd show-nodes -h'). + +```sh +pgd show-events [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster) + + $ pgd show-events --lines 10 + + Time Observer Node Subject Node Type Message + ---- ------------- ------------ ---- ------- + 2022-04-19 19:45:43.077712+00 bdr-b1 bdr-c1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:43.066804+00 bdr-c1 bdr-a1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:43.057598+00 bdr-b1 bdr-a1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:43.046515+00 bdr-c1 bdr-b1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:43.033369+00 bdr-a1 bdr-c1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:43.013203+00 bdr-a1 bdr-b1 receiver worker error pglogical worker received fast finish request, exiting + 2022-04-19 19:45:40.024662+00 bdr-c1 bdr-c1 node state change ACTIVE + 2022-04-19 19:45:40.024575+00 bdr-b1 bdr-c1 node state change ACTIVE + 2022-04-19 19:45:40.022788+00 bdr-a1 bdr-c1 node state change ACTIVE + 2022-04-19 19:45:38.961424+00 bdr-c1 bdr-c1 node state change PROMOTING + +``` + +### Options + +```text + -h, --help help for show-events + -n, --lines int show top n lines +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd_cli/1/pgd_show-nodes.mdx b/product_docs/docs/pgd_cli/1/pgd_show-nodes.mdx new file mode 100644 index 00000000000..fc1dabd0c29 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/pgd_show-nodes.mdx @@ -0,0 +1,110 @@ +--- +title: show-nodes +--- + +Shows all nodes in the BDR cluster and their summary. + +### Synopsis + +Shows all nodes in the BDR cluster and their summary such as name, node id, +group, and current/target state etc. + +Node States + +* NONE: Node state is unset when the worker starts, expected to be set +quickly to the current known state. +* CREATED: bdr.create_node() has been executed, but the node is not a +member of any BDR cluster yet. +* JOIN_START: bdr.join_node_group() begins to join the local node to an +existing BDR cluster. +* JOINING: The node join has started and is currently at the initial sync +phase, creating the schema and data on the node. +* CATCHUP: Initial sync phase is completed; now the join is at the last step +of retrieving and applying transactions that were performed on the upstream +peer node since the join started. +* STANDBY: Node join has finished, but not yet started to broadcast changes. +All joins spend some time in this state, but if defined as a Logical +Standby, the node will continue in this state. +* PROMOTE: Node was a logical standby and we just called bdr.promote_node to +move the node state to ACTIVE. These two PROMOTE states have to be +coherent to the fact, that only one node can be with a state higher than +STANDBY but lower than ACTIVE. +* PROMOTING: Promotion from logical standby to full BDR node is in progress. +* ACTIVE: The node is a full BDR node and is currently ACTIVE. This is the +most common node status. +* PART_START: Node was ACTIVE or STANDBY and we just called bdr.part_node +to remove the node from the BDR cluster. +* PARTING: Node disconnects from other nodes and plays no further part in +consensus or replication. +* PART_CATCHUP: Non-parting nodes synchronize any missing data from the +recently parted node. +* PARTED: Node parting operation is now complete on all nodes. + +Only one node at a time can be in either of the states PROMOTE or PROMOTING. + +Note that the read-only state of a node, as shown in the Current State or in +the Target State columns, is indicated as STANDBY. + +```sh +pgd show-nodes [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd show-nodes + + Node Node ID Node Group Current State Target State Status Seq ID + ---- ------- ---------- ------------- ------------ ------ ------ + bdr-a1 3136956818 bdrgroup ACTIVE ACTIVE Up 1 + bdr-b1 2380210996 bdrgroup ACTIVE ACTIVE Unreachable 2 + bdr-c1 1804769977 bdrgroup ACTIVE ACTIVE Up 3 + + + Example 2 (3 node cluster, all nodes are up) + + $ pgd show-nodes + + Node Node ID Node Group Current State Target State Status Seq ID + ---- ------- ---------- ------------- ------------ ------ ------ + bdr-a1 3136956818 bdrgroup ACTIVE ACTIVE Up 1 + bdr-b1 2380210996 bdrgroup ACTIVE ACTIVE Up 2 + bdr-c1 1804769977 bdrgroup ACTIVE ACTIVE Up 3 + + + Example 3 (cluster with witness, logical standby and subscriber-only nodes) + Note: In contrast to logical standby, the subscriber-only nodes are fully + joined node to the cluster + + $ pgd show-nodes + + Node Node ID Node Group Current State Target State Status Seq ID + ---- ------- ---------- ------------- ------------ ------ ------ + bdr-a1 3136956818 bdrgroup ACTIVE ACTIVE Up 1 + bdr-b1 2380210996 bdrgroup ACTIVE ACTIVE Up 2 + witness-c1 1450365472 bdrgroup ACTIVE ACTIVE Up 3 + logical-standby-a1 1140256918 bdrgroup STANDBY STANDBY Up 4 + logical-standby-b1 3541792022 bdrgroup STANDBY STANDBY Up 5 + subscriber-only-c1 2448841809 subscriber-only ACTIVE ACTIVE Up 6 + +``` + +### Options + +```text + -h, --help help for show-nodes +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd_cli/1/pgd_show-raft.mdx b/product_docs/docs/pgd_cli/1/pgd_show-raft.mdx new file mode 100644 index 00000000000..90bd525a064 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/pgd_show-raft.mdx @@ -0,0 +1,89 @@ +--- +title: show-raft +--- + +Shows BDR Raft (consensus protocol) details. + +### Synopsis + +Shows BDR Raft (consensus protocol) details such as Raft state (leader, +follower), Raft election id, and number of voting nodes. + +Note: In some cases such as network partition, output may vary based on the +node to which the CLI is connected. + +```sh +pgd show-raft [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd show-raft + + Node Raft State Raft Term Commit Index Nodes Voting Nodes Protocol Version + ---- ---------- --------- ------------ ----- ------------ ---------------- + bdr-c1 RAFT_FOLLOWER 29 6081272 3 3 4002 + bdr-a1 RAFT_LEADER 29 6081272 3 3 4002 + bdr-b1 + + + Example 2 (3 node cluster, all nodes are up) + + $ pgd show-raft + + Node Raft State Raft Term Commit Index Nodes Voting Nodes Protocol Version + ---- ---------- --------- ------------ ----- ------------ ---------------- + bdr-c1 RAFT_FOLLOWER 38 6132327 3 3 4002 + bdr-a1 RAFT_LEADER 38 6132331 3 3 4002 + bdr-b1 RAFT_FOLLOWER 38 6132336 3 3 4002 + + + Example 3 (3 node cluster, all nodes are up but bdr-a1 is not able to connect + to other nodes; following is the output when cli is connected to bdr-a1) + + $ pgd show-raft + + Node Raft State Raft Term Commit Index Nodes Voting Nodes Protocol Version + ---- ---------- --------- ------------ ----- ------------ ---------------- + bdr-c1 + bdr-a1 RAFT_FOLLOWER 40 6176769 3 3 4002 + bdr-b1 + + + Example 4 (cluster with witness, logical standby and subscriber-only nodes) + Note: Unlike full-bdr (or witness node), logical standby and subscriber-only + nodes don't have raft voting rights. + + $ pgd show-raft + + Node Raft State Raft Term Commit Index Nodes Voting Nodes Protocol Version + ---- ---------- --------- ------------ ----- ------------ ---------------- + bdr-a1 RAFT_LEADER 0 10268 6 3 4003 + bdr-b1 RAFT_FOLLOWER 0 10279 6 3 4003 + witness-c1 RAFT_FOLLOWER 0 10281 6 3 4003 + logical-standby-a1 RAFT_FOLLOWER 0 10281 6 3 4003 + logical-standby-b1 RAFT_FOLLOWER 0 10281 6 3 4003 + subscriber-only-c1 RAFT_FOLLOWER 0 10281 6 3 4003 + +``` + +### Options + +```text + -h, --help help for show-raft +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd_cli/1/pgd_show-replslots.mdx b/product_docs/docs/pgd_cli/1/pgd_show-replslots.mdx new file mode 100644 index 00000000000..e347ba62c50 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/pgd_show-replslots.mdx @@ -0,0 +1,136 @@ +--- +title: show-replslots +--- + +Shows the status of BDR replication slots. + +### Synopsis + +Shows the status of BDR replication slots. Output with the verbose flag gives +details such as is slot active, replication state (disconnected, streaming, +catchup), and approximate lag. + + Symbol Meaning + ------- -------- + * ok + ~ warning (lag > 10M) + ! critical (lag > 100M OR slot is 'inactive' OR 'disconnected') + x down / unreachable + - n/a + + In matrix view, sometimes byte lag is shown in parentheses. It is a + maxOf(WriteLag, FlushLag, ReplayLag, SentLag). + +```sh +pgd show-replslots [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd show-replslots + + Node bdr-a1 bdr-b1 bdr-c1 + ---- ------ ------ ------ + bdr-a1 * !(6.6G) * + bdr-b1 x * x + bdr-c1 * !(6.9G) * + + + $ pgd show-replslots --verbose + + Origin Node Target Node Status (active/state) Write Lag (bytes/duration) Flush Lag (bytes/duration) Replay Lag (bytes/duration) Sent Lag (bytes) + ----------- ----------- --------------------- -------------------------- -------------------------- --------------------------- ---------------- + bdr-a1 bdr-b1 f / disconnected 6.6G / 8 days 02:58:36.243723 6.6G / 8 days 02:58:36.243723 6.6G / 8 days 02:58:36.243723 6.6G + bdr-a1 bdr-c1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-c1 bdr-a1 t / streaming 0B / 00:00:00.000812 0B / 00:00:00.000812 0B / 00:00:00.000812 0B + bdr-c1 bdr-b1 f / disconnected 6.9G / 8 days 02:58:36.004415 6.9G / 8 days 02:58:36.004415 6.9G / 8 days 02:58:36.004415 6.9G + + + Example 2 (3 node cluster, bdr-b1 was down and it has just been restarted) + + $ pgd show-replslots + + Node bdr-a1 bdr-b1 bdr-c1 + ---- ------ ------ ------ + bdr-a1 * !(6.9G) * + bdr-b1 * * * + bdr-c1 * !(5.8G) * + + + $ pgd show-replslots --verbose + + Origin Node Target Node Status (active/state) Write Lag (bytes/duration) Flush Lag (bytes/duration) Replay Lag (bytes/duration) Sent Lag (bytes) + ----------- ----------- --------------------- -------------------------- -------------------------- --------------------------- ---------------- + bdr-a1 bdr-b1 t / catchup 6.9G / 00:00:00.000778 6.9G / 00:00:00.000778 6.9G / 00:00:00.000778 6.9G + bdr-a1 bdr-c1 t / streaming 0B / 00:00:00.104121 0B / 00:00:00.104133 0B / 00:00:00.104133 0B + bdr-b1 bdr-a1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-b1 bdr-c1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-c1 bdr-a1 t / streaming 6.8K / 00:00:00 6.8K / 00:00:00 6.8K / 00:00:00 6.8K + bdr-c1 bdr-b1 t / catchup 5.5G / 00:00:00.008257 5.5G / 00:00:00.008257 5.5G / 00:00:00.008257 5.5G + + + Example 3 (3 node cluster, all nodes are up and in 'streaming' state) + + $ pgd show-replslots + + Node bdr-a1 bdr-b1 bdr-c1 + ---- ------ ------ ------ + bdr-a1 * * * + bdr-b1 * * * + bdr-c1 * * * + + + $ pgd show-replslots --verbose + + Origin Node Target Node Status (active/state) Write Lag (bytes/duration) Flush Lag (bytes/duration) Replay Lag (bytes/duration) Sent Lag (bytes) + ----------- ----------- --------------------- -------------------------- -------------------------- --------------------------- ---------------- + bdr-a1 bdr-b1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-a1 bdr-c1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-b1 bdr-a1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-b1 bdr-c1 t / streaming 0B / 00:00:00 0B / 00:00:00 0B / 00:00:00 0B + bdr-c1 bdr-a1 t / streaming 0B / 00:00:00 528B / 00:00:00 528B / 00:00:00 0B + bdr-c1 bdr-b1 t / streaming 528B / 00:00:00 528B / 00:00:00 528B / 00:00:00 0B + + + Example 4 (cluster with witness, logical standby and subscriber-only nodes; + upstream for logical-standby-a1 is bdr-a1 and for logical-standby-b1 it is bdr-b1) + Note: + 1. A logical standby is sent data only by one source node, but no other + nodes receive replication changes from it + 2. Subscriber-only node subscribes to replication changes from other nodes + in the cluster, but no other nodes receive replication changes from it + + $ pgd show-replslots + + Node bdr-a1 bdr-b1 logical-standby-a1 logical-standby-b1 subscriber-only-c1 witness-c1 + ---- ------ ------ ------------------ ------------------ ------------------ ---------- + bdr-a1 * * * - * * + bdr-b1 * * - * * * + logical-standby-a1 - - * - - - + logical-standby-b1 - - - * - - + subscriber-only-c1 - - - - * - + witness-c1 * * - - * * + +``` + +### Options + +```text + -h, --help help for show-replslots + -v, --verbose verbose output (default true) +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd_cli/1/pgd_show-subscriptions.mdx b/product_docs/docs/pgd_cli/1/pgd_show-subscriptions.mdx new file mode 100644 index 00000000000..41edd35c983 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/pgd_show-subscriptions.mdx @@ -0,0 +1,103 @@ +--- +title: show-subscriptions +--- + +Shows BDR subscription (incoming replication) details. + +### Synopsis + +Shows BDR subscription (incoming replication) details such as origin/target +node, timestamp of the last replayed transaction, and lag between now and the +timestamp of the last replayed transaction. + +```sh +pgd show-subscriptions [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd show-subscriptions + + Origin Node Target Node Last Transaction Replayed At Lag Duration (seconds) + ----------- ----------- ---------------------------- ---------------------- + bdr-a1 bdr-c1 2022-04-23 13:13:40.854433+00 0.514275 + bdr-b1 bdr-a1 + bdr-b1 bdr-c1 + bdr-c1 bdr-a1 2022-04-23 13:13:40.852233+00 0.335464 + + + Example 2 (3 node cluster, bdr-b1 was down and it has just been restarted) + + $ pgd show-subscriptions + + Origin Node Target Node Last Transaction Replayed At Lag Duration (seconds) + ----------- ----------- ---------------------------- ---------------------- + bdr-a1 bdr-b1 2022-04-23 13:14:45.669254+00 0.001686 + bdr-a1 bdr-c1 2022-04-23 13:14:46.157913+00 -0.002009 + bdr-b1 bdr-a1 + bdr-b1 bdr-c1 + bdr-c1 bdr-a1 2022-04-23 13:14:45.698472+00 0.259521 + bdr-c1 bdr-b1 2022-04-23 13:14:45.667979+00 0.002961 + + + Example 3 (3 node cluster, all nodes are up and in 'streaming' state) + + $ pgd show-subscriptions + + Origin Node Target Node Last Transaction Replayed At Lag Duration (seconds) + ----------- ----------- ---------------------------- ---------------------- + bdr-a1 bdr-b1 2022-04-23 13:15:39.732375+00 0.034462 + bdr-a1 bdr-c1 2022-04-23 13:15:40.179618+00 0.002647 + bdr-b1 bdr-a1 2022-04-23 13:15:39.719994+00 0.305814 + bdr-b1 bdr-c1 2022-04-23 13:15:40.180886+00 0.001379 + bdr-c1 bdr-a1 2022-04-23 13:15:39.714397+00 0.311411 + bdr-c1 bdr-b1 2022-04-23 13:15:39.714397+00 0.052440 + + + Example 4 (cluster with witness, logical standby and subscriber-only nodes; + upstream for logical-standby-a1 is bdr-a1 and for logical-standby-b1 it is bdr-b1) + Note: Logical standby and subscriber-only nodes receive changes but do not + send changes made locally to other nodes + + $ pgd show-subscriptions + + Origin Node Target Node Last Transaction Replayed At Lag Duration (seconds) + ----------- ----------- ---------------------------- ---------------------- + bdr-a1 bdr-b1 2022-04-23 13:40:49.106411+00 0.853665 + bdr-a1 logical-standby-a1 2022-04-23 13:40:50.72036+00 0.138430 + bdr-a1 logical-standby-b1 + bdr-a1 subscriber-only-c1 2022-04-23 13:40:50.72036+00 0.016226 + bdr-a1 witness-c1 2022-04-23 13:40:50.470142+00 0.001514 + bdr-b1 bdr-a1 2022-04-23 13:40:49.10174+00 1.095422 + bdr-b1 logical-standby-a1 + bdr-b1 logical-standby-b1 2022-04-23 13:40:50.713666+00 0.271213 + bdr-b1 subscriber-only-c1 2022-04-23 13:40:50.713666+00 0.022920 + bdr-b1 witness-c1 2022-04-23 13:40:50.471789+00 -0.000133 + witness-c1 bdr-a1 2022-04-23 13:40:49.107706+00 1.089456 + witness-c1 bdr-b1 2022-04-23 13:40:49.107706+00 0.852370 + witness-c1 logical-standby-a1 + witness-c1 logical-standby-b1 + witness-c1 subscriber-only-c1 2022-04-23 13:40:50.719844+00 0.016742 + +``` + +### Options + +```text + -h, --help help for show-subscriptions +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd_cli/1/pgd_show-version.mdx b/product_docs/docs/pgd_cli/1/pgd_show-version.mdx new file mode 100644 index 00000000000..75e2a99d956 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/pgd_show-version.mdx @@ -0,0 +1,57 @@ +--- +title: show-version +--- + +Shows the version of BDR and Postgres installed on each node. + +### Synopsis + +Shows the version of BDR and Postgres installed on each node in the cluster. + +```sh +pgd show-version [flags] +``` + +### Examples + +```text + + Example 1 (3 node cluster, bdr-a1 and bdr-c1 are up, bdr-b1 is down) + + $ pgd show-version + + Node BDR Version Postgres Version + ---- ----------- ---------------- + bdr-c1 4.1.0 14.2 (EDB Postgres Extended Server 14.2.0) (Debian 2:14.2.0edbpge-1.buster+1) + bdr-a1 4.1.0 14.2 (EDB Postgres Extended Server 14.2.0) (Debian 2:14.2.0edbpge-1.buster+1) + bdr-b1 + + + Example 2 (3 node cluster, all nodes are up) + + $ pgd show-version + + Node BDR Version Postgres Version + ---- ----------- ---------------- + bdr-c1 4.1.0 14.2 (EDB Postgres Extended Server 14.2.0) (Debian 2:14.2.0edbpge-1.buster+1) + bdr-a1 4.1.0 14.2 (EDB Postgres Extended Server 14.2.0) (Debian 2:14.2.0edbpge-1.buster+1) + bdr-b1 4.1.0 14.2 (EDB Postgres Extended Server 14.2.0) (Debian 2:14.2.0edbpge-1.buster+1) + +``` + +### Options + +```text + -h, --help help for show-version +``` + +### Options inherited from parent commands + +```text + -f, --config-file string config file; ignored if + --dsn flag is present (default "/etc/edb/pgd-config.yml") + --dsn string database connection string + e.g."host=bdr-a1 port=5432 dbname=bdrdb user=postgres " + -L, --log-level string logging level: debug, info, warn, error (default "error") + -o, --output string output format: json, yaml +``` diff --git a/product_docs/docs/pgd_cli/1/rel_notes/index.mdx b/product_docs/docs/pgd_cli/1/rel_notes/index.mdx new file mode 100644 index 00000000000..2507dbdfe0b --- /dev/null +++ b/product_docs/docs/pgd_cli/1/rel_notes/index.mdx @@ -0,0 +1,13 @@ +--- +title: "PGD CLI Release notes" +navTitle: Release notes +--- + +PGD CLI is a tool to manage your BDR cluster. It allows you to run commands against BDR clusters. + +The PGD CLI documentation describes the latest version of PGD CLI including minor releases and patches. The release notes in this section provide information on what was new in each release. + + +| Version | Release Date | +| ----------------------- | ------------ | +| [1](pgd_cli_1_rel_notes) | 2022 May 17 | diff --git a/product_docs/docs/pgd_cli/1/rel_notes/pgd_cli_1_rel_notes.mdx b/product_docs/docs/pgd_cli/1/rel_notes/pgd_cli_1_rel_notes.mdx new file mode 100644 index 00000000000..ea3f53a5755 --- /dev/null +++ b/product_docs/docs/pgd_cli/1/rel_notes/pgd_cli_1_rel_notes.mdx @@ -0,0 +1,20 @@ +--- +title: "Release notes for PGD CLI version 1" +navTitle: "Version 1" +--- + +This is an initial release of EDB Postgres Distributed (PGD) Command Line Interface (CLI). PGD CLI version 1 includes: + +| Type | Description | +| ------- | ----------------------------------------------------------------------------------------------- | +| Feature | Initial release of PGD CLI. Includes the following commands: | +| | `pgd check-health` - Checks the health of the BDR cluster. | +| | `pgd show-camo` - Shows BDR CAMO (Commit at Most Once) details. | +| | `pgd show-clockskew` - Shows the status of clock skew between each BDR node pair. | +| | `pgd show-events` - Shows events such as background worker errors and node membership changes. | +| | `pgd show-nodes` - Shows all nodes in the BDR cluster and their summary. | +| | `pgd show-raft` - Shows BDR Raft (consensus protocol) details. | +| | `pgd show-replslots` - Shows the status of BDR replication slots. | +| | `pgd show-subscriptions` - Shows BDR subscription (incoming replication) details. | +| | `pgd show-version` - Shows the version of BDR and Postgres installed on each node. | + \ No newline at end of file diff --git a/src/constants/products.js b/src/constants/products.js index e9f3b7651eb..b5f3390eea0 100644 --- a/src/constants/products.js +++ b/src/constants/products.js @@ -14,6 +14,7 @@ export const products = { bart: { name: "Backup and Recovery Tool", iconName: IconNames.EDB_BART }, biganimal: { name: "BigAnimal", iconName: IconNames.BIGANIMAL }, efm: { name: "Failover Manager", iconName: IconNames.EDB_EFM }, + epd: { name: "EDB Postgres Distributed", iconName: IconNames.EDB_EPAS }, eprs: { name: "EDB Replication Server", iconName: IconNames.EDB_EPAS }, postgres_for_kubernetes: { name: "EDB Postgres for Kubernetes", diff --git a/src/pages/index.js b/src/pages/index.js index 5fa835b9225..e18e2c204d1 100644 --- a/src/pages/index.js +++ b/src/pages/index.js @@ -203,14 +203,24 @@ const Page = () => ( - - Replication - - + + EDB Postgres Distributed + + BDR (Bi-Directional Replication) + + HA Routing for Postgres (HARP) + + + EDB Postgres Distributed CLI + + + + Replication + Replication Server pglogical Slony @@ -218,9 +228,7 @@ const Page = () => ( Cluster Management - - High Availability Routing for Postgres (HARP) - + Failover Manager Replication Manager (repmgr) diff --git a/static/_redirects b/static/_redirects index 3087944d757..cc32e262ac9 100644 --- a/static/_redirects +++ b/static/_redirects @@ -30,6 +30,9 @@ /docs/pgpool/1.0/* /docs/pgpool/latest/:splat 301 /docs/postgis/1.0/* /docs/postgis/latest/:splat 301 /docs/slony/1.0/* /docs/slony/latest/:splat 301 +/docs/bdr/4.0/* /docs/bdr/latest/:splat 301 +/docs/harp/2.0/* /docs/harp/latest/:splat 301 +/docs/harp/1.0/* /docs/harp/latest/:splat 301 # BART /docs/bart/2.4/* /docs/bart/latest/ 301