From e36b9122e635db22c186f2a92f8829a766f6498a Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Tue, 2 Jul 2024 16:35:04 -0500
Subject: [PATCH 01/16] lxd/cluster: Replace deprecated Node.Recover

ReconfigureMembershipExt takes into consideration a node's dqlite role.

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/cluster/recover.go | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/lxd/cluster/recover.go b/lxd/cluster/recover.go
index 5e83351ecce9..0cdd3db09772 100644
--- a/lxd/cluster/recover.go
+++ b/lxd/cluster/recover.go
@@ -38,7 +38,9 @@ func ListDatabaseNodes(database *db.Node) ([]string, error) {
 	return addresses, nil
 }
 
-// Recover attempts data recovery on the cluster database.
+// Recover rebuilds the dqlite raft configuration leaving only the current
+// member in the cluster. Use `Reconfigure` if more members should remain in
+// the raft configuration.
 func Recover(database *db.Node) error {
 	// Figure out if we actually act as dqlite node.
 	var info *db.RaftNode
@@ -63,20 +65,16 @@ func Recover(database *db.Node) error {
 	}
 
 	dir := filepath.Join(database.Dir(), "global")
-	server, err := dqlite.New(
-		uint64(info.ID),
-		info.Address,
-		dir,
-	)
-	if err != nil {
-		return fmt.Errorf("Failed to create dqlite server: %w", err)
-	}
 
 	cluster := []dqlite.NodeInfo{
-		{ID: uint64(info.ID), Address: info.Address},
+		{
+			ID:      uint64(info.ID),
+			Address: info.Address,
+			Role:    client.Voter,
+		},
 	}
 
-	err = server.Recover(cluster)
+	err = dqlite.ReconfigureMembershipExt(dir, cluster)
 	if err != nil {
 		return fmt.Errorf("Failed to recover database state: %w", err)
 	}

From d29d18298af5269dcb8b786a2408cf0d33c8203e Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Tue, 2 Jul 2024 17:22:43 -0500
Subject: [PATCH 02/16] lxd/db: Add yaml tag to RaftNode

...for cluster recovery

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/db/raft.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lxd/db/raft.go b/lxd/db/raft.go
index 0ff8598143b9..44cb192216ab 100644
--- a/lxd/db/raft.go
+++ b/lxd/db/raft.go
@@ -19,7 +19,7 @@ import (
 // dqlite client package.
 type RaftNode struct {
 	client.NodeInfo
-	Name string
+	Name string `yaml:"name"`
 }
 
 // RaftRole captures the role of dqlite/raft node.

From 34b45e3529201dfea0adc829284e1f1cda2fb8d9 Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Fri, 12 Jul 2024 16:55:53 -0500
Subject: [PATCH 03/16] lxd/cluster/recover: Refactor DetermineRaftNode
 transaction

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/cluster/recover.go | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/lxd/cluster/recover.go b/lxd/cluster/recover.go
index 0cdd3db09772..d793241c7b49 100644
--- a/lxd/cluster/recover.go
+++ b/lxd/cluster/recover.go
@@ -38,24 +38,36 @@ func ListDatabaseNodes(database *db.Node) ([]string, error) {
 	return addresses, nil
 }
 
-// Recover rebuilds the dqlite raft configuration leaving only the current
-// member in the cluster. Use `Reconfigure` if more members should remain in
-// the raft configuration.
-func Recover(database *db.Node) error {
-	// Figure out if we actually act as dqlite node.
+// Return the entry in the raft_nodes table that corresponds to the local
+// `core.https_address`.
+// Returns err if no raft_node exists for the local node.
+func localRaftNode(database *db.Node) (*db.RaftNode, error) {
 	var info *db.RaftNode
 	err := database.Transaction(context.TODO(), func(ctx context.Context, tx *db.NodeTx) error {
 		var err error
 		info, err = node.DetermineRaftNode(ctx, tx)
+
 		return err
 	})
 	if err != nil {
-		return fmt.Errorf("Failed to determine node role: %w", err)
+		return nil, fmt.Errorf("Failed to determine cluster member raft role: %w", err)
 	}
 
 	// If we're not a database node, return an error.
 	if info == nil {
-		return fmt.Errorf("This LXD instance has no database role")
+		return nil, fmt.Errorf("This cluster member has no raft role")
+	}
+
+	return info, nil
+}
+
+// Recover rebuilds the dqlite raft configuration leaving only the current
+// member in the cluster. Use `Reconfigure` if more members should remain in
+// the raft configuration.
+func Recover(database *db.Node) error {
+	info, err := localRaftNode(database)
+	if err != nil {
+		return err
 	}
 
 	// If this is a standalone node not exposed to the network, return an
@@ -127,19 +139,9 @@ func updateLocalAddress(database *db.Node, address string) error {
 // Reconfigure replaces the entire cluster configuration.
 // Addresses and node roles may be updated. Node IDs are read-only.
 func Reconfigure(database *db.Node, raftNodes []db.RaftNode) error {
-	var info *db.RaftNode
-	err := database.Transaction(context.TODO(), func(ctx context.Context, tx *db.NodeTx) error {
-		var err error
-		info, err = node.DetermineRaftNode(ctx, tx)
-
-		return err
-	})
+	info, err := localRaftNode(database)
 	if err != nil {
-		return fmt.Errorf("Failed to determine cluster member raft role: %w", err)
-	}
-
-	if info == nil {
-		return fmt.Errorf("This cluster member has no raft role")
+		return err
 	}
 
 	localAddress := info.Address

From d52a788d33c5b425a03ed8ffa1c784a760fe137b Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Fri, 12 Jul 2024 17:00:31 -0500
Subject: [PATCH 04/16] lxd/cluster: Refactor recovery patch.global.sql

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/cluster/recover.go | 60 +++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/lxd/cluster/recover.go b/lxd/cluster/recover.go
index d793241c7b49..32667c52214a 100644
--- a/lxd/cluster/recover.go
+++ b/lxd/cluster/recover.go
@@ -136,6 +136,39 @@ func updateLocalAddress(database *db.Node, address string) error {
 	return nil
 }
 
+// Create a patch file for the nodes table in the global database; this updates
+// the addresses of all cluster members from the list of RaftNode in case they
+// were changed during cluster recovery.
+func writeGlobalNodesPatch(database *db.Node, nodes []db.RaftNode) error {
+	// No patch needed if there are no nodes
+	if len(nodes) < 1 {
+		return nil
+	}
+
+	reverter := revert.New()
+	defer reverter.Fail()
+
+	filePath := filepath.Join(database.Dir(), "patch.global.sql")
+	file, err := os.OpenFile(filePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+	if err != nil {
+		return err
+	}
+
+	defer func() { _ = file.Close() }()
+	reverter.Add(func() { _ = os.Remove(filePath) })
+
+	for _, node := range nodes {
+		_, err = fmt.Fprintf(file, "UPDATE nodes SET address = %q WHERE id = %d;\n", node.Address, node.ID)
+		if err != nil {
+			return err
+		}
+	}
+
+	reverter.Success()
+
+	return nil
+}
+
 // Reconfigure replaces the entire cluster configuration.
 // Addresses and node roles may be updated. Node IDs are read-only.
 func Reconfigure(database *db.Node, raftNodes []db.RaftNode) error {
@@ -179,30 +212,9 @@ func Reconfigure(database *db.Node, raftNodes []db.RaftNode) error {
 		return err
 	}
 
-	// Create patch file for global nodes database.
-	content := ""
-	for _, node := range nodes {
-		content += fmt.Sprintf("UPDATE nodes SET address = %q WHERE id = %d;\n", node.Address, node.ID)
-	}
-
-	if len(content) > 0 {
-		filePath := filepath.Join(database.Dir(), "patch.global.sql")
-		file, err := os.OpenFile(filePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
-		if err != nil {
-			return err
-		}
-
-		defer func() { _ = file.Close() }()
-
-		_, err = file.Write([]byte(content))
-		if err != nil {
-			return err
-		}
-
-		err = file.Close()
-		if err != nil {
-			return err
-		}
+	err = writeGlobalNodesPatch(database, raftNodes)
+	if err != nil {
+		return fmt.Errorf("Failed to create global db patch for cluster recover: %w", err)
 	}
 
 	return nil

From 2525886620a8d4bb6c3409ea85b98631a5480050 Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Tue, 2 Jul 2024 17:28:25 -0500
Subject: [PATCH 05/16] lxd/cluster: Write recovery tarball after cluster edit

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/cluster/recover.go | 101 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 94 insertions(+), 7 deletions(-)

diff --git a/lxd/cluster/recover.go b/lxd/cluster/recover.go
index 32667c52214a..88629d51d64b 100644
--- a/lxd/cluster/recover.go
+++ b/lxd/cluster/recover.go
@@ -1,6 +1,8 @@
 package cluster
 
 import (
+	"archive/tar"
+	"compress/gzip"
 	"context"
 	"fmt"
 	"os"
@@ -9,11 +11,16 @@ import (
 
 	dqlite "github.com/canonical/go-dqlite"
 	"github.com/canonical/go-dqlite/client"
+	"gopkg.in/yaml.v2"
 
 	"github.com/canonical/lxd/lxd/db"
 	"github.com/canonical/lxd/lxd/node"
+	"github.com/canonical/lxd/shared/revert"
 )
 
+// RecoveryTarballName is the filename used for recovery tarballs.
+const RecoveryTarballName = "lxd_recovery_db.tar.gz"
+
 // ListDatabaseNodes returns a list of database node names.
 func ListDatabaseNodes(database *db.Node) ([]string, error) {
 	nodes := []db.RaftNode{}
@@ -171,10 +178,11 @@ func writeGlobalNodesPatch(database *db.Node, nodes []db.RaftNode) error {
 
 // Reconfigure replaces the entire cluster configuration.
 // Addresses and node roles may be updated. Node IDs are read-only.
-func Reconfigure(database *db.Node, raftNodes []db.RaftNode) error {
+// Returns the path to the new database state (recovery tarball).
+func Reconfigure(database *db.Node, raftNodes []db.RaftNode) (string, error) {
 	info, err := localRaftNode(database)
 	if err != nil {
-		return err
+		return "", err
 	}
 
 	localAddress := info.Address
@@ -193,7 +201,7 @@ func Reconfigure(database *db.Node, raftNodes []db.RaftNode) error {
 	if localAddress != info.Address {
 		err := updateLocalAddress(database, localAddress)
 		if err != nil {
-			return err
+			return "", err
 		}
 	}
 
@@ -201,7 +209,7 @@ func Reconfigure(database *db.Node, raftNodes []db.RaftNode) error {
 	// Replace cluster configuration in dqlite.
 	err = dqlite.ReconfigureMembershipExt(dir, nodes)
 	if err != nil {
-		return fmt.Errorf("Failed to recover database state: %w", err)
+		return "", fmt.Errorf("Failed to recover database state: %w", err)
 	}
 
 	// Replace cluster configuration in local raft_nodes database.
@@ -209,15 +217,94 @@ func Reconfigure(database *db.Node, raftNodes []db.RaftNode) error {
 		return tx.ReplaceRaftNodes(raftNodes)
 	})
 	if err != nil {
-		return err
+		return "", err
+	}
+
+	tarballPath, err := writeRecoveryTarball(database.Dir(), raftNodes)
+	if err != nil {
+		return "", fmt.Errorf("Failed to create recovery tarball: copy db manually; %w", err)
 	}
 
 	err = writeGlobalNodesPatch(database, raftNodes)
 	if err != nil {
-		return fmt.Errorf("Failed to create global db patch for cluster recover: %w", err)
+		return "", fmt.Errorf("Failed to create global db patch for cluster recover: %w", err)
 	}
 
-	return nil
+	return tarballPath, nil
+}
+
+// Create a tarball of the global database dir to be copied to all other
+// remaining cluster members.
+func writeRecoveryTarball(databaseDir string, raftNodes []db.RaftNode) (string, error) {
+	reverter := revert.New()
+	defer reverter.Fail()
+
+	tarballPath := filepath.Join(databaseDir, RecoveryTarballName)
+
+	tarball, err := os.Create(tarballPath)
+	if err != nil {
+		return "", err
+	}
+
+	reverter.Add(func() { _ = os.Remove(tarballPath) })
+
+	gzWriter := gzip.NewWriter(tarball)
+	tarWriter := tar.NewWriter(gzWriter)
+
+	globalDBDirFS := os.DirFS(filepath.Join(databaseDir, "global"))
+
+	err = tarWriter.AddFS(globalDBDirFS)
+	if err != nil {
+		return "", err
+	}
+
+	raftNodesYaml, err := yaml.Marshal(raftNodes)
+	if err != nil {
+		return "", err
+	}
+
+	raftNodesHeader := tar.Header{
+		Typeflag: tar.TypeReg,
+		Name:     "raft_nodes.yaml",
+		Size:     int64(len(raftNodesYaml)),
+		Mode:     0o644,
+		Uid:      0,
+		Gid:      0,
+		Format:   tar.FormatPAX,
+	}
+
+	err = tarWriter.WriteHeader(&raftNodesHeader)
+	if err != nil {
+		return "", err
+	}
+
+	written, err := tarWriter.Write(raftNodesYaml)
+	if err != nil {
+		return "", err
+	}
+
+	if written != len(raftNodesYaml) {
+		return "", fmt.Errorf("Wrote %d bytes but expected to write %d", written, len(raftNodesYaml))
+	}
+
+	err = tarWriter.Close()
+	if err != nil {
+		return "", err
+	}
+
+	err = gzWriter.Close()
+	if err != nil {
+		return "", err
+	}
+
+	err = tarball.Close()
+	if err != nil {
+		return "", err
+	}
+
+	reverter.Success()
+
+	return tarballPath, nil
 }
 
 // RemoveRaftNode removes a raft node from the raft configuration.

From ee6ce68196756b02bb3d32fb9b20650e330a413d Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Thu, 29 Aug 2024 17:18:42 -0500
Subject: [PATCH 06/16] lxd/cluster: Prevent custom patches during cluster
 recovery

Since this is a somewhat arbitrary check, we should make sure to do it
before we've mutated the dqlite dir state.

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/cluster/recover.go | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/lxd/cluster/recover.go b/lxd/cluster/recover.go
index 88629d51d64b..be798b37b829 100644
--- a/lxd/cluster/recover.go
+++ b/lxd/cluster/recover.go
@@ -21,6 +21,8 @@ import (
 // RecoveryTarballName is the filename used for recovery tarballs.
 const RecoveryTarballName = "lxd_recovery_db.tar.gz"
 
+const errPatchExists = "Custom patches should not be applied during recovery"
+
 // ListDatabaseNodes returns a list of database node names.
 func ListDatabaseNodes(database *db.Node) ([]string, error) {
 	nodes := []db.RaftNode{}
@@ -156,7 +158,13 @@ func writeGlobalNodesPatch(database *db.Node, nodes []db.RaftNode) error {
 	defer reverter.Fail()
 
 	filePath := filepath.Join(database.Dir(), "patch.global.sql")
-	file, err := os.OpenFile(filePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+
+	_, err := os.Stat(filePath)
+	if err == nil {
+		return fmt.Errorf("Found %s: %s", filePath, errPatchExists)
+	}
+
+	file, err := os.OpenFile(filePath, os.O_CREATE|os.O_WRONLY, 0644)
 	if err != nil {
 		return err
 	}
@@ -197,6 +205,12 @@ func Reconfigure(database *db.Node, raftNodes []db.RaftNode) (string, error) {
 		}
 	}
 
+	patchPath := path.Join(database.Dir(), "patch.global.sql")
+	_, err = os.Stat(patchPath)
+	if err == nil {
+		return "", fmt.Errorf("Found %s: %s", patchPath, errPatchExists)
+	}
+
 	// Update cluster.https_address if changed.
 	if localAddress != info.Address {
 		err := updateLocalAddress(database, localAddress)

From 539626842343ba0bf889747e13f164f494a77275 Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Tue, 9 Jul 2024 17:46:59 -0500
Subject: [PATCH 07/16] lxd/cluster: Implement cluster recovery tarball unpack

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/cluster/recover.go | 182 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 181 insertions(+), 1 deletion(-)

diff --git a/lxd/cluster/recover.go b/lxd/cluster/recover.go
index be798b37b829..70151a828f5e 100644
--- a/lxd/cluster/recover.go
+++ b/lxd/cluster/recover.go
@@ -5,8 +5,12 @@ import (
 	"compress/gzip"
 	"context"
 	"fmt"
+	"io"
+	"io/fs"
 	"os"
+	"path"
 	"path/filepath"
+	"strings"
 	"time"
 
 	dqlite "github.com/canonical/go-dqlite"
@@ -15,12 +19,15 @@ import (
 
 	"github.com/canonical/lxd/lxd/db"
 	"github.com/canonical/lxd/lxd/node"
+	"github.com/canonical/lxd/shared/logger"
 	"github.com/canonical/lxd/shared/revert"
 )
 
 // RecoveryTarballName is the filename used for recovery tarballs.
 const RecoveryTarballName = "lxd_recovery_db.tar.gz"
 
+const raftNodesFilename = "raft_nodes.yaml"
+
 const errPatchExists = "Custom patches should not be applied during recovery"
 
 // ListDatabaseNodes returns a list of database node names.
@@ -279,7 +286,7 @@ func writeRecoveryTarball(databaseDir string, raftNodes []db.RaftNode) (string,
 
 	raftNodesHeader := tar.Header{
 		Typeflag: tar.TypeReg,
-		Name:     "raft_nodes.yaml",
+		Name:     raftNodesFilename,
 		Size:     int64(len(raftNodesYaml)),
 		Mode:     0o644,
 		Uid:      0,
@@ -321,6 +328,114 @@ func writeRecoveryTarball(databaseDir string, raftNodes []db.RaftNode) (string,
 	return tarballPath, nil
 }
 
+// DatabaseReplaceFromTarball unpacks the tarball found at `tarballPath`, replaces
+// the global database, updates the local database with any changed addresses,
+// and writes a global patch file to update the global database with any changed
+// addresses.
+func DatabaseReplaceFromTarball(tarballPath string, database *db.Node) error {
+	globalDBDir := path.Join(database.Dir(), "global")
+	unpackDir := filepath.Join(database.Dir(), "global.recover")
+
+	logger.Warn("Recovery tarball located; attempting DB recovery", logger.Ctx{"tarball": tarballPath})
+
+	err := unpackTarball(tarballPath, unpackDir)
+	if err != nil {
+		return err
+	}
+
+	raftNodesYamlPath := path.Join(unpackDir, raftNodesFilename)
+	raftNodesYaml, err := os.ReadFile(raftNodesYamlPath)
+	if err != nil {
+		return err
+	}
+
+	var incomingRaftNodes []db.RaftNode
+	err = yaml.Unmarshal(raftNodesYaml, &incomingRaftNodes)
+	if err != nil {
+		return fmt.Errorf("Invalid %q", raftNodesYamlPath)
+	}
+
+	var localRaftNodes []db.RaftNode
+	err = database.Transaction(context.TODO(), func(ctx context.Context, tx *db.NodeTx) (err error) {
+		localRaftNodes, err = tx.GetRaftNodes(ctx)
+		return err
+	})
+	if err != nil {
+		return err
+	}
+
+	for _, localNode := range localRaftNodes {
+		foundLocal := false
+		for _, incomingNode := range incomingRaftNodes {
+			foundLocal = localNode.ID == incomingNode.ID &&
+				localNode.Name == incomingNode.Name
+
+			if foundLocal {
+				break
+			}
+		}
+
+		// The incoming tarball should contain a node with the same dqlite ID as
+		// the local LXD server; we shouldn't unpack a recovery tarball from a
+		// different cluster.
+		if !foundLocal {
+			return fmt.Errorf("Missing cluster member %q in incoming recovery tarball", localNode.Name)
+		}
+	}
+
+	// Update our core.https_address if it has changed
+	localRaftNode, err := localRaftNode(database)
+	if err != nil {
+		return err
+	}
+
+	for _, incomingNode := range incomingRaftNodes {
+		if incomingNode.ID == localRaftNode.ID {
+			if incomingNode.Address != localRaftNode.Address {
+				err = updateLocalAddress(database, incomingNode.Address)
+				if err != nil {
+					return err
+				}
+			}
+
+			break
+		}
+	}
+
+	// Replace cluster configuration in local raft_nodes database.
+	err = database.Transaction(context.TODO(), func(ctx context.Context, tx *db.NodeTx) error {
+		return tx.ReplaceRaftNodes(incomingRaftNodes)
+	})
+	if err != nil {
+		return err
+	}
+
+	err = writeGlobalNodesPatch(database, incomingRaftNodes)
+	if err != nil {
+		return fmt.Errorf("Failed to create global db patch for cluster recover: %w", err)
+	}
+
+	// Now that we're as sure as we can be that the recovery DB is valid, we can
+	// replace the existing DB
+	err = os.RemoveAll(globalDBDir)
+	if err != nil {
+		return err
+	}
+
+	err = os.Rename(unpackDir, globalDBDir)
+	if err != nil {
+		return err
+	}
+
+	// Prevent the database being restored again after subsequent restarts
+	err = os.Remove(tarballPath)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
 // RemoveRaftNode removes a raft node from the raft configuration.
 func RemoveRaftNode(gateway *Gateway, address string) error {
 	nodes, err := gateway.currentRaftNodes()
@@ -358,3 +473,68 @@ func RemoveRaftNode(gateway *Gateway, address string) error {
 
 	return nil
 }
+
+func unpackTarball(tarballPath string, destRoot string) error {
+	reverter := revert.New()
+	defer reverter.Fail()
+
+	tarball, err := os.Open(tarballPath)
+	if err != nil {
+		return err
+	}
+
+	gzReader, err := gzip.NewReader(tarball)
+	if err != nil {
+		return err
+	}
+
+	tarReader := tar.NewReader(gzReader)
+
+	err = os.MkdirAll(destRoot, 0o755)
+	if err != nil {
+		return err
+	}
+
+	reverter.Add(func() { _ = os.RemoveAll(destRoot) })
+
+	for {
+		header, err := tarReader.Next()
+		if err == io.EOF {
+			break
+		} else if err != nil {
+			return err
+		}
+
+		// CWE-22
+		if strings.Contains(header.Name, "..") {
+			return fmt.Errorf("Invalid sequence `..` in recovery tarball entry %q", header.Name)
+		}
+
+		filepath := path.Join(destRoot, header.Name)
+
+		switch header.Typeflag {
+		case tar.TypeReg:
+			file, err := os.Create(filepath)
+			if err != nil {
+				return err
+			}
+
+			countWritten, err := io.Copy(file, tarReader)
+			if countWritten != header.Size {
+				return fmt.Errorf("Mismatched written (%d) and size (%d) for entry %q in %q", countWritten, header.Size, header.Name, tarballPath)
+			} else if err != nil {
+				return err
+			}
+
+		case tar.TypeDir:
+			err = os.MkdirAll(filepath, fs.FileMode(header.Mode&int64(fs.ModePerm)))
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	reverter.Success()
+
+	return nil
+}

From f1b8073c4cd4be983e21ba51e38d54f3635539ef Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Tue, 9 Jul 2024 17:51:09 -0500
Subject: [PATCH 08/16] lxd/daemon: Load recovery tarball on daemon start

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/daemon.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lxd/daemon.go b/lxd/daemon.go
index 4e183f057ae5..6a225fbd1394 100644
--- a/lxd/daemon.go
+++ b/lxd/daemon.go
@@ -1198,6 +1198,18 @@ func (d *Daemon) init() error {
 		d.serverCertInt = serverCert
 	}
 
+	// If we're clustered, check for an incoming recovery tarball
+	if d.serverClustered {
+		tarballPath := filepath.Join(d.db.Node.Dir(), cluster.RecoveryTarballName)
+
+		if shared.PathExists(tarballPath) {
+			err = cluster.DatabaseReplaceFromTarball(tarballPath, d.db.Node)
+			if err != nil {
+				return fmt.Errorf("Failed to load recovery tarball: %w", err)
+			}
+		}
+	}
+
 	/* Setup dqlite */
 	clusterLogLevel := "ERROR"
 	if shared.ValueInSlice("dqlite", trace) {

From 62444e20a395ae7cb6be3a94fd526153caf3cb85 Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Fri, 12 Jul 2024 17:32:05 -0500
Subject: [PATCH 09/16] lxd/main_cluster: refactor promptConfirmation

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/main_cluster.go | 82 +++++++++++++++++++--------------------------
 1 file changed, 35 insertions(+), 47 deletions(-)

diff --git a/lxd/main_cluster.go b/lxd/main_cluster.go
index a674443b080f..bc0d793f1d3f 100644
--- a/lxd/main_cluster.go
+++ b/lxd/main_cluster.go
@@ -25,6 +25,20 @@ import (
 	"github.com/canonical/lxd/shared/termios"
 )
 
+func promptConfirmation(prompt string, opname string) error {
+	reader := bufio.NewReader(os.Stdin)
+	fmt.Print(prompt + "Do you want to proceed? (yes/no): ")
+
+	input, _ := reader.ReadString('\n')
+	input = strings.TrimSuffix(input, "\n")
+
+	if !shared.ValueInSlice(strings.ToLower(input), []string{"yes"}) {
+		return fmt.Errorf("%s operation aborted", opname)
+	}
+
+	return nil
+}
+
 type cmdCluster struct {
 	global *cmdGlobal
 }
@@ -368,6 +382,22 @@ func (c *cmdClusterListDatabase) Run(cmd *cobra.Command, args []string) error {
 	return nil
 }
 
+const recoverFromQuorumLossPrompt = `You should run this command only if you are *absolutely* certain that this is
+the only database node left in your cluster AND that other database nodes will
+never come back (i.e. their LXD daemon won't ever be started again).
+
+This will make this LXD instance the only member of the cluster, and it won't
+be possible to perform operations on former cluster members anymore.
+
+However all information about former cluster members will be preserved in the
+database, so you can possibly inspect it for further recovery.
+
+You'll be able to permanently delete from the database all information about
+former cluster members by running "lxc cluster remove <member-name> --force".
+
+See https://documentation.ubuntu.com/lxd/en/latest/howto/cluster_recover/#recover-from-quorum-loss for more
+info.`
+
 type cmdClusterRecoverFromQuorumLoss struct {
 	global             *cmdGlobal
 	flagNonInteractive bool
@@ -394,7 +424,7 @@ func (c *cmdClusterRecoverFromQuorumLoss) Run(cmd *cobra.Command, args []string)
 
 	// Prompt for confirmation unless --quiet was passed.
 	if !c.flagNonInteractive {
-		err := c.promptConfirmation()
+		err := promptConfirmation(recoverFromQuorumLossPrompt, "Recover")
 		if err != nil {
 			return err
 		}
@@ -410,34 +440,9 @@ func (c *cmdClusterRecoverFromQuorumLoss) Run(cmd *cobra.Command, args []string)
 	return cluster.Recover(db)
 }
 
-func (c *cmdClusterRecoverFromQuorumLoss) promptConfirmation() error {
-	reader := bufio.NewReader(os.Stdin)
-	fmt.Print(`You should run this command only if you are *absolutely* certain that this is
-the only database node left in your cluster AND that other database nodes will
-never come back (i.e. their LXD daemon won't ever be started again).
-
-This will make this LXD instance the only member of the cluster, and it won't
-be possible to perform operations on former cluster members anymore.
-
-However all information about former cluster members will be preserved in the
-database, so you can possibly inspect it for further recovery.
-
-You'll be able to permanently delete from the database all information about
-former cluster members by running "lxc cluster remove <member-name> --force".
-
-See https://documentation.ubuntu.com/lxd/en/latest/howto/cluster_recover/#recover-from-quorum-loss for more
-info.
-
-Do you want to proceed? (yes/no): `)
-	input, _ := reader.ReadString('\n')
-	input = strings.TrimSuffix(input, "\n")
-
-	if !shared.ValueInSlice(strings.ToLower(input), []string{"yes"}) {
-		return fmt.Errorf("Recover operation aborted")
-	}
-
-	return nil
-}
+const removeRaftNodePrompt = `You should run this command only if you ended up in an
+inconsistent state where a node has been uncleanly removed (i.e. it doesn't show
+up in "lxc cluster list" but it's still in the raft configuration).`
 
 type cmdClusterRemoveRaftNode struct {
 	global             *cmdGlobal
@@ -466,7 +471,7 @@ func (c *cmdClusterRemoveRaftNode) Run(cmd *cobra.Command, args []string) error
 
 	// Prompt for confirmation unless --quiet was passed.
 	if !c.flagNonInteractive {
-		err := c.promptConfirmation()
+		err := promptConfirmation(removeRaftNodePrompt, "Remove raft node")
 		if err != nil {
 			return err
 		}
@@ -485,20 +490,3 @@ func (c *cmdClusterRemoveRaftNode) Run(cmd *cobra.Command, args []string) error
 
 	return nil
 }
-
-func (c *cmdClusterRemoveRaftNode) promptConfirmation() error {
-	reader := bufio.NewReader(os.Stdin)
-	fmt.Print(`You should run this command only if you ended up in an
-inconsistent state where a node has been uncleanly removed (i.e. it doesn't show
-up in "lxc cluster list" but it's still in the raft configuration).
-
-Do you want to proceed? (yes/no): `)
-	input, _ := reader.ReadString('\n')
-	input = strings.TrimSuffix(input, "\n")
-
-	if !shared.ValueInSlice(strings.ToLower(input), []string{"yes"}) {
-		return fmt.Errorf("Remove raft node operation aborted")
-	}
-
-	return nil
-}

From 605ec41bd61410c0a16790a046a0807dce7f3cc0 Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Thu, 29 Aug 2024 16:22:18 -0500
Subject: [PATCH 10/16] lxd/main_cluster: Use "member" instead of "node"

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/main_cluster.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lxd/main_cluster.go b/lxd/main_cluster.go
index bc0d793f1d3f..ba2718bd4426 100644
--- a/lxd/main_cluster.go
+++ b/lxd/main_cluster.go
@@ -383,10 +383,10 @@ func (c *cmdClusterListDatabase) Run(cmd *cobra.Command, args []string) error {
 }
 
 const recoverFromQuorumLossPrompt = `You should run this command only if you are *absolutely* certain that this is
-the only database node left in your cluster AND that other database nodes will
+the only database member left in your cluster AND that other database members will
 never come back (i.e. their LXD daemon won't ever be started again).
 
-This will make this LXD instance the only member of the cluster, and it won't
+This will make this LXD server the only member of the cluster, and it won't
 be possible to perform operations on former cluster members anymore.
 
 However all information about former cluster members will be preserved in the
@@ -441,8 +441,8 @@ func (c *cmdClusterRecoverFromQuorumLoss) Run(cmd *cobra.Command, args []string)
 }
 
 const removeRaftNodePrompt = `You should run this command only if you ended up in an
-inconsistent state where a node has been uncleanly removed (i.e. it doesn't show
-up in "lxc cluster list" but it's still in the raft configuration).`
+inconsistent state where a cluster member has been uncleanly removed (i.e. it
+doesn't show up in "lxc cluster list" but it's still in the raft configuration).`
 
 type cmdClusterRemoveRaftNode struct {
 	global             *cmdGlobal

From 9bb6d220a09d37137e1c6e2e13f4ba77358a99c3 Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Fri, 12 Jul 2024 17:39:26 -0500
Subject: [PATCH 11/16] lxd/main_cluster: Prompt before cluster edit

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/main_cluster.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lxd/main_cluster.go b/lxd/main_cluster.go
index ba2718bd4426..00215a034873 100644
--- a/lxd/main_cluster.go
+++ b/lxd/main_cluster.go
@@ -118,6 +118,13 @@ func (c ClusterMember) ToRaftNode() (*db.RaftNode, error) {
 	return node, nil
 }
 
+const clusterEditPrompt = `You should run this command only if:
+ - A quorum of cluster members is permanently lost or their addresses have changed
+ - You are *absolutely* sure all LXD daemons are stopped
+ - This instance has the most up to date database
+
+See https://documentation.ubuntu.com/lxd/en/latest/howto/cluster_recover/#reconfigure-the-cluster for more info.`
+
 type cmdClusterEdit struct {
 	global *cmdGlobal
 }
@@ -188,6 +195,11 @@ func (c *cmdClusterEdit) Run(cmd *cobra.Command, args []string) error {
 			return err
 		}
 	} else {
+		err = promptConfirmation(clusterEditPrompt, "Cluster edit")
+		if err != nil {
+			return err
+		}
+
 		if len(config.Members) > 0 {
 			data = []byte(fmt.Sprintf(SegmentComment, segmentID) + "\n\n" + string(data))
 		}

From f44ed5dd764f04b33db135d6edd01563f0e970d7 Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Fri, 12 Jul 2024 17:44:54 -0500
Subject: [PATCH 12/16] lxd/main_cluster: Add yaml comment with role
 explanation

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/main_cluster.go | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/lxd/main_cluster.go b/lxd/main_cluster.go
index 00215a034873..f18d7e65a3fb 100644
--- a/lxd/main_cluster.go
+++ b/lxd/main_cluster.go
@@ -125,6 +125,19 @@ const clusterEditPrompt = `You should run this command only if:
 
 See https://documentation.ubuntu.com/lxd/en/latest/howto/cluster_recover/#reconfigure-the-cluster for more info.`
 
+const clusterEditComment = `# Member roles can be modified. Unrecoverable nodes should be given the role "spare".
+#
+# "voter" - Voting member of the database. A majority of voters is a quorum.
+# "stand-by" - Non-voting member of the database; can be promoted to voter.
+# "spare" - Not a member of the database.
+#
+# The edit is aborted if:
+# - the number of members changes
+# - the name of any member changes
+# - the ID of any member changes
+# - no changes are made
+`
+
 type cmdClusterEdit struct {
 	global *cmdGlobal
 }
@@ -201,7 +214,10 @@ func (c *cmdClusterEdit) Run(cmd *cobra.Command, args []string) error {
 		}
 
 		if len(config.Members) > 0 {
-			data = []byte(fmt.Sprintf(SegmentComment, segmentID) + "\n\n" + string(data))
+			data = []byte(
+				clusterEditComment + "\n\n" +
+					fmt.Sprintf(SegmentComment, segmentID) + "\n\n" +
+					string(data))
 		}
 
 		content, err = shared.TextEditor("", data)

From f18c023673601750b8ce711f520a260f44311831 Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Mon, 15 Jul 2024 15:09:18 -0500
Subject: [PATCH 13/16] lxd/main_cluster: Print instructions after cluster edit

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/main_cluster.go | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lxd/main_cluster.go b/lxd/main_cluster.go
index f18d7e65a3fb..38124c214fee 100644
--- a/lxd/main_cluster.go
+++ b/lxd/main_cluster.go
@@ -226,6 +226,7 @@ func (c *cmdClusterEdit) Run(cmd *cobra.Command, args []string) error {
 		}
 	}
 
+	var tarballPath string
 	for {
 		newConfig := ClusterConfig{}
 		err = yaml.Unmarshal(content, &newConfig)
@@ -246,7 +247,7 @@ func (c *cmdClusterEdit) Run(cmd *cobra.Command, args []string) error {
 			if err == nil {
 				err = validateNewConfig(nodes, newNodes)
 				if err == nil {
-					err = cluster.Reconfigure(database, newNodes)
+					tarballPath, err = cluster.Reconfigure(database, newNodes)
 				}
 			}
 		}
@@ -270,6 +271,10 @@ func (c *cmdClusterEdit) Run(cmd *cobra.Command, args []string) error {
 		break
 	}
 
+	fmt.Printf("Cluster changes applied; new database state saved to %s\n\n", tarballPath)
+	fmt.Printf("*Before* starting any cluster member, copy %s to %s on all remaining cluster members.\n\n", tarballPath, tarballPath)
+	fmt.Printf("LXD will load this file during startup.\n")
+
 	return nil
 }
 

From 408d7321c6bef9a835aef1790e7215f504d18b5f Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Thu, 11 Jul 2024 17:56:23 -0500
Subject: [PATCH 14/16] test: Copy tarball from `cluster edit`

`lxd cluster edit` on each node is no longer supported.

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 test/suites/clustering.sh | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/test/suites/clustering.sh b/test/suites/clustering.sh
index 65a8e474a75e..c91b26cec5d3 100644
--- a/test/suites/clustering.sh
+++ b/test/suites/clustering.sh
@@ -3162,28 +3162,19 @@ test_clustering_edit_configuration() {
   config=$(mktemp -p "${TEST_DIR}" XXX)
   # Update the cluster configuration with new port numbers
   LXD_DIR="${LXD_ONE_DIR}" lxd cluster show > "${config}"
-  sed -e "s/:8443/:9393/" -i "${config}"
-  LXD_DIR="${LXD_ONE_DIR}" lxd cluster edit < "${config}"
-
-  LXD_DIR="${LXD_TWO_DIR}" lxd cluster show > "${config}"
-  sed -e "s/:8443/:9393/" -i "${config}"
-  LXD_DIR="${LXD_TWO_DIR}" lxd cluster edit < "${config}"
-
-  LXD_DIR="${LXD_THREE_DIR}" lxd cluster show > "${config}"
-  sed -e "s/:8443/:9393/" -i "${config}"
-  LXD_DIR="${LXD_THREE_DIR}" lxd cluster edit < "${config}"
 
-  LXD_DIR="${LXD_FOUR_DIR}" lxd cluster show > "${config}"
+  # lxd cluster edit generates ${LXD_DIR}/database/lxd_recovery_db.tar.gz
   sed -e "s/:8443/:9393/" -i "${config}"
-  LXD_DIR="${LXD_FOUR_DIR}" lxd cluster edit < "${config}"
+  LXD_DIR="${LXD_ONE_DIR}" lxd cluster edit < "${config}"
 
-  LXD_DIR="${LXD_FIVE_DIR}" lxd cluster show > "${config}"
-  sed -e "s/:8443/:9393/" -i "${config}"
-  LXD_DIR="${LXD_FIVE_DIR}" lxd cluster edit < "${config}"
+  for other_dir in "${LXD_TWO_DIR}" "${LXD_THREE_DIR}" "${LXD_FOUR_DIR}" "${LXD_FIVE_DIR}" "${LXD_SIX_DIR}"; do
+    cp "${LXD_ONE_DIR}/database/lxd_recovery_db.tar.gz" "${other_dir}/database/"
+  done
 
-  LXD_DIR="${LXD_SIX_DIR}" lxd cluster show > "${config}"
-  sed -e "s/:8443/:9393/" -i "${config}"
-  LXD_DIR="${LXD_SIX_DIR}" lxd cluster edit < "${config}"
+  # While it does work to load the recovery DB on the node which generated it,
+  # we should test to make sure that the recovery operation left the database
+  # ready to go.
+  rm "${LXD_ONE_DIR}/database/lxd_recovery_db.tar.gz"
 
   # Respawn the nodes
   LXD_NETNS="${ns1}" respawn_lxd "${LXD_ONE_DIR}" false

From b919409d7a08f5b6a2c2ca1c93d6e41c971980a9 Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Fri, 21 Jun 2024 17:20:48 -0500
Subject: [PATCH 15/16] doc: cluster edit updates & copy tarball

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 doc/howto/cluster_recover.md | 88 +++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 32 deletions(-)

diff --git a/doc/howto/cluster_recover.md b/doc/howto/cluster_recover.md
index 4f119aab9cee..2917a63c015b 100644
--- a/doc/howto/cluster_recover.md
+++ b/doc/howto/cluster_recover.md
@@ -2,47 +2,50 @@
 # How to recover a cluster
 
 It might happen that one or several members of your cluster go offline or become unreachable.
-In that case, no operations are possible on this member, and neither are operations that require a state change across all members.
+If too many cluster members go offline, no operations will be possible on the cluster.
 See {ref}`clustering-offline-members` and {ref}`cluster-automatic-evacuation` for more information.
 
-If you can bring the offline cluster members back or delete them from the cluster, operation resumes as normal.
-If this is not possible, there are a few ways to recover the cluster, depending on the scenario that caused the failure.
-See the following sections for details.
+If you can bring the offline cluster members back up, operation resumes as normal.
+If the cluster members are lost permanently (e.g. disk failure), it is possible
+to recover any remaining cluster members.
 
 ```{note}
-When your cluster is in a state that needs recovery, most `lxc` commands do not work, because the LXD client cannot connect to the LXD daemon.
+When your cluster is in a state that needs recovery, most `lxc` commands do not
+work because the LXD database does not respond when a majority of database
+voters are inaccessible.
+
+The commands to recover a cluster are provided directly by the LXD daemon (`lxd`)
+because they modify database files directly instead of making requests to the
+LXD daemon.
 
-Therefore, the commands to recover the cluster are provided directly by the LXD daemon (`lxd`).
 Run `lxd cluster --help` for an overview of all available commands.
 ```
 
-## Recover from quorum loss
+## Database members
 
 Every LXD cluster has a specific number of members (configured through {config:option}`server-cluster:cluster.max_voters`) that serve as voting members of the distributed database.
-If you permanently lose a majority of these cluster members (for example, you have a three-member cluster and you lose two members), the cluster loses quorum and becomes unavailable.
-However, if at least one database member survives, it is possible to recover the cluster.
+If you lose a majority of these cluster members (for example, you have a three-member cluster and you lose two members), the cluster loses quorum and becomes unavailable.
 
-To do so, complete the following steps:
+To determine which members have (or had) database roles, log on to any surviving member of your cluster and run the following command:
 
-1. Log on to any surviving member of your cluster and run the following command:
+    sudo lxd cluster list-database
+
+## Recover from quorum loss
 
-       sudo lxd cluster list-database
+If only one cluster member with the database role survives, complete the following
+steps. See [Reconfigure the cluster](#reconfigure-the-cluster) below for recovering
+more than one member.
 
-   This command shows which cluster members have one of the database roles.
-1. Pick one of the listed database members that is still online as the new leader.
-   Log on to the machine (if it differs from the one you are already logged on to).
 1. Make sure that the LXD daemon is not running on the machine.
    For example, if you're using the snap:
 
        sudo snap stop lxd
 
-1. Log on to all other cluster members that are still online and stop the LXD daemon.
-1. On the server that you picked as the new leader, run the following command:
+1. Use the following command to reconfigure the database:
 
        sudo lxd cluster recover-from-quorum-loss
 
-1. Start the LXD daemon again on all machines, starting with the new leader.
-   For example, if you're using the snap:
+1. Start the LXD daemon again. For example, if you're using the snap:
 
        sudo snap start lxd
 
@@ -54,25 +57,34 @@ This can help you with further recovery steps if you need to re-create the lost
 To permanently delete the cluster members that you have lost, force-remove them.
 See {ref}`cluster-manage-delete-members`.
 
-## Recover cluster members with changed addresses
+## Reconfigure the cluster
+
+```{important}
+It is highly recommended to take a backup of `/var/snap/lxd/common/lxd/database`
+(for snap users) or `/var/lib/lxd/lxd/database` (otherwise) before reconfiguring
+the cluster.
+```
 
 If some members of your cluster are no longer reachable, or if the cluster itself is unreachable due to a change in IP address or listening port number, you can reconfigure the cluster.
 
-To do so, edit the cluster configuration on each member of the cluster and change the IP addresses or listening port numbers as required.
-You cannot remove any members during this process.
-The cluster configuration must contain the description of the full cluster, so you must do the changes for all cluster members on all cluster members.
+To do so, choose one database member to edit the cluster configuration.
+Once the cluster edit is complete you will need to manually copy the reconfigured global database to every other surviving member.
+
+You can change the IP addresses or listening port numbers for each member as required.
+You cannot add or remove any members during this process.
+The cluster configuration must contain the description of the full cluster.
 
-You can edit the {ref}`clustering-member-roles` of the different members, but with the following limitations:
+You can edit the {ref}`clustering-member-roles` of the members, but with the following limitations:
 
 - A cluster member that does not have a `database*` role cannot become a voter, because it might lack a global database.
 - At least two members must remain voters (except in the case of a two-member cluster, where one voter suffices), or there will be no quorum.
 
-Log on to each cluster member and complete the following steps:
-
-1. Stop the LXD daemon.
+Before performing the recovery, stop the LXD daemon on all surviving cluster members.
    For example, if you're using the snap:
 
-       sudo snap stop lxd
+    sudo snap stop lxd
+
+Complete the following steps on one database member:
 
 1. Run the following command:
 
@@ -100,12 +112,24 @@ Log on to each cluster member and complete the following steps:
 
    You can edit the addresses and the roles.
 
-After doing the changes on all cluster members, start the LXD daemon on all members again.
-For example, if you're using the snap:
+1. When the cluster configuration has been changed on one member, LXD will create
+   a tarball of the global database (`/var/snap/lxd/common/lxd/database/lxd_recovery_db.tar.gz`
+   for snap installations or `/var/lib/lxd/database/lxd_recovery_db.tar.gz`).
+   Copy this recovery tarball to the same path on all remaining cluster members.
+
+   ```{note}
+   The tarball can be removed from the first member after it is generated, but
+   it does not have to be.
+   ```
+
+1. Once the tarball has been copied to all remaining cluster members, start the
+   LXD daemon on all members again. LXD will load the recovery tarball on startup.
 
-    sudo snap start lxd
+   If you're using the snap:
+
+       sudo snap start lxd
 
-The cluster should now be fully available again with all members reporting in.
+The cluster should now be fully available again with all surviving members reporting in.
 No information has been deleted from the database.
 All information about the cluster members and their instances is still there.
 

From 085afb90f286f1bde5babda1271cecc8a4e30f89 Mon Sep 17 00:00:00 2001
From: Wesley Hershberger <wesley.hershberger@canonical.com>
Date: Fri, 12 Jul 2024 23:36:07 -0500
Subject: [PATCH 16/16] lxd/main_cluster: Fix linter errors

Signed-off-by: Wesley Hershberger <wesley.hershberger@canonical.com>
---
 lxd/main_cluster.go | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/lxd/main_cluster.go b/lxd/main_cluster.go
index 38124c214fee..2bf191989928 100644
--- a/lxd/main_cluster.go
+++ b/lxd/main_cluster.go
@@ -43,6 +43,7 @@ type cmdCluster struct {
 	global *cmdGlobal
 }
 
+// Command returns a subcommand for administrating a cluster.
 func (c *cmdCluster) Command() *cobra.Command {
 	cmd := &cobra.Command{}
 	cmd.Use = "cluster"
@@ -55,8 +56,8 @@ func (c *cmdCluster) Command() *cobra.Command {
 	cmd.AddCommand(listDatabase.Command())
 
 	// Recover
-	recover := cmdClusterRecoverFromQuorumLoss{global: c.global}
-	cmd.AddCommand(recover.Command())
+	clusterRecover := cmdClusterRecoverFromQuorumLoss{global: c.global}
+	cmd.AddCommand(clusterRecover.Command())
 
 	// Remove a raft node.
 	removeRaftNode := cmdClusterRemoveRaftNode{global: c.global}
@@ -76,7 +77,7 @@ func (c *cmdCluster) Command() *cobra.Command {
 	return cmd
 }
 
-const SegmentComment = "# Latest dqlite segment ID: %s"
+const segmentComment = "# Latest dqlite segment ID: %s"
 
 // ClusterMember is a more human-readable representation of the db.RaftNode struct.
 type ClusterMember struct {
@@ -142,6 +143,7 @@ type cmdClusterEdit struct {
 	global *cmdGlobal
 }
 
+// Command returns a command for reconfiguring a cluster.
 func (c *cmdClusterEdit) Command() *cobra.Command {
 	cmd := &cobra.Command{}
 	cmd.Use = "edit"
@@ -153,6 +155,7 @@ func (c *cmdClusterEdit) Command() *cobra.Command {
 	return cmd
 }
 
+// Run executes the command for reconfiguring a cluster.
 func (c *cmdClusterEdit) Run(cmd *cobra.Command, args []string) error {
 	// Make sure that the daemon is not running.
 	_, err := lxd.ConnectLXDUnix("", nil)
@@ -216,7 +219,7 @@ func (c *cmdClusterEdit) Run(cmd *cobra.Command, args []string) error {
 		if len(config.Members) > 0 {
 			data = []byte(
 				clusterEditComment + "\n\n" +
-					fmt.Sprintf(SegmentComment, segmentID) + "\n\n" +
+					fmt.Sprintf(segmentComment, segmentID) + "\n\n" +
 					string(data))
 		}
 
@@ -323,6 +326,7 @@ type cmdClusterShow struct {
 	global *cmdGlobal
 }
 
+// Command returns a command for showing the current cluster configuration.
 func (c *cmdClusterShow) Command() *cobra.Command {
 	cmd := &cobra.Command{}
 	cmd.Use = "show"
@@ -334,6 +338,7 @@ func (c *cmdClusterShow) Command() *cobra.Command {
 	return cmd
 }
 
+// Run executes the command for showing the current cluster configuration.
 func (c *cmdClusterShow) Run(cmd *cobra.Command, args []string) error {
 	database, err := db.OpenNode(filepath.Join(sys.DefaultOS().VarDir, "database"), nil)
 	if err != nil {
@@ -368,7 +373,7 @@ func (c *cmdClusterShow) Run(cmd *cobra.Command, args []string) error {
 	}
 
 	if len(config.Members) > 0 {
-		fmt.Printf(SegmentComment+"\n\n%s", segmentID, data)
+		fmt.Printf(segmentComment+"\n\n%s", segmentID, data)
 	} else {
 		fmt.Print(data)
 	}
@@ -380,6 +385,7 @@ type cmdClusterListDatabase struct {
 	global *cmdGlobal
 }
 
+// Command returns a command for showing the database roles of cluster members.
 func (c *cmdClusterListDatabase) Command() *cobra.Command {
 	cmd := &cobra.Command{}
 	cmd.Use = "list-database"
@@ -391,6 +397,7 @@ func (c *cmdClusterListDatabase) Command() *cobra.Command {
 	return cmd
 }
 
+// Run executes the command for showing the database roles of cluster members.
 func (c *cmdClusterListDatabase) Run(cmd *cobra.Command, args []string) error {
 	os := sys.DefaultOS()
 
@@ -436,6 +443,7 @@ type cmdClusterRecoverFromQuorumLoss struct {
 	flagNonInteractive bool
 }
 
+// Command returns a command for rebuilding a cluster based on the current member.
 func (c *cmdClusterRecoverFromQuorumLoss) Command() *cobra.Command {
 	cmd := &cobra.Command{}
 	cmd.Use = "recover-from-quorum-loss"
@@ -448,6 +456,7 @@ func (c *cmdClusterRecoverFromQuorumLoss) Command() *cobra.Command {
 	return cmd
 }
 
+// Run executes the command for rebuilding a cluster based on the current member.
 func (c *cmdClusterRecoverFromQuorumLoss) Run(cmd *cobra.Command, args []string) error {
 	// Make sure that the daemon is not running.
 	_, err := lxd.ConnectLXDUnix("", nil)
@@ -482,6 +491,7 @@ type cmdClusterRemoveRaftNode struct {
 	flagNonInteractive bool
 }
 
+// Command returns a command for removing a raft node from the currently running database.
 func (c *cmdClusterRemoveRaftNode) Command() *cobra.Command {
 	cmd := &cobra.Command{}
 	cmd.Use = "remove-raft-node <address>"
@@ -494,6 +504,7 @@ func (c *cmdClusterRemoveRaftNode) Command() *cobra.Command {
 	return cmd
 }
 
+// Run executes the command for removing a raft node from the currently running database.
 func (c *cmdClusterRemoveRaftNode) Run(cmd *cobra.Command, args []string) error {
 	if len(args) != 1 {
 		_ = cmd.Help()