From 6e21a3d6afccfdf68b1c49351e5f8a2c62b56e2e Mon Sep 17 00:00:00 2001 From: Rebecca Mahany-Horton Date: Fri, 14 Jun 2024 16:56:45 -0400 Subject: [PATCH] Add rungroup actor to take periodic backups of the launcher database --- cmd/launcher/launcher.go | 3 + ee/agent/storage/bbolt/backup.go | 118 ++++++++++++++++++++++++++ ee/agent/storage/bbolt/backup_test.go | 52 ++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 ee/agent/storage/bbolt/backup.go create mode 100644 ee/agent/storage/bbolt/backup_test.go diff --git a/cmd/launcher/launcher.go b/cmd/launcher/launcher.go index 5dbfe0e44..3dab13e94 100644 --- a/cmd/launcher/launcher.go +++ b/cmd/launcher/launcher.go @@ -259,6 +259,9 @@ func runLauncher(ctx context.Context, cancel func(), multiSlogger, systemMultiSl // pickup internal.RecordLauncherVersion(ctx, rootDirectory) + p := agentbbolt.NewDatabasePhotographer(k) + runGroup.Add("databasePhotographer", p.Execute, p.Interrupt) + // create the certificate pool var rootPool *x509.CertPool if k.RootPEM() != "" { diff --git a/ee/agent/storage/bbolt/backup.go b/ee/agent/storage/bbolt/backup.go new file mode 100644 index 000000000..776452d6b --- /dev/null +++ b/ee/agent/storage/bbolt/backup.go @@ -0,0 +1,118 @@ +package agentbbolt + +import ( + "context" + "fmt" + "log/slog" + "os" + "path/filepath" + "time" + + "github.com/kolide/launcher/ee/agent/types" + "go.etcd.io/bbolt" +) + +const ( + snapshotInitialDelay = 10 * time.Minute + snapshotInterval = 1 * time.Hour +) + +// A photographer takes snapshots. +// TODO RM - A better name. +type photographer struct { + knapsack types.Knapsack + slogger *slog.Logger + interrupt chan struct{} + interrupted bool +} + +func NewDatabasePhotographer(k types.Knapsack) *photographer { + return &photographer{ + knapsack: k, + slogger: k.Slogger().With("component", "database_photographer"), + interrupt: make(chan struct{}, 1), + } +} + +func (p *photographer) Execute() error { + // Wait a little bit after startup before taking first snapshot, to allow for enrollment + select { + case <-p.interrupt: + p.slogger.Log(context.TODO(), slog.LevelDebug, + "received external interrupt during initial delay, stopping", + ) + return nil + case <-time.After(snapshotInitialDelay): + break + } + + // Take periodic snapshots + ticker := time.NewTicker(snapshotInterval) + defer ticker.Stop() + for { + if err := p.backupDb(); err != nil { + p.slogger.Log(context.TODO(), slog.LevelWarn, + "could not perform periodic database backup", + "err", err, + ) + } + + select { + case <-ticker.C: + continue + case <-p.interrupt: + p.slogger.Log(context.TODO(), slog.LevelDebug, + "interrupt received, exiting execute loop", + ) + return nil + } + } +} + +func (p *photographer) Interrupt(_ error) { + // Only perform shutdown tasks on first call to interrupt -- no need to repeat on potential extra calls. + if p.interrupted { + return + } + p.interrupted = true + + p.interrupt <- struct{}{} +} + +func (p *photographer) backupDb() error { + // Take backup -- it's fine to just overwrite previous backups + backupLocation := BackupLauncherDbLocation(p.knapsack.RootDirectory()) + if err := p.knapsack.BboltDB().View(func(tx *bbolt.Tx) error { + return tx.CopyFile(backupLocation, 0600) + }); err != nil { + return fmt.Errorf("backing up database: %w", err) + } + + // Confirm file exists and is nonempty + fileInfo, err := os.Stat(backupLocation) + if os.IsNotExist(err) { + return fmt.Errorf("backup succeeded, but no file at backup location %s", backupLocation) + } + if err != nil { + return fmt.Errorf("checking %s exists after taking backup: %w", backupLocation, err) + } + if fileInfo.Size() <= 0 { + return fmt.Errorf("backup succeeded, but backup database at %s is empty", backupLocation) + } + + // Log success + p.slogger.Log(context.TODO(), slog.LevelDebug, + "took backup", + "backup_location", backupLocation, + ) + + return nil +} + +func LauncherDbLocation(rootDir string) string { + return filepath.Join(rootDir, "launcher.db") +} + +func BackupLauncherDbLocation(rootDir string) string { + return filepath.Join(rootDir, "launcher.db.bak") +} diff --git a/ee/agent/storage/bbolt/backup_test.go b/ee/agent/storage/bbolt/backup_test.go new file mode 100644 index 000000000..7f2e048e4 --- /dev/null +++ b/ee/agent/storage/bbolt/backup_test.go @@ -0,0 +1,52 @@ +package agentbbolt + +import ( + "errors" + "testing" + "time" + + typesmocks "github.com/kolide/launcher/ee/agent/types/mocks" + "github.com/kolide/launcher/pkg/log/multislogger" + "github.com/stretchr/testify/require" +) + +func TestInterrupt_Multiple(t *testing.T) { + t.Parallel() + + testKnapsack := typesmocks.NewKnapsack(t) + testKnapsack.On("Slogger").Return(multislogger.NewNopLogger()) + + p := NewDatabasePhotographer(testKnapsack) + + // Start and then interrupt + go p.Execute() + p.Interrupt(errors.New("test error")) + + // Confirm we can call Interrupt multiple times without blocking + interruptComplete := make(chan struct{}) + expectedInterrupts := 3 + for i := 0; i < expectedInterrupts; i += 1 { + go func() { + p.Interrupt(nil) + interruptComplete <- struct{}{} + }() + } + + receivedInterrupts := 0 + for { + if receivedInterrupts >= expectedInterrupts { + break + } + + select { + case <-interruptComplete: + receivedInterrupts += 1 + continue + case <-time.After(5 * time.Second): + t.Errorf("could not call interrupt multiple times and return within 5 seconds -- received %d interrupts before timeout", receivedInterrupts) + t.FailNow() + } + } + + require.Equal(t, expectedInterrupts, receivedInterrupts) +}