Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Take and use backups of the launcher database #1755

Merged
merged 14 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cmd/launcher/launcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,9 @@ func runLauncher(ctx context.Context, cancel func(), multiSlogger, systemMultiSl
// this. Note that the timeout is documented as failing
// unimplemented on windows, though empirically it seems to
// work.
agentbbolt.UseBackupDbIfNeeded(rootDirectory, slogger)
boltOptions := &bbolt.Options{Timeout: time.Duration(30) * time.Second}
db, err := bbolt.Open(filepath.Join(rootDirectory, "launcher.db"), 0600, boltOptions)
db, err := bbolt.Open(agentbbolt.LauncherDbLocation(rootDirectory), 0600, boltOptions)
if err != nil {
return fmt.Errorf("open launcher db: %w", err)
}
Expand Down Expand Up @@ -259,6 +260,9 @@ func runLauncher(ctx context.Context, cancel func(), multiSlogger, systemMultiSl
// pickup
internal.RecordLauncherVersion(ctx, rootDirectory)

dbBackupSaver := agentbbolt.NewDatabaseBackupSaver(k)
runGroup.Add("dbBackupSaver", dbBackupSaver.Execute, dbBackupSaver.Interrupt)

// create the certificate pool
var rootPool *x509.CertPool
if k.RootPEM() != "" {
Expand Down
234 changes: 234 additions & 0 deletions ee/agent/storage/bbolt/backup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
package agentbbolt

import (
"context"
"fmt"
"log/slog"
"os"
"path/filepath"
"time"

"github.com/kolide/launcher/ee/agent/types"
"go.etcd.io/bbolt"
)

const (
backupInitialDelay = 10 * time.Minute
backupInterval = 1 * time.Hour
RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved
numberOfOldBackupsToRetain = 3
)

// databaseBackupSaver periodically takes backups of launcher.db.
type databaseBackupSaver struct {
knapsack types.Knapsack
slogger *slog.Logger
interrupt chan struct{}
interrupted bool
}

func NewDatabaseBackupSaver(k types.Knapsack) *databaseBackupSaver {
return &databaseBackupSaver{
knapsack: k,
slogger: k.Slogger().With("component", "database_backup_saver"),
interrupt: make(chan struct{}, 1),
}
}

func (d *databaseBackupSaver) Execute() error {
// Wait a little bit after startup before taking first backup, to allow for enrollment
directionless marked this conversation as resolved.
Show resolved Hide resolved
select {
case <-d.interrupt:
d.slogger.Log(context.TODO(), slog.LevelDebug,
"received external interrupt during initial delay, stopping",
)
return nil
case <-time.After(backupInitialDelay):
break
}

// Take periodic backups
ticker := time.NewTicker(backupInterval)
defer ticker.Stop()
for {
if err := d.backupDb(); err != nil {
d.slogger.Log(context.TODO(), slog.LevelWarn,
"could not perform periodic database backup",
"err", err,
)
}

select {
case <-ticker.C:
continue
case <-d.interrupt:
d.slogger.Log(context.TODO(), slog.LevelDebug,
"interrupt received, exiting execute loop",
)
return nil
}
}
}

func (d *databaseBackupSaver) Interrupt(_ error) {
// Only perform shutdown tasks on first call to interrupt -- no need to repeat on potential extra calls.
if d.interrupted {
return
}
d.interrupted = true

d.interrupt <- struct{}{}
}

func (d *databaseBackupSaver) backupDb() error {
// Take backup in temporary location
backupLocation := BackupLauncherDbLocation(d.knapsack.RootDirectory())
tempBackupLocation := fmt.Sprintf("%s.tmp", backupLocation)
defer func() {
// In case we errored out when taking the backup, clean up the temp file
_ = os.Remove(tempBackupLocation)
}()

if err := d.knapsack.BboltDB().View(func(tx *bbolt.Tx) error {
zackattack01 marked this conversation as resolved.
Show resolved Hide resolved
return tx.CopyFile(tempBackupLocation, 0600)
}); err != nil {
return fmt.Errorf("backing up database: %w", err)
}

// Confirm file exists and is nonempty
if exists, err := nonEmptyFileExists(tempBackupLocation); !exists {
return fmt.Errorf("backup succeeded, but nonempty file does not exist at %s", tempBackupLocation)
} else if err != nil {
return fmt.Errorf("backup succeeded, but error checking if file was created at %s: %w", tempBackupLocation, err)
}

// Perform rotation of older backups so we can move this backup to `backupLocation`
RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved
if err := d.rotate(); err != nil {
return fmt.Errorf("backup succeeded, but rotation did not: %w", err)
}

if err := os.Rename(tempBackupLocation, backupLocation); err != nil {
return fmt.Errorf("renaming temp backup %s to %s after rotation: %w", tempBackupLocation, backupLocation, err)
}

// Log success
d.slogger.Log(context.TODO(), slog.LevelDebug,
RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved
"took backup",
"backup_location", backupLocation,
)

return nil
}

func (d *databaseBackupSaver) rotate() error {
baseBackupPath := BackupLauncherDbLocation(d.knapsack.RootDirectory())

for i := numberOfOldBackupsToRetain; i > 0; i -= 1 {
currentBackupPath := fmt.Sprintf("%s.%d", baseBackupPath, i)

// This backup doesn't exist yet -- skip it
if _, err := os.Stat(currentBackupPath); err != nil && os.IsNotExist(err) {
continue
}

// If is the oldest backup, delete it so we can rotate a new one into its place
if i == numberOfOldBackupsToRetain {
if err := os.Remove(currentBackupPath); err != nil {
return fmt.Errorf("removing oldest backup %s during rotation: %w", currentBackupPath, err)
}
continue
}
RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved

// Rename from launcher.db.bak.<n> to launcher.db.bak.<n+1>
olderBackupPath := fmt.Sprintf("%s.%d", baseBackupPath, i+1)
if err := os.Rename(currentBackupPath, olderBackupPath); err != nil {
return fmt.Errorf("renaming %s to %s during rotation: %w", currentBackupPath, olderBackupPath, err)
}
}

if err := os.Rename(baseBackupPath, fmt.Sprintf("%s.1", baseBackupPath)); err != nil {
return fmt.Errorf("rotating %s: %w", baseBackupPath, err)
}

return nil
}

// UseBackupDbIfNeeded falls back to the backup database IFF the original database does not exist
// and the backup does. In this case, it renames the backup database to the expected filename
// launcher.db.
func UseBackupDbIfNeeded(rootDir string, slogger *slog.Logger) {
// Check first to see if the regular database exists
originalDbLocation := LauncherDbLocation(rootDir)
if originalDbExists, err := nonEmptyFileExists(originalDbLocation); originalDbExists {
// DB exists -- we should use that
slogger.Log(context.TODO(), slog.LevelDebug,
"launcher.db exists, no need to use backup",
"db_location", originalDbLocation,
)
return
} else if err != nil {
// Can't determine whether the db exists -- err on the side of not replacing it
slogger.Log(context.TODO(), slog.LevelWarn,
"could not determine whether original launcher db exists, not going to use backup",
"err", err,
)
return
}

// Launcher DB doesn't exist -- check to see if the backup does
backupLocation := BackupLauncherDbLocation(rootDir)
backupDbExists, err := nonEmptyFileExists(backupLocation)
if !backupDbExists {
// Backup DB doesn't exist either -- this is likely a fresh install.
// Nothing to do here; launcher should create a new DB.
slogger.Log(context.TODO(), slog.LevelInfo,
"both launcher db and backup db do not exist -- likely a fresh install",
)
return
}
if err != nil {
// Couldn't determine if the backup DB exists -- let launcher create a new DB instead.
slogger.Log(context.TODO(), slog.LevelWarn,
"could not determine whether backup launcher db exists, not going to use backup",
"err", err,
)
return
}

// The backup database exists, and the original one does not. Rename the backup
// to the original so we can use it.
if err := os.Rename(backupLocation, originalDbLocation); err != nil {
slogger.Log(context.TODO(), slog.LevelWarn,
"could not rename backup db",
"backup_location", backupLocation,
"original_location", originalDbLocation,
"err", err,
)
return
}
slogger.Log(context.TODO(), slog.LevelInfo,
"original db does not exist and backup does -- using backup db",
"backup_location", backupLocation,
"original_location", originalDbLocation,
)
}

func LauncherDbLocation(rootDir string) string {
return filepath.Join(rootDir, "launcher.db")
}

func BackupLauncherDbLocation(rootDir string) string {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hate to say this, but we probably need to keep N and rotate. Otherwise we can probably get inconsistency if there's a reader and writer. :|

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't super worried about that because of how the access remains relatively restricted in this implementation --

  • On launcher startup, before any rungroups execute, launcher may rename launcher.db.bak to launcher.db; nothing else is accessing either db at this time
  • After rungroups begin to execute, this actor is the only thing touching launcher.db.bak -- nothing else will read from it or write to it
  • (Except for the flare checkup -- and data inconsistency probably isn't a huge deal there, and I'm not sure how much we even care about having that checkup anyway)
  • (There's also remote uninstall, but that just wants to delete the file)

Is there something I'm missing?

I'm happy to keep N and rotate if we feel like that's a safer option, regardless.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added rotation in 15183fa -- lmk what you're thinking!

RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved
return filepath.Join(rootDir, "launcher.db.bak")
}

func nonEmptyFileExists(path string) (bool, error) {
fileInfo, err := os.Stat(path)
if os.IsNotExist(err) {
return false, nil
}

if err != nil {
return false, err
}

return fileInfo.Size() > 0, nil
}
Loading
Loading