Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Take and use backups of the launcher database #1755

Merged
merged 14 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cmd/launcher/launcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,9 @@ func runLauncher(ctx context.Context, cancel func(), multiSlogger, systemMultiSl
// this. Note that the timeout is documented as failing
// unimplemented on windows, though empirically it seems to
// work.
agentbbolt.UseBackupDbIfNeeded(rootDirectory, slogger)
boltOptions := &bbolt.Options{Timeout: time.Duration(30) * time.Second}
db, err := bbolt.Open(filepath.Join(rootDirectory, "launcher.db"), 0600, boltOptions)
db, err := bbolt.Open(agentbbolt.LauncherDbLocation(rootDirectory), 0600, boltOptions)
if err != nil {
return fmt.Errorf("open launcher db: %w", err)
}
Expand Down Expand Up @@ -259,6 +260,9 @@ func runLauncher(ctx context.Context, cancel func(), multiSlogger, systemMultiSl
// pickup
internal.RecordLauncherVersion(ctx, rootDirectory)

dbBackupSaver := agentbbolt.NewDatabaseBackupSaver(k)
runGroup.Add("dbBackupSaver", dbBackupSaver.Execute, dbBackupSaver.Interrupt)

// create the certificate pool
var rootPool *x509.CertPool
if k.RootPEM() != "" {
Expand Down
185 changes: 185 additions & 0 deletions ee/agent/storage/bbolt/backup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
package agentbbolt

import (
"context"
"fmt"
"log/slog"
"os"
"path/filepath"
"time"

"github.com/kolide/launcher/ee/agent/types"
"go.etcd.io/bbolt"
)

const (
backupInitialDelay = 10 * time.Minute
backupInterval = 1 * time.Hour
)

// databaseBackupSaver periodically takes backups of launcher.db.
type databaseBackupSaver struct {
knapsack types.Knapsack
slogger *slog.Logger
interrupt chan struct{}
interrupted bool
}

func NewDatabaseBackupSaver(k types.Knapsack) *databaseBackupSaver {
return &databaseBackupSaver{
knapsack: k,
slogger: k.Slogger().With("component", "database_backup_saver"),
interrupt: make(chan struct{}, 1),
}
}

func (d *databaseBackupSaver) Execute() error {
// Wait a little bit after startup before taking first backup, to allow for enrollment
directionless marked this conversation as resolved.
Show resolved Hide resolved
select {
case <-d.interrupt:
d.slogger.Log(context.TODO(), slog.LevelDebug,
"received external interrupt during initial delay, stopping",
)
return nil
case <-time.After(backupInitialDelay):
break
}

// Take periodic backups
ticker := time.NewTicker(backupInterval)
defer ticker.Stop()
for {
if err := d.backupDb(); err != nil {
d.slogger.Log(context.TODO(), slog.LevelWarn,
"could not perform periodic database backup",
"err", err,
)
}

select {
case <-ticker.C:
continue
case <-d.interrupt:
d.slogger.Log(context.TODO(), slog.LevelDebug,
"interrupt received, exiting execute loop",
)
return nil
}
}
}

func (d *databaseBackupSaver) Interrupt(_ error) {
// Only perform shutdown tasks on first call to interrupt -- no need to repeat on potential extra calls.
if d.interrupted {
return
}
d.interrupted = true

d.interrupt <- struct{}{}
}

func (d *databaseBackupSaver) backupDb() error {
// Take backup -- it's fine to just overwrite previous backups
backupLocation := BackupLauncherDbLocation(d.knapsack.RootDirectory())
if err := d.knapsack.BboltDB().View(func(tx *bbolt.Tx) error {
zackattack01 marked this conversation as resolved.
Show resolved Hide resolved
return tx.CopyFile(backupLocation, 0600)
directionless marked this conversation as resolved.
Show resolved Hide resolved
}); err != nil {
return fmt.Errorf("backing up database: %w", err)
}

// Confirm file exists and is nonempty
if exists, err := nonEmptyFileExists(backupLocation); !exists {
return fmt.Errorf("backup succeeded, but nonempty file does not exist at %s", backupLocation)
} else if err != nil {
return fmt.Errorf("backup succeeded, but error checking if file was created at %s: %w", backupLocation, err)
} else {
// Log success
d.slogger.Log(context.TODO(), slog.LevelDebug,
"took backup",
"backup_location", backupLocation,
)
}

return nil
}

// UseBackupDbIfNeeded falls back to the backup database IFF the original database does not exist
// and the backup does. In this case, it renames the backup database to the expected filename
// launcher.db.
func UseBackupDbIfNeeded(rootDir string, slogger *slog.Logger) {
// Check first to see if the regular database exists
originalDbLocation := LauncherDbLocation(rootDir)
if originalDbExists, err := nonEmptyFileExists(originalDbLocation); originalDbExists {
// DB exists -- we should use that
slogger.Log(context.TODO(), slog.LevelDebug,
"launcher.db exists, no need to use backup",
"db_location", originalDbLocation,
)
return
} else if err != nil {
// Can't determine whether the db exists -- err on the side of not replacing it
slogger.Log(context.TODO(), slog.LevelWarn,
"could not determine whether original launcher db exists, not going to use backup",
"err", err,
)
return
}

// Launcher DB doesn't exist -- check to see if the backup does
backupLocation := BackupLauncherDbLocation(rootDir)
backupDbExists, err := nonEmptyFileExists(backupLocation)
if !backupDbExists {
// Backup DB doesn't exist either -- this is likely a fresh install.
// Nothing to do here; launcher should create a new DB.
slogger.Log(context.TODO(), slog.LevelInfo,
"both launcher db and backup db do not exist -- likely a fresh install",
)
return
}
if err != nil {
// Couldn't determine if the backup DB exists -- let launcher create a new DB instead.
slogger.Log(context.TODO(), slog.LevelWarn,
"could not determine whether backup launcher db exists, not going to use backup",
"err", err,
)
return
}

// The backup database exists, and the original one does not. Rename the backup
// to the original so we can use it.
if err := os.Rename(backupLocation, originalDbLocation); err != nil {
slogger.Log(context.TODO(), slog.LevelWarn,
"could not rename backup db",
"backup_location", backupLocation,
"original_location", originalDbLocation,
"err", err,
)
return
}
slogger.Log(context.TODO(), slog.LevelInfo,
"original db does not exist and backup does -- using backup db",
"backup_location", backupLocation,
"original_location", originalDbLocation,
)
}

func LauncherDbLocation(rootDir string) string {
return filepath.Join(rootDir, "launcher.db")
}

func BackupLauncherDbLocation(rootDir string) string {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hate to say this, but we probably need to keep N and rotate. Otherwise we can probably get inconsistency if there's a reader and writer. :|

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't super worried about that because of how the access remains relatively restricted in this implementation --

  • On launcher startup, before any rungroups execute, launcher may rename launcher.db.bak to launcher.db; nothing else is accessing either db at this time
  • After rungroups begin to execute, this actor is the only thing touching launcher.db.bak -- nothing else will read from it or write to it
  • (Except for the flare checkup -- and data inconsistency probably isn't a huge deal there, and I'm not sure how much we even care about having that checkup anyway)
  • (There's also remote uninstall, but that just wants to delete the file)

Is there something I'm missing?

I'm happy to keep N and rotate if we feel like that's a safer option, regardless.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added rotation in 15183fa -- lmk what you're thinking!

RebeccaMahany marked this conversation as resolved.
Show resolved Hide resolved
return filepath.Join(rootDir, "launcher.db.bak")
}

func nonEmptyFileExists(path string) (bool, error) {
fileInfo, err := os.Stat(path)
if os.IsNotExist(err) {
return false, nil
}

if err != nil {
return false, err
}

return fileInfo.Size() > 0, nil
}
150 changes: 150 additions & 0 deletions ee/agent/storage/bbolt/backup_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
package agentbbolt

import (
"errors"
"os"
"testing"
"time"

typesmocks "github.com/kolide/launcher/ee/agent/types/mocks"
"github.com/kolide/launcher/pkg/log/multislogger"
"github.com/stretchr/testify/require"
"go.etcd.io/bbolt"
)

func TestUseBackupDbIfNeeded(t *testing.T) {
t.Parallel()

for _, tt := range []struct {
name string
originalDbExists bool
backupDbExists bool
shouldPerformRename bool
}{
{
name: "original exists, backup exists, should use original",
originalDbExists: true,
backupDbExists: true,
shouldPerformRename: false,
},
{
name: "original exists, backup does not exist, should use original",
originalDbExists: true,
backupDbExists: false,
shouldPerformRename: false,
},
{
name: "original does not exist, backup exists, should use backup",
originalDbExists: false,
backupDbExists: true,
shouldPerformRename: true,
},
{
name: "original does not exist, backup does not exist, should use (new) original",
originalDbExists: false,
backupDbExists: false,
shouldPerformRename: false,
},
} {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

// Set up test databases
tempRootDir := t.TempDir()
originalDbFileLocation := LauncherDbLocation(tempRootDir)
backupDbFileLocation := BackupLauncherDbLocation(tempRootDir)
if tt.originalDbExists {
createNonEmptyBboltDb(t, originalDbFileLocation)
}
if tt.backupDbExists {
createNonEmptyBboltDb(t, backupDbFileLocation)
}

// Ask agentbbolt to use the backup database if the original one isn't present
UseBackupDbIfNeeded(tempRootDir, multislogger.NewNopLogger())

// Check to make sure appropriate action was taken
if tt.shouldPerformRename {
// The backup database should no longer exist
_, err := os.Stat(backupDbFileLocation)
require.Error(t, err, "should not be able to stat launcher.db.bak since it should have been renamed")
require.True(t, os.IsNotExist(err), "checking that launcher.db.bak does not exist, and error is not ErrNotExist")

// The original database should exist
_, err = os.Stat(originalDbFileLocation)
require.NoError(t, err, "checking if launcher.db exists")
} else {
// No rename, so we should be in the same state we started in
_, err := os.Stat(originalDbFileLocation)
if tt.originalDbExists {
require.NoError(t, err, "checking if launcher.db exists")
} else {
// launcher.db didn't exist before, it shouldn't exist now
require.True(t, os.IsNotExist(err), "checking that launcher.db does not exist, and error is not ErrNotExist")
}

_, err = os.Stat(backupDbFileLocation)
if tt.backupDbExists {
require.NoError(t, err, "checking if launcher.db.bak exists")
} else {
// launcher.db.bak didn't exist before, it shouldn't exist now
require.True(t, os.IsNotExist(err), "checking that launcher.db.bak does not exist, and error is not ErrNotExist")
}
}
})
}
}

func createNonEmptyBboltDb(t *testing.T, dbFileLocation string) time.Time {
boltOptions := &bbolt.Options{Timeout: time.Duration(5) * time.Second}
db, err := bbolt.Open(dbFileLocation, 0600, boltOptions)
require.NoError(t, err, "creating db")
require.NoError(t, db.Close(), "closing db")

fi, err := os.Stat(dbFileLocation)
require.NoError(t, err, "statting db")

return fi.ModTime()
}

func TestInterrupt_Multiple(t *testing.T) {
t.Parallel()

testKnapsack := typesmocks.NewKnapsack(t)
testKnapsack.On("Slogger").Return(multislogger.NewNopLogger())

p := NewDatabaseBackupSaver(testKnapsack)

// Start and then interrupt
go p.Execute()
p.Interrupt(errors.New("test error"))

// Confirm we can call Interrupt multiple times without blocking
interruptComplete := make(chan struct{})
expectedInterrupts := 3
for i := 0; i < expectedInterrupts; i += 1 {
go func() {
p.Interrupt(nil)
interruptComplete <- struct{}{}
}()
}

receivedInterrupts := 0
for {
if receivedInterrupts >= expectedInterrupts {
break
}

select {
case <-interruptComplete:
receivedInterrupts += 1
continue
case <-time.After(5 * time.Second):
t.Errorf("could not call interrupt multiple times and return within 5 seconds -- received %d interrupts before timeout", receivedInterrupts)
t.FailNow()
}
}

require.Equal(t, expectedInterrupts, receivedInterrupts)
}
Loading
Loading