Skip to content

Commit

Permalink
Add rungroup actor to take periodic backups of the launcher database
Browse files Browse the repository at this point in the history
  • Loading branch information
RebeccaMahany committed Jun 14, 2024
1 parent aefa457 commit 6e21a3d
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 0 deletions.
3 changes: 3 additions & 0 deletions cmd/launcher/launcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,9 @@ func runLauncher(ctx context.Context, cancel func(), multiSlogger, systemMultiSl
// pickup
internal.RecordLauncherVersion(ctx, rootDirectory)

p := agentbbolt.NewDatabasePhotographer(k)
runGroup.Add("databasePhotographer", p.Execute, p.Interrupt)

// create the certificate pool
var rootPool *x509.CertPool
if k.RootPEM() != "" {
Expand Down
118 changes: 118 additions & 0 deletions ee/agent/storage/bbolt/backup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package agentbbolt

import (
"context"
"fmt"
"log/slog"
"os"
"path/filepath"
"time"

"github.com/kolide/launcher/ee/agent/types"
"go.etcd.io/bbolt"
)

const (
snapshotInitialDelay = 10 * time.Minute
snapshotInterval = 1 * time.Hour
)

// A photographer takes snapshots.
// TODO RM - A better name.
type photographer struct {
knapsack types.Knapsack
slogger *slog.Logger
interrupt chan struct{}
interrupted bool
}

func NewDatabasePhotographer(k types.Knapsack) *photographer {
return &photographer{
knapsack: k,
slogger: k.Slogger().With("component", "database_photographer"),
interrupt: make(chan struct{}, 1),
}
}

func (p *photographer) Execute() error {
// Wait a little bit after startup before taking first snapshot, to allow for enrollment
select {
case <-p.interrupt:
p.slogger.Log(context.TODO(), slog.LevelDebug,
"received external interrupt during initial delay, stopping",
)
return nil
case <-time.After(snapshotInitialDelay):
break
}

// Take periodic snapshots
ticker := time.NewTicker(snapshotInterval)
defer ticker.Stop()
for {
if err := p.backupDb(); err != nil {
p.slogger.Log(context.TODO(), slog.LevelWarn,
"could not perform periodic database backup",
"err", err,
)
}

select {
case <-ticker.C:
continue
case <-p.interrupt:
p.slogger.Log(context.TODO(), slog.LevelDebug,
"interrupt received, exiting execute loop",
)
return nil
}
}
}

func (p *photographer) Interrupt(_ error) {
// Only perform shutdown tasks on first call to interrupt -- no need to repeat on potential extra calls.
if p.interrupted {
return
}
p.interrupted = true

p.interrupt <- struct{}{}
}

func (p *photographer) backupDb() error {
// Take backup -- it's fine to just overwrite previous backups
backupLocation := BackupLauncherDbLocation(p.knapsack.RootDirectory())
if err := p.knapsack.BboltDB().View(func(tx *bbolt.Tx) error {
return tx.CopyFile(backupLocation, 0600)
}); err != nil {
return fmt.Errorf("backing up database: %w", err)
}

// Confirm file exists and is nonempty
fileInfo, err := os.Stat(backupLocation)
if os.IsNotExist(err) {
return fmt.Errorf("backup succeeded, but no file at backup location %s", backupLocation)
}
if err != nil {
return fmt.Errorf("checking %s exists after taking backup: %w", backupLocation, err)
}
if fileInfo.Size() <= 0 {
return fmt.Errorf("backup succeeded, but backup database at %s is empty", backupLocation)
}

// Log success
p.slogger.Log(context.TODO(), slog.LevelDebug,
"took backup",
"backup_location", backupLocation,
)

return nil
}

func LauncherDbLocation(rootDir string) string {
return filepath.Join(rootDir, "launcher.db")
}

func BackupLauncherDbLocation(rootDir string) string {
return filepath.Join(rootDir, "launcher.db.bak")
}
52 changes: 52 additions & 0 deletions ee/agent/storage/bbolt/backup_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package agentbbolt

import (
"errors"
"testing"
"time"

typesmocks "github.com/kolide/launcher/ee/agent/types/mocks"
"github.com/kolide/launcher/pkg/log/multislogger"
"github.com/stretchr/testify/require"
)

func TestInterrupt_Multiple(t *testing.T) {
t.Parallel()

testKnapsack := typesmocks.NewKnapsack(t)
testKnapsack.On("Slogger").Return(multislogger.NewNopLogger())

p := NewDatabasePhotographer(testKnapsack)

// Start and then interrupt
go p.Execute()
p.Interrupt(errors.New("test error"))

// Confirm we can call Interrupt multiple times without blocking
interruptComplete := make(chan struct{})
expectedInterrupts := 3
for i := 0; i < expectedInterrupts; i += 1 {
go func() {
p.Interrupt(nil)
interruptComplete <- struct{}{}
}()
}

receivedInterrupts := 0
for {
if receivedInterrupts >= expectedInterrupts {
break
}

select {
case <-interruptComplete:
receivedInterrupts += 1
continue
case <-time.After(5 * time.Second):
t.Errorf("could not call interrupt multiple times and return within 5 seconds -- received %d interrupts before timeout", receivedInterrupts)
t.FailNow()
}
}

require.Equal(t, expectedInterrupts, receivedInterrupts)
}

0 comments on commit 6e21a3d

Please sign in to comment.