Skip to content

Commit

Permalink
Retry osquery instance launch until successful or shutdown requested (k…
Browse files Browse the repository at this point in the history
  • Loading branch information
RebeccaMahany authored Nov 12, 2024
1 parent c6fe8b7 commit 34bfb61
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 17 deletions.
64 changes: 51 additions & 13 deletions pkg/osquery/runtime/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"log/slog"
"sync"
"time"

"github.com/kolide/launcher/ee/agent/flags/keys"
"github.com/kolide/launcher/ee/agent/types"
Expand All @@ -15,6 +16,8 @@ import (

const (
defaultRegistrationId = "default"

launchRetryDelay = 30 * time.Second
)

type Runner struct {
Expand Down Expand Up @@ -91,18 +94,14 @@ func (r *Runner) Run() error {
func (r *Runner) runInstance(registrationId string) error {
slogger := r.slogger.With("registration_id", registrationId)

// First, launch the instance. Ensure we don't try to restart before launch is complete.
r.instanceLock.Lock()
instance := newInstance(registrationId, r.knapsack, r.serviceClient, r.opts...)
if err := instance.Launch(); err != nil {
r.instanceLock.Unlock()
// First, launch the instance.
instance, err := r.launchInstanceWithRetries(registrationId)
if err != nil {
// We only receive an error on launch if the runner has been shut down -- in that case,
// return now.
return fmt.Errorf("starting instance for %s: %w", registrationId, err)
}

// Now that the instance is running, we can add it to `r.instances` and remove the lock
r.instances[registrationId] = instance
r.instanceLock.Unlock()

// This loop restarts the instance as necessary. It exits when `Shutdown` is called,
// or if the instance exits and cannot be restarted.
for {
Expand All @@ -127,15 +126,54 @@ func (r *Runner) runInstance(registrationId string) error {
"err", err,
)

var launchErr error
instance, launchErr = r.launchInstanceWithRetries(registrationId)
if launchErr != nil {
// We only receive an error on launch if the runner has been shut down -- in that case,
// return now.
return fmt.Errorf("restarting instance for %s after unexpected exit: %w", registrationId, launchErr)
}
}
}

// launchInstanceWithRetries repeatedly tries to create and launch a new osquery instance.
// It will retry until it succeeds, or until the runner is shut down.
func (r *Runner) launchInstanceWithRetries(registrationId string) (*OsqueryInstance, error) {
for {
// Lock to ensure we don't try to restart before launch is complete.
r.instanceLock.Lock()
instance = newInstance(registrationId, r.knapsack, r.serviceClient, r.opts...)
r.instances[registrationId] = instance
if err := instance.Launch(); err != nil {
instance := newInstance(registrationId, r.knapsack, r.serviceClient, r.opts...)
err := instance.Launch()

// Success!
if err == nil {
// Now that the instance is running, we can add it to `r.instances` and remove the lock
r.instances[registrationId] = instance
r.instanceLock.Unlock()
return fmt.Errorf("could not restart osquery instance after unexpected exit: %w", err)

r.slogger.Log(context.TODO(), slog.LevelInfo,
"runner successfully launched instance",
"registration_id", registrationId,
)

return instance, nil
}

// Launching was not successful. Unlock, log the error, and wait to retry.
r.instanceLock.Unlock()
r.slogger.Log(context.TODO(), slog.LevelWarn,
"could not launch instance, will retry after delay",
"err", err,
"registration_id", registrationId,
)

select {
case <-r.shutdown:
return nil, fmt.Errorf("runner received shutdown, halting before successfully launching instance for %s", registrationId)
case <-time.After(launchRetryDelay):
// Continue to retry
continue
}
}
}

Expand Down
19 changes: 15 additions & 4 deletions pkg/osquery/runtime/runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,14 @@ func TestBadBinaryPath(t *testing.T) {
t.Parallel()
rootDirectory := t.TempDir()

logBytes, slogger, opts := setUpTestSlogger(rootDirectory)

k := typesMocks.NewKnapsack(t)
k.On("OsqueryHealthcheckStartupDelay").Return(0 * time.Second).Maybe()
k.On("WatchdogEnabled").Return(false)
k.On("Slogger").Return(multislogger.NewNopLogger())
k.On("RegisterChangeObserver", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything)
k.On("LatestOsquerydPath", mock.Anything).Return("")
k.On("Slogger").Return(slogger)
k.On("LatestOsquerydPath", mock.Anything).Return("") // bad binary path
k.On("RootDirectory").Return(rootDirectory).Maybe()
k.On("OsqueryVerbose").Return(true)
k.On("OsqueryFlags").Return([]string{})
Expand All @@ -133,8 +135,17 @@ func TestBadBinaryPath(t *testing.T) {
k.On("ReadEnrollSecret").Return("", nil).Maybe()
setUpMockStores(t, k)

runner := New(k, mockServiceClient())
assert.Error(t, runner.Run())
runner := New(k, mockServiceClient(), opts...)

// The runner will repeatedly try to launch the instance, so `Run`
// won't return an error until we shut it down. Kick off `Run`,
// wait a while, and confirm we can still shut down.
go runner.Run()
time.Sleep(2 * time.Second)
waitShutdown(t, runner, logBytes)

// Confirm we tried to launch the instance by examining the logs.
require.Contains(t, logBytes.String(), "could not launch instance, will retry after delay")

k.AssertExpectations(t)
}
Expand Down

0 comments on commit 34bfb61

Please sign in to comment.