Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry osquery instance launch until successful or shutdown requested #1952

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 51 additions & 13 deletions pkg/osquery/runtime/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"log/slog"
"sync"
"time"

"github.com/kolide/launcher/ee/agent/flags/keys"
"github.com/kolide/launcher/ee/agent/types"
Expand All @@ -15,6 +16,8 @@ import (

const (
defaultRegistrationId = "default"

launchRetryDelay = 30 * time.Second
)

type Runner struct {
Expand Down Expand Up @@ -91,18 +94,14 @@ func (r *Runner) Run() error {
func (r *Runner) runInstance(registrationId string) error {
slogger := r.slogger.With("registration_id", registrationId)

// First, launch the instance. Ensure we don't try to restart before launch is complete.
r.instanceLock.Lock()
instance := newInstance(registrationId, r.knapsack, r.serviceClient, r.opts...)
if err := instance.Launch(); err != nil {
r.instanceLock.Unlock()
// First, launch the instance.
instance, err := r.launchInstanceWithRetries(registrationId)
if err != nil {
// We only receive an error on launch if the runner has been shut down -- in that case,
// return now.
return fmt.Errorf("starting instance for %s: %w", registrationId, err)
}

// Now that the instance is running, we can add it to `r.instances` and remove the lock
r.instances[registrationId] = instance
r.instanceLock.Unlock()

// This loop restarts the instance as necessary. It exits when `Shutdown` is called,
// or if the instance exits and cannot be restarted.
for {
Expand All @@ -127,15 +126,54 @@ func (r *Runner) runInstance(registrationId string) error {
"err", err,
)

var launchErr error
instance, launchErr = r.launchInstanceWithRetries(registrationId)
if launchErr != nil {
// We only receive an error on launch if the runner has been shut down -- in that case,
// return now.
return fmt.Errorf("restarting instance for %s after unexpected exit: %w", registrationId, launchErr)
}
}
}

// launchInstanceWithRetries repeatedly tries to create and launch a new osquery instance.
// It will retry until it succeeds, or until the runner is shut down.
func (r *Runner) launchInstanceWithRetries(registrationId string) (*OsqueryInstance, error) {
for {
// Lock to ensure we don't try to restart before launch is complete.
r.instanceLock.Lock()
instance = newInstance(registrationId, r.knapsack, r.serviceClient, r.opts...)
r.instances[registrationId] = instance
if err := instance.Launch(); err != nil {
instance := newInstance(registrationId, r.knapsack, r.serviceClient, r.opts...)
err := instance.Launch()

// Success!
if err == nil {
// Now that the instance is running, we can add it to `r.instances` and remove the lock
r.instances[registrationId] = instance
r.instanceLock.Unlock()
return fmt.Errorf("could not restart osquery instance after unexpected exit: %w", err)

r.slogger.Log(context.TODO(), slog.LevelInfo,
"runner successfully launched instance",
"registration_id", registrationId,
)

return instance, nil
}

// Launching was not successful. Unlock, log the error, and wait to retry.
r.instanceLock.Unlock()
r.slogger.Log(context.TODO(), slog.LevelWarn,
"could not launch instance, will retry after delay",
"err", err,
"registration_id", registrationId,
)

select {
case <-r.shutdown:
return nil, fmt.Errorf("runner received shutdown, halting before successfully launching instance for %s", registrationId)
case <-time.After(launchRetryDelay):
// Continue to retry
continue
}
}
}

Expand Down
19 changes: 15 additions & 4 deletions pkg/osquery/runtime/runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,14 @@ func TestBadBinaryPath(t *testing.T) {
t.Parallel()
rootDirectory := t.TempDir()

logBytes, slogger, opts := setUpTestSlogger(rootDirectory)

k := typesMocks.NewKnapsack(t)
k.On("OsqueryHealthcheckStartupDelay").Return(0 * time.Second).Maybe()
k.On("WatchdogEnabled").Return(false)
k.On("Slogger").Return(multislogger.NewNopLogger())
k.On("RegisterChangeObserver", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything)
k.On("LatestOsquerydPath", mock.Anything).Return("")
k.On("Slogger").Return(slogger)
k.On("LatestOsquerydPath", mock.Anything).Return("") // bad binary path
k.On("RootDirectory").Return(rootDirectory).Maybe()
k.On("OsqueryVerbose").Return(true)
k.On("OsqueryFlags").Return([]string{})
Expand All @@ -133,8 +135,17 @@ func TestBadBinaryPath(t *testing.T) {
k.On("ReadEnrollSecret").Return("", nil).Maybe()
setUpMockStores(t, k)

runner := New(k, mockServiceClient())
assert.Error(t, runner.Run())
runner := New(k, mockServiceClient(), opts...)

// The runner will repeatedly try to launch the instance, so `Run`
// won't return an error until we shut it down. Kick off `Run`,
// wait a while, and confirm we can still shut down.
go runner.Run()
time.Sleep(2 * time.Second)
waitShutdown(t, runner, logBytes)

// Confirm we tried to launch the instance by examining the logs.
require.Contains(t, logBytes.String(), "could not launch instance, will retry after delay")

k.AssertExpectations(t)
}
Expand Down
Loading