From a833edf269874c925d3f09310b869af7e56f05d4 Mon Sep 17 00:00:00 2001 From: RebeccaMahany Date: Tue, 17 Dec 2024 15:45:23 -0500 Subject: [PATCH 01/13] Check for and clean up old osquery database lock files before starting process --- pkg/osquery/runtime/osqueryinstance.go | 37 +++++ .../runtime/osqueryinstance_other_test.go | 20 +++ pkg/osquery/runtime/osqueryinstance_test.go | 137 ++++++++++++++++++ .../runtime/osqueryinstance_windows_test.go | 40 +++++ 4 files changed, 234 insertions(+) create mode 100644 pkg/osquery/runtime/osqueryinstance_other_test.go diff --git a/pkg/osquery/runtime/osqueryinstance.go b/pkg/osquery/runtime/osqueryinstance.go index 0ecb3ad4d..61f3d0f97 100644 --- a/pkg/osquery/runtime/osqueryinstance.go +++ b/pkg/osquery/runtime/osqueryinstance.go @@ -249,6 +249,12 @@ func (i *OsqueryInstance) Launch() error { return fmt.Errorf("could not calculate osquery file paths: %w", err) } + // Make sure no lock files exist that could prevent osquery from starting up + if err := i.cleanUpOldDatabaseLock(ctx, paths); err != nil { + traces.SetError(span, fmt.Errorf("cleaning up old database lock: %w", err)) + return fmt.Errorf("cleaning up old database lock: %w", err) + } + // Populate augeas lenses, if requested if i.opts.augeasLensFunc != nil { if err := os.MkdirAll(paths.augeasPath, 0755); err != nil { @@ -702,6 +708,37 @@ func calculateOsqueryPaths(rootDirectory string, registrationId string, runId st return osqueryFilePaths, nil } +// cleanUpOldDatabaseLock checks for the presence of a lock file in the given database and removes it +// if it exists. The lock file can hang around if an osquery process isn't properly terminated, and +// will prevent us from launching a new osquery process. +func (i *OsqueryInstance) cleanUpOldDatabaseLock(ctx context.Context, paths *osqueryFilePaths) error { + lockFilePath := filepath.Join(paths.databasePath, "LOCK") + + lockFileInfo, err := os.Stat(lockFilePath) + if os.IsNotExist(err) { + // No lock file, nothing to do here + return nil + } + + if err := os.Remove(lockFilePath); err != nil { + i.slogger.Log(ctx, slog.LevelInfo, + "detected old lock file but could not remove it", + "lockfile_path", lockFilePath, + "lockfile_modtime", lockFileInfo.ModTime().String(), + "err", err, + ) + return fmt.Errorf("removing lock file at %s: %w", lockFilePath, err) + } + + i.slogger.Log(ctx, slog.LevelInfo, + "detected and removed old osquery db lock file", + "lockfile_path", lockFilePath, + "lockfile_modtime", lockFileInfo.ModTime().String(), + ) + + return nil +} + // createOsquerydCommand uses osqueryOptions to return an *exec.Cmd // which will launch a properly configured osqueryd process. func (i *OsqueryInstance) createOsquerydCommand(osquerydBinary string, paths *osqueryFilePaths) (*exec.Cmd, error) { diff --git a/pkg/osquery/runtime/osqueryinstance_other_test.go b/pkg/osquery/runtime/osqueryinstance_other_test.go new file mode 100644 index 000000000..7ce670ba9 --- /dev/null +++ b/pkg/osquery/runtime/osqueryinstance_other_test.go @@ -0,0 +1,20 @@ +//go:build !windows +// +build !windows + +package runtime + +import ( + "syscall" + "testing" + + "github.com/stretchr/testify/require" +) + +func createLockFile(t *testing.T, fileToLock string) { + lockFile, err := syscall.Open(fileToLock, syscall.O_CREAT|syscall.O_RDONLY, 0600) + require.NoError(t, err) + require.NoError(t, syscall.Flock(lockFile, syscall.LOCK_EX|syscall.LOCK_NB)) + t.Cleanup(func() { + syscall.Close(lockFile) + }) +} diff --git a/pkg/osquery/runtime/osqueryinstance_test.go b/pkg/osquery/runtime/osqueryinstance_test.go index 1a6a98478..6df7e6ec3 100644 --- a/pkg/osquery/runtime/osqueryinstance_test.go +++ b/pkg/osquery/runtime/osqueryinstance_test.go @@ -2,7 +2,9 @@ package runtime import ( "context" + "errors" "fmt" + "os" "path/filepath" "runtime" "strings" @@ -12,8 +14,10 @@ import ( "github.com/kolide/kit/ulid" "github.com/kolide/launcher/ee/agent/types" typesMocks "github.com/kolide/launcher/ee/agent/types/mocks" + "github.com/kolide/launcher/pkg/backoff" "github.com/kolide/launcher/pkg/log/multislogger" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" ) @@ -235,3 +239,136 @@ func Test_healthcheckWithRetries(t *testing.T) { // No client available, so healthcheck should fail despite retries require.Error(t, i.healthcheckWithRetries(context.TODO(), 5, 100*time.Millisecond)) } + +func TestLaunch(t *testing.T) { + t.Parallel() + + logBytes, slogger := setUpTestSlogger() + rootDirectory := testRootDirectory(t) + + k := typesMocks.NewKnapsack(t) + k.On("WatchdogEnabled").Return(true) + k.On("WatchdogMemoryLimitMB").Return(150) + k.On("WatchdogUtilizationLimitPercent").Return(20) + k.On("WatchdogDelaySec").Return(120) + k.On("OsqueryFlags").Return([]string{"verbose=true"}) + k.On("OsqueryVerbose").Return(true) + k.On("Slogger").Return(slogger) + k.On("RootDirectory").Return(rootDirectory) + k.On("LoggingInterval").Return(1 * time.Second) + k.On("LogMaxBytesPerBatch").Return(500) + k.On("Transport").Return("jsonrpc") + setUpMockStores(t, k) + k.On("ReadEnrollSecret").Return("", nil) + k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory) + k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second) + + i := newInstance(types.DefaultRegistrationID, k, mockServiceClient()) + go i.Launch() + + // Wait for the instance to become healthy + require.NoError(t, backoff.WaitFor(func() error { + // Instance self-reports as healthy + if err := i.Healthy(); err != nil { + return fmt.Errorf("instance not healthy: %w", err) + } + + // Confirm instance setup is complete + if i.stats == nil || i.stats.ConnectTime == "" { + return errors.New("no connect time set yet") + } + + // Good to go + return nil + }, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String())) + + // Now wait for full shutdown + i.BeginShutdown() + shutdownErr := make(chan error) + go func() { + shutdownErr <- i.WaitShutdown() + }() + + select { + case err := <-shutdownErr: + require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String())) + case <-time.After(1 * time.Minute): + t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String())) + t.FailNow() + } + + k.AssertExpectations(t) +} + +func TestLaunch_WithLockFileCleanup(t *testing.T) { + t.Parallel() + + logBytes, slogger := setUpTestSlogger() + rootDirectory := testRootDirectory(t) + + k := typesMocks.NewKnapsack(t) + k.On("WatchdogEnabled").Return(true) + k.On("WatchdogMemoryLimitMB").Return(150) + k.On("WatchdogUtilizationLimitPercent").Return(20) + k.On("WatchdogDelaySec").Return(120) + k.On("OsqueryFlags").Return([]string{"verbose=true"}) + k.On("OsqueryVerbose").Return(true) + k.On("Slogger").Return(slogger) + k.On("RootDirectory").Return(rootDirectory) + k.On("LoggingInterval").Return(1 * time.Second) + k.On("LogMaxBytesPerBatch").Return(500) + k.On("Transport").Return("jsonrpc") + setUpMockStores(t, k) + k.On("ReadEnrollSecret").Return("", nil) + k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory) + k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second) + + i := newInstance(types.DefaultRegistrationID, k, mockServiceClient()) + + // Create a lock file that would ordinarily prevent osquery from starting up + lockFilePath := filepath.Join(rootDirectory, "osquery.db", "LOCK") + require.NoError(t, os.MkdirAll(filepath.Dir(lockFilePath), 0700)) // drwx + createLockFile(t, lockFilePath) + + // Start the instance + instanceStartTime := time.Now() + go i.Launch() + + // Wait for the instance to become healthy + require.NoError(t, backoff.WaitFor(func() error { + // Instance self-reports as healthy + if err := i.Healthy(); err != nil { + return fmt.Errorf("instance not healthy: %w", err) + } + + // Confirm instance setup is complete + if i.stats == nil || i.stats.ConnectTime == "" { + return errors.New("no connect time set yet") + } + + // Good to go + return nil + }, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String())) + + // Confirm that the lock file is new -- it should have a modtime after instanceStartTime + lockFileInfo, err := os.Stat(lockFilePath) + require.NoError(t, err) + require.True(t, lockFileInfo.ModTime().After(instanceStartTime), logBytes.String()) + + // Now wait for full shutdown + i.BeginShutdown() + shutdownErr := make(chan error) + go func() { + shutdownErr <- i.WaitShutdown() + }() + + select { + case err := <-shutdownErr: + require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String())) + case <-time.After(1 * time.Minute): + t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String())) + t.FailNow() + } + + k.AssertExpectations(t) +} diff --git a/pkg/osquery/runtime/osqueryinstance_windows_test.go b/pkg/osquery/runtime/osqueryinstance_windows_test.go index d97f00a62..101d9f0b6 100644 --- a/pkg/osquery/runtime/osqueryinstance_windows_test.go +++ b/pkg/osquery/runtime/osqueryinstance_windows_test.go @@ -5,7 +5,9 @@ package runtime import ( "strings" + "syscall" "testing" + "unsafe" "github.com/kolide/launcher/ee/agent/types" typesMocks "github.com/kolide/launcher/ee/agent/types/mocks" @@ -50,3 +52,41 @@ func TestCreateOsqueryCommandEnvVars(t *testing.T) { k.AssertExpectations(t) } + +func createLockFile(t *testing.T, fileToLock string) { + lockedFileName, err := syscall.UTF16PtrFromString(fileToLock) + require.NoError(t, err) + + handle, err := syscall.CreateFile( + lockedFileName, + syscall.GENERIC_READ, + syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE, + nil, + syscall.CREATE_ALWAYS, + syscall.FILE_ATTRIBUTE_NORMAL, + 0, + ) + require.NoError(t, err) + + t.Cleanup(func() { + syscall.Close(handle) + }) + + // See: https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-lockfileex + modkernel32 := syscall.NewLazyDLL("kernel32.dll") + procLockFileEx := modkernel32.NewProc("LockFileEx") + overlapped := &syscall.Overlapped{ + HEvent: 0, + } + + r1, _, e1 := syscall.SyscallN( + procLockFileEx.Addr(), + uintptr(handle), + uintptr(2), // LOCKFILE_EXCLUSIVE_LOCK + uintptr(0), // Reserved parameter; must be set to zero. + uintptr(1), // nNumberOfBytesToLockLow -- the low-order 32 bits of the length of the byte range to lock. + uintptr(0), // nNumberOfBytesToLockHigh -- the high-order 32 bits of the length of the byte range to lock. + uintptr(unsafe.Pointer(overlapped)), + ) + require.False(t, r1 == 0, error(e1)) +} From 2c83441a6ef90b53b4f4f88d5614ce1d90758a8e Mon Sep 17 00:00:00 2001 From: RebeccaMahany Date: Tue, 17 Dec 2024 16:53:01 -0500 Subject: [PATCH 02/13] Retry removing lock file --- pkg/osquery/runtime/osqueryinstance.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pkg/osquery/runtime/osqueryinstance.go b/pkg/osquery/runtime/osqueryinstance.go index 61f3d0f97..457dde59e 100644 --- a/pkg/osquery/runtime/osqueryinstance.go +++ b/pkg/osquery/runtime/osqueryinstance.go @@ -720,7 +720,15 @@ func (i *OsqueryInstance) cleanUpOldDatabaseLock(ctx context.Context, paths *osq return nil } - if err := os.Remove(lockFilePath); err != nil { + // We do a couple retries here -- Windows complains about removing files in use sometimes. + err = backoff.WaitFor(func() error { + if err := os.Remove(lockFilePath); err != nil { + return fmt.Errorf("removing lock file at %s: %w", lockFilePath, err) + } + return nil + }, 5*time.Second, 500*time.Millisecond) + + if err != nil { i.slogger.Log(ctx, slog.LevelInfo, "detected old lock file but could not remove it", "lockfile_path", lockFilePath, From 5ec73930e15c2966f75629f63755c0d3c9cec7ad Mon Sep 17 00:00:00 2001 From: RebeccaMahany Date: Wed, 18 Dec 2024 12:05:24 -0500 Subject: [PATCH 03/13] No remediate, only detect --- pkg/osquery/runtime/osqueryinstance.go | 46 ++++++-------- pkg/osquery/runtime/osqueryinstance_test.go | 68 +++++---------------- 2 files changed, 32 insertions(+), 82 deletions(-) diff --git a/pkg/osquery/runtime/osqueryinstance.go b/pkg/osquery/runtime/osqueryinstance.go index 457dde59e..aec79e99c 100644 --- a/pkg/osquery/runtime/osqueryinstance.go +++ b/pkg/osquery/runtime/osqueryinstance.go @@ -249,10 +249,13 @@ func (i *OsqueryInstance) Launch() error { return fmt.Errorf("could not calculate osquery file paths: %w", err) } - // Make sure no lock files exist that could prevent osquery from starting up - if err := i.cleanUpOldDatabaseLock(ctx, paths); err != nil { - traces.SetError(span, fmt.Errorf("cleaning up old database lock: %w", err)) - return fmt.Errorf("cleaning up old database lock: %w", err) + // Check to see whether lock files exist that could prevent osquery from starting up + if _, err := i.detectStaleDatabaseLock(ctx, paths); err != nil { + // A detection error shouldn't prevent us from creating the process -- log it and move on + i.slogger.Log(ctx, slog.LevelInfo, + "error detecting stale database lock", + "err", err, + ) } // Populate augeas lenses, if requested @@ -708,43 +711,28 @@ func calculateOsqueryPaths(rootDirectory string, registrationId string, runId st return osqueryFilePaths, nil } -// cleanUpOldDatabaseLock checks for the presence of a lock file in the given database and removes it -// if it exists. The lock file can hang around if an osquery process isn't properly terminated, and -// will prevent us from launching a new osquery process. -func (i *OsqueryInstance) cleanUpOldDatabaseLock(ctx context.Context, paths *osqueryFilePaths) error { +// detectStaleDatabaseLock checks for the presence of a lock file in the given database. +// We have noticed the lock file occasionally hanging around -- we think this can happen +// when osquery doesn't get adequate time to shut down gracefully. The presence of the lock +// file will prevent osquery from starting up entirely. For now, we just detect this stale +// lock file, and log information about it. In the future we will attempt to remediate. +// See: https://github.com/kolide/launcher/issues/2004. +func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *osqueryFilePaths) (bool, error) { lockFilePath := filepath.Join(paths.databasePath, "LOCK") lockFileInfo, err := os.Stat(lockFilePath) if os.IsNotExist(err) { // No lock file, nothing to do here - return nil - } - - // We do a couple retries here -- Windows complains about removing files in use sometimes. - err = backoff.WaitFor(func() error { - if err := os.Remove(lockFilePath); err != nil { - return fmt.Errorf("removing lock file at %s: %w", lockFilePath, err) - } - return nil - }, 5*time.Second, 500*time.Millisecond) - - if err != nil { - i.slogger.Log(ctx, slog.LevelInfo, - "detected old lock file but could not remove it", - "lockfile_path", lockFilePath, - "lockfile_modtime", lockFileInfo.ModTime().String(), - "err", err, - ) - return fmt.Errorf("removing lock file at %s: %w", lockFilePath, err) + return false, nil } i.slogger.Log(ctx, slog.LevelInfo, - "detected and removed old osquery db lock file", + "detected stale osquery db lock file", "lockfile_path", lockFilePath, "lockfile_modtime", lockFileInfo.ModTime().String(), ) - return nil + return true, nil } // createOsquerydCommand uses osqueryOptions to return an *exec.Cmd diff --git a/pkg/osquery/runtime/osqueryinstance_test.go b/pkg/osquery/runtime/osqueryinstance_test.go index 6df7e6ec3..b0bab985f 100644 --- a/pkg/osquery/runtime/osqueryinstance_test.go +++ b/pkg/osquery/runtime/osqueryinstance_test.go @@ -300,75 +300,37 @@ func TestLaunch(t *testing.T) { k.AssertExpectations(t) } -func TestLaunch_WithLockFileCleanup(t *testing.T) { +func Test_detectStaleDatabaseLock(t *testing.T) { t.Parallel() - logBytes, slogger := setUpTestSlogger() + _, slogger := setUpTestSlogger() rootDirectory := testRootDirectory(t) k := typesMocks.NewKnapsack(t) - k.On("WatchdogEnabled").Return(true) - k.On("WatchdogMemoryLimitMB").Return(150) - k.On("WatchdogUtilizationLimitPercent").Return(20) - k.On("WatchdogDelaySec").Return(120) - k.On("OsqueryFlags").Return([]string{"verbose=true"}) - k.On("OsqueryVerbose").Return(true) k.On("Slogger").Return(slogger) k.On("RootDirectory").Return(rootDirectory) - k.On("LoggingInterval").Return(1 * time.Second) - k.On("LogMaxBytesPerBatch").Return(500) - k.On("Transport").Return("jsonrpc") setUpMockStores(t, k) - k.On("ReadEnrollSecret").Return("", nil) - k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory) - k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second) i := newInstance(types.DefaultRegistrationID, k, mockServiceClient()) - // Create a lock file that would ordinarily prevent osquery from starting up + // Calculate paths + paths, err := calculateOsqueryPaths(i.knapsack.RootDirectory(), i.registrationId, i.runId, i.opts) + require.NoError(t, err) + + // Check for stale database lock -- there shouldn't be one + detected, err := i.detectStaleDatabaseLock(context.TODO(), paths) + require.NoError(t, err) + require.False(t, detected) + + // Create a lock file lockFilePath := filepath.Join(rootDirectory, "osquery.db", "LOCK") require.NoError(t, os.MkdirAll(filepath.Dir(lockFilePath), 0700)) // drwx createLockFile(t, lockFilePath) - // Start the instance - instanceStartTime := time.Now() - go i.Launch() - - // Wait for the instance to become healthy - require.NoError(t, backoff.WaitFor(func() error { - // Instance self-reports as healthy - if err := i.Healthy(); err != nil { - return fmt.Errorf("instance not healthy: %w", err) - } - - // Confirm instance setup is complete - if i.stats == nil || i.stats.ConnectTime == "" { - return errors.New("no connect time set yet") - } - - // Good to go - return nil - }, 30*time.Second, 1*time.Second), fmt.Sprintf("instance not healthy by %s: instance logs:\n\n%s", time.Now().String(), logBytes.String())) - - // Confirm that the lock file is new -- it should have a modtime after instanceStartTime - lockFileInfo, err := os.Stat(lockFilePath) + // Check for a stale database lock again -- now, we should find it + detectedRetry, err := i.detectStaleDatabaseLock(context.TODO(), paths) require.NoError(t, err) - require.True(t, lockFileInfo.ModTime().After(instanceStartTime), logBytes.String()) - - // Now wait for full shutdown - i.BeginShutdown() - shutdownErr := make(chan error) - go func() { - shutdownErr <- i.WaitShutdown() - }() - - select { - case err := <-shutdownErr: - require.True(t, errors.Is(err, context.Canceled), fmt.Sprintf("instance logs:\n\n%s", logBytes.String())) - case <-time.After(1 * time.Minute): - t.Error("instance did not shut down within timeout", fmt.Sprintf("instance logs: %s", logBytes.String())) - t.FailNow() - } + require.True(t, detectedRetry) k.AssertExpectations(t) } From 511dbb0f47e9c12fe31a43229e3dac2fb906a09c Mon Sep 17 00:00:00 2001 From: RebeccaMahany Date: Wed, 18 Dec 2024 13:45:50 -0500 Subject: [PATCH 04/13] Log more info --- pkg/osquery/runtime/osqueryinstance.go | 46 +++++++++++++++++-- pkg/osquery/runtime/runtime_helpers.go | 31 +++++++++++++ .../runtime/runtime_helpers_windows.go | 26 +++++++++++ 3 files changed, 100 insertions(+), 3 deletions(-) diff --git a/pkg/osquery/runtime/osqueryinstance.go b/pkg/osquery/runtime/osqueryinstance.go index aec79e99c..15c0180c4 100644 --- a/pkg/osquery/runtime/osqueryinstance.go +++ b/pkg/osquery/runtime/osqueryinstance.go @@ -725,12 +725,52 @@ func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *os // No lock file, nothing to do here return false, nil } + if err != nil { + return false, fmt.Errorf("determining whether lock file exists: %w", err) + } - i.slogger.Log(ctx, slog.LevelInfo, - "detected stale osquery db lock file", + infoToLog := []any{ "lockfile_path", lockFilePath, "lockfile_modtime", lockFileInfo.ModTime().String(), - ) + } + + defer func() { + i.slogger.Log(ctx, slog.LevelInfo, + "detected stale osquery db lock file", + infoToLog..., + ) + }() + + // Check to see whether the process holding the file still exists + p, err := getProcessHoldingFile(ctx, lockFilePath) + if err != nil { + infoToLog = append(infoToLog, "err", err) + return false, fmt.Errorf("getting process holding file: %w", err) + } + + // Grab more info to log from the process using the lockfile + infoToLog = append(infoToLog, "pid", p.Pid) + if name, err := p.NameWithContext(ctx); err == nil { + infoToLog = append(infoToLog, "process_name", name) + } + if cmdline, err := p.CmdlineWithContext(ctx); err == nil { + infoToLog = append(infoToLog, "process_cmdline", cmdline) + } + if status, err := p.StatusWithContext(ctx); err == nil { + infoToLog = append(infoToLog, "process_status", status) + } + if isRunning, err := p.IsRunningWithContext(ctx); err == nil { + infoToLog = append(infoToLog, "process_is_running", isRunning) + } + if parent, err := p.ParentWithContext(ctx); err == nil { + infoToLog = append(infoToLog, "process_parent_pid", parent.Pid) + if parentCmdline, err := parent.CmdlineWithContext(ctx); err == nil { + infoToLog = append(infoToLog, "process_parent_cmdline", parentCmdline) + } + if parentStatus, err := p.StatusWithContext(ctx); err == nil { + infoToLog = append(infoToLog, "process_parent_status", parentStatus) + } + } return true, nil } diff --git a/pkg/osquery/runtime/runtime_helpers.go b/pkg/osquery/runtime/runtime_helpers.go index 49ffb614c..392e4a4c5 100644 --- a/pkg/osquery/runtime/runtime_helpers.go +++ b/pkg/osquery/runtime/runtime_helpers.go @@ -4,10 +4,17 @@ package runtime import ( + "context" + "errors" "fmt" "os/exec" "path/filepath" + "strconv" + "strings" "syscall" + + "github.com/kolide/launcher/ee/allowedcmd" + "github.com/shirou/gopsutil/v3/process" ) func setpgid() *syscall.SysProcAttr { @@ -33,3 +40,27 @@ func platformArgs() []string { func isExitOk(_ error) bool { return false } + +func getProcessHoldingFile(ctx context.Context, pathToFile string) (*process.Process, error) { + cmd, err := allowedcmd.Lsof(ctx, "-t", pathToFile) + if err != nil { + return nil, fmt.Errorf("creating lsof command: %w", err) + } + + out, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("running lsof: %w", err) + } + + pidStr := strings.TrimSpace(string(out)) + if pidStr == "" { + return nil, errors.New("no process found using file via lsof") + } + + pid, err := strconv.ParseInt(pidStr, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid pid %s: %w", pidStr, err) + } + + return process.NewProcess(int32(pid)) +} diff --git a/pkg/osquery/runtime/runtime_helpers_windows.go b/pkg/osquery/runtime/runtime_helpers_windows.go index 7c5ed26e3..bc50363e7 100644 --- a/pkg/osquery/runtime/runtime_helpers_windows.go +++ b/pkg/osquery/runtime/runtime_helpers_windows.go @@ -12,6 +12,7 @@ import ( "github.com/kolide/launcher/ee/allowedcmd" "github.com/pkg/errors" + "github.com/shirou/gopsutil/v3/process" ) func setpgid() *syscall.SysProcAttr { @@ -80,3 +81,28 @@ func isExitOk(err error) bool { } return false } + +func getProcessHoldingFile(ctx context.Context, pathToFile string) (*process.Process, error) { + allProcesses, err := process.ProcessesWithContext(ctx) + if err != nil { + return nil, fmt.Errorf("getting process list: %w", err) + } + + for _, p := range allProcesses { + openFiles, err := p.OpenFilesWithContext(ctx) + if err != nil { + continue + } + + // Check the process's open files to see if this process is the one using the lockfile + for _, f := range openFiles { + if f.Path != pathToFile { + continue + } + + return p, nil + } + } + + return nil, errors.New("no process found using file") +} From e65370a435289609f7e80a3b0e25926e7bcf389e Mon Sep 17 00:00:00 2001 From: RebeccaMahany Date: Wed, 18 Dec 2024 14:17:41 -0500 Subject: [PATCH 05/13] Handle multiple processes holding file --- pkg/osquery/runtime/osqueryinstance.go | 48 +++++++++++-------- pkg/osquery/runtime/runtime_helpers.go | 24 ++++++++-- .../runtime/runtime_helpers_windows.go | 12 +++-- 3 files changed, 55 insertions(+), 29 deletions(-) diff --git a/pkg/osquery/runtime/osqueryinstance.go b/pkg/osquery/runtime/osqueryinstance.go index 15c0180c4..af180eb6a 100644 --- a/pkg/osquery/runtime/osqueryinstance.go +++ b/pkg/osquery/runtime/osqueryinstance.go @@ -29,6 +29,7 @@ import ( "github.com/osquery/osquery-go/plugin/config" "github.com/osquery/osquery-go/plugin/distributed" osquerylogger "github.com/osquery/osquery-go/plugin/logger" + "github.com/shirou/gopsutil/v3/process" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" @@ -731,7 +732,7 @@ func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *os infoToLog := []any{ "lockfile_path", lockFilePath, - "lockfile_modtime", lockFileInfo.ModTime().String(), + "lockfile_modtime", lockFileInfo.ModTime().UTC().String(), } defer func() { @@ -742,37 +743,42 @@ func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *os }() // Check to see whether the process holding the file still exists - p, err := getProcessHoldingFile(ctx, lockFilePath) + processes, err := getProcessesHoldingFile(ctx, lockFilePath) if err != nil { infoToLog = append(infoToLog, "err", err) return false, fmt.Errorf("getting process holding file: %w", err) } - // Grab more info to log from the process using the lockfile - infoToLog = append(infoToLog, "pid", p.Pid) - if name, err := p.NameWithContext(ctx); err == nil { - infoToLog = append(infoToLog, "process_name", name) + // Grab more info to log from the processes using the lockfile + processStrs := make([]string, len(processes)) + for i, p := range processes { + processStrs[i] = processStr(ctx, p) } - if cmdline, err := p.CmdlineWithContext(ctx); err == nil { - infoToLog = append(infoToLog, "process_cmdline", cmdline) + infoToLog = append(infoToLog, "processes", processStrs) + + return true, nil +} + +func processStr(ctx context.Context, p *process.Process) string { + name := "unknown" + uids := "unknown" + status := "unknown" + cmdline := "unknown" + + if gotName, err := p.NameWithContext(ctx); err == nil { + name = gotName } - if status, err := p.StatusWithContext(ctx); err == nil { - infoToLog = append(infoToLog, "process_status", status) + if gotUids, err := p.UidsWithContext(ctx); err == nil { + uids = fmt.Sprintf("%v", gotUids) } - if isRunning, err := p.IsRunningWithContext(ctx); err == nil { - infoToLog = append(infoToLog, "process_is_running", isRunning) + if gotStatus, err := p.StatusWithContext(ctx); err == nil { + status = strings.Join(gotStatus, " ") } - if parent, err := p.ParentWithContext(ctx); err == nil { - infoToLog = append(infoToLog, "process_parent_pid", parent.Pid) - if parentCmdline, err := parent.CmdlineWithContext(ctx); err == nil { - infoToLog = append(infoToLog, "process_parent_cmdline", parentCmdline) - } - if parentStatus, err := p.StatusWithContext(ctx); err == nil { - infoToLog = append(infoToLog, "process_parent_status", parentStatus) - } + if gotCmdline, err := p.CmdlineWithContext(ctx); err == nil { + cmdline = gotCmdline } - return true, nil + return fmt.Sprintf("process with name `%s` and PID %d belonging to UIDs %s has current status `%s` (%s)", name, p.Pid, uids, status, cmdline) } // createOsquerydCommand uses osqueryOptions to return an *exec.Cmd diff --git a/pkg/osquery/runtime/runtime_helpers.go b/pkg/osquery/runtime/runtime_helpers.go index 392e4a4c5..500a4c7e2 100644 --- a/pkg/osquery/runtime/runtime_helpers.go +++ b/pkg/osquery/runtime/runtime_helpers.go @@ -41,7 +41,7 @@ func isExitOk(_ error) bool { return false } -func getProcessHoldingFile(ctx context.Context, pathToFile string) (*process.Process, error) { +func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) { cmd, err := allowedcmd.Lsof(ctx, "-t", pathToFile) if err != nil { return nil, fmt.Errorf("creating lsof command: %w", err) @@ -57,10 +57,24 @@ func getProcessHoldingFile(ctx context.Context, pathToFile string) (*process.Pro return nil, errors.New("no process found using file via lsof") } - pid, err := strconv.ParseInt(pidStr, 10, 32) - if err != nil { - return nil, fmt.Errorf("invalid pid %s: %w", pidStr, err) + pidStrs := strings.Split(strings.TrimSpace(string(out)), "\n") + if len(pidStrs) == 0 { + return nil, errors.New("no processes found using file via lsof") + } + + processes := make([]*process.Process, 0) + for _, pidStr := range pidStrs { + pid, err := strconv.ParseInt(pidStr, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid pid %s: %w", pidStr, err) + } + + p, err := process.NewProcess(int32(pid)) + if err != nil { + return nil, fmt.Errorf("getting process for %d: %w", pid, err) + } + processes = append(processes, p) } - return process.NewProcess(int32(pid)) + return processes, nil } diff --git a/pkg/osquery/runtime/runtime_helpers_windows.go b/pkg/osquery/runtime/runtime_helpers_windows.go index bc50363e7..6e9895e7c 100644 --- a/pkg/osquery/runtime/runtime_helpers_windows.go +++ b/pkg/osquery/runtime/runtime_helpers_windows.go @@ -82,12 +82,13 @@ func isExitOk(err error) bool { return false } -func getProcessHoldingFile(ctx context.Context, pathToFile string) (*process.Process, error) { +func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) { allProcesses, err := process.ProcessesWithContext(ctx) if err != nil { return nil, fmt.Errorf("getting process list: %w", err) } + processes := make([]*process.Process, 0) for _, p := range allProcesses { openFiles, err := p.OpenFilesWithContext(ctx) if err != nil { @@ -100,9 +101,14 @@ func getProcessHoldingFile(ctx context.Context, pathToFile string) (*process.Pro continue } - return p, nil + processes = append(processes, p) + break } } - return nil, errors.New("no process found using file") + if len(processes) == 0 { + return nil, errors.New("no processes found using file") + } + + return processes, nil } From 432caa22f03afdc1acd993f89a3d647093bf5af6 Mon Sep 17 00:00:00 2001 From: Rebecca Mahany-Horton Date: Thu, 19 Dec 2024 11:02:22 -0500 Subject: [PATCH 06/13] Fix open files check --- pkg/osquery/runtime/runtime_helpers_windows.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/osquery/runtime/runtime_helpers_windows.go b/pkg/osquery/runtime/runtime_helpers_windows.go index 6e9895e7c..a1f73901a 100644 --- a/pkg/osquery/runtime/runtime_helpers_windows.go +++ b/pkg/osquery/runtime/runtime_helpers_windows.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "os/exec" + "strings" "syscall" "time" @@ -97,7 +98,9 @@ func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process // Check the process's open files to see if this process is the one using the lockfile for _, f := range openFiles { - if f.Path != pathToFile { + // We check for strings.Contains rather than equals because the open file's path contains + // a `\\?\` prefix. + if !strings.Contains(f.Path, pathToFile) { continue } From a7ce8f04865faee4c2ceaf5e9ba351a84431cbd3 Mon Sep 17 00:00:00 2001 From: Rebecca Mahany-Horton Date: Thu, 19 Dec 2024 11:03:01 -0500 Subject: [PATCH 07/13] Use IsRunning over Status since it's supported on Windows --- pkg/osquery/runtime/osqueryinstance.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pkg/osquery/runtime/osqueryinstance.go b/pkg/osquery/runtime/osqueryinstance.go index af180eb6a..4c5544bc1 100644 --- a/pkg/osquery/runtime/osqueryinstance.go +++ b/pkg/osquery/runtime/osqueryinstance.go @@ -762,7 +762,7 @@ func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *os func processStr(ctx context.Context, p *process.Process) string { name := "unknown" uids := "unknown" - status := "unknown" + runningStatus := "unknown" cmdline := "unknown" if gotName, err := p.NameWithContext(ctx); err == nil { @@ -771,14 +771,18 @@ func processStr(ctx context.Context, p *process.Process) string { if gotUids, err := p.UidsWithContext(ctx); err == nil { uids = fmt.Sprintf("%v", gotUids) } - if gotStatus, err := p.StatusWithContext(ctx); err == nil { - status = strings.Join(gotStatus, " ") + if gotIsRunning, err := p.IsRunningWithContext(ctx); err == nil { + if gotIsRunning { + runningStatus = "running" + } else { + runningStatus = "not running" + } } if gotCmdline, err := p.CmdlineWithContext(ctx); err == nil { cmdline = gotCmdline } - return fmt.Sprintf("process with name `%s` and PID %d belonging to UIDs %s has current status `%s` (%s)", name, p.Pid, uids, status, cmdline) + return fmt.Sprintf("process with name `%s` and PID %d belonging to UIDs %s has current status `%s` (%s)", name, p.Pid, uids, runningStatus, cmdline) } // createOsquerydCommand uses osqueryOptions to return an *exec.Cmd From f24b66cc564a5910d06b83cd0e6a60a4fbb2cc99 Mon Sep 17 00:00:00 2001 From: Rebecca Mahany-Horton Date: Thu, 19 Dec 2024 11:04:54 -0500 Subject: [PATCH 08/13] Use Username over UIDs since it's supported on Windows --- pkg/osquery/runtime/osqueryinstance.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/osquery/runtime/osqueryinstance.go b/pkg/osquery/runtime/osqueryinstance.go index 4c5544bc1..b61e808bc 100644 --- a/pkg/osquery/runtime/osqueryinstance.go +++ b/pkg/osquery/runtime/osqueryinstance.go @@ -761,15 +761,15 @@ func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *os func processStr(ctx context.Context, p *process.Process) string { name := "unknown" - uids := "unknown" + username := "unknown" runningStatus := "unknown" cmdline := "unknown" if gotName, err := p.NameWithContext(ctx); err == nil { name = gotName } - if gotUids, err := p.UidsWithContext(ctx); err == nil { - uids = fmt.Sprintf("%v", gotUids) + if gotUsername, err := p.UsernameWithContext(ctx); err == nil { + username = gotUsername } if gotIsRunning, err := p.IsRunningWithContext(ctx); err == nil { if gotIsRunning { @@ -782,7 +782,7 @@ func processStr(ctx context.Context, p *process.Process) string { cmdline = gotCmdline } - return fmt.Sprintf("process with name `%s` and PID %d belonging to UIDs %s has current status `%s` (%s)", name, p.Pid, uids, runningStatus, cmdline) + return fmt.Sprintf("process with name `%s` and PID %d belonging to user `%s` has current status `%s` (%s)", name, p.Pid, username, runningStatus, cmdline) } // createOsquerydCommand uses osqueryOptions to return an *exec.Cmd From 025bd99b401a1632af111fe90688ac0131bb2a7d Mon Sep 17 00:00:00 2001 From: Rebecca Mahany-Horton Date: Thu, 19 Dec 2024 11:39:58 -0500 Subject: [PATCH 09/13] Rename var --- pkg/osquery/runtime/osqueryinstance.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/osquery/runtime/osqueryinstance.go b/pkg/osquery/runtime/osqueryinstance.go index b61e808bc..502127bee 100644 --- a/pkg/osquery/runtime/osqueryinstance.go +++ b/pkg/osquery/runtime/osqueryinstance.go @@ -761,7 +761,7 @@ func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *os func processStr(ctx context.Context, p *process.Process) string { name := "unknown" - username := "unknown" + processOwner := "unknown" runningStatus := "unknown" cmdline := "unknown" @@ -769,7 +769,7 @@ func processStr(ctx context.Context, p *process.Process) string { name = gotName } if gotUsername, err := p.UsernameWithContext(ctx); err == nil { - username = gotUsername + processOwner = gotUsername } if gotIsRunning, err := p.IsRunningWithContext(ctx); err == nil { if gotIsRunning { @@ -782,7 +782,7 @@ func processStr(ctx context.Context, p *process.Process) string { cmdline = gotCmdline } - return fmt.Sprintf("process with name `%s` and PID %d belonging to user `%s` has current status `%s` (%s)", name, p.Pid, username, runningStatus, cmdline) + return fmt.Sprintf("process with name `%s` and PID %d belonging to user `%s` has current status `%s` (%s)", name, p.Pid, processOwner, runningStatus, cmdline) } // createOsquerydCommand uses osqueryOptions to return an *exec.Cmd From cf1392f0e8b24959713f69deb3aba3f0b1619372 Mon Sep 17 00:00:00 2001 From: Rebecca Mahany-Horton Date: Thu, 19 Dec 2024 11:43:22 -0500 Subject: [PATCH 10/13] Better error messages --- pkg/osquery/runtime/runtime_helpers_windows.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/osquery/runtime/runtime_helpers_windows.go b/pkg/osquery/runtime/runtime_helpers_windows.go index a1f73901a..ee3f4eef3 100644 --- a/pkg/osquery/runtime/runtime_helpers_windows.go +++ b/pkg/osquery/runtime/runtime_helpers_windows.go @@ -88,6 +88,9 @@ func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process if err != nil { return nil, fmt.Errorf("getting process list: %w", err) } + if len(allProcesses) == 0 { + return nil, errors.New("could not get any processes") + } processes := make([]*process.Process, 0) for _, p := range allProcesses { @@ -110,7 +113,7 @@ func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process } if len(processes) == 0 { - return nil, errors.New("no processes found using file") + return nil, fmt.Errorf("no processes found using file %s", pathToFile) } return processes, nil From ea7ea4445f1955f4da8840cf72d9e38894eb6567 Mon Sep 17 00:00:00 2001 From: Rebecca Mahany-Horton Date: Thu, 19 Dec 2024 12:45:42 -0500 Subject: [PATCH 11/13] Try actually opening file too --- pkg/osquery/runtime/osqueryinstance_windows_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/osquery/runtime/osqueryinstance_windows_test.go b/pkg/osquery/runtime/osqueryinstance_windows_test.go index 101d9f0b6..75c1f2330 100644 --- a/pkg/osquery/runtime/osqueryinstance_windows_test.go +++ b/pkg/osquery/runtime/osqueryinstance_windows_test.go @@ -4,6 +4,7 @@ package runtime import ( + "os" "strings" "syscall" "testing" @@ -89,4 +90,11 @@ func createLockFile(t *testing.T, fileToLock string) { uintptr(unsafe.Pointer(overlapped)), ) require.False(t, r1 == 0, error(e1)) + + fh, err := os.Open(fileToLock) + require.NoError(t, err) + + t.Cleanup(func() { + fh.Close() + }) } From fe65f8ac6b7f69e4cef47e3e91231744d545dca7 Mon Sep 17 00:00:00 2001 From: Rebecca Mahany-Horton Date: Thu, 19 Dec 2024 13:00:02 -0500 Subject: [PATCH 12/13] Skip test in GHA --- pkg/osquery/runtime/osqueryinstance_test.go | 4 ++++ pkg/osquery/runtime/osqueryinstance_windows_test.go | 8 -------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pkg/osquery/runtime/osqueryinstance_test.go b/pkg/osquery/runtime/osqueryinstance_test.go index b0bab985f..8eeac44aa 100644 --- a/pkg/osquery/runtime/osqueryinstance_test.go +++ b/pkg/osquery/runtime/osqueryinstance_test.go @@ -303,6 +303,10 @@ func TestLaunch(t *testing.T) { func Test_detectStaleDatabaseLock(t *testing.T) { t.Parallel() + if runtime.GOOS == "windows" && os.Getenv("GITHUB_ACTIONS") == "true" { + t.Skip("Skipping test on GitHub Actions -- test only works locally on Windows") + } + _, slogger := setUpTestSlogger() rootDirectory := testRootDirectory(t) diff --git a/pkg/osquery/runtime/osqueryinstance_windows_test.go b/pkg/osquery/runtime/osqueryinstance_windows_test.go index 75c1f2330..101d9f0b6 100644 --- a/pkg/osquery/runtime/osqueryinstance_windows_test.go +++ b/pkg/osquery/runtime/osqueryinstance_windows_test.go @@ -4,7 +4,6 @@ package runtime import ( - "os" "strings" "syscall" "testing" @@ -90,11 +89,4 @@ func createLockFile(t *testing.T, fileToLock string) { uintptr(unsafe.Pointer(overlapped)), ) require.False(t, r1 == 0, error(e1)) - - fh, err := os.Open(fileToLock) - require.NoError(t, err) - - t.Cleanup(func() { - fh.Close() - }) } From fc409fdb2fa43011aacd03c672b4e953eaf90d05 Mon Sep 17 00:00:00 2001 From: RebeccaMahany Date: Mon, 30 Dec 2024 16:06:51 -0500 Subject: [PATCH 13/13] Fix rebase --- pkg/osquery/runtime/osqueryinstance_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/osquery/runtime/osqueryinstance_test.go b/pkg/osquery/runtime/osqueryinstance_test.go index 5f49c76ce..9bd8c8b11 100644 --- a/pkg/osquery/runtime/osqueryinstance_test.go +++ b/pkg/osquery/runtime/osqueryinstance_test.go @@ -260,7 +260,7 @@ func TestLaunch(t *testing.T) { k.On("Transport").Return("jsonrpc") setUpMockStores(t, k) k.On("ReadEnrollSecret").Return("", nil) - k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinaryDirectory) + k.On("LatestOsquerydPath", mock.Anything).Return(testOsqueryBinary) k.On("OsqueryHealthcheckStartupDelay").Return(10 * time.Second) i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())