diff --git a/ee/log/osquerylogs/log.go b/ee/log/osquerylogs/log.go index f98fa0057..cf08d014c 100644 --- a/ee/log/osquerylogs/log.go +++ b/ee/log/osquerylogs/log.go @@ -9,6 +9,7 @@ import ( "regexp" "strconv" "strings" + "time" "github.com/shirou/gopsutil/v3/host" "github.com/shirou/gopsutil/v3/process" @@ -17,9 +18,10 @@ import ( // OsqueryLogAdapater creates an io.Writer implementation useful for attaching // to the osquery stdout/stderr type OsqueryLogAdapter struct { - slogger slog.Logger - level slog.Level - rootDirectory string + slogger slog.Logger + level slog.Level + rootDirectory string + lastLockfileLogTime time.Time } type Option func(*OsqueryLogAdapter) @@ -30,9 +32,11 @@ func WithLevel(level slog.Level) Option { } } -var callerRegexp = regexp.MustCompile(`[\w.]+:\d+]`) - -var pidRegex = regexp.MustCompile(`Refusing to kill non-osqueryd process (\d+)`) +var ( + callerRegexp = regexp.MustCompile(`[\w.]+:\d+]`) + pidRegex = regexp.MustCompile(`Refusing to kill non-osqueryd process (\d+)`) + lockfileRegex = regexp.MustCompile(`lock file: ([a-zA-Z0-9_\.\s\\\/\-:]*LOCK):`) +) func extractOsqueryCaller(msg string) string { return strings.TrimSuffix(callerRegexp.FindString(msg), "]") @@ -78,6 +82,22 @@ func (l *OsqueryLogAdapter) Write(p []byte) (int, error) { go l.logInfoAboutUnrecognizedProcessLockingPidfile(p) } + // We have noticed the lock file occasionally locked when it shouldn't be -- we think this can happen + // when osquery doesn't get adequate time to shut down gracefully. The held lockfile will prevent + // osquery from starting up entirely. + // See: https://github.com/kolide/launcher/issues/2004. + if bytes.Contains(p, []byte("Rocksdb open failed")) { + // We can get spammed with this log, but we don't want to do all the work to look up info about the process + // using the lockfile each time we see this log -- make sure we only log once every 10 minutes at most. + if time.Since(l.lastLockfileLogTime) > 10*time.Minute { + l.lastLockfileLogTime = time.Now() + l.slogger.Log(context.TODO(), slog.LevelError, + "detected stale lockfile, logging info about file", + ) + go l.logInfoAboutProcessHoldingLockfile(context.TODO(), p) + } + } + msg := strings.TrimSpace(string(p)) caller := extractOsqueryCaller(msg) l.slogger.Log(context.TODO(), level, // nolint:sloglint // it's fine to not have a constant or literal here @@ -202,3 +222,68 @@ func getSliceStat(getFunc func() ([]int32, error)) string { } return fmt.Sprintf("%+v", stat) } + +// logInfoAboutProcessHoldingLockfile logs information about the osquery database's lock file. +func (l *OsqueryLogAdapter) logInfoAboutProcessHoldingLockfile(ctx context.Context, p []byte) { + matches := lockfileRegex.FindAllStringSubmatch(string(p), -1) + if len(matches) < 1 || len(matches[0]) < 2 { + l.slogger.Log(context.TODO(), slog.LevelError, + "could not extract lockfile path from log line", + "log_line", string(p), + ) + + return + } + + lockFilePath := strings.TrimSpace(matches[0][1]) // We want the group, not the full match + infoToLog := []any{ + "lockfile_path", lockFilePath, + } + + defer func() { + l.slogger.Log(ctx, slog.LevelInfo, + "detected stale osquery db lock file", + infoToLog..., + ) + }() + + // Check to see whether the process holding the file still exists + processes, err := getProcessesHoldingFile(ctx, lockFilePath) + if err != nil { + infoToLog = append(infoToLog, "err", err) + return + } + + // Grab more info to log from the processes using the lockfile + processStrs := make([]string, len(processes)) + for i, p := range processes { + processStrs[i] = processStr(ctx, p) + } + infoToLog = append(infoToLog, "processes", processStrs) +} + +func processStr(ctx context.Context, p *process.Process) string { + name := "unknown" + processOwner := "unknown" + runningStatus := "unknown" + cmdline := "unknown" + + if gotName, err := p.NameWithContext(ctx); err == nil { + name = gotName + } + if gotUsername, err := p.UsernameWithContext(ctx); err == nil { + processOwner = gotUsername + } + if gotIsRunning, err := p.IsRunningWithContext(ctx); err == nil { + if gotIsRunning { + runningStatus = "running" + } else { + runningStatus = "not running" + } + } + if gotCmdline, err := p.CmdlineWithContext(ctx); err == nil { + cmdline = gotCmdline + } + + return fmt.Sprintf("process with name `%s` and PID %d belonging to user `%s` has current status `%s` (%s)", name, p.Pid, processOwner, runningStatus, cmdline) +} diff --git a/ee/log/osquerylogs/log_posix.go b/ee/log/osquerylogs/log_posix.go index 377570e2d..0dca60a3a 100644 --- a/ee/log/osquerylogs/log_posix.go +++ b/ee/log/osquerylogs/log_posix.go @@ -5,11 +5,16 @@ package osquerylogs import ( "context" + "errors" + "fmt" "log/slog" "path/filepath" + "strconv" + "strings" "time" "github.com/kolide/launcher/ee/allowedcmd" + "github.com/shirou/gopsutil/v3/process" ) // runAndLogPs runs ps filtering on the given PID, and logs the output. @@ -110,3 +115,41 @@ func (l *OsqueryLogAdapter) runAndLogLsofOnPidfile() { "output", string(out), ) } + +func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) { + cmd, err := allowedcmd.Lsof(ctx, "-t", pathToFile) + if err != nil { + return nil, fmt.Errorf("creating lsof command: %w", err) + } + + out, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("running lsof: %w", err) + } + + pidStr := strings.TrimSpace(string(out)) + if pidStr == "" { + return nil, errors.New("no process found using file via lsof") + } + + pidStrs := strings.Split(strings.TrimSpace(string(out)), "\n") + if len(pidStrs) == 0 { + return nil, errors.New("no processes found using file via lsof") + } + + processes := make([]*process.Process, 0) + for _, pidStr := range pidStrs { + pid, err := strconv.ParseInt(pidStr, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid pid %s: %w", pidStr, err) + } + + p, err := process.NewProcess(int32(pid)) + if err != nil { + return nil, fmt.Errorf("getting process for %d: %w", pid, err) + } + processes = append(processes, p) + } + + return processes, nil +} diff --git a/ee/log/osquerylogs/log_windows.go b/ee/log/osquerylogs/log_windows.go index f609dd519..f27be502a 100644 --- a/ee/log/osquerylogs/log_windows.go +++ b/ee/log/osquerylogs/log_windows.go @@ -3,6 +3,15 @@ package osquerylogs +import ( + "context" + "errors" + "fmt" + "strings" + + "github.com/shirou/gopsutil/v3/process" +) + func (l *OsqueryLogAdapter) runAndLogPs(_ string) { return } @@ -14,3 +23,39 @@ func (l *OsqueryLogAdapter) runAndLogLsofByPID(_ string) { func (l *OsqueryLogAdapter) runAndLogLsofOnPidfile() { return } + +func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) { + allProcesses, err := process.ProcessesWithContext(ctx) + if err != nil { + return nil, fmt.Errorf("getting process list: %w", err) + } + if len(allProcesses) == 0 { + return nil, errors.New("could not get any processes") + } + + processes := make([]*process.Process, 0) + for _, p := range allProcesses { + openFiles, err := p.OpenFilesWithContext(ctx) + if err != nil { + continue + } + + // Check the process's open files to see if this process is the one using the lockfile + for _, f := range openFiles { + // We check for strings.Contains rather than equals because the open file's path contains + // a `\\?\` prefix. + if !strings.Contains(f.Path, pathToFile) { + continue + } + + processes = append(processes, p) + break + } + } + + if len(processes) == 0 { + return nil, fmt.Errorf("no processes found using file %s", pathToFile) + } + + return processes, nil +} diff --git a/pkg/osquery/runtime/osqueryinstance.go b/pkg/osquery/runtime/osqueryinstance.go index 502127bee..0ecb3ad4d 100644 --- a/pkg/osquery/runtime/osqueryinstance.go +++ b/pkg/osquery/runtime/osqueryinstance.go @@ -29,7 +29,6 @@ import ( "github.com/osquery/osquery-go/plugin/config" "github.com/osquery/osquery-go/plugin/distributed" osquerylogger "github.com/osquery/osquery-go/plugin/logger" - "github.com/shirou/gopsutil/v3/process" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/trace" @@ -250,15 +249,6 @@ func (i *OsqueryInstance) Launch() error { return fmt.Errorf("could not calculate osquery file paths: %w", err) } - // Check to see whether lock files exist that could prevent osquery from starting up - if _, err := i.detectStaleDatabaseLock(ctx, paths); err != nil { - // A detection error shouldn't prevent us from creating the process -- log it and move on - i.slogger.Log(ctx, slog.LevelInfo, - "error detecting stale database lock", - "err", err, - ) - } - // Populate augeas lenses, if requested if i.opts.augeasLensFunc != nil { if err := os.MkdirAll(paths.augeasPath, 0755); err != nil { @@ -712,79 +702,6 @@ func calculateOsqueryPaths(rootDirectory string, registrationId string, runId st return osqueryFilePaths, nil } -// detectStaleDatabaseLock checks for the presence of a lock file in the given database. -// We have noticed the lock file occasionally hanging around -- we think this can happen -// when osquery doesn't get adequate time to shut down gracefully. The presence of the lock -// file will prevent osquery from starting up entirely. For now, we just detect this stale -// lock file, and log information about it. In the future we will attempt to remediate. -// See: https://github.com/kolide/launcher/issues/2004. -func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *osqueryFilePaths) (bool, error) { - lockFilePath := filepath.Join(paths.databasePath, "LOCK") - - lockFileInfo, err := os.Stat(lockFilePath) - if os.IsNotExist(err) { - // No lock file, nothing to do here - return false, nil - } - if err != nil { - return false, fmt.Errorf("determining whether lock file exists: %w", err) - } - - infoToLog := []any{ - "lockfile_path", lockFilePath, - "lockfile_modtime", lockFileInfo.ModTime().UTC().String(), - } - - defer func() { - i.slogger.Log(ctx, slog.LevelInfo, - "detected stale osquery db lock file", - infoToLog..., - ) - }() - - // Check to see whether the process holding the file still exists - processes, err := getProcessesHoldingFile(ctx, lockFilePath) - if err != nil { - infoToLog = append(infoToLog, "err", err) - return false, fmt.Errorf("getting process holding file: %w", err) - } - - // Grab more info to log from the processes using the lockfile - processStrs := make([]string, len(processes)) - for i, p := range processes { - processStrs[i] = processStr(ctx, p) - } - infoToLog = append(infoToLog, "processes", processStrs) - - return true, nil -} - -func processStr(ctx context.Context, p *process.Process) string { - name := "unknown" - processOwner := "unknown" - runningStatus := "unknown" - cmdline := "unknown" - - if gotName, err := p.NameWithContext(ctx); err == nil { - name = gotName - } - if gotUsername, err := p.UsernameWithContext(ctx); err == nil { - processOwner = gotUsername - } - if gotIsRunning, err := p.IsRunningWithContext(ctx); err == nil { - if gotIsRunning { - runningStatus = "running" - } else { - runningStatus = "not running" - } - } - if gotCmdline, err := p.CmdlineWithContext(ctx); err == nil { - cmdline = gotCmdline - } - - return fmt.Sprintf("process with name `%s` and PID %d belonging to user `%s` has current status `%s` (%s)", name, p.Pid, processOwner, runningStatus, cmdline) -} - // createOsquerydCommand uses osqueryOptions to return an *exec.Cmd // which will launch a properly configured osqueryd process. func (i *OsqueryInstance) createOsquerydCommand(osquerydBinary string, paths *osqueryFilePaths) (*exec.Cmd, error) { diff --git a/pkg/osquery/runtime/osqueryinstance_other_test.go b/pkg/osquery/runtime/osqueryinstance_other_test.go deleted file mode 100644 index 7ce670ba9..000000000 --- a/pkg/osquery/runtime/osqueryinstance_other_test.go +++ /dev/null @@ -1,20 +0,0 @@ -//go:build !windows -// +build !windows - -package runtime - -import ( - "syscall" - "testing" - - "github.com/stretchr/testify/require" -) - -func createLockFile(t *testing.T, fileToLock string) { - lockFile, err := syscall.Open(fileToLock, syscall.O_CREAT|syscall.O_RDONLY, 0600) - require.NoError(t, err) - require.NoError(t, syscall.Flock(lockFile, syscall.LOCK_EX|syscall.LOCK_NB)) - t.Cleanup(func() { - syscall.Close(lockFile) - }) -} diff --git a/pkg/osquery/runtime/osqueryinstance_test.go b/pkg/osquery/runtime/osqueryinstance_test.go index 9bd8c8b11..fd445c4a5 100644 --- a/pkg/osquery/runtime/osqueryinstance_test.go +++ b/pkg/osquery/runtime/osqueryinstance_test.go @@ -4,7 +4,6 @@ import ( "context" "errors" "fmt" - "os" "path/filepath" "runtime" "strings" @@ -299,42 +298,3 @@ func TestLaunch(t *testing.T) { k.AssertExpectations(t) } - -func Test_detectStaleDatabaseLock(t *testing.T) { - t.Parallel() - - if runtime.GOOS == "windows" && os.Getenv("GITHUB_ACTIONS") == "true" { - t.Skip("Skipping test on GitHub Actions -- test only works locally on Windows") - } - - _, slogger := setUpTestSlogger() - rootDirectory := testRootDirectory(t) - - k := typesMocks.NewKnapsack(t) - k.On("Slogger").Return(slogger) - k.On("RootDirectory").Return(rootDirectory) - setUpMockStores(t, k) - - i := newInstance(types.DefaultRegistrationID, k, mockServiceClient()) - - // Calculate paths - paths, err := calculateOsqueryPaths(i.knapsack.RootDirectory(), i.registrationId, i.runId, i.opts) - require.NoError(t, err) - - // Check for stale database lock -- there shouldn't be one - detected, err := i.detectStaleDatabaseLock(context.TODO(), paths) - require.NoError(t, err) - require.False(t, detected) - - // Create a lock file - lockFilePath := filepath.Join(rootDirectory, "osquery.db", "LOCK") - require.NoError(t, os.MkdirAll(filepath.Dir(lockFilePath), 0700)) // drwx - createLockFile(t, lockFilePath) - - // Check for a stale database lock again -- now, we should find it - detectedRetry, err := i.detectStaleDatabaseLock(context.TODO(), paths) - require.NoError(t, err) - require.True(t, detectedRetry) - - k.AssertExpectations(t) -} diff --git a/pkg/osquery/runtime/osqueryinstance_windows_test.go b/pkg/osquery/runtime/osqueryinstance_windows_test.go index 1d76e3b8d..d59d6582f 100644 --- a/pkg/osquery/runtime/osqueryinstance_windows_test.go +++ b/pkg/osquery/runtime/osqueryinstance_windows_test.go @@ -5,9 +5,7 @@ package runtime import ( "strings" - "syscall" "testing" - "unsafe" "github.com/kolide/launcher/ee/agent/types" typesMocks "github.com/kolide/launcher/ee/agent/types/mocks" @@ -52,41 +50,3 @@ func TestCreateOsqueryCommandEnvVars(t *testing.T) { k.AssertExpectations(t) } - -func createLockFile(t *testing.T, fileToLock string) { - lockedFileName, err := syscall.UTF16PtrFromString(fileToLock) - require.NoError(t, err) - - handle, err := syscall.CreateFile( - lockedFileName, - syscall.GENERIC_READ, - syscall.FILE_SHARE_READ|syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_DELETE, - nil, - syscall.CREATE_ALWAYS, - syscall.FILE_ATTRIBUTE_NORMAL, - 0, - ) - require.NoError(t, err) - - t.Cleanup(func() { - syscall.Close(handle) - }) - - // See: https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-lockfileex - modkernel32 := syscall.NewLazyDLL("kernel32.dll") - procLockFileEx := modkernel32.NewProc("LockFileEx") - overlapped := &syscall.Overlapped{ - HEvent: 0, - } - - r1, _, e1 := syscall.SyscallN( - procLockFileEx.Addr(), - uintptr(handle), - uintptr(2), // LOCKFILE_EXCLUSIVE_LOCK - uintptr(0), // Reserved parameter; must be set to zero. - uintptr(1), // nNumberOfBytesToLockLow -- the low-order 32 bits of the length of the byte range to lock. - uintptr(0), // nNumberOfBytesToLockHigh -- the high-order 32 bits of the length of the byte range to lock. - uintptr(unsafe.Pointer(overlapped)), - ) - require.False(t, r1 == 0, error(e1)) -} diff --git a/pkg/osquery/runtime/runtime_helpers.go b/pkg/osquery/runtime/runtime_helpers.go index 500a4c7e2..49ffb614c 100644 --- a/pkg/osquery/runtime/runtime_helpers.go +++ b/pkg/osquery/runtime/runtime_helpers.go @@ -4,17 +4,10 @@ package runtime import ( - "context" - "errors" "fmt" "os/exec" "path/filepath" - "strconv" - "strings" "syscall" - - "github.com/kolide/launcher/ee/allowedcmd" - "github.com/shirou/gopsutil/v3/process" ) func setpgid() *syscall.SysProcAttr { @@ -40,41 +33,3 @@ func platformArgs() []string { func isExitOk(_ error) bool { return false } - -func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) { - cmd, err := allowedcmd.Lsof(ctx, "-t", pathToFile) - if err != nil { - return nil, fmt.Errorf("creating lsof command: %w", err) - } - - out, err := cmd.CombinedOutput() - if err != nil { - return nil, fmt.Errorf("running lsof: %w", err) - } - - pidStr := strings.TrimSpace(string(out)) - if pidStr == "" { - return nil, errors.New("no process found using file via lsof") - } - - pidStrs := strings.Split(strings.TrimSpace(string(out)), "\n") - if len(pidStrs) == 0 { - return nil, errors.New("no processes found using file via lsof") - } - - processes := make([]*process.Process, 0) - for _, pidStr := range pidStrs { - pid, err := strconv.ParseInt(pidStr, 10, 32) - if err != nil { - return nil, fmt.Errorf("invalid pid %s: %w", pidStr, err) - } - - p, err := process.NewProcess(int32(pid)) - if err != nil { - return nil, fmt.Errorf("getting process for %d: %w", pid, err) - } - processes = append(processes, p) - } - - return processes, nil -} diff --git a/pkg/osquery/runtime/runtime_helpers_windows.go b/pkg/osquery/runtime/runtime_helpers_windows.go index ee3f4eef3..7c5ed26e3 100644 --- a/pkg/osquery/runtime/runtime_helpers_windows.go +++ b/pkg/osquery/runtime/runtime_helpers_windows.go @@ -7,13 +7,11 @@ import ( "context" "fmt" "os/exec" - "strings" "syscall" "time" "github.com/kolide/launcher/ee/allowedcmd" "github.com/pkg/errors" - "github.com/shirou/gopsutil/v3/process" ) func setpgid() *syscall.SysProcAttr { @@ -82,39 +80,3 @@ func isExitOk(err error) bool { } return false } - -func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) { - allProcesses, err := process.ProcessesWithContext(ctx) - if err != nil { - return nil, fmt.Errorf("getting process list: %w", err) - } - if len(allProcesses) == 0 { - return nil, errors.New("could not get any processes") - } - - processes := make([]*process.Process, 0) - for _, p := range allProcesses { - openFiles, err := p.OpenFilesWithContext(ctx) - if err != nil { - continue - } - - // Check the process's open files to see if this process is the one using the lockfile - for _, f := range openFiles { - // We check for strings.Contains rather than equals because the open file's path contains - // a `\\?\` prefix. - if !strings.Contains(f.Path, pathToFile) { - continue - } - - processes = append(processes, p) - break - } - } - - if len(processes) == 0 { - return nil, fmt.Errorf("no processes found using file %s", pathToFile) - } - - return processes, nil -}