Skip to content

Commit

Permalink
Move lockfile logging to osquery log adapter
Browse files Browse the repository at this point in the history
  • Loading branch information
RebeccaMahany committed Dec 31, 2024
1 parent bb27062 commit 906d99b
Show file tree
Hide file tree
Showing 9 changed files with 169 additions and 269 deletions.
84 changes: 81 additions & 3 deletions ee/log/osquerylogs/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ func WithLevel(level slog.Level) Option {
}
}

var callerRegexp = regexp.MustCompile(`[\w.]+:\d+]`)

var pidRegex = regexp.MustCompile(`Refusing to kill non-osqueryd process (\d+)`)
var (
callerRegexp = regexp.MustCompile(`[\w.]+:\d+]`)
pidRegex = regexp.MustCompile(`Refusing to kill non-osqueryd process (\d+)`)
lockfileRegex = regexp.MustCompile(`Failed to create lock file: ([a-zA-Z0-9_\.\s\\\/\-:]*LOCK):`)
)

func extractOsqueryCaller(msg string) string {
return strings.TrimSuffix(callerRegexp.FindString(msg), "]")
Expand Down Expand Up @@ -78,6 +80,13 @@ func (l *OsqueryLogAdapter) Write(p []byte) (int, error) {
go l.logInfoAboutUnrecognizedProcessLockingPidfile(p)
}

if bytes.Contains(p, []byte("Failed to create lock file")) {
l.slogger.Log(context.TODO(), slog.LevelError,
"detected stale lockfile, logging info about file",
)
go l.logInfoAboutProcessHoldingLockfile(context.TODO(), p)
}

msg := strings.TrimSpace(string(p))
caller := extractOsqueryCaller(msg)
l.slogger.Log(context.TODO(), level, // nolint:sloglint // it's fine to not have a constant or literal here
Expand Down Expand Up @@ -202,3 +211,72 @@ func getSliceStat(getFunc func() ([]int32, error)) string {
}
return fmt.Sprintf("%+v", stat)
}

// logInfoAboutProcessHoldingLockfile logs information about the osquery database's lock file.
// We have noticed the lock file occasionally locked when it shouldn't be -- we think this can happen
// when osquery doesn't get adequate time to shut down gracefully. The held lockfile will prevent
// osquery from starting up entirely.
// See: https://github.com/kolide/launcher/issues/2004.
func (l *OsqueryLogAdapter) logInfoAboutProcessHoldingLockfile(ctx context.Context, p []byte) {
matches := lockfileRegex.FindAllStringSubmatch(string(p), -1)
if len(matches) < 1 || len(matches[0]) < 2 {
l.slogger.Log(context.TODO(), slog.LevelError,
"could not extract lockfile path from log line",
"log_line", string(p),
)

return
}

lockFilePath := strings.TrimSpace(matches[0][1]) // We want the group, not the full match
infoToLog := []any{
"lockfile_path", lockFilePath,
}

defer func() {
l.slogger.Log(ctx, slog.LevelInfo,
"detected stale osquery db lock file",
infoToLog...,
)
}()

// Check to see whether the process holding the file still exists
processes, err := getProcessesHoldingFile(ctx, lockFilePath)
if err != nil {
infoToLog = append(infoToLog, "err", err)
return
}

// Grab more info to log from the processes using the lockfile
processStrs := make([]string, len(processes))
for i, p := range processes {
processStrs[i] = processStr(ctx, p)
}
infoToLog = append(infoToLog, "processes", processStrs)
}

func processStr(ctx context.Context, p *process.Process) string {
name := "unknown"
processOwner := "unknown"
runningStatus := "unknown"
cmdline := "unknown"

if gotName, err := p.NameWithContext(ctx); err == nil {
name = gotName
}
if gotUsername, err := p.UsernameWithContext(ctx); err == nil {
processOwner = gotUsername
}
if gotIsRunning, err := p.IsRunningWithContext(ctx); err == nil {
if gotIsRunning {
runningStatus = "running"
} else {
runningStatus = "not running"
}
}
if gotCmdline, err := p.CmdlineWithContext(ctx); err == nil {
cmdline = gotCmdline
}

return fmt.Sprintf("process with name `%s` and PID %d belonging to user `%s` has current status `%s` (%s)", name, p.Pid, processOwner, runningStatus, cmdline)
}
43 changes: 43 additions & 0 deletions ee/log/osquerylogs/log_posix.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,16 @@ package osquerylogs

import (
"context"
"errors"
"fmt"
"log/slog"
"path/filepath"
"strconv"
"strings"
"time"

"github.com/kolide/launcher/ee/allowedcmd"
"github.com/shirou/gopsutil/v3/process"
)

// runAndLogPs runs ps filtering on the given PID, and logs the output.
Expand Down Expand Up @@ -110,3 +115,41 @@ func (l *OsqueryLogAdapter) runAndLogLsofOnPidfile() {
"output", string(out),
)
}

func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) {
cmd, err := allowedcmd.Lsof(ctx, "-t", pathToFile)
if err != nil {
return nil, fmt.Errorf("creating lsof command: %w", err)
}

out, err := cmd.CombinedOutput()
if err != nil {
return nil, fmt.Errorf("running lsof: %w", err)
}

pidStr := strings.TrimSpace(string(out))
if pidStr == "" {
return nil, errors.New("no process found using file via lsof")
}

pidStrs := strings.Split(strings.TrimSpace(string(out)), "\n")
if len(pidStrs) == 0 {
return nil, errors.New("no processes found using file via lsof")
}

processes := make([]*process.Process, 0)
for _, pidStr := range pidStrs {
pid, err := strconv.ParseInt(pidStr, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid pid %s: %w", pidStr, err)
}

p, err := process.NewProcess(int32(pid))
if err != nil {
return nil, fmt.Errorf("getting process for %d: %w", pid, err)
}
processes = append(processes, p)
}

return processes, nil
}
45 changes: 45 additions & 0 deletions ee/log/osquerylogs/log_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@

package osquerylogs

import (
"context"
"errors"
"fmt"
"strings"

"github.com/shirou/gopsutil/v3/process"
)

func (l *OsqueryLogAdapter) runAndLogPs(_ string) {
return
}
Expand All @@ -14,3 +23,39 @@ func (l *OsqueryLogAdapter) runAndLogLsofByPID(_ string) {
func (l *OsqueryLogAdapter) runAndLogLsofOnPidfile() {
return
}

func getProcessesHoldingFile(ctx context.Context, pathToFile string) ([]*process.Process, error) {
allProcesses, err := process.ProcessesWithContext(ctx)
if err != nil {
return nil, fmt.Errorf("getting process list: %w", err)
}
if len(allProcesses) == 0 {
return nil, errors.New("could not get any processes")
}

processes := make([]*process.Process, 0)
for _, p := range allProcesses {
openFiles, err := p.OpenFilesWithContext(ctx)
if err != nil {
continue
}

// Check the process's open files to see if this process is the one using the lockfile
for _, f := range openFiles {
// We check for strings.Contains rather than equals because the open file's path contains
// a `\\?\` prefix.
if !strings.Contains(f.Path, pathToFile) {
continue
}

processes = append(processes, p)
break
}
}

if len(processes) == 0 {
return nil, fmt.Errorf("no processes found using file %s", pathToFile)
}

return processes, nil
}
83 changes: 0 additions & 83 deletions pkg/osquery/runtime/osqueryinstance.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
"github.com/osquery/osquery-go/plugin/config"
"github.com/osquery/osquery-go/plugin/distributed"
osquerylogger "github.com/osquery/osquery-go/plugin/logger"
"github.com/shirou/gopsutil/v3/process"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"

Expand Down Expand Up @@ -250,15 +249,6 @@ func (i *OsqueryInstance) Launch() error {
return fmt.Errorf("could not calculate osquery file paths: %w", err)
}

// Check to see whether lock files exist that could prevent osquery from starting up
if _, err := i.detectStaleDatabaseLock(ctx, paths); err != nil {
// A detection error shouldn't prevent us from creating the process -- log it and move on
i.slogger.Log(ctx, slog.LevelInfo,
"error detecting stale database lock",
"err", err,
)
}

// Populate augeas lenses, if requested
if i.opts.augeasLensFunc != nil {
if err := os.MkdirAll(paths.augeasPath, 0755); err != nil {
Expand Down Expand Up @@ -712,79 +702,6 @@ func calculateOsqueryPaths(rootDirectory string, registrationId string, runId st
return osqueryFilePaths, nil
}

// detectStaleDatabaseLock checks for the presence of a lock file in the given database.
// We have noticed the lock file occasionally hanging around -- we think this can happen
// when osquery doesn't get adequate time to shut down gracefully. The presence of the lock
// file will prevent osquery from starting up entirely. For now, we just detect this stale
// lock file, and log information about it. In the future we will attempt to remediate.
// See: https://github.com/kolide/launcher/issues/2004.
func (i *OsqueryInstance) detectStaleDatabaseLock(ctx context.Context, paths *osqueryFilePaths) (bool, error) {
lockFilePath := filepath.Join(paths.databasePath, "LOCK")

lockFileInfo, err := os.Stat(lockFilePath)
if os.IsNotExist(err) {
// No lock file, nothing to do here
return false, nil
}
if err != nil {
return false, fmt.Errorf("determining whether lock file exists: %w", err)
}

infoToLog := []any{
"lockfile_path", lockFilePath,
"lockfile_modtime", lockFileInfo.ModTime().UTC().String(),
}

defer func() {
i.slogger.Log(ctx, slog.LevelInfo,
"detected stale osquery db lock file",
infoToLog...,
)
}()

// Check to see whether the process holding the file still exists
processes, err := getProcessesHoldingFile(ctx, lockFilePath)
if err != nil {
infoToLog = append(infoToLog, "err", err)
return false, fmt.Errorf("getting process holding file: %w", err)
}

// Grab more info to log from the processes using the lockfile
processStrs := make([]string, len(processes))
for i, p := range processes {
processStrs[i] = processStr(ctx, p)
}
infoToLog = append(infoToLog, "processes", processStrs)

return true, nil
}

func processStr(ctx context.Context, p *process.Process) string {
name := "unknown"
processOwner := "unknown"
runningStatus := "unknown"
cmdline := "unknown"

if gotName, err := p.NameWithContext(ctx); err == nil {
name = gotName
}
if gotUsername, err := p.UsernameWithContext(ctx); err == nil {
processOwner = gotUsername
}
if gotIsRunning, err := p.IsRunningWithContext(ctx); err == nil {
if gotIsRunning {
runningStatus = "running"
} else {
runningStatus = "not running"
}
}
if gotCmdline, err := p.CmdlineWithContext(ctx); err == nil {
cmdline = gotCmdline
}

return fmt.Sprintf("process with name `%s` and PID %d belonging to user `%s` has current status `%s` (%s)", name, p.Pid, processOwner, runningStatus, cmdline)
}

// createOsquerydCommand uses osqueryOptions to return an *exec.Cmd
// which will launch a properly configured osqueryd process.
func (i *OsqueryInstance) createOsquerydCommand(osquerydBinary string, paths *osqueryFilePaths) (*exec.Cmd, error) {
Expand Down
20 changes: 0 additions & 20 deletions pkg/osquery/runtime/osqueryinstance_other_test.go

This file was deleted.

40 changes: 0 additions & 40 deletions pkg/osquery/runtime/osqueryinstance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
Expand Down Expand Up @@ -299,42 +298,3 @@ func TestLaunch(t *testing.T) {

k.AssertExpectations(t)
}

func Test_detectStaleDatabaseLock(t *testing.T) {
t.Parallel()

if runtime.GOOS == "windows" && os.Getenv("GITHUB_ACTIONS") == "true" {
t.Skip("Skipping test on GitHub Actions -- test only works locally on Windows")
}

_, slogger := setUpTestSlogger()
rootDirectory := testRootDirectory(t)

k := typesMocks.NewKnapsack(t)
k.On("Slogger").Return(slogger)
k.On("RootDirectory").Return(rootDirectory)
setUpMockStores(t, k)

i := newInstance(types.DefaultRegistrationID, k, mockServiceClient())

// Calculate paths
paths, err := calculateOsqueryPaths(i.knapsack.RootDirectory(), i.registrationId, i.runId, i.opts)
require.NoError(t, err)

// Check for stale database lock -- there shouldn't be one
detected, err := i.detectStaleDatabaseLock(context.TODO(), paths)
require.NoError(t, err)
require.False(t, detected)

// Create a lock file
lockFilePath := filepath.Join(rootDirectory, "osquery.db", "LOCK")
require.NoError(t, os.MkdirAll(filepath.Dir(lockFilePath), 0700)) // drwx
createLockFile(t, lockFilePath)

// Check for a stale database lock again -- now, we should find it
detectedRetry, err := i.detectStaleDatabaseLock(context.TODO(), paths)
require.NoError(t, err)
require.True(t, detectedRetry)

k.AssertExpectations(t)
}
Loading

0 comments on commit 906d99b

Please sign in to comment.