Skip to content

Commit

Permalink
add use_oneshot parameter to speed up check runs with `psutil.Proce…
Browse files Browse the repository at this point in the history
…ss().oneshot()` (#17817)
  • Loading branch information
mrafi97 authored Jun 21, 2024
1 parent 1150aec commit 8cf3d23
Show file tree
Hide file tree
Showing 9 changed files with 138 additions and 57 deletions.
7 changes: 7 additions & 0 deletions process/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,11 @@ files:
value:
type: integer
example: 120
- name: use_oneshot
description: |
If set to `true`, the check uses `psutil.Process().oneshot()` to collect and cache process metrics.
This can help speed up the check completion.
value:
type: boolean
example: true
- template: instances/default
1 change: 1 addition & 0 deletions process/changelog.d/17817.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add `use_oneshot` parameter to speed up check runs with `psutil.Process().oneshot()`
4 changes: 4 additions & 0 deletions process/datadog_checks/process/config_models/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,7 @@ def instance_pid_cache_duration():

def instance_try_sudo():
return False


def instance_use_oneshot():
return True
1 change: 1 addition & 0 deletions process/datadog_checks/process/config_models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class InstanceConfig(BaseModel):
tags: Optional[tuple[str, ...]] = None
thresholds: Optional[MappingProxyType[str, Any]] = None
try_sudo: Optional[bool] = None
use_oneshot: Optional[bool] = None
user: Optional[str] = None

@model_validator(mode='before')
Expand Down
6 changes: 6 additions & 0 deletions process/datadog_checks/process/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,12 @@ instances:
#
# pid_cache_duration: 120

## @param use_oneshot - boolean - optional - default: true
## If set to `true`, the check uses `psutil.Process().oneshot()` to collect and cache process metrics.
## This can help speed up the check completion.
#
# use_oneshot: true

## @param tags - list of strings - optional
## A list of tags to attach to every metric and service check emitted by this instance.
##
Expand Down
123 changes: 67 additions & 56 deletions process/datadog_checks/process/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(self, name, init_config, instances):
self.collect_children = is_affirmative(self.instance.get('collect_children', False))
self.user = self.instance.get('user', False)
self.try_sudo = self.instance.get('try_sudo', False)
self.use_oneshot = is_affirmative(self.instance.get('use_oneshot', True))

# ad stands for access denied
# We cache the PIDs getting this error and don't iterate on them more often than `access_denied_cache_duration``
Expand Down Expand Up @@ -293,65 +294,75 @@ def get_process_state(self, name, pids):

p = self.process_cache[name][pid]

meminfo = self.psutil_wrapper(p, 'memory_info', ['rss', 'vms'])
st['rss'].append(meminfo.get('rss'))
st['vms'].append(meminfo.get('vms'))
if self.use_oneshot:
self.log.debug("Using psutil Process.oneshot()")
with p.oneshot():
st = self.run_psutil_methods(pid, p, st, new_process)
else:
st = self.run_psutil_methods(pid, p, st, new_process)

mem_percent = self.psutil_wrapper(p, 'memory_percent')
st['mem_pct'].append(mem_percent)
return st

# will fail on win32 and solaris
shared_mem = self.psutil_wrapper(p, 'memory_info', ['shared']).get('shared')
if shared_mem is not None and meminfo.get('rss') is not None:
st['real'].append(meminfo['rss'] - shared_mem)
else:
st['real'].append(None)

ctxinfo = self.psutil_wrapper(p, 'num_ctx_switches', ['voluntary', 'involuntary'])
st['ctx_swtch_vol'].append(ctxinfo.get('voluntary'))
st['ctx_swtch_invol'].append(ctxinfo.get('involuntary'))

st['thr'].append(self.psutil_wrapper(p, 'num_threads'))

cpu_percent = self.psutil_wrapper(p, 'cpu_percent')
cpu_count = psutil.cpu_count()
if not new_process:
# psutil returns `0.` for `cpu_percent` the
# first time it's sampled on a process,
# so save the value only on non-new processes
st['cpu'].append(cpu_percent)
if cpu_count > 0 and cpu_percent is not None:
st['cpu_norm'].append(cpu_percent / cpu_count)
else:
self.log.debug('could not calculate the normalized cpu pct, cpu_count: %s', cpu_count)
st['open_fd'].append(self.psutil_wrapper(p, 'num_fds'))
st['open_handle'].append(self.psutil_wrapper(p, 'num_handles'))

ioinfo = self.psutil_wrapper(p, 'io_counters', ['read_count', 'write_count', 'read_bytes', 'write_bytes'])
st['r_count'].append(ioinfo.get('read_count'))
st['w_count'].append(ioinfo.get('write_count'))
st['r_bytes'].append(ioinfo.get('read_bytes'))
st['w_bytes'].append(ioinfo.get('write_bytes'))

pagefault_stats = self.get_pagefault_stats(pid)
if pagefault_stats is not None:
(minflt, cminflt, majflt, cmajflt) = pagefault_stats
st['minflt'].append(minflt)
st['cminflt'].append(cminflt)
st['majflt'].append(majflt)
st['cmajflt'].append(cmajflt)
def run_psutil_methods(self, pid, p, st, new_process):
meminfo = self.psutil_wrapper(p, 'memory_info', ['rss', 'vms'])
st['rss'].append(meminfo.get('rss'))
st['vms'].append(meminfo.get('vms'))

mem_percent = self.psutil_wrapper(p, 'memory_percent')
st['mem_pct'].append(mem_percent)

# will fail on win32 and solaris
shared_mem = self.psutil_wrapper(p, 'memory_info', ['shared']).get('shared')
if shared_mem is not None and meminfo.get('rss') is not None:
st['real'].append(meminfo['rss'] - shared_mem)
else:
st['real'].append(None)

ctxinfo = self.psutil_wrapper(p, 'num_ctx_switches', ['voluntary', 'involuntary'])
st['ctx_swtch_vol'].append(ctxinfo.get('voluntary'))
st['ctx_swtch_invol'].append(ctxinfo.get('involuntary'))

st['thr'].append(self.psutil_wrapper(p, 'num_threads'))

cpu_percent = self.psutil_wrapper(p, 'cpu_percent')
cpu_count = psutil.cpu_count()
if not new_process:
# psutil returns `0.` for `cpu_percent` the
# first time it's sampled on a process,
# so save the value only on non-new processes
st['cpu'].append(cpu_percent)
if cpu_count > 0 and cpu_percent is not None:
st['cpu_norm'].append(cpu_percent / cpu_count)
else:
st['minflt'].append(None)
st['cminflt'].append(None)
st['majflt'].append(None)
st['cmajflt'].append(None)

# calculate process run time
create_time = self.psutil_wrapper(p, 'create_time')
if create_time is not None:
now = time.time()
run_time = now - create_time
st['run_time'].append(run_time)
self.log.debug('could not calculate the normalized cpu pct, cpu_count: %s', cpu_count)
st['open_fd'].append(self.psutil_wrapper(p, 'num_fds'))
st['open_handle'].append(self.psutil_wrapper(p, 'num_handles'))

ioinfo = self.psutil_wrapper(p, 'io_counters', ['read_count', 'write_count', 'read_bytes', 'write_bytes'])
st['r_count'].append(ioinfo.get('read_count'))
st['w_count'].append(ioinfo.get('write_count'))
st['r_bytes'].append(ioinfo.get('read_bytes'))
st['w_bytes'].append(ioinfo.get('write_bytes'))

pagefault_stats = self.get_pagefault_stats(pid)
if pagefault_stats is not None:
(minflt, cminflt, majflt, cmajflt) = pagefault_stats
st['minflt'].append(minflt)
st['cminflt'].append(cminflt)
st['majflt'].append(majflt)
st['cmajflt'].append(cmajflt)
else:
st['minflt'].append(None)
st['cminflt'].append(None)
st['majflt'].append(None)
st['cmajflt'].append(None)

# calculate process run time
create_time = self.psutil_wrapper(p, 'create_time')
if create_time is not None:
now = time.time()
run_time = now - create_time
st['run_time'].append(run_time)

return st

Expand Down
2 changes: 2 additions & 0 deletions process/hatch.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@

[[envs.default.matrix]]
python = ["2.7", "3.11"]

[envs.bench]
36 changes: 36 additions & 0 deletions process/tests/test_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# (C) Datadog, Inc. 2024-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
from datadog_checks.process import ProcessCheck

from . import common


def test_run(benchmark, dd_run_check):
instance = {
'name': 'py',
'search_string': ['python'],
'exact_match': False,
'ignored_denied_access': True,
'use_oneshot': False,
'thresholds': {'warning': [1, 10], 'critical': [1, 100]},
}
process = ProcessCheck(common.CHECK_NAME, {}, [instance])
dd_run_check(process)

benchmark(dd_run_check, process)


def test_run_oneshot(benchmark, dd_run_check):
instance = {
'name': 'py',
'search_string': ['python'],
'exact_match': False,
'ignored_denied_access': True,
'use_oneshot': True,
'thresholds': {'warning': [1, 10], 'critical': [1, 100]},
}
process = ProcessCheck(common.CHECK_NAME, {}, [instance])
dd_run_check(process)

benchmark(dd_run_check, process)
15 changes: 14 additions & 1 deletion process/tests/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ def is_running(self):
def children(self, recursive=False):
return []

# https://stackoverflow.com/questions/5093382/object-becomes-none-when-using-a-context-manager
def oneshot(self):
class MockOneShot(object):
def __enter__(self):
return self

def __exit__(self, type, value, traceback):
pass

return MockOneShot()


class NamedMockProcess(object):
def __init__(self, name):
Expand Down Expand Up @@ -227,7 +238,8 @@ def test_check_missing_process(aggregator, dd_run_check, caplog):
assert "Unable to find process named ['fooprocess', '/usr/bin/foo'] among processes" in caplog.text


def test_check_real_process(aggregator, dd_run_check):
@pytest.mark.parametrize("oneshot", [True, False])
def test_check_real_process(aggregator, dd_run_check, oneshot):
"Check that we detect python running (at least this process)"
from datadog_checks.base.utils.platform import Platform

Expand All @@ -237,6 +249,7 @@ def test_check_real_process(aggregator, dd_run_check):
'exact_match': False,
'ignored_denied_access': True,
'thresholds': {'warning': [1, 10], 'critical': [1, 100]},
'use_oneshot': oneshot,
}
process = ProcessCheck(common.CHECK_NAME, {}, [instance])
expected_tags = generate_expected_tags(instance)
Expand Down

0 comments on commit 8cf3d23

Please sign in to comment.