add use_oneshot parameter to speed up check runs with `psutil.Proce…

…ss().oneshot()` (#17817)
DataDog · Jun 21, 2024 · 8cf3d23 · 8cf3d23
1 parent 1150aec
commit 8cf3d23
Show file tree

Hide file tree

Showing 9 changed files with 138 additions and 57 deletions.
diff --git a/process/assets/configuration/spec.yaml b/process/assets/configuration/spec.yaml
@@ -140,4 +140,11 @@ files:
         value:
           type: integer
           example: 120
+      - name: use_oneshot
+        description: |
+          If set to `true`, the check uses `psutil.Process().oneshot()` to collect and cache process metrics. 
+          This can help speed up the check completion.
+        value:
+          type: boolean
+          example: true
       - template: instances/default
diff --git a/process/changelog.d/17817.added b/process/changelog.d/17817.added
@@ -0,0 +1 @@
+Add `use_oneshot` parameter to speed up check runs with `psutil.Process().oneshot()`
diff --git a/process/datadog_checks/process/config_models/defaults.py b/process/datadog_checks/process/config_models/defaults.py
@@ -50,3 +50,7 @@ def instance_pid_cache_duration():
 
 def instance_try_sudo():
     return False
+
+
+def instance_use_oneshot():
+    return True
diff --git a/process/datadog_checks/process/config_models/instance.py b/process/datadog_checks/process/config_models/instance.py
@@ -51,6 +51,7 @@ class InstanceConfig(BaseModel):
     tags: Optional[tuple[str, ...]] = None
     thresholds: Optional[MappingProxyType[str, Any]] = None
     try_sudo: Optional[bool] = None
+    use_oneshot: Optional[bool] = None
     user: Optional[str] = None
 
     @model_validator(mode='before')

diff --git a/process/datadog_checks/process/data/conf.yaml.example b/process/datadog_checks/process/data/conf.yaml.example
@@ -136,6 +136,12 @@ instances:
     #
     # pid_cache_duration: 120
 
+    ## @param use_oneshot - boolean - optional - default: true
+    ## If set to `true`, the check uses `psutil.Process().oneshot()` to collect and cache process metrics. 
+    ## This can help speed up the check completion.
+    #
+    # use_oneshot: true
+
     ## @param tags - list of strings - optional
     ## A list of tags to attach to every metric and service check emitted by this instance.
     ##

diff --git a/process/datadog_checks/process/process.py b/process/datadog_checks/process/process.py
@@ -77,6 +77,7 @@ def __init__(self, name, init_config, instances):
         self.collect_children = is_affirmative(self.instance.get('collect_children', False))
         self.user = self.instance.get('user', False)
         self.try_sudo = self.instance.get('try_sudo', False)
+        self.use_oneshot = is_affirmative(self.instance.get('use_oneshot', True))
 
         # ad stands for access denied
         # We cache the PIDs getting this error and don't iterate on them more often than `access_denied_cache_duration``
@@ -293,65 +294,75 @@ def get_process_state(self, name, pids):
 
             p = self.process_cache[name][pid]
 
-            meminfo = self.psutil_wrapper(p, 'memory_info', ['rss', 'vms'])
-            st['rss'].append(meminfo.get('rss'))
-            st['vms'].append(meminfo.get('vms'))
+            if self.use_oneshot:
+                self.log.debug("Using psutil Process.oneshot()")
+                with p.oneshot():
+                    st = self.run_psutil_methods(pid, p, st, new_process)
+            else:
+                st = self.run_psutil_methods(pid, p, st, new_process)
 
-            mem_percent = self.psutil_wrapper(p, 'memory_percent')
-            st['mem_pct'].append(mem_percent)
+        return st
 
-            # will fail on win32 and solaris
-            shared_mem = self.psutil_wrapper(p, 'memory_info', ['shared']).get('shared')
-            if shared_mem is not None and meminfo.get('rss') is not None:
-                st['real'].append(meminfo['rss'] - shared_mem)
-            else:
-                st['real'].append(None)
-
-            ctxinfo = self.psutil_wrapper(p, 'num_ctx_switches', ['voluntary', 'involuntary'])
-            st['ctx_swtch_vol'].append(ctxinfo.get('voluntary'))
-            st['ctx_swtch_invol'].append(ctxinfo.get('involuntary'))
-
-            st['thr'].append(self.psutil_wrapper(p, 'num_threads'))
-
-            cpu_percent = self.psutil_wrapper(p, 'cpu_percent')
-            cpu_count = psutil.cpu_count()
-            if not new_process:
-                # psutil returns `0.` for `cpu_percent` the
-                # first time it's sampled on a process,
-                # so save the value only on non-new processes
-                st['cpu'].append(cpu_percent)
-                if cpu_count > 0 and cpu_percent is not None:
-                    st['cpu_norm'].append(cpu_percent / cpu_count)
-                else:
-                    self.log.debug('could not calculate the normalized cpu pct, cpu_count: %s', cpu_count)
-            st['open_fd'].append(self.psutil_wrapper(p, 'num_fds'))
-            st['open_handle'].append(self.psutil_wrapper(p, 'num_handles'))
-
-            ioinfo = self.psutil_wrapper(p, 'io_counters', ['read_count', 'write_count', 'read_bytes', 'write_bytes'])
-            st['r_count'].append(ioinfo.get('read_count'))
-            st['w_count'].append(ioinfo.get('write_count'))
-            st['r_bytes'].append(ioinfo.get('read_bytes'))
-            st['w_bytes'].append(ioinfo.get('write_bytes'))
-
-            pagefault_stats = self.get_pagefault_stats(pid)
-            if pagefault_stats is not None:
-                (minflt, cminflt, majflt, cmajflt) = pagefault_stats
-                st['minflt'].append(minflt)
-                st['cminflt'].append(cminflt)
-                st['majflt'].append(majflt)
-                st['cmajflt'].append(cmajflt)
+    def run_psutil_methods(self, pid, p, st, new_process):
+        meminfo = self.psutil_wrapper(p, 'memory_info', ['rss', 'vms'])
+        st['rss'].append(meminfo.get('rss'))
+        st['vms'].append(meminfo.get('vms'))
+
+        mem_percent = self.psutil_wrapper(p, 'memory_percent')
+        st['mem_pct'].append(mem_percent)
+
+        # will fail on win32 and solaris
+        shared_mem = self.psutil_wrapper(p, 'memory_info', ['shared']).get('shared')
+        if shared_mem is not None and meminfo.get('rss') is not None:
+            st['real'].append(meminfo['rss'] - shared_mem)
+        else:
+            st['real'].append(None)
+
+        ctxinfo = self.psutil_wrapper(p, 'num_ctx_switches', ['voluntary', 'involuntary'])
+        st['ctx_swtch_vol'].append(ctxinfo.get('voluntary'))
+        st['ctx_swtch_invol'].append(ctxinfo.get('involuntary'))
+
+        st['thr'].append(self.psutil_wrapper(p, 'num_threads'))
+
+        cpu_percent = self.psutil_wrapper(p, 'cpu_percent')
+        cpu_count = psutil.cpu_count()
+        if not new_process:
+            # psutil returns `0.` for `cpu_percent` the
+            # first time it's sampled on a process,
+            # so save the value only on non-new processes
+            st['cpu'].append(cpu_percent)
+            if cpu_count > 0 and cpu_percent is not None:
+                st['cpu_norm'].append(cpu_percent / cpu_count)
             else:
-                st['minflt'].append(None)
-                st['cminflt'].append(None)
-                st['majflt'].append(None)
-                st['cmajflt'].append(None)
-
-            # calculate process run time
-            create_time = self.psutil_wrapper(p, 'create_time')
-            if create_time is not None:
-                now = time.time()
-                run_time = now - create_time
-                st['run_time'].append(run_time)
+                self.log.debug('could not calculate the normalized cpu pct, cpu_count: %s', cpu_count)
+        st['open_fd'].append(self.psutil_wrapper(p, 'num_fds'))
+        st['open_handle'].append(self.psutil_wrapper(p, 'num_handles'))
+
+        ioinfo = self.psutil_wrapper(p, 'io_counters', ['read_count', 'write_count', 'read_bytes', 'write_bytes'])
+        st['r_count'].append(ioinfo.get('read_count'))
+        st['w_count'].append(ioinfo.get('write_count'))
+        st['r_bytes'].append(ioinfo.get('read_bytes'))
+        st['w_bytes'].append(ioinfo.get('write_bytes'))
+
+        pagefault_stats = self.get_pagefault_stats(pid)
+        if pagefault_stats is not None:
+            (minflt, cminflt, majflt, cmajflt) = pagefault_stats
+            st['minflt'].append(minflt)
+            st['cminflt'].append(cminflt)
+            st['majflt'].append(majflt)
+            st['cmajflt'].append(cmajflt)
+        else:
+            st['minflt'].append(None)
+            st['cminflt'].append(None)
+            st['majflt'].append(None)
+            st['cmajflt'].append(None)
+
+        # calculate process run time
+        create_time = self.psutil_wrapper(p, 'create_time')
+        if create_time is not None:
+            now = time.time()
+            run_time = now - create_time
+            st['run_time'].append(run_time)
 
         return st
 

diff --git a/process/hatch.toml b/process/hatch.toml
@@ -2,3 +2,5 @@
 
 [[envs.default.matrix]]
 python = ["2.7", "3.11"]
+
+[envs.bench]
diff --git a/process/tests/test_bench.py b/process/tests/test_bench.py
@@ -0,0 +1,36 @@
+# (C) Datadog, Inc. 2024-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+from datadog_checks.process import ProcessCheck
+
+from . import common
+
+
+def test_run(benchmark, dd_run_check):
+    instance = {
+        'name': 'py',
+        'search_string': ['python'],
+        'exact_match': False,
+        'ignored_denied_access': True,
+        'use_oneshot': False,
+        'thresholds': {'warning': [1, 10], 'critical': [1, 100]},
+    }
+    process = ProcessCheck(common.CHECK_NAME, {}, [instance])
+    dd_run_check(process)
+
+    benchmark(dd_run_check, process)
+
+
+def test_run_oneshot(benchmark, dd_run_check):
+    instance = {
+        'name': 'py',
+        'search_string': ['python'],
+        'exact_match': False,
+        'ignored_denied_access': True,
+        'use_oneshot': True,
+        'thresholds': {'warning': [1, 10], 'critical': [1, 100]},
+    }
+    process = ProcessCheck(common.CHECK_NAME, {}, [instance])
+    dd_run_check(process)
+
+    benchmark(dd_run_check, process)
diff --git a/process/tests/test_process.py b/process/tests/test_process.py
@@ -45,6 +45,17 @@ def is_running(self):
     def children(self, recursive=False):
         return []
 
+    # https://stackoverflow.com/questions/5093382/object-becomes-none-when-using-a-context-manager
+    def oneshot(self):
+        class MockOneShot(object):
+            def __enter__(self):
+                return self
+
+            def __exit__(self, type, value, traceback):
+                pass
+
+        return MockOneShot()
+
 
 class NamedMockProcess(object):
     def __init__(self, name):
@@ -227,7 +238,8 @@ def test_check_missing_process(aggregator, dd_run_check, caplog):
     assert "Unable to find process named ['fooprocess', '/usr/bin/foo'] among processes" in caplog.text
 
 
-def test_check_real_process(aggregator, dd_run_check):
+@pytest.mark.parametrize("oneshot", [True, False])
+def test_check_real_process(aggregator, dd_run_check, oneshot):
     "Check that we detect python running (at least this process)"
     from datadog_checks.base.utils.platform import Platform
 
@@ -237,6 +249,7 @@ def test_check_real_process(aggregator, dd_run_check):
         'exact_match': False,
         'ignored_denied_access': True,
         'thresholds': {'warning': [1, 10], 'critical': [1, 100]},
+        'use_oneshot': oneshot,
     }
     process = ProcessCheck(common.CHECK_NAME, {}, [instance])
     expected_tags = generate_expected_tags(instance)