Skip to content

Commit

Permalink
Active CPU monitoring
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Nilsson committed Dec 17, 2024
1 parent 13c79cc commit f479fe9
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 18 deletions.
31 changes: 14 additions & 17 deletions pilot/control/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import re

from collections import namedtuple
from os import environ, getuid
from os import environ, getuid, getpid
from subprocess import (
Popen,
PIPE
Expand All @@ -53,6 +53,7 @@
get_local_oidc_token_info,
update_local_oidc_token_info
)
from pilot.util.psutils import get_process_info
from pilot.util.queuehandling import (
abort_jobs_in_queues,
get_maxwalltime_from_job,
Expand Down Expand Up @@ -84,8 +85,8 @@ def control(queues: namedtuple, traces: Any, args: object): # noqa: C901
last_token_check = t_0

# for CPU usage debugging
# cpuchecktime = int(config.Pilot.cpu_check)
# tcpu = t_0
cpuchecktime = int(config.Pilot.cpu_check)
tcpu = t_0
last_minute_check = t_0

queuedata = get_queuedata_from_job(queues)
Expand Down Expand Up @@ -158,15 +159,15 @@ def control(queues: namedtuple, traces: Any, args: object): # noqa: C901
time.sleep(1)

# time to check the CPU usage?
# if is_pilot_check(check='cpu_usage'):
# if int(time.time() - tcpu) > cpuchecktime:
# processes = get_process_info('python3 pilot3/pilot.py', pid=getpid())
# if processes:
# logger.info(f'PID={getpid()} has CPU usage={processes[0]}% CMD={processes[2]}')
# nproc = processes[3]
# if nproc > 1:
# logger.info(f'.. there are {nproc} such processes running')
# tcpu = time.time()
if is_pilot_check(check='cpu_usage'):
if int(time.time() - tcpu) > cpuchecktime:
processes = get_process_info('python3 pilot3/pilot.py', pid=getpid())
if processes:
logger.info(f'PID={getpid()} has CPU usage={processes[0]}% CMD={processes[2]}')
nproc = processes[3]
if nproc > 1:
logger.info(f'.. there are {nproc} such processes running')
tcpu = time.time()

# proceed with running the other checks
run_checks(queues, args)
Expand Down Expand Up @@ -283,11 +284,7 @@ def reached_maxtime_abort(args: Any):
args.graceful_stop.set()


#def log_lifetime(sig, frame, traces):
# logger.info('lifetime: %i used, %i maximum', int(time.time() - traces.pilot['lifetime_start']), traces.pilot['lifetime_max'])


def get_process_info(cmd: str, user: str = "", args: str = 'aufx', pid: int = 0) -> list:
def get_process_info_old(cmd: str, user: str = "", args: str = 'aufx', pid: int = 0) -> list:
"""
Return process info for given command.
Expand Down
39 changes: 38 additions & 1 deletion pilot/util/psutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# under the License.
#
# Authors:
# - Paul Nilsson, [email protected], 2023
# - Paul Nilsson, [email protected], 2023-24

import logging
import os
Expand Down Expand Up @@ -373,3 +373,40 @@ def check_cpu_load():
else:
logger.info("system load is normal")
return False


def get_process_info(cmd: str, user: str = "", pid: int = 0) -> list:
"""
Return process info for given command.
The function returns a list with format [cpu, mem, command, number of commands] for
a given command (e.g. python3 pilot3/pilot.py).
:param cmd: command (str)
:param user: user (str)
:param pid: process id (int)
:return: list with process info (l[0]=cpu usage(%), l[1]=mem usage(%), l[2]=command(string)) (list).
"""
if not _is_psutil_available:
logger.warning('psutil not available, cannot check pilot CPU load')
return []

processes = []
num = 0

for proc in psutil.process_iter(['pid', 'username', 'cpu_percent', 'memory_percent', 'cmdline']):
try:
if user and proc.info['username'] != user:
continue
cmdline = proc.info['cmdline']
if cmdline and cmd in ' '.join(cmdline):
num += 1
if proc.info['pid'] == pid:
processes = [proc.info['cpu_percent'], proc.info['memory_percent'], ' '.join(cmdline)]
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
continue

if processes:
processes.append(num)

return processes

0 comments on commit f479fe9

Please sign in to comment.