Skip to content

Commit

Permalink
Now locating correct payload pid for oom score
Browse files Browse the repository at this point in the history
  • Loading branch information
PalNilsson committed Oct 29, 2024
1 parent 831cfa4 commit 162dc16
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 14 deletions.
2 changes: 1 addition & 1 deletion PILOTVERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.9.2.6
3.9.2.13
2 changes: 1 addition & 1 deletion pilot/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
RELEASE = '3' # released number should be fixed at 3 for Pilot 3
VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates
REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates
BUILD = '8' # build number should be reset to '1' for every new development cycle
BUILD = '13' # build number should be reset to '1' for every new development cycle

SUCCESS = 0
FAILURE = 1
Expand Down
35 changes: 23 additions & 12 deletions pilot/util/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@
from pilot.util.psutils import (
is_process_running,
get_pid,
get_subprocesses
get_subprocesses,
find_actual_payload_pid
)
from pilot.util.timing import get_time_since
from pilot.util.workernode import (
Expand Down Expand Up @@ -136,8 +137,8 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i
if exit_code != 0:
return exit_code, diagnostics

# display OOM process info (once)
display_oom_info(job.pid)
# update the OOM process info to prevent killing processes in the wrong order in case the job is killed (once)
update_oom_info(job.pid, job.transformation)

# should the pilot abort the payload?
exit_code, diagnostics = should_abort_payload(current_time, mt)
Expand Down Expand Up @@ -199,22 +200,32 @@ def still_running(pid):
return running


def display_oom_info(payload_pid):
def update_oom_info(bash_pid, payload_cmd):
"""
Display OOM process info.
Update OOM process info.
:param payload_pid: payload pid (int).
In case the job is killed, the OOM process info should be updated to prevent killing processes in the wrong order.
It will otherwise lead to lingering processes.
:param bash_pid: bash chain pid (int)
:param payload_cmd: payload command (string).
"""
# use the pid of the bash chain to get the actual payload pid which should be a child process
payload_pid = find_actual_payload_pid(bash_pid, payload_cmd)
if not payload_pid:
return

fname = f"/proc/{payload_pid}/oom_score"
fname_adj = fname + "_adj"
payload_score = get_score(payload_pid) if payload_pid else 'UNKNOWN'
pilot_score = get_score(os.getpid())

#cmd = "whoami"
#_, stdout, _ = execute(cmd)
#ogger.debug(f"stdout = {stdout}")
#cmd = f"ls -l {fname}"
#_, stdout, _ = execute(cmd)
#ogger.debug(f"stdout = {stdout}")
cmd = "whoami"
_, stdout, _ = execute(cmd)
logger.debug(f"stdout = {stdout}")
cmd = f"ls -l {fname_adj}"
_, stdout, _ = execute(cmd)
logger.debug(f"stdout = {stdout}")

if isinstance(pilot_score, str) and pilot_score == 'UNKNOWN':
logger.warning(f'could not get oom_score for pilot process: {pilot_score}')
Expand Down
30 changes: 30 additions & 0 deletions pilot/util/psutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,3 +291,33 @@ def find_process_by_jobid(jobid: int) -> int or None:
return proc.pid

return None


def find_actual_payload_pid(bash_pid: int, payload_cmd: str) -> int or None:
"""
Find the actual payload PID.
Identify all subprocesses of the given bash PID and search for the payload command. Return its PID.
:param bash_pid: bash PID (int)
:param payload_cmd: payload command (partial) (str)
:return: payload PID (int or None).
"""
if not _is_psutil_available:
logger.warning('find_actual_payload_pid(): psutil not available - aborting')
return None

children = get_subprocesses(bash_pid)
if not children:
logger.warning(f'no children found for bash PID {bash_pid}')
return None

for pid in reversed(children): # reverse the order since it's probably the last PID
cmd = get_command_by_pid(pid)
logger.debug(f'pid={pid} cmd={cmd}')
if payload_cmd in cmd:
logger.info(f'found payload PID={pid} for bash PID={bash_pid}')
return pid

logger.warning(f'could not find payload PID for bash PID {bash_pid}')
return None

0 comments on commit 162dc16

Please sign in to comment.