diff --git a/PILOTVERSION b/PILOTVERSION index 64e22377..c7cd9485 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.6 \ No newline at end of file +3.9.2.13 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index d5e9705a..c4abb169 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '8' # build number should be reset to '1' for every new development cycle +BUILD = '13' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index c35f0714..d1d0e3b2 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -62,7 +62,8 @@ from pilot.util.psutils import ( is_process_running, get_pid, - get_subprocesses + get_subprocesses, + find_actual_payload_pid ) from pilot.util.timing import get_time_since from pilot.util.workernode import ( @@ -136,8 +137,8 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i if exit_code != 0: return exit_code, diagnostics - # display OOM process info (once) - display_oom_info(job.pid) + # update the OOM process info to prevent killing processes in the wrong order in case the job is killed (once) + update_oom_info(job.pid, job.transformation) # should the pilot abort the payload? exit_code, diagnostics = should_abort_payload(current_time, mt) @@ -199,22 +200,32 @@ def still_running(pid): return running -def display_oom_info(payload_pid): +def update_oom_info(bash_pid, payload_cmd): """ - Display OOM process info. + Update OOM process info. - :param payload_pid: payload pid (int). + In case the job is killed, the OOM process info should be updated to prevent killing processes in the wrong order. + It will otherwise lead to lingering processes. + + :param bash_pid: bash chain pid (int) + :param payload_cmd: payload command (string). """ + # use the pid of the bash chain to get the actual payload pid which should be a child process + payload_pid = find_actual_payload_pid(bash_pid, payload_cmd) + if not payload_pid: + return + fname = f"/proc/{payload_pid}/oom_score" + fname_adj = fname + "_adj" payload_score = get_score(payload_pid) if payload_pid else 'UNKNOWN' pilot_score = get_score(os.getpid()) - #cmd = "whoami" - #_, stdout, _ = execute(cmd) - #ogger.debug(f"stdout = {stdout}") - #cmd = f"ls -l {fname}" - #_, stdout, _ = execute(cmd) - #ogger.debug(f"stdout = {stdout}") + cmd = "whoami" + _, stdout, _ = execute(cmd) + logger.debug(f"stdout = {stdout}") + cmd = f"ls -l {fname_adj}" + _, stdout, _ = execute(cmd) + logger.debug(f"stdout = {stdout}") if isinstance(pilot_score, str) and pilot_score == 'UNKNOWN': logger.warning(f'could not get oom_score for pilot process: {pilot_score}') diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index eb70f263..9d8085fa 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -291,3 +291,33 @@ def find_process_by_jobid(jobid: int) -> int or None: return proc.pid return None + + +def find_actual_payload_pid(bash_pid: int, payload_cmd: str) -> int or None: + """ + Find the actual payload PID. + + Identify all subprocesses of the given bash PID and search for the payload command. Return its PID. + + :param bash_pid: bash PID (int) + :param payload_cmd: payload command (partial) (str) + :return: payload PID (int or None). + """ + if not _is_psutil_available: + logger.warning('find_actual_payload_pid(): psutil not available - aborting') + return None + + children = get_subprocesses(bash_pid) + if not children: + logger.warning(f'no children found for bash PID {bash_pid}') + return None + + for pid in reversed(children): # reverse the order since it's probably the last PID + cmd = get_command_by_pid(pid) + logger.debug(f'pid={pid} cmd={cmd}') + if payload_cmd in cmd: + logger.info(f'found payload PID={pid} for bash PID={bash_pid}') + return pid + + logger.warning(f'could not find payload PID for bash PID {bash_pid}') + return None