From 0204f1b4efca1594131aca6962405f1f13a0221d Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 16 Oct 2024 16:29:47 +0200 Subject: [PATCH] Updated oom_score handling and reporting --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 28 +++++++++++++++++++--------- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 425ed0a8..8cf995ff 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.1.1 \ No newline at end of file +3.9.1.5 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index f5aae5cb..fd659dcc 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '1' # build number should be reset to '1' for every new development cycle +BUILD = '5' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 3cc621bd..03f48c80 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -30,7 +30,7 @@ from signal import SIGKILL from pilot.common.errorcodes import ErrorCodes -from pilot.common.exception import PilotException, MiddlewareImportFailure +from pilot.common.exception import PilotException, MiddlewareImportFailure #, FileHandlingFailure from pilot.util.auxiliary import set_pilot_state #, show_memory_usage from pilot.util.config import config from pilot.util.constants import PILOT_PRE_PAYLOAD @@ -40,7 +40,7 @@ remove_files, get_local_file_size, read_file, - zip_files + zip_files, write_file ) from pilot.util.loopingjob import looping_job from pilot.util.math import ( @@ -135,7 +135,7 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i if exit_code != 0: return exit_code, diagnostics - # display OOM process info + # display OOM process info (once) display_oom_info(job.pid) # should the pilot abort the payload? @@ -204,20 +204,30 @@ def display_oom_info(payload_pid): :param payload_pid: payload pid (int). """ - + fname = f"/proc/{payload_pid}/oom_score_adj" payload_score = get_score(payload_pid) if payload_pid else 'UNKNOWN' pilot_score = get_score(os.getpid()) - logger.info(f'oom_score(pilot) = {pilot_score}, oom_score(payload) = {payload_score}') + if isinstance(pilot_score, str) and pilot_score == 'UNKNOWN': + logger.warning(f'could not get oom_score for pilot process: {pilot_score}') + else: + relative_payload_score = "1" + + # write the payload oom_score to the oom_score_adj file + try: + write_file(path=fname, contents=relative_payload_score) + except Exception as e: # FileHandlingFailure + logger.warning(f'could not write oom_score to file: {e}') + + logger.info(f'oom_score(pilot) = {pilot_score}, oom_score(payload) = {payload_score} (attempted writing relative score 1 to {fname})') -def get_score(pid): +def get_score(pid) -> str: """ Get the OOM process score. - :param pid: process id (int). - :return: score (string). + :param pid: process id (int) + :return: score (str). """ - try: score = '%s' % read_file('/proc/%d/oom_score' % pid) except Exception as error: