Skip to content

Commit

Permalink
Updated oom_score handling and reporting
Browse files Browse the repository at this point in the history
  • Loading branch information
PalNilsson committed Oct 16, 2024
1 parent 8c383c9 commit 0204f1b
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 11 deletions.
2 changes: 1 addition & 1 deletion PILOTVERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.9.1.1
3.9.1.5
2 changes: 1 addition & 1 deletion pilot/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
RELEASE = '3' # released number should be fixed at 3 for Pilot 3
VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates
REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates
BUILD = '1' # build number should be reset to '1' for every new development cycle
BUILD = '5' # build number should be reset to '1' for every new development cycle

SUCCESS = 0
FAILURE = 1
Expand Down
28 changes: 19 additions & 9 deletions pilot/util/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from signal import SIGKILL

from pilot.common.errorcodes import ErrorCodes
from pilot.common.exception import PilotException, MiddlewareImportFailure
from pilot.common.exception import PilotException, MiddlewareImportFailure #, FileHandlingFailure
from pilot.util.auxiliary import set_pilot_state #, show_memory_usage
from pilot.util.config import config
from pilot.util.constants import PILOT_PRE_PAYLOAD
Expand All @@ -40,7 +40,7 @@
remove_files,
get_local_file_size,
read_file,
zip_files
zip_files, write_file
)
from pilot.util.loopingjob import looping_job
from pilot.util.math import (
Expand Down Expand Up @@ -135,7 +135,7 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i
if exit_code != 0:
return exit_code, diagnostics

# display OOM process info
# display OOM process info (once)
display_oom_info(job.pid)

# should the pilot abort the payload?
Expand Down Expand Up @@ -204,20 +204,30 @@ def display_oom_info(payload_pid):
:param payload_pid: payload pid (int).
"""

fname = f"/proc/{payload_pid}/oom_score_adj"
payload_score = get_score(payload_pid) if payload_pid else 'UNKNOWN'
pilot_score = get_score(os.getpid())
logger.info(f'oom_score(pilot) = {pilot_score}, oom_score(payload) = {payload_score}')
if isinstance(pilot_score, str) and pilot_score == 'UNKNOWN':
logger.warning(f'could not get oom_score for pilot process: {pilot_score}')
else:
relative_payload_score = "1"

# write the payload oom_score to the oom_score_adj file
try:
write_file(path=fname, contents=relative_payload_score)
except Exception as e: # FileHandlingFailure
logger.warning(f'could not write oom_score to file: {e}')

logger.info(f'oom_score(pilot) = {pilot_score}, oom_score(payload) = {payload_score} (attempted writing relative score 1 to {fname})')


def get_score(pid):
def get_score(pid) -> str:
"""
Get the OOM process score.
:param pid: process id (int).
:return: score (string).
:param pid: process id (int)
:return: score (str).
"""

try:
score = '%s' % read_file('/proc/%d/oom_score' % pid)
except Exception as error:
Expand Down

0 comments on commit 0204f1b

Please sign in to comment.