Skip to content

Commit

Permalink
Improved extraction of dmesg memory error
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Nilsson committed Dec 2, 2024
1 parent e35fcaa commit 9468133
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 2 deletions.
25 changes: 24 additions & 1 deletion pilot/control/payload.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,30 @@ def scan_for_memory_errors(subprocesses: list) -> str:
if search_str in line:
diagnostics = line[line.find(search_str):]
logger.warning(f'found memory error: {diagnostics}')
break

# make sure that this message is for a true subprocess of the pilot
# extract the pid from the message and compare it to the subprocesses list
match = search(r'Killed process (\d+)', diagnostics)
if match:
try:
found_pid = int(match.group(1))
logger.info(f"extracted PID: {found_pid}")

# is it a known subprocess?
if found_pid in subprocesses:
logger.info("PID found in the list of subprocesses")
break
else:
logger.warning("the extracted PID is not a known subprocess of the payload")
diagnostics = ""
# is the extracted PID a subprocess of the main pilot process itself?

except (ValueError, TypeError, AttributeError) as e:
logger.warning(f"failed to extract PID from the message: {e}")
diagnostics = ""
else:
logger.warning("PID could not be extracted from the message")
diagnostics = ""

if diagnostics:
break
Expand Down
2 changes: 1 addition & 1 deletion pilot/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
RELEASE = '3' # released number should be fixed at 3 for Pilot 3
VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates
REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates
BUILD = '37' # build number should be reset to '1' for every new development cycle
BUILD = '38' # build number should be reset to '1' for every new development cycle

SUCCESS = 0
FAILURE = 1
Expand Down

0 comments on commit 9468133

Please sign in to comment.