From 528a6b058ef4b79e7570d34dca35b29628f3b1d1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 18 Oct 2024 11:22:40 +0200 Subject: [PATCH 01/65] Corrected a few type hints, removed unused code --- PILOTVERSION | 2 +- pilot/util/constants.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index e3b1f212..fd68bdd9 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.1.13 \ No newline at end of file +3.9.2.1 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index e90371e1..6a121595 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -27,8 +27,8 @@ # Pilot version RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates -REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '13' # build number should be reset to '1' for every new development cycle +REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '1' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 3e04dc6a6f97e08d2ade414b622f28db16b25e17 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 18 Oct 2024 11:44:43 +0200 Subject: [PATCH 02/65] execute() now using thread synchronization to protect against polling errors ending up in stderr --- pilot/util/container.py | 95 ++--------------------------------------- 1 file changed, 4 insertions(+), 91 deletions(-) diff --git a/pilot/util/container.py b/pilot/util/container.py index e5837d14..8326ba2d 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -131,8 +131,10 @@ def read_output(stream, queue): stdout_thread = threading.Thread(target=read_output, args=(process.stdout, stdout_queue)) stderr_thread = threading.Thread(target=read_output, args=(process.stderr, stderr_queue)) - stdout_thread.start() - stderr_thread.start() + # start the threads and use thread synchronization + with threading.Lock(): + stdout_thread.start() + stderr_thread.start() try: logger.debug(f'subprocess.communicate() will use timeout {timeout} s') @@ -183,95 +185,6 @@ def read_output(stream, queue): return exit_code, stdout, stderr -def execute_old2(executable: Any, **kwargs: dict) -> Any: # noqa: C901 - usecontainer = kwargs.get('usecontainer', False) - job = kwargs.get('job') - obscure = kwargs.get('obscure', '') - - if isinstance(executable, list): - executable = ' '.join(executable) - - if job and job.imagename != "" and "runcontainer" in executable: - usecontainer = False - job.usecontainer = usecontainer - - if usecontainer: - executable, diagnostics = containerise_executable(executable, **kwargs) - if not executable: - return None if kwargs.get('returnproc', False) else -1, "", diagnostics - - if not kwargs.get('mute', False): - print_executable(executable, obscure=obscure) - - timeout = get_timeout(kwargs.get('timeout', None)) - exe = ['/usr/bin/python'] + executable.split() if kwargs.get('mode', 'bash') == 'python' else ['/bin/bash', '-c', executable] - - exit_code = 0 - stdout = '' - stderr = '' - - def read_output(pipe, output_list): - for line in iter(pipe.readline, ''): - output_list.append(line) - pipe.close() - - process = None - with execute_lock: - process = subprocess.Popen(exe, - bufsize=-1, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=kwargs.get('cwd', getcwd()), - preexec_fn=os.setsid, - encoding='utf-8', - errors='replace') - if kwargs.get('returnproc', False): - return process - - stdout_lines = [] - stderr_lines = [] - - stdout_thread = threading.Thread(target=read_output, args=(process.stdout, stdout_lines)) - stderr_thread = threading.Thread(target=read_output, args=(process.stderr, stderr_lines)) - - stdout_thread.start() - stderr_thread.start() - - try: - logger.debug(f'subprocess.communicate() will use timeout {timeout} s') - process.wait(timeout=timeout) - except subprocess.TimeoutExpired as exc: - stderr += f'subprocess communicate sent TimeoutExpired: {exc}' - logger.warning(stderr) - exit_code = errors.COMMANDTIMEDOUT - stderr = kill_all(process, stderr) - except Exception as exc: - logger.warning(f'exception caused when executing command: {executable}: {exc}') - exit_code = errors.UNKNOWNEXCEPTION - stderr = kill_all(process, str(exc)) - else: - exit_code = process.poll() - - stdout_thread.join() - stderr_thread.join() - - stdout = ''.join(stdout_lines) - stderr = ''.join(stderr_lines) - - try: - if process: - process.wait(timeout=60) - except subprocess.TimeoutExpired: - if process: - logger.warning("process did not complete within the timeout of 60s - terminating") - process.terminate() - - if stdout and stdout.endswith('\n'): - stdout = stdout[:-1] - - return exit_code, stdout, stderr - - def execute_nothreads(executable: Any, **kwargs: dict) -> Any: """ Execute the command with its options in the provided executable list using subprocess time-out handler. From 17f1b0abf2757fa13298ad06aa8fd82520272079 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 21 Oct 2024 14:48:49 +0200 Subject: [PATCH 03/65] Using start_new_session instead of preexec_fn in subprocess.Popen(). Comment update --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 2 +- pilot/util/constants.py | 2 +- pilot/util/container.py | 3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index fd68bdd9..2302c3eb 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.1 \ No newline at end of file +3.9.2.2 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 2317ff7b..056d2c32 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -842,7 +842,7 @@ def run(self) -> tuple[int, str]: # noqa: C901 if stdout and stderr else "General payload setup verification error (check setup logs)" ) - # check for special errors in thw output + # check for special errors in the output exit_code = errors.resolve_transform_error(exit_code, diagnostics) diagnostics = errors.format_diagnostics(exit_code, diagnostics) return exit_code, diagnostics diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6a121595..f6060292 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '1' # build number should be reset to '1' for every new development cycle +BUILD = '2' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/container.py b/pilot/util/container.py index 8326ba2d..a3661230 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -100,7 +100,8 @@ def execute(executable: Any, **kwargs: dict) -> Any: # noqa: C901 stdout=kwargs.get('stdout', subprocess.PIPE), stderr=kwargs.get('stderr', subprocess.PIPE), cwd=kwargs.get('cwd', getcwd()), - preexec_fn=os.setsid, # setpgrp + start_new_session=True, # alternative to use os.setsid + # preexec_fn=os.setsid, # setpgrp encoding='utf-8', errors='replace') if kwargs.get('returnproc', False): From f58d93cb0076926d5c5c9cd6f095e91ea738ef67 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 28 Oct 2024 14:35:42 +0100 Subject: [PATCH 04/65] Various tests for oom score, now out-commented --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 35 ++++++++++++++++++++++++++--------- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 2302c3eb..64e22377 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.2 \ No newline at end of file +3.9.2.6 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index f6060292..05a3998f 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '2' # build number should be reset to '1' for every new development cycle +BUILD = '6' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 9b7e1d14..c35f0714 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -205,22 +205,39 @@ def display_oom_info(payload_pid): :param payload_pid: payload pid (int). """ - #fname = f"/proc/{payload_pid}/oom_score_adj" + fname = f"/proc/{payload_pid}/oom_score" payload_score = get_score(payload_pid) if payload_pid else 'UNKNOWN' pilot_score = get_score(os.getpid()) + + #cmd = "whoami" + #_, stdout, _ = execute(cmd) + #ogger.debug(f"stdout = {stdout}") + #cmd = f"ls -l {fname}" + #_, stdout, _ = execute(cmd) + #ogger.debug(f"stdout = {stdout}") + if isinstance(pilot_score, str) and pilot_score == 'UNKNOWN': logger.warning(f'could not get oom_score for pilot process: {pilot_score}') else: - #relative_payload_score = "1" + relative_payload_score = "1" + write_to_oom_score_adj(payload_pid, relative_payload_score) + logger.info(f'oom_score(pilot) = {pilot_score}, oom_score(payload) = {payload_score} (attempted writing relative score 1 to {fname})') + - # write the payload oom_score to the oom_score_adj file - #try: - # write_file(path=fname, contents=relative_payload_score) - #except Exception as e: # FileHandlingFailure - # logger.warning(f'could not write oom_score to file: {e}') +def write_to_oom_score_adj(pid, value): + """Writes the specified value to the oom_score_adj file for the given PID. - #logger.info(f'oom_score(pilot) = {pilot_score}, oom_score(payload) = {payload_score} (attempted writing relative score 1 to {fname})') - logger.info(f'oom_score(pilot) = {pilot_score}, oom_score(payload) = {payload_score}') + Args: + pid: The PID of the process. + value: The value to write to the oom_score_adj file. + """ + + command = f"echo {value} > /proc/{pid}/oom_score_adj" + try: + subprocess.check_call(command, shell=True) + logger.info(f"successfully wrote {value} to /proc/{pid}/oom_score_adj") + except subprocess.CalledProcessError as e: + logger.warning(f"error writing to /proc/{pid}/oom_score_adj: {e}") def get_score(pid) -> str: From 855679032d69b88179f2f574e1ffcea19b497304 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 28 Oct 2024 14:47:08 +0100 Subject: [PATCH 05/65] Added new option for turning off token renewal --- pilot.py | 12 +++++++++++- pilot/control/monitor.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pilot.py b/pilot.py index 5405cdb9..a657ffce 100755 --- a/pilot.py +++ b/pilot.py @@ -147,7 +147,7 @@ def main() -> int: return error.get_error_code() # update the OIDC token if necessary (after queuedata has been downloaded, since PQ.catchall can contain instruction to prevent token renewal) - if 'no_token_renewal' in infosys.queuedata.catchall: + if 'no_token_renewal' in infosys.queuedata.catchall or args.token_renewal is False: logger.info("OIDC token will not be renewed by the pilot") else: update_local_oidc_token_info(args.url, args.port) @@ -441,6 +441,16 @@ def get_args() -> Any: help="Number of getjob requests", ) + # no_token_renewal + arg_parser.add_argument( + "-w", + "--notokenrenewal", + dest="token_renewal", + action="store_false", + default=True, + help="Disable token renewal", + ) + arg_parser.add_argument( "-x", "--getjobfailures", diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 9c92c9ec..89ff1f5f 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -109,7 +109,7 @@ def control(queues: namedtuple, traces: Any, args: object): # noqa: C901 if tokendownloadchecktime and queuedata: if int(time.time() - last_token_check) > tokendownloadchecktime: last_token_check = time.time() - if 'no_token_renewal' in queuedata.catchall: + if 'no_token_renewal' in queuedata.catchall or args.token_renewal is False: logger.info("OIDC token will not be renewed by the pilot") else: update_local_oidc_token_info(args.url, args.port) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 05a3998f..daba270b 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '6' # build number should be reset to '1' for every new development cycle +BUILD = '7' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 831cfa4cc92643372d4734d5bfb590d299e847bf Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 28 Oct 2024 18:35:49 +0100 Subject: [PATCH 06/65] Added new option for turning off token renewal --- pilot.py | 20 ++++++++++---------- pilot/util/constants.py | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pilot.py b/pilot.py index a657ffce..a7e9108c 100755 --- a/pilot.py +++ b/pilot.py @@ -441,16 +441,6 @@ def get_args() -> Any: help="Number of getjob requests", ) - # no_token_renewal - arg_parser.add_argument( - "-w", - "--notokenrenewal", - dest="token_renewal", - action="store_false", - default=True, - help="Disable token renewal", - ) - arg_parser.add_argument( "-x", "--getjobfailures", @@ -461,6 +451,16 @@ def get_args() -> Any: help="Maximum number of getjob request failures in Harvester mode", ) + # no_token_renewal + arg_parser.add_argument( + "-y", + "--notokenrenewal", + dest="token_renewal", + action="store_false", + default=True, + help="Disable token renewal", + ) + arg_parser.add_argument( "--subscribe-to-msgsvc", dest="subscribe_to_msgsvc", diff --git a/pilot/util/constants.py b/pilot/util/constants.py index daba270b..d5e9705a 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '7' # build number should be reset to '1' for every new development cycle +BUILD = '8' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 162dc161a8561b30c522c26f9b2c1c004c85044f Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 29 Oct 2024 16:10:34 +0100 Subject: [PATCH 07/65] Now locating correct payload pid for oom score --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 35 +++++++++++++++++++++++------------ pilot/util/psutils.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 14 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 64e22377..c7cd9485 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.6 \ No newline at end of file +3.9.2.13 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index d5e9705a..c4abb169 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '8' # build number should be reset to '1' for every new development cycle +BUILD = '13' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index c35f0714..d1d0e3b2 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -62,7 +62,8 @@ from pilot.util.psutils import ( is_process_running, get_pid, - get_subprocesses + get_subprocesses, + find_actual_payload_pid ) from pilot.util.timing import get_time_since from pilot.util.workernode import ( @@ -136,8 +137,8 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i if exit_code != 0: return exit_code, diagnostics - # display OOM process info (once) - display_oom_info(job.pid) + # update the OOM process info to prevent killing processes in the wrong order in case the job is killed (once) + update_oom_info(job.pid, job.transformation) # should the pilot abort the payload? exit_code, diagnostics = should_abort_payload(current_time, mt) @@ -199,22 +200,32 @@ def still_running(pid): return running -def display_oom_info(payload_pid): +def update_oom_info(bash_pid, payload_cmd): """ - Display OOM process info. + Update OOM process info. - :param payload_pid: payload pid (int). + In case the job is killed, the OOM process info should be updated to prevent killing processes in the wrong order. + It will otherwise lead to lingering processes. + + :param bash_pid: bash chain pid (int) + :param payload_cmd: payload command (string). """ + # use the pid of the bash chain to get the actual payload pid which should be a child process + payload_pid = find_actual_payload_pid(bash_pid, payload_cmd) + if not payload_pid: + return + fname = f"/proc/{payload_pid}/oom_score" + fname_adj = fname + "_adj" payload_score = get_score(payload_pid) if payload_pid else 'UNKNOWN' pilot_score = get_score(os.getpid()) - #cmd = "whoami" - #_, stdout, _ = execute(cmd) - #ogger.debug(f"stdout = {stdout}") - #cmd = f"ls -l {fname}" - #_, stdout, _ = execute(cmd) - #ogger.debug(f"stdout = {stdout}") + cmd = "whoami" + _, stdout, _ = execute(cmd) + logger.debug(f"stdout = {stdout}") + cmd = f"ls -l {fname_adj}" + _, stdout, _ = execute(cmd) + logger.debug(f"stdout = {stdout}") if isinstance(pilot_score, str) and pilot_score == 'UNKNOWN': logger.warning(f'could not get oom_score for pilot process: {pilot_score}') diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index eb70f263..9d8085fa 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -291,3 +291,33 @@ def find_process_by_jobid(jobid: int) -> int or None: return proc.pid return None + + +def find_actual_payload_pid(bash_pid: int, payload_cmd: str) -> int or None: + """ + Find the actual payload PID. + + Identify all subprocesses of the given bash PID and search for the payload command. Return its PID. + + :param bash_pid: bash PID (int) + :param payload_cmd: payload command (partial) (str) + :return: payload PID (int or None). + """ + if not _is_psutil_available: + logger.warning('find_actual_payload_pid(): psutil not available - aborting') + return None + + children = get_subprocesses(bash_pid) + if not children: + logger.warning(f'no children found for bash PID {bash_pid}') + return None + + for pid in reversed(children): # reverse the order since it's probably the last PID + cmd = get_command_by_pid(pid) + logger.debug(f'pid={pid} cmd={cmd}') + if payload_cmd in cmd: + logger.info(f'found payload PID={pid} for bash PID={bash_pid}') + return pid + + logger.warning(f'could not find payload PID for bash PID {bash_pid}') + return None From f48855800c1ed5352db98ca15015faf983f05494 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 30 Oct 2024 10:23:02 +0100 Subject: [PATCH 08/65] Testing oom write --- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 4 +++- pilot/util/psutils.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index c4abb169..dc19af83 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '13' # build number should be reset to '1' for every new development cycle +BUILD = '16' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index d1d0e3b2..bbd64286 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -242,13 +242,15 @@ def write_to_oom_score_adj(pid, value): pid: The PID of the process. value: The value to write to the oom_score_adj file. """ - command = f"echo {value} > /proc/{pid}/oom_score_adj" try: subprocess.check_call(command, shell=True) logger.info(f"successfully wrote {value} to /proc/{pid}/oom_score_adj") except subprocess.CalledProcessError as e: logger.warning(f"error writing to /proc/{pid}/oom_score_adj: {e}") + ec, stdout, stderr = execute(command) + logger.debug(f"ec = {ec} stdout = {stdout}\nstderr = {stderr}") + _, stdout, _ = execute(f"cat /proc/{pid}/oom_score_adj") def get_score(pid) -> str: diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index 9d8085fa..ba18e1b1 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -310,9 +310,9 @@ def find_actual_payload_pid(bash_pid: int, payload_cmd: str) -> int or None: children = get_subprocesses(bash_pid) if not children: logger.warning(f'no children found for bash PID {bash_pid}') - return None + return bash_pid - for pid in reversed(children): # reverse the order since it's probably the last PID + for pid in children: cmd = get_command_by_pid(pid) logger.debug(f'pid={pid} cmd={cmd}') if payload_cmd in cmd: From b93f3cce79fd43401affb090375c3b30eb5d2cf0 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 31 Oct 2024 09:13:29 +0100 Subject: [PATCH 09/65] Removed useless arcproxy -i subject --- pilot/user/atlas/proxy.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pilot/user/atlas/proxy.py b/pilot/user/atlas/proxy.py index b6defea9..ab397033 100644 --- a/pilot/user/atlas/proxy.py +++ b/pilot/user/atlas/proxy.py @@ -161,9 +161,6 @@ def verify_arcproxy(envsetup: str, limit: int, proxy_id: str = "pilot", test: bo # validityLeft - duration of proxy validity left in seconds. # vomsACvalidityEnd - timestamp when VOMS attribute validity ends. # vomsACvalidityLeft - duration of VOMS attribute validity left in seconds. - cmd = f"{envsetup}arcproxy -i subject" - _exit_code, _, _ = execute(cmd, shell=True) # , usecontainer=True, copytool=True) - cmd = f"{envsetup}arcproxy -i validityEnd -i validityLeft -i vomsACvalidityEnd -i vomsACvalidityLeft" _exit_code, stdout, stderr = execute_nothreads(cmd, shell=True) # , usecontainer=True, copytool=True) if stdout is not None: From b1c4549c5ff3819742de9f638c92dd48a3616d77 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 31 Oct 2024 09:26:45 +0100 Subject: [PATCH 10/65] Cleanup --- pilot.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pilot.py b/pilot.py index a7e9108c..67ffd4db 100755 --- a/pilot.py +++ b/pilot.py @@ -182,8 +182,6 @@ def main() -> int: f"pilot.workflow.{args.workflow}", globals(), locals(), [args.workflow], 0 ) - # check if real-time logging is requested for this queue - #rtloggingtype # update the pilot heartbeat file update_pilot_heartbeat(time.time()) From 0b5ff5f3f28f5484467bc1f71ed4cf3964af67fc Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 31 Oct 2024 09:31:15 +0100 Subject: [PATCH 11/65] Added PROXYTOOSHORT --- pilot/common/errorcodes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 123a50ad..76effc16 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -182,6 +182,7 @@ class ErrorCodes: PREEMPTION = 1379 ARCPROXYFAILURE = 1380 ARCPROXYLIBFAILURE = 1381 + PROXYTOOSHORT = 1382 # used at the beginning of the pilot to indicate that the proxy is too short _error_messages = { GENERALERROR: "General pilot error, consult batch log", @@ -326,6 +327,7 @@ class ErrorCodes: PREEMPTION: "Job was preempted", ARCPROXYFAILURE: "General arcproxy failure", ARCPROXYLIBFAILURE: "Arcproxy failure while loading shared libraries", + PROXYTOOSHORT: "Proxy is too short", } put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181] From dae81f3c332610bca420e5b644ee46e5efd8a9e6 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 31 Oct 2024 09:41:25 +0100 Subject: [PATCH 12/65] Added PROXYTOOSHORT as pilot exit code 80 --- pilot/util/auxiliary.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index 7c5e3146..ddcd8547 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -179,6 +179,7 @@ def get_error_code_translation_dictionary() -> dict: """ error_code_translation_dictionary = { -1: [64, "Site offline"], + errors.CVMFSISNOTALIVE: [64, "CVMFS is not responding"], # same exit code as site offline errors.GENERALERROR: [65, "General pilot error, consult batch log"], # added to traces object errors.MKDIR: [66, "Could not create directory"], # added to traces object errors.NOSUCHFILE: [67, "No such file or directory"], # added to traces object @@ -196,7 +197,7 @@ def get_error_code_translation_dictionary() -> dict: errors.MISSINGINPUTFILE: [77, "Missing input file in SE"], # should pilot report this type of error to wrapper? errors.PANDAQUEUENOTACTIVE: [78, "PanDA queue is not active"], errors.COMMUNICATIONFAILURE: [79, "PanDA server communication failure"], - errors.CVMFSISNOTALIVE: [64, "CVMFS is not responding"], # same exit code as site offline + errors.PROXYTOOSHORT: [80, "Proxy too short"], # added to traces object errors.KILLSIGNAL: [137, "General kill signal"], # Job terminated by unknown kill signal errors.SIGTERM: [143, "Job killed by signal: SIGTERM"], # 128+15 errors.SIGQUIT: [131, "Job killed by signal: SIGQUIT"], # 128+3 From 15037f9839a8f6d6676e960db64330f419722bf4 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 31 Oct 2024 09:56:50 +0100 Subject: [PATCH 13/65] Implementing and testing proxy too short --- PILOTVERSION | 2 +- pilot/control/job.py | 6 +++--- pilot/user/atlas/proxy.py | 14 ++++++++++++-- pilot/user/generic/proxy.py | 3 ++- pilot/user/rubin/proxy.py | 3 ++- pilot/user/sphenix/proxy.py | 3 ++- pilot/util/constants.py | 2 +- 7 files changed, 23 insertions(+), 10 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index c7cd9485..ef95cfb7 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.13 \ No newline at end of file +3.9.2.17 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 506b0ffc..6c8c4c28 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1550,13 +1550,13 @@ def proceed_with_getjob(timefloor: int, starttime: int, jobnumber: int, getjob_r if verify_proxy: userproxy = __import__(f'pilot.user.{pilot_user}.proxy', globals(), locals(), [pilot_user], 0) - # is the proxy still valid? - exit_code, diagnostics = userproxy.verify_proxy(test=False) + # is the proxy still valid? at pilot startup, the proxy lifetime must be at least 72h + exit_code, diagnostics = userproxy.verify_proxy(test=False, pilotstartup=True) if traces.pilot['error_code'] == 0: # careful so we don't overwrite another error code traces.pilot['error_code'] = exit_code if exit_code == errors.ARCPROXYLIBFAILURE: logger.warning("currently ignoring arcproxy library failure") - if exit_code in {errors.NOPROXY, errors.NOVOMSPROXY, errors.CERTIFICATEHASEXPIRED}: + if exit_code in {errors.NOPROXY, errors.NOVOMSPROXY, errors.CERTIFICATEHASEXPIRED, errors.PROXYTOOSHORT}: logger.warning(diagnostics) return False diff --git a/pilot/user/atlas/proxy.py b/pilot/user/atlas/proxy.py index ab397033..20249586 100644 --- a/pilot/user/atlas/proxy.py +++ b/pilot/user/atlas/proxy.py @@ -90,7 +90,7 @@ def get_and_verify_proxy(x509: str, voms_role: str = '', proxy_type: str = '', w return exit_code, diagnostics, x509 -def verify_proxy(limit: int = None, x509: bool = None, proxy_id: str = "pilot", test: bool = False) -> tuple[int, str]: +def verify_proxy(limit: int = None, x509: bool = None, proxy_id: str = "pilot", test: bool = False, pilotstartup: bool = False) -> tuple[int, str]: """ Check for a valid voms/grid proxy longer than N hours. @@ -100,8 +100,11 @@ def verify_proxy(limit: int = None, x509: bool = None, proxy_id: str = "pilot", :param x509: points to the proxy file. If not set (=None) - get proxy file from X509_USER_PROXY environment (bool) :param proxy_id: proxy id (str) :param test: free Boolean test parameter (bool) + :param pilotstartup: free Boolean pilotstartup parameter (bool) :return: exit code (NOPROXY or NOVOMSPROXY) (int), diagnostics (error diagnostics string) (str) (tuple). """ + if pilotstartup: + limit = 72 # 3 days if limit is None: limit = 1 @@ -171,6 +174,8 @@ def verify_arcproxy(envsetup: str, limit: int, proxy_id: str = "pilot", test: bo else: exit_code, diagnostics, validity_end_cert, validity_end = interpret_proxy_info(_exit_code, stdout, stderr, limit) + validity_end = int(time()) + 71 * 3600 # 71 hours test + if proxy_id and validity_end: # setup cache if requested if exit_code == 0: logger.info(f"caching the validity ends from arcproxy: cache[\'{proxy_id}\'] = [{validity_end_cert}, {validity_end}]") @@ -219,7 +224,12 @@ def check_time_left(proxyname: str, validity: int, limit: int) -> tuple[int, str logger.info(f"cache: check {proxyname} validity: wanted={limit}h ({limit * 3600 - 20 * 60}s with grace) " f"left={float(seconds_left) / 3600:.2f}h (now={tnow} validity={validity} left={seconds_left}s)") - if seconds_left < limit * 3600 - 20 * 60: + # special case for limit=72h (3 days) for pilot startup + if limit == 72 and proxyname == "proxy" and seconds_left < limit * 3600 - 20 * 60: + diagnostics = f'proxy is too short for pilot startup: {float(seconds_left) / 3600:.2f}h' + logger.warning(diagnostics) + exit_code = errors.PROXYTOOSHORT + elif seconds_left < limit * 3600 - 20 * 60: diagnostics = f'cert/proxy is about to expire: {float(seconds_left) / 3600:.2f}h' logger.warning(diagnostics) exit_code = errors.CERTIFICATEHASEXPIRED if proxyname == 'cert' else errors.VOMSPROXYABOUTTOEXPIRE diff --git a/pilot/user/generic/proxy.py b/pilot/user/generic/proxy.py index 579f92e0..bc91ae0f 100644 --- a/pilot/user/generic/proxy.py +++ b/pilot/user/generic/proxy.py @@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) -def verify_proxy(limit: int = None, x509: str = None, proxy_id: str = "pilot", test: bool = False) -> (int, str): +def verify_proxy(limit: int = None, x509: str = None, proxy_id: str = "pilot", test: bool = False, pilotstartup: bool = False) -> (int, str): """ Check for a valid voms/grid proxy longer than N hours. @@ -37,6 +37,7 @@ def verify_proxy(limit: int = None, x509: str = None, proxy_id: str = "pilot", t :param x509: points to the proxy file. If not set (=None) - get proxy file from X509_USER_PROXY environment (str) :param proxy_id: proxy id (str) :param test: free Boolean test parameter (bool) + :param pilotstartup: free Boolean pilotstartup parameter (bool) :return: exit code (NOPROXY or NOVOMSPROXY), diagnostics (error diagnostics string) (int, str). """ if limit or x509 or proxy_id or test: # to bypass pylint score 0 diff --git a/pilot/user/rubin/proxy.py b/pilot/user/rubin/proxy.py index 13662df0..52426148 100644 --- a/pilot/user/rubin/proxy.py +++ b/pilot/user/rubin/proxy.py @@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) -def verify_proxy(limit: int = None, x509: str = None, proxy_id: str = "pilot", test: bool = False) -> (int, str): +def verify_proxy(limit: int = None, x509: str = None, proxy_id: str = "pilot", test: bool = False, pilotstartup: bool = False) -> (int, str): """ Check for a valid voms/grid proxy longer than N hours. @@ -37,6 +37,7 @@ def verify_proxy(limit: int = None, x509: str = None, proxy_id: str = "pilot", t :param x509: points to the proxy file. If not set (=None) - get proxy file from X509_USER_PROXY environment (str) :param proxy_id: proxy id (str) :param test: free Boolean test parameter (bool) + :param pilotstartup: free Boolean pilotstartup parameter (bool) :return: exit code (NOPROXY or NOVOMSPROXY), diagnostics (error diagnostics string) (int, str). """ if limit or x509 or proxy_id or test: # to bypass pylint score 0 diff --git a/pilot/user/sphenix/proxy.py b/pilot/user/sphenix/proxy.py index 187f4d80..83fcea49 100644 --- a/pilot/user/sphenix/proxy.py +++ b/pilot/user/sphenix/proxy.py @@ -28,7 +28,7 @@ logger = logging.getLogger(__name__) -def verify_proxy(limit: int = None, x509: bool = None, proxy_id: str = "pilot", test: bool = False) -> (int, str): +def verify_proxy(limit: int = None, x509: bool = None, proxy_id: str = "pilot", test: bool = False, pilotstartup: bool = False) -> (int, str): """ Check for a valid voms/grid proxy longer than N hours. Use `limit` to set required time limit. @@ -37,6 +37,7 @@ def verify_proxy(limit: int = None, x509: bool = None, proxy_id: str = "pilot", :param x509: points to the proxy file. If not set (=None) - get proxy file from X509_USER_PROXY environment (bool) :param proxy_id: proxy id (str) :param test: free Boolean test parameter (bool) + :param pilotstartup: free Boolean pilotstartup parameter (bool) :return: exit code (NOPROXY or NOVOMSPROXY) (int), diagnostics (error diagnostics string) (str). """ if limit or x509 or proxy_id or test: # to bypass pylint score 0 diff --git a/pilot/util/constants.py b/pilot/util/constants.py index dc19af83..7772d928 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '16' # build number should be reset to '1' for every new development cycle +BUILD = '17' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 5ab7204b85c26120236a8b956ab15f3168d5c1b1 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 31 Oct 2024 10:16:32 +0100 Subject: [PATCH 14/65] Removed test --- pilot/user/atlas/proxy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pilot/user/atlas/proxy.py b/pilot/user/atlas/proxy.py index 20249586..19a47fa6 100644 --- a/pilot/user/atlas/proxy.py +++ b/pilot/user/atlas/proxy.py @@ -173,8 +173,7 @@ def verify_arcproxy(envsetup: str, limit: int, proxy_id: str = "pilot", test: bo exit_code = -1 else: exit_code, diagnostics, validity_end_cert, validity_end = interpret_proxy_info(_exit_code, stdout, stderr, limit) - - validity_end = int(time()) + 71 * 3600 # 71 hours test + # validity_end = int(time()) + 71 * 3600 # 71 hours test if proxy_id and validity_end: # setup cache if requested if exit_code == 0: From d89faa3718a9c6a071859695f3319ed24097d13b Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 31 Oct 2024 10:29:30 +0100 Subject: [PATCH 15/65] Updated build number --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index ef95cfb7..06cb46f1 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.17 \ No newline at end of file +3.9.2.19 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 7772d928..48baa66b 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '17' # build number should be reset to '1' for every new development cycle +BUILD = '19' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 5ece001678476f25e57cb16bb15b90611e33fc7c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 31 Oct 2024 14:22:56 +0100 Subject: [PATCH 16/65] Removed proxy name check --- pilot/user/atlas/proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/user/atlas/proxy.py b/pilot/user/atlas/proxy.py index 19a47fa6..7b0295e5 100644 --- a/pilot/user/atlas/proxy.py +++ b/pilot/user/atlas/proxy.py @@ -224,7 +224,7 @@ def check_time_left(proxyname: str, validity: int, limit: int) -> tuple[int, str f"left={float(seconds_left) / 3600:.2f}h (now={tnow} validity={validity} left={seconds_left}s)") # special case for limit=72h (3 days) for pilot startup - if limit == 72 and proxyname == "proxy" and seconds_left < limit * 3600 - 20 * 60: + if limit == 72 and seconds_left < limit * 3600 - 20 * 60: diagnostics = f'proxy is too short for pilot startup: {float(seconds_left) / 3600:.2f}h' logger.warning(diagnostics) exit_code = errors.PROXYTOOSHORT From 3d499d860efc394b8f06813dea766b4e6d157aa0 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 1 Nov 2024 15:35:10 +0100 Subject: [PATCH 17/65] Pylint updates --- pilot/util/timer.py | 66 ++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/pilot/util/timer.py b/pilot/util/timer.py index 22e28ee9..841d953b 100644 --- a/pilot/util/timer.py +++ b/pilot/util/timer.py @@ -18,7 +18,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-24 """ Standalone implementation of time-out check on function call. @@ -43,32 +43,32 @@ from pilot.util.auxiliary import TimeoutException -class TimedThread(object): +class TimedThread: """ Thread-based Timer implementation (`threading` module) (shared memory space, GIL limitations, no way to kill thread, Windows compatible) """ - def __init__(self, timeout): + def __init__(self, _timeout): """ :param timeout: timeout value for operation in seconds. """ - self.timeout = timeout + self.timeout = _timeout self.is_timeout = False def execute(self, func, args, kwargs): try: ret = (True, func(*args, **kwargs)) - except Exception: + except (TypeError, ValueError, AttributeError, KeyError): ret = (False, sys.exc_info()) self.result = ret return ret - def run(self, func, args, kwargs, timeout=None): + def run(self, func, args, kwargs, _timeout=None): """ :raise: TimeoutException if timeout value is reached before function finished """ @@ -78,30 +78,30 @@ def run(self, func, args, kwargs, timeout=None): thread.start() - timeout = timeout if timeout is not None else self.timeout + _timeout = _timeout if _timeout is not None else self.timeout try: - thread.join(timeout) - except Exception as exc: + thread.join(_timeout) + except (RuntimeError, KeyboardInterrupt) as exc: print(f'exception caught while joining timer thread: {exc}') if thread.is_alive(): self.is_timeout = True - raise TimeoutException("Timeout reached", timeout=timeout) + raise TimeoutException("Timeout reached", timeout=_timeout) ret = self.result if ret[0]: return ret[1] - else: - try: - _r = ret[1][0](ret[1][1]).with_traceback(ret[1][2]) - except AttributeError: - exec("raise ret[1][0], ret[1][1], ret[1][2]") - raise _r + + try: + _r = ret[1][0](ret[1][1]).with_traceback(ret[1][2]) + except AttributeError: + exec("raise ret[1][0], ret[1][1], ret[1][2]") + raise _r -class TimedProcess(object): +class TimedProcess: """ Process-based Timer implementation (`multiprocessing` module). Uses shared Queue to keep result. (completely isolated memory space) @@ -110,22 +110,22 @@ class TimedProcess(object): Traceback data is printed to stderr """ - def __init__(self, timeout): + def __init__(self, _timeout): """ - :param timeout: timeout value for operation in seconds. + :param _timeout: timeout value for operation in seconds. """ - self.timeout = timeout + self.timeout = _timeout self.is_timeout = False - def run(self, func, args, kwargs, timeout=None): + def run(self, func, args, kwargs, _timeout=None): def _execute(func, args, kwargs, queue): try: ret = func(*args, **kwargs) queue.put((True, ret)) - except Exception as e: - print('Exception occurred while executing %s' % func, file=sys.stderr) + except (TypeError, ValueError, AttributeError, KeyError) as e: + print(f'exception occurred while executing {func}', file=sys.stderr) traceback.print_exc(file=sys.stderr) queue.put((False, e)) @@ -137,14 +137,14 @@ def _execute(func, args, kwargs, queue): process.daemon = True process.start() - timeout = timeout if timeout is not None else self.timeout + _timeout = _timeout if _timeout is not None else self.timeout try: - ret = queue.get(block=True, timeout=timeout) - except Empty: + ret = queue.get(block=True, timeout=_timeout) + except Empty as exc: self.is_timeout = True process.terminate() - raise TimeoutException("Timeout reached", timeout=timeout) + raise TimeoutException("Timeout reached", timeout=_timeout) from exc finally: while process.is_alive(): process.join(1) @@ -158,20 +158,20 @@ def _execute(func, args, kwargs, queue): if ret[0]: return ret[1] - else: - raise ret[1] + raise ret[1] Timer = TimedProcess -def timeout(seconds, timer=None): +def timeout(seconds: int, timer: Timer = None): """ Decorator for a function which causes it to timeout (stop execution) once passed given number of seconds. - :param timer: timer class (by default is Timer) - :raise: TimeoutException in case of timeout interrupt - """ + :param seconds: timeout value in seconds (int) + :param timer: timer class (None or Timer) + :raise: TimeoutException in case of timeout interrupt. + """ timer = timer or Timer def decorate(function): From 5a02856a0f853fc51fae7408abb4bb56cad82cb6 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 1 Nov 2024 15:42:17 +0100 Subject: [PATCH 18/65] Pylint updates --- pilot/util/parameters.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pilot/util/parameters.py b/pilot/util/parameters.py index 46f47931..614992be 100644 --- a/pilot/util/parameters.py +++ b/pilot/util/parameters.py @@ -17,15 +17,18 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 # This module contains functions that are used with the get_parameters() function defined in the information module. # WARNING: IN GENERAL, NEEDS TO USE PLUG-IN MANAGER +import logging +from typing import Any + from pilot.info import infosys +from pilot.util.config import config -import logging logger = logging.getLogger(__name__) @@ -40,7 +43,6 @@ def get_maximum_input_sizes(): try: _maxinputsizes = infosys.queuedata.maxwdir # normally 14336+2000 MB except TypeError as exc: - from pilot.util.config import config _maxinputsizes = config.Pilot.maximum_input_file_sizes # MB logger.warning(f'could not convert schedconfig value for maxwdir: {exc} (will use default value instead - {_maxinputsizes})') @@ -49,25 +51,24 @@ def get_maximum_input_sizes(): try: _maxinputsizes = int(_maxinputsizes) - except Exception as exc: + except (ValueError, TypeError) as exc: _maxinputsizes = 14336 + 2000 logger.warning(f'failed to convert maxinputsizes to int: {exc} (using value: {_maxinputsizes} MB)') return _maxinputsizes -def convert_to_int(parameter, default=None): +def convert_to_int(parameter: Any, default: Any = None) -> Any: """ Try to convert a given parameter to an integer value. + The default parameter can be used to force the function to always return a given value in case the integer conversion, int(parameter), fails. - :param parameter: parameter (any type). - :param default: None by default (if set, always return an integer; the given value will be returned if - conversion to integer fails). - :return: converted integer. + :param parameter: parameter (Any) + :param default: None by default (Any) + :return: converted integer (Any). """ - try: value = int(parameter) except (ValueError, TypeError): From f49520176ae7c2a934025b9febb67640127535aa Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 1 Nov 2024 15:44:32 +0100 Subject: [PATCH 19/65] Pylint updates --- pilot/user/sphenix/monitoring.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pilot/user/sphenix/monitoring.py b/pilot/user/sphenix/monitoring.py index b42e5917..8ef2750d 100644 --- a/pilot/user/sphenix/monitoring.py +++ b/pilot/user/sphenix/monitoring.py @@ -17,18 +17,20 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-24 -from typing import Any +from pilot.info.jobdata import JobData -def fast_monitor_tasks(job: Any) -> int: +def fast_monitor_tasks(job: JobData) -> int: """ Perform fast monitoring tasks. - :param job: job object (Any) + :param job: job object (JobData) :return: exit code (int). """ + if not job: # to bypass pylint warning + pass exit_code = 0 return exit_code From fc76bcec9dff48d29cc8de6a4d55a4e603c293ef Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 1 Nov 2024 15:47:15 +0100 Subject: [PATCH 20/65] Pylint updates --- pilot/user/sphenix/cpu.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pilot/user/sphenix/cpu.py b/pilot/user/sphenix/cpu.py index 2126ab0b..5da6c85e 100644 --- a/pilot/user/sphenix/cpu.py +++ b/pilot/user/sphenix/cpu.py @@ -17,22 +17,26 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 -from typing import Any +import logging + +from pilot.info.jobdata import JobData from pilot.util.container import execute -import logging logger = logging.getLogger(__name__) -def get_core_count(job: Any) -> int: +def get_core_count(job: JobData) -> int: """ Return the core count. - :param job: job object (Any) + :param job: job object (JobData) :return: core count (int). """ + if not job: # to bypass pylint warning + pass + return 0 @@ -46,6 +50,7 @@ def add_core_count(corecount: int, core_counts: list = None) -> list: """ if core_counts is None: core_counts = [] + return core_counts.append(corecount) @@ -58,11 +63,11 @@ def set_core_counts(**kwargs): job = kwargs.get('job', None) if job and job.pgrp: cmd = f"ps axo pgid,psr | sort | grep {job.pgrp} | uniq | awk '{{print $1}}' | grep -x {job.pgrp} | wc -l" - exit_code, stdout, stderr = execute(cmd, mute=True) + _, stdout, _ = execute(cmd, mute=True) logger.debug(f'{cmd}: {stdout}') try: job.actualcorecount = int(stdout) - except Exception as e: + except (ValueError, TypeError) as e: logger.warning(f'failed to convert number of actual cores to int: {e}') else: logger.debug(f'set number of actual cores to: {job.actualcorecount}') From cdfc37f0d6f04e7c689a8cc59653e82a7c066610 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 1 Nov 2024 15:49:35 +0100 Subject: [PATCH 21/65] Pylint updates --- pilot/user/rubin/copytool_definitions.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pilot/user/rubin/copytool_definitions.py b/pilot/user/rubin/copytool_definitions.py index 7a9ef40d..163f2a33 100644 --- a/pilot/user/rubin/copytool_definitions.py +++ b/pilot/user/rubin/copytool_definitions.py @@ -17,34 +17,33 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2022-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2022-24 from hashlib import md5 -def mv_to_final_destination(): +def mv_to_final_destination() -> bool: """ - Is mv allowed to move files to/from final destination? + Decide if mv is allowed to move files to/from final destination. - :return: Boolean. + :return: True if mv is allowed to move files to final destination, False otherwise (bool). """ - return False -def get_path(scope, lfn): +def get_path(scope: str, lfn: str) -> str: """ - Construct a partial Rucio PFN using the scope and the LFN + Construct a partial Rucio PFN using the scope and the LFN. + /md5(:)[0:2]/md5()[2:4]/ E.g. scope = 'user.jwebb2', lfn = 'user.jwebb2.66999._000001.top1outDS.tar' -> 'user/jwebb2/01/9f/user.jwebb2.66999._000001.top1outDS.tar' - :param scope: scope (string). - :param lfn: LFN (string). - :return: partial rucio path (string). + :param scope: scope (str) + :param lfn: LFN (str) + :return: partial rucio path (str). """ - s = f'{scope}:{lfn}' hash_hex = md5(s.encode('utf-8')).hexdigest() paths = scope.split('.') + [hash_hex[0:2], hash_hex[2:4], lfn] From ca8683d1b4010813d46d253f556f3ea9f32f9f51 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 1 Nov 2024 15:52:57 +0100 Subject: [PATCH 22/65] Pylint updates --- pilot/user/rubin/diagnose.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pilot/user/rubin/diagnose.py b/pilot/user/rubin/diagnose.py index 34844259..0b44d51f 100644 --- a/pilot/user/rubin/diagnose.py +++ b/pilot/user/rubin/diagnose.py @@ -17,26 +17,26 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-24 # - Tadashi Maeno, tadashi.maeno@cern.ch, 2020 +import logging import os +from pilot.info.jobdata import JobData from pilot.util.config import config from pilot.util.filehandling import read_file, tail -import logging logger = logging.getLogger(__name__) -def interpret(job): +def interpret(job: JobData) -> int: """ Interpret the payload, look for specific errors in the stdout. - :param job: job object + :param job: job object (JobData) :return: exit code (payload) (int). """ - stdout = os.path.join(job.workdir, config.Payload.payloadstdout) if os.path.exists(stdout): message = 'payload stdout dump\n' @@ -55,16 +55,16 @@ def interpret(job): return 0 -def get_log_extracts(job, state): +def get_log_extracts(job: JobData, state: str) -> str: """ Extract special warnings and other info from special logs. + This function also discovers if the payload had any outbound connections. - :param job: job object. - :param state: job state (string). - :return: log extracts (string). + :param job: job object (JobData) + :param state: job state (str) + :return: log extracts (str). """ - logger.info("building log extracts (sent to the server as \'pilotLog\')") # for failed/holding jobs, add extracts from the pilot log file, but always add it to the pilot log itself @@ -72,20 +72,19 @@ def get_log_extracts(job, state): _extracts = get_pilot_log_extracts(job) if _extracts != "": logger.warning(f'detected the following tail of warning/fatal messages in the pilot log:\n{_extracts}') - if state == 'failed' or state == 'holding': + if state in {'failed', 'holding'}: extracts += _extracts return extracts -def get_pilot_log_extracts(job): +def get_pilot_log_extracts(job: JobData) -> str: """ Get the extracts from the pilot log (warning/fatal messages, as well as tail of the log itself). - :param job: job object. - :return: tail of pilot log (string). + :param job: job object (JobData) + :return: tail of pilot log (str). """ - extracts = "" path = os.path.join(job.workdir, config.Pilot.pilotlog) From e20a831a3714fd7492e2d9df108fdc1346b5411f Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 1 Nov 2024 15:55:52 +0100 Subject: [PATCH 23/65] Pylint updates --- pilot/user/rubin/loopingjob_definitions.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pilot/user/rubin/loopingjob_definitions.py b/pilot/user/rubin/loopingjob_definitions.py index 664b1744..2bada33a 100644 --- a/pilot/user/rubin/loopingjob_definitions.py +++ b/pilot/user/rubin/loopingjob_definitions.py @@ -20,29 +20,30 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2018-24 -def allow_loopingjob_detection(): +def allow_loopingjob_detection() -> bool: """ - Should the looping job detection algorithm be allowed? + Decide if the looping job detection algorithm should be allowed. + The looping job detection algorithm finds recently touched files within the job's workdir. If a found file has not been touched during the allowed time limit (see looping job section in util/default.cfg), the algorithm will kill the job/payload process. - :return: boolean. + :return: True if yes (bool). """ - return True -def remove_unwanted_files(workdir, files): +def remove_unwanted_files(workdir: str, files: list) -> list: """ Remove files from the list that are to be ignored by the looping job algorithm. - :param workdir: working directory (string). Needed in case the find command includes the workdir in the list of - recently touched files. - :param files: list of recently touched files (file names). - :return: filtered files list. - """ + The workdir is needed in case the find command includes the workdir + in the list of recently touched files. + :param workdir: working directory (str) + :param files: list of recently touched files (list) + :return: filtered files (list). + """ _files = [] for _file in files: if not (workdir == _file or From f60beed010673368fa78a907451ab6c5cda80d59 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 4 Nov 2024 17:47:36 +0100 Subject: [PATCH 24/65] Pylint updates --- pilot/user/rubin/esprocessfinegrainedproc.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pilot/user/rubin/esprocessfinegrainedproc.py b/pilot/user/rubin/esprocessfinegrainedproc.py index 9a90ed3d..838593fd 100644 --- a/pilot/user/rubin/esprocessfinegrainedproc.py +++ b/pilot/user/rubin/esprocessfinegrainedproc.py @@ -60,17 +60,17 @@ def __init__(self, max_workers=None, thread_name_prefix='', initializer=None, in self.outputs = {} self._lock = threading.RLock() self.max_workers = max_workers - super(ESRunnerThreadPool, self).__init__(max_workers=max_workers, - thread_name_prefix=thread_name_prefix, - initializer=initializer, - initargs=initargs) + super().__init__(max_workers=max_workers, + thread_name_prefix=thread_name_prefix, + initializer=initializer, + initargs=initargs) def submit(self, fn, *args, **kwargs): - future = super(ESRunnerThreadPool, self).submit(fn, *args, **kwargs) + future = super().submit(fn, *args, **kwargs) return future def run_event(self, fn, event): - future = super(ESRunnerThreadPool, self).submit(fn, event) + future = super().submit(fn, event) with self._lock: self.futures[event['eventRangeID']] = {'event': event, 'future': future} From cbdced94c55f8106b2afb6f97e25c11bcc2ed557 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 6 Nov 2024 11:29:16 +0100 Subject: [PATCH 25/65] IPv4 support in request2() --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/https.py | 106 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 06cb46f1..51c9881a 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.19 \ No newline at end of file +3.9.2.22 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 48baa66b..6b9bd5c3 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '19' # build number should be reset to '1' for every new development cycle +BUILD = '22' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/https.py b/pilot/util/https.py index 70bcacc8..1f5bef31 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -791,6 +791,14 @@ def get_auth_token_content(auth_token: str, key: bool = False) -> str: return auth_token_content +class IPv4HTTPHandler(urllib.request.HTTPHandler): + def http_open(self, req): + return self.do_open(self._create_connection, req) + + def _create_connection(self, host, port=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): + return socket.create_connection((host, port), timeout, source_address, family=socket.AF_INET) + + def request2(url: str = "", data: dict = None, secure: bool = True, @@ -799,6 +807,104 @@ def request2(url: str = "", """ Send a request using HTTPS (using urllib module). + :param url: the URL of the resource (str) + :param data: data to send (dict) + :param secure: use secure connection (bool) + :param compressed: compress data (bool) + :param panda: True for panda server interactions (bool) + :return: server response (str or dict). + """ + if data is None: + data = {} + + ipv = os.environ.get("PILOT_IP_VERSION") + + # https might not have been set up if running in a [middleware] container + if not _ctx.cacert: + https_setup(None, get_pilot_version()) + + # should tokens be used? + auth_token, auth_origin = get_local_oidc_token_info() + use_oidc_token = auth_token and auth_origin and panda + auth_token_content = get_auth_token_content(auth_token) if use_oidc_token else "" + if not auth_token_content and use_oidc_token: + logger.warning('OIDC_AUTH_TOKEN/PANDA_AUTH_TOKEN content could not be read') + return "" + + # get the relevant headers + headers = get_headers(use_oidc_token, auth_token_content, auth_origin) + logger.info(f'headers = {hide_token(headers.copy())}') + logger.info(f'data = {data}') + + # encode data as compressed JSON + if compressed: + rdata_out = BytesIO() + with GzipFile(fileobj=rdata_out, mode="w") as f_gzip: + f_gzip.write(json.dumps(data).encode()) + data_json = rdata_out.getvalue() + else: + data_json = json.dumps(data).encode('utf-8') + + # set up the request + req = urllib.request.Request(url, data_json, headers=headers) + + # create a context with certificate verification + ssl_context = get_ssl_context() + #ssl_context.verify_mode = ssl.CERT_REQUIRED + ssl_context.load_cert_chain(certfile=_ctx.cacert, keyfile=_ctx.cacert) + + if not secure: + ssl_context.verify_mode = False + ssl_context.check_hostname = False + + if ipv == 'IPv4': + logger.info("will use IPv4 in server communication") + opener = urllib.request.build_opener(IPv4HTTPHandler()) + urllib.request.install_opener(opener) + else: + logger.info("will use IPv6 in server communication") + + # ssl_context = ssl.create_default_context(capath=_ctx.capath, cafile=_ctx.cacert) + # Send the request securely + try: + logger.debug('sending data to server') + with urllib.request.urlopen(req, context=ssl_context, timeout=config.Pilot.http_maxtime) as response: + # Handle the response here + logger.debug(f"response.status={response.status}, response.reason={response.reason}") + ret = response.read().decode('utf-8') + if 'getProxy' not in url: + logger.debug(f"response={ret}") + logger.debug('sent request to server') + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as exc: + logger.warning(f'failed to send request: {exc}') + ret = "" + else: + if secure and isinstance(ret, str): + if ret == 'Succeeded': # this happens for sending modeOn (debug mode) + ret = {'StatusCode': '0'} + elif ret.startswith('{') and ret.endswith('}'): + try: + ret = json.loads(ret) + except json.JSONDecodeError as e: + logger.warning(f'failed to parse response: {e}') + else: # response="StatusCode=_some number_" + # Parse the query string into a dictionary + query_dict = parse_qs(ret) + + # Convert lists to single values + ret = {k: v[0] if len(v) == 1 else v for k, v in query_dict.items()} + + return ret + + +def request2_old(url: str = "", + data: dict = None, + secure: bool = True, + compressed: bool = True, + panda: bool = False) -> str or dict: + """ + Send a request using HTTPS (using urllib module). + :param url: the URL of the resource (str) :param data: data to send (dict) :param secure: use secure connection (bool) From b9825c4765832538a084398e5d9304419d488012 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 7 Nov 2024 11:12:52 +0100 Subject: [PATCH 26/65] Added support for http_proxy --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/https.py | 25 +++++++++++++++++++++---- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 51c9881a..2fb403f4 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.22 \ No newline at end of file +3.9.2.23b \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6b9bd5c3..1334b93b 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '22' # build number should be reset to '1' for every new development cycle +BUILD = '23' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/https.py b/pilot/util/https.py index 1f5bef31..9b483e9a 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -859,8 +859,7 @@ def request2(url: str = "", if ipv == 'IPv4': logger.info("will use IPv4 in server communication") - opener = urllib.request.build_opener(IPv4HTTPHandler()) - urllib.request.install_opener(opener) + install_ipv4_opener() else: logger.info("will use IPv6 in server communication") @@ -870,10 +869,10 @@ def request2(url: str = "", logger.debug('sending data to server') with urllib.request.urlopen(req, context=ssl_context, timeout=config.Pilot.http_maxtime) as response: # Handle the response here - logger.debug(f"response.status={response.status}, response.reason={response.reason}") + logger.info(f"response.status={response.status}, response.reason={response.reason}") ret = response.read().decode('utf-8') if 'getProxy' not in url: - logger.debug(f"response={ret}") + logger.info(f"response={ret}") logger.debug('sent request to server') except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as exc: logger.warning(f'failed to send request: {exc}') @@ -897,6 +896,24 @@ def request2(url: str = "", return ret +def install_ipv4_opener(): + """Install the IPv4 opener.""" + http_proxy = os.environ.get("http_proxy") + all_proxy = os.environ.get("all_proxy") + if http_proxy and all_proxy: + logger.info(f"using http_proxy={http_proxy}, all_proxy={all_proxy}") + proxy_handler = urllib.request.ProxyHandler({ + 'http': http_proxy, + 'https': http_proxy, + 'all': all_proxy + }) + opener = urllib.request.build_opener(proxy_handler, IPv4HTTPHandler()) + else: + logger.info("no http_proxy found, will use IPv4 without proxy") + opener = urllib.request.build_opener(IPv4HTTPHandler()) + urllib.request.install_opener(opener) + + def request2_old(url: str = "", data: dict = None, secure: bool = True, From 87123d3bc436b8cd71d64bb1cb68c56228b0616b Mon Sep 17 00:00:00 2001 From: Alexey Anisenkov Date: Fri, 8 Nov 2024 19:19:02 +0700 Subject: [PATCH 27/65] enable altstageout for unified queues; nucleus use-case should be checked and verified --- pilot/api/data.py | 6 ++++-- pilot/control/data.py | 22 ++++++++++++++-------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 93cc76f1..c1649af6 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -1121,11 +1121,13 @@ def prepare_destinations(self, files: list, activities: list or str) -> list: " .. will use default ddm=%s as (local) destination; ddm_alt=%s", activity, e.lfn, ddm, ddm_alt) e.ddmendpoint = ddm e.ddmendpoint_alt = ddm_alt - elif e.ddmendpoint not in storages: # fspec.ddmendpoint is not in associated storages => assume it as final (non local) alternative destination + #elif e.ddmendpoint not in storages and is_unified: ## customize nucleus logic if need + # pass + elif e.ddmendpoint not in storages: # fspec.ddmendpoint is not in associated storages => use it as (non local) alternative destination self.logger.info("[prepare_destinations][%s]: Requested fspec.ddmendpoint=%s is not in the list of allowed (local) destinations" " .. will consider default ddm=%s for transfer and tag %s as alt. location", activity, e.ddmendpoint, ddm, e.ddmendpoint) e.ddmendpoint_alt = e.ddmendpoint # verify me - e.ddmendpoint = ddm + e.ddmendpoint = ddm # check/verify nucleus case else: # set corresponding ddmendpoint_alt if exist (next entry in available storages list) cur = storages.index(e.ddmendpoint) ddm_next = storages[cur + 1] if (cur + 1) < len(storages) else storages[0] # cycle storages, take the first elem when reach end diff --git a/pilot/control/data.py b/pilot/control/data.py index c3d1af73..0f916a51 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -917,19 +917,21 @@ def _do_stageout(job: JobData, args: object, xdata: list, activity: list, title: kwargs = {'workdir': job.workdir, 'cwd': job.workdir, 'usecontainer': False, 'job': job, 'output_dir': args.output_dir, 'catchall': job.infosys.queuedata.catchall, 'rucio_host': args.rucio_host} #, mode='stage-out') - is_unified = job.infosys.queuedata.type == 'unified' + #is_unified = job.infosys.queuedata.type == 'unified' # prod analy unification: use destination preferences from PanDA server for unified queues - if not is_unified: - client.prepare_destinations(xdata, activity) ## FIX ME LATER: split activities: for astorages and for copytools (to unify with ES workflow) + #if not is_unified: + # client.prepare_destinations(xdata, activity) ## FIX ME LATER: split activities: for astorages and for copytools (to unify with ES workflow) - altstageout = not is_unified and job.allow_altstageout() # do not use alt stage-out for unified queues + client.prepare_destinations(xdata, activity) ## FIX ME LATER: split activities: for astorages and for copytools (to unify with ES workflow) + + altstageout = job.allow_altstageout() client.transfer(xdata, activity, raise_exception=not altstageout, **kwargs) remain_files = [entry for entry in xdata if entry.require_transfer()] # check if alt stageout can be applied (all remain files must have alt storage declared ddmendpoint_alt) has_altstorage = all(entry.ddmendpoint_alt and entry.ddmendpoint != entry.ddmendpoint_alt for entry in remain_files) - logger.info('alt stage-out settings: %s, is_unified=%s, altstageout=%s, remain_files=%s, has_altstorage=%s', - activity, is_unified, altstageout, len(remain_files), has_altstorage) + logger.info('alt stage-out settings: %s, allow_altstageout=%s, remain_files=%s, has_altstorage=%s', + activity, altstageout, len(remain_files), has_altstorage) if altstageout and remain_files and has_altstorage: # apply alternative stageout for failed transfers for entry in remain_files: @@ -992,8 +994,12 @@ def _stage_out_new(job: JobData, args: object) -> bool: logger.info('this job does not have any output files, only stage-out log file') job.stageout = 'log' + is_unified = job.infosys.queuedata.type == 'unified' + is_analysis = job.is_analysis() + activities = ['write_lan_analysis', 'write_lan', 'w'] if is_unified and is_analysis else ['write_lan', 'w'] + if job.stageout != 'log': ## do stage-out output files - if not _do_stageout(job, args, job.outdata, ['pw', 'w'], title='output', + if not _do_stageout(job, args, job.outdata, activities, title='output', ipv=args.internet_protocol_version): is_success = False logger.warning('transfer of output file(s) failed') @@ -1037,7 +1043,7 @@ def _stage_out_new(job: JobData, args: object) -> bool: # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_LOG_TAR, time.time(), args) - if not _do_stageout(job, args, [logfile], ['pl', 'pw', 'w'], title='log', + if not _do_stageout(job, args, [logfile], ['pl'] + activities, title='log', ipv=args.internet_protocol_version): is_success = False logger.warning('log transfer failed') From 024cd70f2a41119346577f454f6d6804d488e254 Mon Sep 17 00:00:00 2001 From: Alexey Anisenkov Date: Thu, 14 Nov 2024 17:06:32 +0700 Subject: [PATCH 28/65] modify altstage out logic (exclude nucleus) --- pilot/api/data.py | 45 ++++++++++++++++++++++++++++--------------- pilot/control/data.py | 3 ++- pilot/info/jobdata.py | 5 +++-- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index c1649af6..dc0e8764 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -20,7 +20,7 @@ # - Mario Lassnig, mario.lassnig@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018 -# - Alexey Anisenkov, anisyonk@cern.ch, 2018-2019 +# - Alexey Anisenkov, anisyonk@cern.ch, 2018-2024 """API for data transfers.""" @@ -1072,7 +1072,7 @@ class StageOutClient(StagingClient): mode = "stage-out" - def prepare_destinations(self, files: list, activities: list or str) -> list: + def prepare_destinations(self, files: list, activities: list or str, alt_exclude: list = []) -> list: """ Resolve destination RSE (filespec.ddmendpoint) for each entry from `files` according to requested `activities`. @@ -1080,6 +1080,7 @@ def prepare_destinations(self, files: list, activities: list or str) -> list: :param files: list of FileSpec objects to be processed (list) :param activities: ordered list of activities to be used to resolve astorages (list or str) + :param alt_exclude: global list of destinations that should be excluded / not used for alternative stage-out :return: updated fspec entries (list). """ if not self.infosys.queuedata: # infosys is not initialized: not able to fix destination if need, nothing to do @@ -1108,11 +1109,26 @@ def prepare_destinations(self, files: list, activities: list or str) -> list: raise PilotException(f"Failed to resolve destination: no associated storages defined for activity={activity} ({act})", code=ErrorCodes.NOSTORAGE, state='NO_ASTORAGES_DEFINED') - # take the fist choice for now, extend the logic later if need - ddm = storages[0] - ddm_alt = storages[1] if len(storages) > 1 else None + def resolve_alt_destination(primary, exclude=None): + """ resolve alt destination as the next to primary entry not equal to `primary` and `exclude` """ + + cur = storages.index(primary) if primary in storages else 0 + inext = (cur + 1) % len(storages) # cycle storages, take the first elem when reach end + exclude = set([primary] + list(exclude if exclude is not None else [])) + alt = None + for attempt in range(len(exclude) or 1): # apply several tries to jump exclude entries (in case of dublicated data will stack) + inext = (cur + 1) % len(storages) # cycle storages, start from the beginning when reach end + if storages[inext] not in exclude: + alt = storages[inext] + break + cur += 1 + return alt + + # default destination + ddm = storages[0] # take the fist choice for now, extend the logic later if need + ddm_alt = resolve_alt_destination(ddm, exclude=alt_exclude) - self.logger.info(f"[prepare_destinations][{activity}]: allowed (local) destinations: {storages}") + self.logger.info(f"[prepare_destinations][{activity}]: allowed (local) destinations: {storages}, alt_exclude={alt_exclude}") self.logger.info(f"[prepare_destinations][{activity}]: resolved default destination: ddm={ddm}, ddm_alt={ddm_alt}") for e in files: @@ -1125,15 +1141,14 @@ def prepare_destinations(self, files: list, activities: list or str) -> list: # pass elif e.ddmendpoint not in storages: # fspec.ddmendpoint is not in associated storages => use it as (non local) alternative destination self.logger.info("[prepare_destinations][%s]: Requested fspec.ddmendpoint=%s is not in the list of allowed (local) destinations" - " .. will consider default ddm=%s for transfer and tag %s as alt. location", activity, e.ddmendpoint, ddm, e.ddmendpoint) - e.ddmendpoint_alt = e.ddmendpoint # verify me - e.ddmendpoint = ddm # check/verify nucleus case - else: # set corresponding ddmendpoint_alt if exist (next entry in available storages list) - cur = storages.index(e.ddmendpoint) - ddm_next = storages[cur + 1] if (cur + 1) < len(storages) else storages[0] # cycle storages, take the first elem when reach end - e.ddmendpoint_alt = ddm_next if e.ddmendpoint != ddm_next else None - self.logger.info("[prepare_destinations][%s]: set ddmendpoint_alt=%s for fspec.ddmendpoint=%s", - activity, e.ddmendpoint_alt, e.ddmendpoint) + " .. will consider default ddm=%s as primary and set %s as alt. location", activity, e.ddmendpoint, ddm, e.ddmendpoint) + e.ddmendpoint_alt = e.ddmendpoint if e.ddmendpoint not in alt_exclude else None + e.ddmendpoint = ddm # use default destination, check/verify nucleus case + else: # set corresponding ddmendpoint_alt if exist (next entry in cycled storages list) + e.ddmendpoint_alt = resolve_alt_destination(e.ddmendpoint, exclude=alt_exclude) + + self.logger.info("[prepare_destinations][%s]: use ddmendpoint_alt=%s for fspec.ddmendpoint=%s", + activity, e.ddmendpoint_alt, e.ddmendpoint) return files diff --git a/pilot/control/data.py b/pilot/control/data.py index 0f916a51..a71e9e53 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -922,7 +922,8 @@ def _do_stageout(job: JobData, args: object, xdata: list, activity: list, title: #if not is_unified: # client.prepare_destinations(xdata, activity) ## FIX ME LATER: split activities: for astorages and for copytools (to unify with ES workflow) - client.prepare_destinations(xdata, activity) ## FIX ME LATER: split activities: for astorages and for copytools (to unify with ES workflow) + ## FIX ME LATER: split activities: for `astorages` and `copytools` (to unify with ES workflow) + client.prepare_destinations(xdata, activity, alt_exclude=list(filter(None, [job.nucleus]))) altstageout = job.allow_altstageout() client.transfer(xdata, activity, raise_exception=not altstageout, **kwargs) diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index 2cc149be..b769f831 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -16,7 +16,7 @@ # under the License. # # Authors: -# - Alexey Anisenkov, anisyonk@cern.ch, 2018-19 +# - Alexey Anisenkov, anisyonk@cern.ch, 2018-24 # - Paul Nilsson, paul.nilsson@cern.ch, 2018-24 # - Wen Guan, wen.guan@cern.ch, 2018 @@ -177,6 +177,7 @@ class JobData(BaseData): noexecstrcnv = None # server instruction to the pilot if it should take payload setup from job parameters swrelease = "" # software release string writetofile = "" # + nucleus = "" # cmtconfig encoded info alrbuserplatform = "" # ALRB_USER_PLATFORM encoded in platform/cmtconfig value @@ -195,7 +196,7 @@ class JobData(BaseData): 'swrelease', 'zipmap', 'imagename', 'imagename_jobdef', 'accessmode', 'transfertype', 'datasetin', ## TO BE DEPRECATED: moved to FileSpec (job.indata) 'infilesguids', 'memorymonitor', 'allownooutput', 'pandasecrets', 'prodproxy', 'alrbuserplatform', - 'debug_command', 'dask_scheduler_ip', 'jupyter_session_ip', 'altstageout'], + 'debug_command', 'dask_scheduler_ip', 'jupyter_session_ip', 'altstageout', 'nucleus'], list: ['piloterrorcodes', 'piloterrordiags', 'workdirsizes', 'zombies', 'corecounts', 'subprocesses', 'logdata', 'outdata', 'indata'], dict: ['status', 'fileinfo', 'metadata', 'utilities', 'overwrite_queuedata', 'sizes', 'preprocess', From 280d47f9eb8dd8eaf0ed860f35eb7354fdd14f51 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 19 Nov 2024 15:13:03 +0100 Subject: [PATCH 29/65] Added function to find lingering methods --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/psutils.py | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 2fb403f4..c0387ef0 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.23b \ No newline at end of file +3.9.2.23 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 1334b93b..2f0ae675 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '23' # build number should be reset to '1' for every new development cycle +BUILD = '24' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index ba18e1b1..dbd4fbbd 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -321,3 +321,23 @@ def find_actual_payload_pid(bash_pid: int, payload_cmd: str) -> int or None: logger.warning(f'could not find payload PID for bash PID {bash_pid}') return None + + +def find_lingering_processes(parent_pid: int) -> list: + """ + Find processes that are still running after the specified parent process has terminated. + + :param parent_pid: The PID of the parent process (int) + :return: A list of lingering process PIDs (list). + """ + if not _is_psutil_available: + logger.warning('psutil not available, cannot find lingering processes - aborting') + return [] + + lingering_processes = [] + parent_process = psutil.Process(parent_pid) + for child in parent_process.children(recursive=True): + if child.status() != psutil.STATUS_ZOMBIE: + lingering_processes.append(child.pid) + + return lingering_processes From e36e57a6d6529b921fe592e3c8a1da613e281110 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 19 Nov 2024 15:27:15 +0100 Subject: [PATCH 30/65] Added list_items() --- pilot/util/auxiliary.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index ddcd8547..9f687e79 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -839,3 +839,13 @@ def uuidgen_t() -> str: :return: A UUID in the format "00000000-0000-0000-0000-000000000000" (str). """ return str(uuid4()) + + +def list_items(items: list): + """ + List the items in the given list as a numbered list. + + :param items: list of items (list) + """ + for i, item in enumerate(items): + logger.info(f'{i + 1}: {item}') From df8ed021a5ab778d8e80a9418ec341a5dbc1a86a Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 19 Nov 2024 15:34:49 +0100 Subject: [PATCH 31/65] Looking for lingering processes after payload has finished --- pilot/control/payloads/generic.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 056d2c32..2cde9668 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -36,7 +36,10 @@ from pilot.common.errorcodes import ErrorCodes from pilot.control.job import send_state from pilot.info import JobData -from pilot.util.auxiliary import set_pilot_state # , show_memory_usage +from pilot.util.auxiliary import ( + set_pilot_state, # , show_memory_usage + list_items +) from pilot.util.config import config from pilot.util.container import execute from pilot.util.constants import ( @@ -56,6 +59,7 @@ read_file ) from pilot.util.processes import kill_processes +from pilot.util.psutils import find_lingering_processes from pilot.util.timing import ( add_to_pilot_timing, get_time_measurement @@ -982,6 +986,16 @@ def run(self) -> tuple[int, str]: # noqa: C901 if self.__job.utilities != {}: self.stop_utilities() + # make sure there are no lingering processes + items = find_lingering_processes() + if items: + list_items() + #logger.warning(f"found lingering processes: {items}") + #for item in items: + # kill_processes(item) + else: + logger.info("found no lingering processes") + if self.__job.is_hpo and state != "failed": # in case there are more hyper-parameter points, move away the previous log files # self.rename_log_files(iteration) From 1c685e95ef2513e8628547adcfe76298af37b4d3 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 19 Nov 2024 16:03:12 +0100 Subject: [PATCH 32/65] Looking for lingering processes after payload has finished --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index c0387ef0..8ecb8303 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.23 \ No newline at end of file +3.9.2.24 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 2cde9668..cc75891d 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -987,7 +987,7 @@ def run(self) -> tuple[int, str]: # noqa: C901 self.stop_utilities() # make sure there are no lingering processes - items = find_lingering_processes() + items = find_lingering_processes(os.getpid()) if items: list_items() #logger.warning(f"found lingering processes: {items}") From c54f6c1303a6b146637f56e15dd64b2aa9b0b290 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 19 Nov 2024 18:38:13 +0100 Subject: [PATCH 33/65] Testing lingering processes --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 14 +++++++++++++- pilot/util/constants.py | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 8ecb8303..e8234382 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.24 \ No newline at end of file +3.9.2.25 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index cc75891d..b20f621f 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -956,6 +956,17 @@ def run(self) -> tuple[int, str]: # noqa: C901 f"\n\nfinished pid={proc.pid} exit_code={exit_code} state={self.__job.state}\n" ) + # make sure there are no lingering processes + items = find_lingering_processes(os.getpid()) + if items: + logger.warning("found lingering processes") + list_items() + #logger.warning(f"found lingering processes: {items}") + #for item in items: + # kill_processes(item) + else: + logger.info("(1/2) found no lingering processes") + # stop the utility command (e.g. a coprocess if necessary if proc_co: logger.debug(f"stopping utility command: {utility_cmd}") @@ -989,12 +1000,13 @@ def run(self) -> tuple[int, str]: # noqa: C901 # make sure there are no lingering processes items = find_lingering_processes(os.getpid()) if items: + logger.warning("found lingering processes") list_items() #logger.warning(f"found lingering processes: {items}") #for item in items: # kill_processes(item) else: - logger.info("found no lingering processes") + logger.info("(2/2) found no lingering processes") if self.__job.is_hpo and state != "failed": # in case there are more hyper-parameter points, move away the previous log files diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 2f0ae675..480c137a 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '24' # build number should be reset to '1' for every new development cycle +BUILD = '25' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From b98e6857f71c14055ef6259961c6a04c36f822cc Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 20 Nov 2024 10:04:28 +0100 Subject: [PATCH 34/65] Added support for curlgetjob catchall --- pilot/control/job.py | 9 ++++++--- pilot/util/constants.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 6c8c4c28..527815ae 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1684,10 +1684,13 @@ def get_job_definition_from_server(args: Any, taskid: str = "") -> str: cmd = https.get_server_command(args.url, args.port) if cmd != "": logger.info(f'executing server command: {cmd}') - res = https.request2(cmd, data=data, panda=True) # will be a dictionary - logger.debug(f"request2 response: {res}") # should be StatusCode=0 if all is ok - if not res: # fallback to curl solution + if "curlgetjob" in infosys.queuedata.catchall: res = https.request(cmd, data=data) + else: + res = https.request2(cmd, data=data, panda=True) # will be a dictionary + logger.debug(f"request2 response: {res}") # should be StatusCode=0 if all is ok + if not res: # fallback to curl solution + res = https.request(cmd, data=data) return res diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 480c137a..0b997dc5 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '25' # build number should be reset to '1' for every new development cycle +BUILD = '26' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 14a4c67957eca4dce184ca7e93a51fcee060aeea Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 20 Nov 2024 10:17:00 +0100 Subject: [PATCH 35/65] Added missing items --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index e8234382..8bde6e70 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.25 \ No newline at end of file +3.9.2.26 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index b20f621f..df62bf6c 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -960,7 +960,7 @@ def run(self) -> tuple[int, str]: # noqa: C901 items = find_lingering_processes(os.getpid()) if items: logger.warning("found lingering processes") - list_items() + list_items(items) #logger.warning(f"found lingering processes: {items}") #for item in items: # kill_processes(item) @@ -1001,7 +1001,7 @@ def run(self) -> tuple[int, str]: # noqa: C901 items = find_lingering_processes(os.getpid()) if items: logger.warning("found lingering processes") - list_items() + list_items(items) #logger.warning(f"found lingering processes: {items}") #for item in items: # kill_processes(item) From 1ddfc3d588a8ef3768db1a8355d9effe72949d96 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 21 Nov 2024 15:24:17 +0100 Subject: [PATCH 36/65] Skipping sending X509 for tokens --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/https.py | 9 +++------ 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 8bde6e70..ae44b459 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.26 \ No newline at end of file +3.9.2.27 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 0b997dc5..fc2e22d3 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '26' # build number should be reset to '1' for every new development cycle +BUILD = '27' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/https.py b/pilot/util/https.py index 9b483e9a..16b25499 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -799,11 +799,7 @@ def _create_connection(self, host, port=None, timeout=socket._GLOBAL_DEFAULT_TIM return socket.create_connection((host, port), timeout, source_address, family=socket.AF_INET) -def request2(url: str = "", - data: dict = None, - secure: bool = True, - compressed: bool = True, - panda: bool = False) -> str or dict: +def request2(url: str = "", data: dict = None, secure: bool = True, compressed: bool = True, panda: bool = False) -> str or dict: # noqa: C901 """ Send a request using HTTPS (using urllib module). @@ -851,7 +847,8 @@ def request2(url: str = "", # create a context with certificate verification ssl_context = get_ssl_context() #ssl_context.verify_mode = ssl.CERT_REQUIRED - ssl_context.load_cert_chain(certfile=_ctx.cacert, keyfile=_ctx.cacert) + if not use_oidc_token: + ssl_context.load_cert_chain(certfile=_ctx.cacert, keyfile=_ctx.cacert) if not secure: ssl_context.verify_mode = False From aaee5195b0c47fbb59a25f5d84a9e012c4fd48a7 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 22 Nov 2024 11:52:13 +0100 Subject: [PATCH 37/65] Added threashold to CPU consumption time --- pilot/util/processes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pilot/util/processes.py b/pilot/util/processes.py index b1c16536..3bf4bdfb 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -582,6 +582,10 @@ def get_instant_cpu_consumption_time(pid: int) -> float: if utime and stime and cutime and cstime: # sum up all the user+system times for both the main process (pid) and the child processes cpu_consumption_time = utime + stime + cutime + cstime + max_threshold = 1e6 + if cpu_consumption_time > max_threshold: + logger.warning(f'CPU consumption time={cpu_consumption_time} exceeds sanity threshold={max_threshold}') + cpu_consumption_time = 0.0 else: cpu_consumption_time = 0.0 From e2935c1052b0cae4930b5822bb3b2ffc3a3d974b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 22 Nov 2024 11:55:07 +0100 Subject: [PATCH 38/65] Added threashold to CPU consumption time, plus debug info in case of trouble --- pilot/util/processes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 3bf4bdfb..495f0507 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -574,7 +574,8 @@ def get_instant_cpu_consumption_time(pid: int) -> float: if os.path.exists(path): try: with open(path, "r", encoding="utf-8") as fp: - fields = fp.read().split(' ')[13:17] + _read = fp.read() + fields = _read.split(' ')[13:17] utime, stime, cutime, cstime = [(float(f) / hz) for f in fields] except IOError as exc: logger.warning(f'exception caught: {exc} (ignored)') @@ -585,6 +586,8 @@ def get_instant_cpu_consumption_time(pid: int) -> float: max_threshold = 1e6 if cpu_consumption_time > max_threshold: logger.warning(f'CPU consumption time={cpu_consumption_time} exceeds sanity threshold={max_threshold}') + logger.warning(f"utime={utime} stime={stime} cutime={cutime} cstime={cstime} hz={hz}") + logger.warning(f"fp.read()={_read}") cpu_consumption_time = 0.0 else: cpu_consumption_time = 0.0 From 321d90f19f13c188bd2674ca509b44d009a3c771 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 22 Nov 2024 12:06:59 +0100 Subject: [PATCH 39/65] Now aborting monitoring loop if CPU consumption time is 0 --- pilot/util/monitoring.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index bbd64286..1f11ee14 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -121,6 +121,9 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i logger.info(f'(instant) CPU consumption time for pid={job.pid}: {cpuconsumptiontime} (rounded to {job.cpuconsumptiontime})') elif _cpuconsumptiontime == -1: logger.warning('could not get CPU consumption time') + elif _cpuconsumptiontime == 0.0: + logger.warning(f'process {job.pid} can no longer be monitored (due to stat problems) - aborting') + return 0, "" else: logger.warning(f'process {job.pid} is no longer using CPU - aborting') return 0, "" From a1fe2c0a14a870a5c7f5b5326cee4de95bc9d758 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 22 Nov 2024 14:07:02 +0100 Subject: [PATCH 40/65] Fixed time-outs --- PILOTVERSION | 2 +- pilot/user/atlas/container.py | 112 ++++++++++++++++++++++++++++++++++ pilot/util/constants.py | 2 +- 3 files changed, 114 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index ae44b459..c59294e0 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.27 \ No newline at end of file +3.9.2.29 \ No newline at end of file diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index adc7de77..9e48760e 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -27,6 +27,7 @@ import logging import os import re +import select import shlex import subprocess import time @@ -838,6 +839,117 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> tuple[int python_completed = False # Flag to track completion of 'python3' process lsetup_completed_at = None + with open(new_path, "w", encoding='utf-8') as file: + while True: + # Check for timeout (once per second) + if time.time() - start_time > lsetup_timeout and not lsetup_completed: + logger.warning("timeout for 'lsetup' exceeded - killing script") + exit_code = 2 # 'lsetup' timeout + process.kill() + break + + # Use select to check if there is data to read (to byspass any blocking operation that will prevent time-out checks) + ready, _, _ = select.select([process.stdout], [], [], 1.0) + if ready: + output = process.stdout.readline() # Read bytes directly + if output: + output = output.decode().strip() + file.write(output + "\n") + stdout += output + "\n" + + # Check for LSETUP_COMPLETED message + if output == "LSETUP_COMPLETED": + lsetup_completed = True + lsetup_completed_at = time.time() + start_time = time.time() # Reset start time for 'python3' timeout + + # Check for PYTHON_COMPLETED message + if "PYTHON_COMPLETED" in output: + python_completed = True + match = re.search(r"\d+$", output) + if match: + exit_code = int(match.group()) + logger.info(f"python remote open command has completed with exit code {exit_code}") + else: + logger.info("python remote open command has completed without any exit code") + break + + # Timeout for python script after LSETUP_COMPLETED + if lsetup_completed and ((time.time() - lsetup_completed_at) > python_script_timeout): + logger.warning(f"(1) timeout for 'python3' subscript exceeded - killing script " + f"({time.time()} - {lsetup_completed_at} > {python_script_timeout})") + exit_code = 3 + process.kill() + break + + # Timeout for python script after LSETUP_COMPLETED + if lsetup_completed and ((time.time() - start_time) > python_script_timeout): + logger.warning(f"(2) timeout for 'python3' subscript exceeded - killing script " + f"({time.time()} - {start_time} > {python_script_timeout})") + exit_code = 3 + process.kill() + break + + if python_completed: + logger.info('aborting since python command has finished') + return_code = process.poll() + if return_code: + logger.warning(f"script execution completed with return code: {return_code}") + # exit_code = return_code + break + + # Check if script has completed normally + return_code = process.poll() + if return_code is not None: + pass + # logger.info(f"script execution completed with return code: {return_code}") + # exit_code = return_code + # break + + time.sleep(0.5) + + # Ensure process is terminated + if process.poll() is None: + process.terminate() + + # Check if 'lsetup' was completed + lsetup_time = int(lsetup_completed_at - lsetup_start_time) if lsetup_completed_at else 0 + + return exit_code, stdout, lsetup_time + + +def execute_remote_file_open_old(path: str, python_script_timeout: int) -> tuple[int, str, int]: # noqa: C901 + """ + Execute the remote file open script. + + :param path: path to container script (str) + :param python_script_timeout: timeout (int) + :return: exit code (int), stdout (str), lsetup time (int) (tuple). + """ + lsetup_timeout = 600 # Timeout for 'lsetup' step + exit_code = 1 + stdout = "" + + # Start the Bash script process with non-blocking I/O + try: + process = subprocess.Popen(["bash", path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=0) + fcntl.fcntl(process.stdout.fileno(), fcntl.F_SETFL, os.O_NONBLOCK) # Set non-blocking + except OSError as e: + logger.warning(f"error starting subprocess: {e}") + return exit_code, "", 0 + + # Split the path at the last dot + filename, _ = path.rsplit(".", 1) + + # Create the new path with the desired suffix + new_path = f"{filename}.stdout" + + start_time = time.time() # Track execution start time + lsetup_start_time = start_time + lsetup_completed = False # Flag to track completion of 'lsetup' process + python_completed = False # Flag to track completion of 'python3' process + lsetup_completed_at = None + with open(new_path, "w", encoding='utf-8') as file: while True: # Check for timeout (once per second) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index fc2e22d3..9987fbc8 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '27' # build number should be reset to '1' for every new development cycle +BUILD = '29' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 4c6a70f3a13895068b771978f3caefe74c04d6c1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 22 Nov 2024 14:08:07 +0100 Subject: [PATCH 41/65] CPU consumption time reset to 1.0 in case of stat problem --- pilot/util/processes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 495f0507..07dd2880 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -585,10 +585,10 @@ def get_instant_cpu_consumption_time(pid: int) -> float: cpu_consumption_time = utime + stime + cutime + cstime max_threshold = 1e6 if cpu_consumption_time > max_threshold: - logger.warning(f'CPU consumption time={cpu_consumption_time} exceeds sanity threshold={max_threshold}') + logger.warning(f'CPU consumption time={cpu_consumption_time} exceeds sanity threshold={max_threshold} (reset to 1.0)') logger.warning(f"utime={utime} stime={stime} cutime={cutime} cstime={cstime} hz={hz}") logger.warning(f"fp.read()={_read}") - cpu_consumption_time = 0.0 + cpu_consumption_time = 1.0 else: cpu_consumption_time = 0.0 From 20ac67c58cfb6b2845c1d6dbe8b5aa68ded0cbf0 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 22 Nov 2024 18:07:25 +0100 Subject: [PATCH 42/65] Added more debug info --- pilot/util/processes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 07dd2880..01d9aab6 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -46,7 +46,7 @@ def find_processes_in_group(cpids: list, pid: int, ps_cache: str = ""): """ Find all processes that belong to the same group using the given ps command output. - Recursively search for the children processes belonging to pid and return their pid's. + Search for the children processes belonging to pid and return their pid's. pid is the parent pid and cpids is a list that has to be initialized before calling this function and it contains the pids of the children AND the parent. @@ -585,7 +585,7 @@ def get_instant_cpu_consumption_time(pid: int) -> float: cpu_consumption_time = utime + stime + cutime + cstime max_threshold = 1e6 if cpu_consumption_time > max_threshold: - logger.warning(f'CPU consumption time={cpu_consumption_time} exceeds sanity threshold={max_threshold} (reset to 1.0)') + logger.warning(f'CPU consumption time={cpu_consumption_time} for pid={pid} exceeds sanity threshold={max_threshold} (reset to 1.0)') logger.warning(f"utime={utime} stime={stime} cutime={cutime} cstime={cstime} hz={hz}") logger.warning(f"fp.read()={_read}") cpu_consumption_time = 1.0 From 5113a065f0aae1282c3a39c209ba85d885073d0a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 22 Nov 2024 18:07:41 +0100 Subject: [PATCH 43/65] Bypassed pylint complaint --- pilot/api/data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index dc0e8764..61600cc2 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -1072,7 +1072,7 @@ class StageOutClient(StagingClient): mode = "stage-out" - def prepare_destinations(self, files: list, activities: list or str, alt_exclude: list = []) -> list: + def prepare_destinations(self, files: list, activities: list or str, alt_exclude: list = None) -> list: """ Resolve destination RSE (filespec.ddmendpoint) for each entry from `files` according to requested `activities`. @@ -1083,6 +1083,9 @@ def prepare_destinations(self, files: list, activities: list or str, alt_exclude :param alt_exclude: global list of destinations that should be excluded / not used for alternative stage-out :return: updated fspec entries (list). """ + if alt_exclude is None: # to bypass pylint complaint if declared as [] above + alt_exclude = [] + if not self.infosys.queuedata: # infosys is not initialized: not able to fix destination if need, nothing to do return files From 6503d2e084154cb2b907893fda208b302758c33e Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 25 Nov 2024 11:28:57 +0100 Subject: [PATCH 44/65] Reverted cpu monitoring, removed threshold which is useless --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 4 +++- pilot/util/processes.py | 10 ++-------- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index c59294e0..13e05fd4 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.29 \ No newline at end of file +3.9.2.30 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 9987fbc8..6baeea60 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '29' # build number should be reset to '1' for every new development cycle +BUILD = '30' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 1f11ee14..56271db0 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -116,12 +116,14 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i else: _cpuconsumptiontime = int(round(cpuconsumptiontime)) if _cpuconsumptiontime > 0: + # make sure there are no sudden jumps in the cpuconsumptiontime + #factor = _cpuconsumptiontime / job.cpuconsumptiontime job.cpuconsumptiontime = int(round(cpuconsumptiontime)) job.cpuconversionfactor = 1.0 logger.info(f'(instant) CPU consumption time for pid={job.pid}: {cpuconsumptiontime} (rounded to {job.cpuconsumptiontime})') elif _cpuconsumptiontime == -1: logger.warning('could not get CPU consumption time') - elif _cpuconsumptiontime == 0.0: + elif _cpuconsumptiontime == 0: logger.warning(f'process {job.pid} can no longer be monitored (due to stat problems) - aborting') return 0, "" else: diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 01d9aab6..0d2fc252 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -552,6 +552,7 @@ def get_cpu_consumption_time(t0: tuple) -> float: def get_instant_cpu_consumption_time(pid: int) -> float: """ Return the CPU consumption time (system+user time) for a given process, by parsing /prod/pid/stat. + Note 1: the function returns 0.0 if the pid is not set. Note 2: the function must sum up all the user+system times for both the main process (pid) and the child processes, since the main process is most likely spawning new processes. @@ -574,8 +575,7 @@ def get_instant_cpu_consumption_time(pid: int) -> float: if os.path.exists(path): try: with open(path, "r", encoding="utf-8") as fp: - _read = fp.read() - fields = _read.split(' ')[13:17] + fields = fp.read().split(' ')[13:17] utime, stime, cutime, cstime = [(float(f) / hz) for f in fields] except IOError as exc: logger.warning(f'exception caught: {exc} (ignored)') @@ -583,12 +583,6 @@ def get_instant_cpu_consumption_time(pid: int) -> float: if utime and stime and cutime and cstime: # sum up all the user+system times for both the main process (pid) and the child processes cpu_consumption_time = utime + stime + cutime + cstime - max_threshold = 1e6 - if cpu_consumption_time > max_threshold: - logger.warning(f'CPU consumption time={cpu_consumption_time} for pid={pid} exceeds sanity threshold={max_threshold} (reset to 1.0)') - logger.warning(f"utime={utime} stime={stime} cutime={cutime} cstime={cstime} hz={hz}") - logger.warning(f"fp.read()={_read}") - cpu_consumption_time = 1.0 else: cpu_consumption_time = 0.0 From 8e571ff832d33f6d82e187187550c0d7f8744d39 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 25 Nov 2024 13:17:53 +0100 Subject: [PATCH 45/65] Added function for checking CPU load --- pilot/util/psutils.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index dbd4fbbd..837f2e94 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -341,3 +341,24 @@ def find_lingering_processes(parent_pid: int) -> list: lingering_processes.append(child.pid) return lingering_processes + + +def check_cpu_load(): + """ + Check if the system is under heavy CPU load. + + High CPU load is here defined to be above 80%. + + :return: True (system is under heavy CPU load), False (system load is normal). + """ + if not _is_psutil_available: + logger.warning('psutil not available, cannot check CPU load (pretending it is normal)') + return False + + cpu_percent = psutil.cpu_percent(interval=1) + if cpu_percent > 80: + logger.info("system is under heavy CPU load") + return True + else: + logger.info("system load is normal") + return False From 8a5c87204fcee04ef2717276b9090ae976d24cf7 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 25 Nov 2024 13:54:55 +0100 Subject: [PATCH 46/65] Verifying the increase of CPU consumption --- pilot/util/monitoring.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 56271db0..9d1a3944 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -61,6 +61,7 @@ ) from pilot.util.psutils import ( is_process_running, + check_cpu_load, get_pid, get_subprocesses, find_actual_payload_pid @@ -116,9 +117,19 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i else: _cpuconsumptiontime = int(round(cpuconsumptiontime)) if _cpuconsumptiontime > 0: - # make sure there are no sudden jumps in the cpuconsumptiontime - #factor = _cpuconsumptiontime / job.cpuconsumptiontime - job.cpuconsumptiontime = int(round(cpuconsumptiontime)) + if job.cpuconsumptiontime == -1: # no time set yet so just proceed + job.cpuconsumptiontime = _cpuconsumptiontime + else: + # make sure there are no sudden jumps in the cpuconsumptiontime + increase_factor = _cpuconsumptiontime / job.cpuconsumptiontime if job.cpuconsumptiontime > 0 else 1 + high_cpu_load = check_cpu_load() + factor = 10 if high_cpu_load else 5 + if increase_factor > factor: + logger.warning(f'CPU consumption time increased by a factor of {increase_factor} (over the limit of {factor})') + logger.warning(f"will not consider the new value: {_cpuconsumptiontime}") + else: + logger.debug(f'CPU consumption time increased by a factor of {increase_factor} (below the limit of {factor})') + job.cpuconsumptiontime = _cpuconsumptiontime job.cpuconversionfactor = 1.0 logger.info(f'(instant) CPU consumption time for pid={job.pid}: {cpuconsumptiontime} (rounded to {job.cpuconsumptiontime})') elif _cpuconsumptiontime == -1: From 3b915af96901ad22c14be22be684bf1937e0907d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 25 Nov 2024 14:11:31 +0100 Subject: [PATCH 47/65] Refarctored job_monitoring_tasks() --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 80 ++++++++++++++++++++++++---------------- 3 files changed, 50 insertions(+), 34 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 13e05fd4..46be5578 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.30 \ No newline at end of file +3.9.2.31 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6baeea60..4f302811 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '30' # build number should be reset to '1' for every new development cycle +BUILD = '31' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 9d1a3944..2ca15413 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -107,39 +107,10 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i # confirm that the worker node has a proper SC_CLK_TCK (problems seen on MPPMU) check_hz() - try: - cpuconsumptiontime = get_current_cpu_consumption_time(job.pid) - except Exception as error: - diagnostics = f"Exception caught: {error}" - logger.warning(diagnostics) - exit_code = get_exception_error_code(diagnostics) + # set the CPU consumption time for the job + exit_code, diagnostics = set_cpu_consumption_time(job) + if exit_code: return exit_code, diagnostics - else: - _cpuconsumptiontime = int(round(cpuconsumptiontime)) - if _cpuconsumptiontime > 0: - if job.cpuconsumptiontime == -1: # no time set yet so just proceed - job.cpuconsumptiontime = _cpuconsumptiontime - else: - # make sure there are no sudden jumps in the cpuconsumptiontime - increase_factor = _cpuconsumptiontime / job.cpuconsumptiontime if job.cpuconsumptiontime > 0 else 1 - high_cpu_load = check_cpu_load() - factor = 10 if high_cpu_load else 5 - if increase_factor > factor: - logger.warning(f'CPU consumption time increased by a factor of {increase_factor} (over the limit of {factor})') - logger.warning(f"will not consider the new value: {_cpuconsumptiontime}") - else: - logger.debug(f'CPU consumption time increased by a factor of {increase_factor} (below the limit of {factor})') - job.cpuconsumptiontime = _cpuconsumptiontime - job.cpuconversionfactor = 1.0 - logger.info(f'(instant) CPU consumption time for pid={job.pid}: {cpuconsumptiontime} (rounded to {job.cpuconsumptiontime})') - elif _cpuconsumptiontime == -1: - logger.warning('could not get CPU consumption time') - elif _cpuconsumptiontime == 0: - logger.warning(f'process {job.pid} can no longer be monitored (due to stat problems) - aborting') - return 0, "" - else: - logger.warning(f'process {job.pid} is no longer using CPU - aborting') - return 0, "" # keep track of the subprocesses running (store payload subprocess PIDs) store_subprocess_pids(job) @@ -199,6 +170,51 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i return exit_code, diagnostics +def set_cpu_consumption_time(job: JobData) -> tuple[int, str]: + """ + Set the CPU consumption time for the job. + + :param job: job object (JobData) + :return: exit code (int), diagnostics (string). + """ + try: + cpuconsumptiontime = get_current_cpu_consumption_time(job.pid) + except Exception as error: + diagnostics = f"Exception caught: {error}" + logger.warning(diagnostics) + exit_code = get_exception_error_code(diagnostics) + return exit_code, diagnostics + else: + _cpuconsumptiontime = int(round(cpuconsumptiontime)) + if _cpuconsumptiontime > 0: + if job.cpuconsumptiontime == -1: # no time set yet so just proceed + job.cpuconsumptiontime = _cpuconsumptiontime + else: + # make sure there are no sudden jumps in the cpuconsumptiontime + increase_factor = _cpuconsumptiontime / job.cpuconsumptiontime if job.cpuconsumptiontime > 0 else 1 + high_cpu_load = check_cpu_load() + factor = 10 if high_cpu_load else 5 + if increase_factor > factor: + logger.warning( + f'CPU consumption time increased by a factor of {increase_factor} (over the limit of {factor})') + logger.warning(f"will not consider the new value: {_cpuconsumptiontime}") + else: + logger.debug( + f'CPU consumption time increased by a factor of {increase_factor} (below the limit of {factor})') + job.cpuconsumptiontime = _cpuconsumptiontime + job.cpuconversionfactor = 1.0 + logger.info( + f'(instant) CPU consumption time for pid={job.pid}: {cpuconsumptiontime} (rounded to {job.cpuconsumptiontime})') + elif _cpuconsumptiontime == -1: + logger.warning('could not get CPU consumption time') + elif _cpuconsumptiontime == 0: + logger.warning(f'process {job.pid} can no longer be monitored (due to stat problems) - aborting') + else: + logger.warning(f'process {job.pid} is no longer using CPU - aborting') + + return 0, "" + + def still_running(pid): # verify that the process is still alive From 2c9290b75d05ba85673fc4d11a072fc4f98953d0 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 25 Nov 2024 15:43:15 +0100 Subject: [PATCH 48/65] Protection against failures --- pilot/util/psutils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index 837f2e94..b25a473f 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -355,7 +355,12 @@ def check_cpu_load(): logger.warning('psutil not available, cannot check CPU load (pretending it is normal)') return False - cpu_percent = psutil.cpu_percent(interval=1) + try: + cpu_percent = psutil.cpu_percent(interval=0.5) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess) as e: + logger.warning(f"Failed to read CPU percent: {e}") + logger.info("system is under heavy CPU load (assumed)") + return True if cpu_percent > 80: logger.info("system is under heavy CPU load") return True From a1930657199b931e996bcf924319d5fdecddd36b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 26 Nov 2024 07:03:22 +0100 Subject: [PATCH 49/65] Cleanup and comments --- PILOTVERSION | 2 +- pilot/control/data.py | 2 ++ pilot/control/payloads/generic.py | 2 -- pilot/util/constants.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 46be5578..9f922ddf 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.31 \ No newline at end of file +3.9.2.32 \ No newline at end of file diff --git a/pilot/control/data.py b/pilot/control/data.py index a71e9e53..641cc31b 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -328,6 +328,8 @@ def _stage_in(args: object, job: JobData) -> bool: logger.info(" -- lfn=%s, status_code=%s, status=%s", infile.lfn, infile.status_code, status) # write time stamps to pilot timing file + + # MOVE THIS TO AFTER REMOTE FILE OPEN HAS BEEN VERIFIED (actually just before the payload starts) add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) remain_files = [infile for infile in job.indata if infile.status not in ['remote_io', 'transferred', 'no_transfer']] diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index df62bf6c..48aab21f 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -492,8 +492,6 @@ def pre_payload(self, job: JobData): """ # write time stamps to pilot timing file update_time = time.time() - logger.debug(f"setting pre-payload time to {update_time} s") - logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, update_time, self.__args) def post_payload(self, job: JobData): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4f302811..c9f9f022 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '31' # build number should be reset to '1' for every new development cycle +BUILD = '32' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 0864ecb153c87c29a7c1d19904b0c874af9f0c99 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 26 Nov 2024 09:41:49 +0100 Subject: [PATCH 50/65] Lingering processes will now be killed --- pilot/control/payloads/generic.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 48aab21f..e33b893c 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -58,7 +58,10 @@ write_file, read_file ) -from pilot.util.processes import kill_processes +from pilot.util.processes import ( + kill_process, + kill_processes, +) from pilot.util.psutils import find_lingering_processes from pilot.util.timing import ( add_to_pilot_timing, @@ -954,17 +957,6 @@ def run(self) -> tuple[int, str]: # noqa: C901 f"\n\nfinished pid={proc.pid} exit_code={exit_code} state={self.__job.state}\n" ) - # make sure there are no lingering processes - items = find_lingering_processes(os.getpid()) - if items: - logger.warning("found lingering processes") - list_items(items) - #logger.warning(f"found lingering processes: {items}") - #for item in items: - # kill_processes(item) - else: - logger.info("(1/2) found no lingering processes") - # stop the utility command (e.g. a coprocess if necessary if proc_co: logger.debug(f"stopping utility command: {utility_cmd}") @@ -998,13 +990,12 @@ def run(self) -> tuple[int, str]: # noqa: C901 # make sure there are no lingering processes items = find_lingering_processes(os.getpid()) if items: - logger.warning("found lingering processes") + logger.warning("found lingering processes - will now be removed") list_items(items) - #logger.warning(f"found lingering processes: {items}") - #for item in items: - # kill_processes(item) + for item in items: + kill_process(item, hardkillonly=True) else: - logger.info("(2/2) found no lingering processes") + logger.info("found no lingering processes") if self.__job.is_hpo and state != "failed": # in case there are more hyper-parameter points, move away the previous log files From 5b8790521eafa4460ccba073565f033f195710e8 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 26 Nov 2024 09:48:26 +0100 Subject: [PATCH 51/65] Added exception handling --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/psutils.py | 14 ++++++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 9f922ddf..a1c45371 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.32 \ No newline at end of file +3.9.2.33 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index c9f9f022..e1af55a8 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '32' # build number should be reset to '1' for every new development cycle +BUILD = '33' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index b25a473f..90eb6394 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -335,10 +335,16 @@ def find_lingering_processes(parent_pid: int) -> list: return [] lingering_processes = [] - parent_process = psutil.Process(parent_pid) - for child in parent_process.children(recursive=True): - if child.status() != psutil.STATUS_ZOMBIE: - lingering_processes.append(child.pid) + try: + parent_process = psutil.Process(parent_pid) + for child in parent_process.children(recursive=True): + try: + if child.status() != psutil.STATUS_ZOMBIE: + lingering_processes.append(child.pid) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess) as e: + logger.warning(f"[harmless] failed to get status for child process {child.pid}: {e}") + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess, psutil.FileNotFoundError) as e: + logger.warning(f"[harmless] failed to get parent process {parent_pid}: {e}") return lingering_processes From a0f114be80158fadb1372eae69132e7d54b027b9 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 26 Nov 2024 09:57:36 +0100 Subject: [PATCH 52/65] Improved log messages --- pilot/util/processes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 0d2fc252..95d4eb8e 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -578,7 +578,9 @@ def get_instant_cpu_consumption_time(pid: int) -> float: fields = fp.read().split(' ')[13:17] utime, stime, cutime, cstime = [(float(f) / hz) for f in fields] except IOError as exc: - logger.warning(f'exception caught: {exc} (ignored)') + logger.warning(f'exception caught: {exc} (ignoring process {pid})') + else: + logger.debug(f"{path} no longer exist (ignoring terminated process {pid})") if utime and stime and cutime and cstime: # sum up all the user+system times for both the main process (pid) and the child processes From 9141f27cb56261f774dc08acbc251e5c851fda87 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 26 Nov 2024 10:15:59 +0100 Subject: [PATCH 53/65] Added verification that /proc/self/statm can be accessed --- pilot/util/monitoring.py | 17 +++++++++++------ pilot/util/processes.py | 15 +++++++++++++++ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 2ca15413..42dde4d8 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -37,9 +37,9 @@ from pilot.util.container import execute from pilot.util.filehandling import ( get_disk_usage, - remove_files, get_local_file_size, read_file, + remove_files, zip_files, #write_file ) @@ -54,22 +54,23 @@ get_maximum_input_sizes ) from pilot.util.processes import ( + check_proc_access, get_current_cpu_consumption_time, - kill_processes, get_number_of_child_processes, + kill_processes, reap_zombies ) from pilot.util.psutils import ( is_process_running, check_cpu_load, + find_actual_payload_pid, get_pid, get_subprocesses, - find_actual_payload_pid ) from pilot.util.timing import get_time_since from pilot.util.workernode import ( + check_hz, get_local_disk_space, - check_hz ) from pilot.info import infosys, JobData @@ -201,10 +202,14 @@ def set_cpu_consumption_time(job: JobData) -> tuple[int, str]: else: logger.debug( f'CPU consumption time increased by a factor of {increase_factor} (below the limit of {factor})') - job.cpuconsumptiontime = _cpuconsumptiontime + + # make sure that /proc/self/statm still exists, otherwise the job is no longer using CPU, ie discard the info + if check_proc_access(): + logger.debug("/proc/self/statm exists - will update the CPU consumption time") + job.cpuconsumptiontime = _cpuconsumptiontime job.cpuconversionfactor = 1.0 logger.info( - f'(instant) CPU consumption time for pid={job.pid}: {cpuconsumptiontime} (rounded to {job.cpuconsumptiontime})') + f'(instant) CPU consumption time for pid={job.pid}: {job.cpuconsumptiontime})') elif _cpuconsumptiontime == -1: logger.warning('could not get CPU consumption time') elif _cpuconsumptiontime == 0: diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 95d4eb8e..d6167e63 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -946,3 +946,18 @@ def waitpid(pid: int = -1): pass logger.info(f'reaping zombies for max {max_timeout} seconds') waitpid(pid) + + +def check_proc_access() -> bool: + """ + Verify that /proc/self/statm can be accessed. + + :return: True if /proc/self/statm can be accessed, False otherwise (bool). + """ + try: + with open('/proc/self/statm', 'r') as f: + _ = f.read() + return True + except (FileNotFoundError, PermissionError) as e: + logger.warning(f"error accessing /proc/self/statm: {e} (CPU consumption time will be discarded)") + return False From ed52d3f76166e7ded5747749c4f0e7d3d3019035 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 26 Nov 2024 10:37:35 +0100 Subject: [PATCH 54/65] Aborting OOM update if the process has terminated --- pilot/util/monitoring.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 42dde4d8..38db1602 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -253,6 +253,11 @@ def update_oom_info(bash_pid, payload_cmd): return fname = f"/proc/{payload_pid}/oom_score" + # abort if the file does not exist + if not os.path.exists(fname): + logger.warning(f'oom_score file does not exist: {fname} (abort)') + return + fname_adj = fname + "_adj" payload_score = get_score(payload_pid) if payload_pid else 'UNKNOWN' pilot_score = get_score(os.getpid()) From d302aadc40597e2e8325af7b0ea89649ac8f7e56 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 26 Nov 2024 10:40:47 +0100 Subject: [PATCH 55/65] Updated log message --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index a1c45371..397bca28 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.33 \ No newline at end of file +3.9.2.34 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index e1af55a8..6959dafa 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '33' # build number should be reset to '1' for every new development cycle +BUILD = '34' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 38db1602..8471904b 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -200,8 +200,8 @@ def set_cpu_consumption_time(job: JobData) -> tuple[int, str]: f'CPU consumption time increased by a factor of {increase_factor} (over the limit of {factor})') logger.warning(f"will not consider the new value: {_cpuconsumptiontime}") else: - logger.debug( - f'CPU consumption time increased by a factor of {increase_factor} (below the limit of {factor})') + logger.info( + f'CPU consumption time changed by a factor of {increase_factor} (below the limit of {factor})') # make sure that /proc/self/statm still exists, otherwise the job is no longer using CPU, ie discard the info if check_proc_access(): From 2167118f6a797fb662e60fc3c47f9a9588aa5653 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 26 Nov 2024 12:06:45 +0100 Subject: [PATCH 56/65] Cleanup --- pilot/control/payloads/generic.py | 2 -- pilot/util/constants.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index e33b893c..9e6e4ae0 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -115,8 +115,6 @@ def pre_setup(self, job: JobData): """ # write time stamps to pilot timing file update_time = time.time() - logger.debug(f"setting pre-setup time to {update_time} s") - logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, update_time, self.__args) def post_setup(self, job: JobData, update_time: bool = None): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6959dafa..58835e2c 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '34' # build number should be reset to '1' for every new development cycle +BUILD = '35' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From b8393859479cf6f41497ac55c072ca7fba0ad3b6 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 26 Nov 2024 12:30:57 +0100 Subject: [PATCH 57/65] Added pilot args object to get_payload_command() --- pilot/user/atlas/common.py | 13 +++++++++++-- pilot/user/generic/common.py | 6 +++++- pilot/user/rubin/common.py | 6 +++++- pilot/user/sphenix/common.py | 6 +++++- pilot/util/constants.py | 2 ++ 5 files changed, 28 insertions(+), 5 deletions(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 4f114911..853b90ab 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -26,6 +26,7 @@ import logging import os import re +import time from collections import defaultdict from functools import reduce @@ -487,14 +488,17 @@ def get_nthreads(catchall: str) -> int: return _nthreads if _nthreads else 1 -def get_payload_command(job: JobData) -> str: +def get_payload_command(job: JobData, args: object = None) -> str: """ Return the full command for executing the payload, including the sourcing of all setup files and setting of environment variables. :param job: job object (JobData) - :return: command (str). + :param args: pilot arguments (object) + :return: command (str) :raises TrfDownloadFailure: in case of download failure. """ + if not args: # bypass pylint complaint + pass # Should the pilot do the setup or does jobPars already contain the information? preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) @@ -515,6 +519,7 @@ def get_payload_command(job: JobData) -> str: exitcode = 0 diagnostics = "" + t0 = time.time() try: exitcode, diagnostics, not_opened_turls, lsetup_time = open_remote_files(job.indata, job.workdir, get_nthreads(catchall)) except Exception as exc: @@ -532,6 +537,10 @@ def get_payload_command(job: JobData) -> str: else: process_remote_file_traces(path, job, not_opened_turls) # ignore PyCharm warning, path is str + t1 = time.time() + dt = int(t1 - t0) + logger.info(f'remote file verification finished in {dt} s') + # fail the job if the remote files could not be verified if exitcode != 0: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode, msg=diagnostics) diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py index 3e2f312e..4c819e7e 100644 --- a/pilot/user/generic/common.py +++ b/pilot/user/generic/common.py @@ -27,6 +27,7 @@ from signal import SIGTERM from pilot.common.exception import TrfDownloadFailure +from pilot.info.jobdata import JobData from pilot.util.config import config from pilot.util.constants import ( UTILITY_BEFORE_PAYLOAD, @@ -64,7 +65,7 @@ def validate(job: object) -> bool: return True -def get_payload_command(job: object) -> str: +def get_payload_command(job: JobData, args: object = None) -> str: """ Return the full command for executing the payload. @@ -73,12 +74,15 @@ def get_payload_command(job: object) -> str: By default, the full payload command is assumed to be in the job.jobparams. :param job: job object (object) + :param args: pilot arguments (object) :return: command (str). """ # Try to download the trf # if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") # logger.warning('overwrote job.transformation, now set to: %s' % job.transformation) + if not args: # bypass pylint complaint + pass ec, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) if ec != 0: raise TrfDownloadFailure(diagnostics) diff --git a/pilot/user/rubin/common.py b/pilot/user/rubin/common.py index d83bed7d..0a853656 100644 --- a/pilot/user/rubin/common.py +++ b/pilot/user/rubin/common.py @@ -28,6 +28,7 @@ from typing import Any from pilot.common.exception import TrfDownloadFailure +from pilot.info.jobdata import JobData from pilot.util.config import config from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED from pilot.util.filehandling import read_file @@ -60,7 +61,7 @@ def validate(job: Any) -> bool: return True -def get_payload_command(job: object): +def get_payload_command(job: JobData, args: object = None): """ Return the full command for executing the payload. @@ -68,12 +69,15 @@ def get_payload_command(job: object): By default, the full payload command is assumed to be in the job.jobparams. :param job: job object (object) + :param args: pilot arguments (object) :return: command (str). """ # Try to download the trf # if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") # logger.warning('overwrote job.transformation, now set to: %s' % job.transformation) + if not args: # bypass pylint complaint + pass ec, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) if ec != 0: raise TrfDownloadFailure(diagnostics) diff --git a/pilot/user/sphenix/common.py b/pilot/user/sphenix/common.py index b4aa30ee..fff836bc 100644 --- a/pilot/user/sphenix/common.py +++ b/pilot/user/sphenix/common.py @@ -30,6 +30,7 @@ FileHandlingFailure ) from pilot.info import FileSpec +from pilot.info.jobdata import JobData from pilot.util.config import config from pilot.util.constants import ( UTILITY_AFTER_PAYLOAD_FINISHED, @@ -76,7 +77,7 @@ def validate(job: object) -> bool: return True -def get_payload_command(job: object) -> str: +def get_payload_command(job: JobData, args: object = None) -> str: """ Return the full command for executing the payload. @@ -84,12 +85,15 @@ def get_payload_command(job: object) -> str: By default, the full payload command is assumed to be in the job.jobparams. :param job: job object (object) + :param args: pilot arguments (object) :return: command (str). """ # Try to download the trf # if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") # logger.warning('overwrote job.transformation, now set to: %s' % job.transformation) + if not args: # to bypass pylint complaint + pass ec, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) if ec != 0: raise TrfDownloadFailure(diagnostics) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 58835e2c..9a1ed683 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -65,6 +65,8 @@ PILOT_POST_FINAL_UPDATE = 'PILOT_POST_FINAL_UPDATE' PILOT_END_TIME = 'PILOT_END_TIME' PILOT_KILL_SIGNAL = 'PILOT_KILL_SIGNAL' +PILOT_PRE_REMOTEIO = 'PILOT_PRE_REMOTEIO' +PILOT_POST_REMOTEIO = 'PILOT_POST_REMOTEIO' # Keep track of log transfers LOG_TRANSFER_NOT_DONE = 'NOT_DONE' From 8ebb97b0dff5c3dde01dde89a98ef082807655d3 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 26 Nov 2024 12:34:05 +0100 Subject: [PATCH 58/65] Added pilot args object to get_payload_command(). Some cleanup --- pilot/control/payloads/generic.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 9e6e4ae0..a320dd1c 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -505,8 +505,6 @@ def post_payload(self, job: JobData): """ # write time stamps to pilot timing file update_time = time.time() - logger.debug(f"setting post-payload time to {update_time} s") - logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, update_time, self.__args) def run_command(self, cmd: str, label: str = "") -> Any: @@ -694,11 +692,10 @@ def wait_graceful(self, args: object, proc: Any) -> int: return exit_code - def get_payload_command(self, job: JobData) -> str: + def get_payload_command(self) -> str: """ Return the payload command string. - :param job: job object (JobData) :return: command (str). """ cmd = "" @@ -708,14 +705,14 @@ def get_payload_command(self, job: JobData) -> str: user = __import__( f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 ) - cmd = user.get_payload_command(job) # + 'sleep 900' # to test looping jobs + cmd = user.get_payload_command(self.__job, args=self.__args) # + 'sleep 900' # to test looping jobs except PilotException as error: - self.post_setup(job) + self.post_setup(self.__job) logger.error(traceback.format_exc()) - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( + self.__job.piloterrorcodes, self.__job.piloterrordiags = errors.add_error_code( error.get_error_code() ) - self.__traces.pilot["error_code"] = job.piloterrorcodes[0] + self.__traces.pilot["error_code"] = self.__job.piloterrorcodes[0] logger.fatal( f"could not define payload command (traces error set to: {self.__traces.pilot['error_code']})" ) @@ -802,7 +799,7 @@ def run(self) -> tuple[int, str]: # noqa: C901 self.pre_setup(self.__job) # get the user defined payload command - cmd = self.get_payload_command(self.__job) + cmd = self.get_payload_command() if not cmd: logger.warning("aborting run() since payload command could not be defined") return errors.UNKNOWNPAYLOADFAILURE, "undefined payload command" From 63c70aefe9911be68b7edb07448a0c1ecc431740 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 26 Nov 2024 12:38:15 +0100 Subject: [PATCH 59/65] Recording time to verify remote files --- pilot/user/atlas/common.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 853b90ab..1533f369 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -57,7 +57,9 @@ UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_STARTED2, UTILITY_BEFORE_STAGEIN, - UTILITY_AFTER_PAYLOAD_FINISHED2 + UTILITY_AFTER_PAYLOAD_FINISHED2, + PILOT_PRE_REMOTEIO, + PILOT_POST_REMOTEIO ) from pilot.util.container import execute from pilot.util.filehandling import ( @@ -82,6 +84,7 @@ get_trimmed_dictionary, is_child ) +from pilot.util.timing import add_to_pilot_timing from pilot.util.tracereport import TraceReport from .container import ( create_root_container_command, @@ -519,7 +522,8 @@ def get_payload_command(job: JobData, args: object = None) -> str: exitcode = 0 diagnostics = "" - t0 = time.time() + t0 = int(time.time()) + add_to_pilot_timing(job.jobid, PILOT_PRE_REMOTEIO, t0, args) try: exitcode, diagnostics, not_opened_turls, lsetup_time = open_remote_files(job.indata, job.workdir, get_nthreads(catchall)) except Exception as exc: @@ -537,8 +541,9 @@ def get_payload_command(job: JobData, args: object = None) -> str: else: process_remote_file_traces(path, job, not_opened_turls) # ignore PyCharm warning, path is str - t1 = time.time() - dt = int(t1 - t0) + t1 = int(time.time()) + add_to_pilot_timing(job.jobid, PILOT_POST_REMOTEIO, t1, args) + dt = t1 - t0 logger.info(f'remote file verification finished in {dt} s') # fail the job if the remote files could not be verified From dfaaa074398ce94c45633aa57f60e3619aa82b91 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 26 Nov 2024 14:42:15 +0100 Subject: [PATCH 60/65] Recording time to verify remote files --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/timing.py | 26 ++++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 397bca28..ea783d7a 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.34 \ No newline at end of file +3.9.2.37 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 9a1ed683..a803cd25 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '35' # build number should be reset to '1' for every new development cycle +BUILD = '37' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/timing.py b/pilot/util/timing.py index ee823058..c87afd6d 100644 --- a/pilot/util/timing.py +++ b/pilot/util/timing.py @@ -52,6 +52,8 @@ PILOT_PRE_STAGEOUT, PILOT_PRE_FINAL_UPDATE, PILOT_START_TIME, + PILOT_PRE_REMOTEIO, + PILOT_POST_REMOTEIO ) from pilot.util.filehandling import ( read_json, @@ -234,6 +236,19 @@ def get_total_pilot_time(job_id: str, args: object) -> int: return get_time_difference(job_id, PILOT_START_TIME, PILOT_END_TIME, args) +def get_total_remoteio_time(job_id: str, args: object) -> int: + """ + Return the total time to verify remote i/o files for the given job_id. + + High level function that returns the end time for the given job_id. + + :param job_id: PanDA job id (str) + :param args: pilot arguments (object) + :return: time in seconds (int). + """ + return get_time_difference(job_id, PILOT_PRE_REMOTEIO, PILOT_POST_REMOTEIO, args) + + def get_postgetjob_time(job_id: str, args: object) -> int or None: """ Return the post getjob time. @@ -394,6 +409,17 @@ def timing_report(job_id: str, args: object) -> tuple[int, int, int, int, int, i time_payload = get_payload_execution_time(job_id, args) time_stageout = get_stageout_time(job_id, args) time_log_creation = get_log_creation_time(job_id, args) + time_remoteio = get_total_remoteio_time(job_id, args) + + # correct the setup and stagein times if remote i/o verification was done + if time_remoteio > 0: + logger.info("correcting setup and stagein times since remote i/o verification was done") + logger.debug(f"original setup time: {time_setup} s") + logger.debug(f"original stagein time: {time_stagein} s") + time_setup -= time_remoteio + time_stagein += time_remoteio + logger.debug(f"corrected setup time: {time_setup} s") + logger.debug(f"corrected stagein time: {time_stagein} s") logger.info('.' * 30) logger.info('. Timing measurements:') From e35fcaa5294f44d5501d535e55b327f918e0a375 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 2 Dec 2024 10:50:01 +0100 Subject: [PATCH 61/65] Improved exception handling for socket related errors --- pilot/util/tracereport.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/util/tracereport.py b/pilot/util/tracereport.py index d6b25ca5..b3607be4 100644 --- a/pilot/util/tracereport.py +++ b/pilot/util/tracereport.py @@ -114,7 +114,7 @@ def init(self, job): try: hostname = os.environ.get('PANDA_HOSTNAME', socket.gethostname()) - except socket.herror as exc: + except (socket.gaierror, socket.herror) as exc: logger.warning(f'unable to detect hostname for trace report: {exc}') hostname = os.environ.get('PANDA_HOSTNAME', 'unknown') @@ -126,7 +126,7 @@ def init(self, job): try: self['ip'] = socket.gethostbyname(hostname) - except socket.herror as exc: + except (socket.gaierror, socket.herror) as exc: logger.debug(f"unable to detect host IP for trace report: {exc}") self['ip'] = '0.0.0.0' From 94681339cbb592983e5b9f55fe0f9dc0e4b484db Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 2 Dec 2024 11:24:09 +0100 Subject: [PATCH 62/65] Improved extraction of dmesg memory error --- pilot/control/payload.py | 25 ++++++++++++++++++++++++- pilot/util/constants.py | 2 +- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/pilot/control/payload.py b/pilot/control/payload.py index 7d091073..9607e218 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -744,7 +744,30 @@ def scan_for_memory_errors(subprocesses: list) -> str: if search_str in line: diagnostics = line[line.find(search_str):] logger.warning(f'found memory error: {diagnostics}') - break + + # make sure that this message is for a true subprocess of the pilot + # extract the pid from the message and compare it to the subprocesses list + match = search(r'Killed process (\d+)', diagnostics) + if match: + try: + found_pid = int(match.group(1)) + logger.info(f"extracted PID: {found_pid}") + + # is it a known subprocess? + if found_pid in subprocesses: + logger.info("PID found in the list of subprocesses") + break + else: + logger.warning("the extracted PID is not a known subprocess of the payload") + diagnostics = "" + # is the extracted PID a subprocess of the main pilot process itself? + + except (ValueError, TypeError, AttributeError) as e: + logger.warning(f"failed to extract PID from the message: {e}") + diagnostics = "" + else: + logger.warning("PID could not be extracted from the message") + diagnostics = "" if diagnostics: break diff --git a/pilot/util/constants.py b/pilot/util/constants.py index a803cd25..e7454438 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '37' # build number should be reset to '1' for every new development cycle +BUILD = '38' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From e8d5b8a32f8d634f6ab2860ab55c96531c8bdfec Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 2 Dec 2024 17:54:09 +0100 Subject: [PATCH 63/65] Improved error message displayed on the monitor for remote i/o errors --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/PILOTVERSION b/PILOTVERSION index ea783d7a..4fb479ab 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.37 \ No newline at end of file +3.9.2.38 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 1533f369..660e206b 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -548,6 +548,8 @@ def get_payload_command(job: JobData, args: object = None) -> str: # fail the job if the remote files could not be verified if exitcode != 0: + # improve the error diagnostics + diagnostics = errors.format_diagnostics(exitcode, diagnostics) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exitcode, msg=diagnostics) raise PilotException(diagnostics, code=exitcode) else: From 61670c5b4a8a7b8316b4da1dc49fdc8f2782fdcd Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 4 Dec 2024 09:44:46 +0100 Subject: [PATCH 64/65] Delayed first CPU consumption time --- PILOTVERSION | 2 +- pilot/info/jobdata.py | 1 + pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 15 +++++++++++---- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 4fb479ab..8c5d7627 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.38 \ No newline at end of file +3.9.2.39 \ No newline at end of file diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index b769f831..c9e41ad6 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -133,6 +133,7 @@ class JobData(BaseData): prodproxy = "" # to keep track of production proxy on unified queues completed = False # True when job has finished or failed, used by https::send_update() lsetuptime = 0 # payload setup time (lsetup) + runningstart = None # time when the payload started running (only for internal monitoring purposes, not the actual start time) # time variable used for on-the-fly cpu consumption time measurements done by job monitoring t0 = None # payload startup time diff --git a/pilot/util/constants.py b/pilot/util/constants.py index e7454438..4cc40074 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '38' # build number should be reset to '1' for every new development cycle +BUILD = '39' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 8471904b..87a131ee 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -100,7 +100,11 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i # update timing info for running jobs (to avoid an update after the job has finished) if job.state == 'running': + # keep track of the time since the job started running (approximate since it is set here, move later) + if not job.runningstart: + job.runningstart = current_time + # check the disk space # make sure that any utility commands are still running (and determine pid of memory monitor- as early as possible) if job.utilities != {}: utility_monitor(job) @@ -108,10 +112,13 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i # confirm that the worker node has a proper SC_CLK_TCK (problems seen on MPPMU) check_hz() - # set the CPU consumption time for the job - exit_code, diagnostics = set_cpu_consumption_time(job) - if exit_code: - return exit_code, diagnostics + # set the CPU consumption time for the job (if it has been running for > 10s) + if job.runningstart and (current_time - job.runningstart) > 10: + exit_code, diagnostics = set_cpu_consumption_time(job) + if exit_code: + return exit_code, diagnostics + else: + logger.debug('skipping CPU consumption time check since job has not been running for long enough') # keep track of the subprocesses running (store payload subprocess PIDs) store_subprocess_pids(job) From 2d4e5b4647197067ec03f28601559b4fcc21c637 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 5 Dec 2024 09:42:18 +0100 Subject: [PATCH 65/65] Corrected core reporting --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/workernode.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 8c5d7627..aeb659e0 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.39 \ No newline at end of file +3.9.2.41 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4cc40074..2b5785b8 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '39' # build number should be reset to '1' for every new development cycle +BUILD = '41' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index aeeba599..df7e7227 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -381,13 +381,13 @@ def get_cpu_cores(modelstring): logger.info(f'found {number_of_cores} cores ({cores_per_socket} cores per socket, {sockets} sockets)') logger.debug(f'current model string: {modelstring}') - if number_of_cores > 0 and '-Core' not in modelstring: + if number_of_cores > 0: if '-Core Processor' in modelstring: # NN-Core info already in string - update it pattern = r'(\d+)\-Core Processor' _nn = re.findall(pattern, modelstring) if _nn: modelstring = modelstring.replace(f'{_nn[0]}-Core', f'{number_of_cores}-Core') - if 'Core Processor' in modelstring: + elif 'Core Processor' in modelstring: modelstring = modelstring.replace('Core', '%d-Core' % number_of_cores) elif 'Processor' in modelstring: modelstring = modelstring.replace('Processor', '%d-Core Processor' % number_of_cores)