From ab0322988e93c49c44295e1f6312a054dfd5fc17 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 30 Apr 2024 17:18:53 +0200 Subject: [PATCH 1/7] New version --- PILOTVERSION | 2 +- pilot/util/constants.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index bb737956..1c95e714 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.3.84 \ No newline at end of file +3.7.4.1 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 19f8bf94..dcb89023 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -27,8 +27,8 @@ # Pilot version RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates -REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '84' # build number should be reset to '1' for every new development cycle +REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '1' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 796c0f4c4a5779d244451f7c77f3d645542e060f Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 1 May 2024 16:04:02 +0200 Subject: [PATCH 2/7] Added sleep --- pilot/user/atlas/container.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index f4ff323b..6bc5af8f 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -861,7 +861,7 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str continue # Timeout for python script after LSETUP_COMPLETED - if lsetup_completed and time.time() - start_time > python_script_timeout: + if lsetup_completed and ((time.time() - start_time) > python_script_timeout): logger.warning("timeout for 'python3' subscript exceeded - killing script") process.kill() break @@ -873,6 +873,8 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str exit_code = return_code break + time.sleep(0.5) + # Ensure process is terminated if process.poll() is None: process.terminate() From 3856fdeaef07d5a6c33e3c2354ff75bba79ecc98 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 1 May 2024 16:24:29 +0200 Subject: [PATCH 3/7] Added error code 1378. Now identifying lsetup and remote file open time outs in new code. Refactored resolve trf error function --- pilot/common/errorcodes.py | 55 ++++++++++++++++++++--------------- pilot/user/atlas/common.py | 2 ++ pilot/user/atlas/container.py | 2 ++ 3 files changed, 35 insertions(+), 24 deletions(-) diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 4f059612..4c84c373 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -178,6 +178,7 @@ class ErrorCodes: LEASETIME = 1375 LOGCREATIONTIMEOUT = 1376 CVMFSISNOTALIVE = 1377 + LSETUPTIMEDOUT = 1378 _error_messages = { GENERALERROR: "General pilot error, consult batch log", @@ -317,7 +318,8 @@ class ErrorCodes: REMOTEFILEDICTDOESNOTEXIST: "Remote file open dictionary does not exist", LEASETIME: "Lease time is up", # internal use only LOGCREATIONTIMEOUT: "Log file creation timed out", - CVMFSISNOTALIVE: "CVMFS is not responding" + CVMFSISNOTALIVE: "CVMFS is not responding", + LSETUPTIMEDOUT: "Lsetup command timed out during remote file open" } put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181] @@ -435,34 +437,39 @@ def resolve_transform_error(self, exit_code: int, stderr: str) -> int: :param stderr: transform stderr (str) :return: pilot error code (int). """ - if exit_code and "Not mounting requested bind point" in stderr: - exit_code = self.SINGULARITYBINDPOINTFAILURE + error_map = { + "Not mounting requested bind point": self.SINGULARITYBINDPOINTFAILURE, + "No more available loop devices": self.SINGULARITYNOLOOPDEVICES, + "Failed to mount image": self.SINGULARITYIMAGEMOUNTFAILURE, + "error: while mounting": self.SINGULARITYIMAGEMOUNTFAILURE, + "Operation not permitted": self.SINGULARITYGENERALFAILURE, + "Failed to create user namespace": self.SINGULARITYFAILEDUSERNAMESPACE, + "Singularity is not installed": self.SINGULARITYNOTINSTALLED, + "Apptainer is not installed": self.APPTAINERNOTINSTALLED, + "cannot create directory": self.MKDIR, + "General payload setup verification error": self.SETUPFAILURE + } + + # Check if stderr contains any known error messages + for error_message, error_code in error_map.items(): + if error_message in stderr: + return error_code + + # Handle specific exit codes + if exit_code == 2: + return self.LSETUPTIMEDOUT + elif exit_code == 3: + return self.REMOTEFILEOPENTIMEDOUT elif exit_code == 251: - exit_code = self.UNKNOWNTRFFAILURE - elif exit_code and "No more available loop devices" in stderr: - exit_code = self.SINGULARITYNOLOOPDEVICES - elif exit_code and ("Failed to mount image" in stderr or "error: while mounting" in stderr): - exit_code = self.SINGULARITYIMAGEMOUNTFAILURE - elif exit_code and "Operation not permitted" in stderr: - exit_code = self.SINGULARITYGENERALFAILURE - elif exit_code and "Failed to create user namespace" in stderr: - exit_code = self.SINGULARITYFAILEDUSERNAMESPACE - elif "Singularity is not installed" in stderr: # exit code should be 64 but not always? - exit_code = self.SINGULARITYNOTINSTALLED - elif "Apptainer is not installed" in stderr: # exit code should be 64 but not always? - exit_code = self.APPTAINERNOTINSTALLED - elif exit_code == 64 and "cannot create directory" in stderr: - exit_code = self.MKDIR - elif exit_code and "General payload setup verification error" in stderr: - exit_code = self.SETUPFAILURE + return self.UNKNOWNTRFFAILURE elif exit_code == -1: - exit_code = self.UNKNOWNTRFFAILURE + return self.UNKNOWNTRFFAILURE elif exit_code == self.COMMANDTIMEDOUT: - pass + return exit_code elif exit_code != 0: - exit_code = self.PAYLOADEXECUTIONFAILURE + return self.PAYLOADEXECUTIONFAILURE - return exit_code + return exit_code # Return original exit code if no specific error is found def extract_stderr_error(self, stderr: str) -> str: """ diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 9216ba13..0d949809 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -268,6 +268,8 @@ def open_remote_files(indata: list, workdir: str, nthreads: int) -> (int, str, l logger.warning(diagnostics) return 11, diagnostics, not_opened + # if execute_remote_file_open() returns exit code 1, it means general error. + # exit code 2 means that lsetup timed out, while 3 means that the python script (actual file open) timed out try: exitcode, stdout = execute_remote_file_open(path, timeout) except PilotException as exc: diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index 6bc5af8f..919cdf61 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -833,6 +833,7 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str # Check for timeout (once per second) if time.time() - start_time > lsetup_timeout and not lsetup_completed: logger.warning("timeout for 'lsetup' exceeded - killing script") + exit_code = 2 # 'lsetup' timeout process.kill() break @@ -863,6 +864,7 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str # Timeout for python script after LSETUP_COMPLETED if lsetup_completed and ((time.time() - start_time) > python_script_timeout): logger.warning("timeout for 'python3' subscript exceeded - killing script") + exit_code = 3 # python script timeout process.kill() break From fcb72b5854a1e658a97a96896a80ecc7ce2ab3cc Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 1 May 2024 16:27:39 +0200 Subject: [PATCH 4/7] Increased time out from 600 s to 900 s --- pilot/user/atlas/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 0d949809..2bd8a56e 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -321,7 +321,7 @@ def get_timeout_for_remoteio(indata: list) -> int: """ remote_io = [fspec.status == 'remote_io' for fspec in indata] - return len(remote_io) * 30 + 600 + return len(remote_io) * 30 + 900 def parse_remotefileverification_dictionary(workdir: str) -> (int, str, list): From 809830020fb378d2c5a97a483a761ea89efe72dc Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 2 May 2024 18:15:17 +0200 Subject: [PATCH 5/7] Added debug info for remote file open timeouts --- pilot/user/atlas/container.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index 919cdf61..2e9bc3b7 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -842,10 +842,11 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str output = process.stdout.readline() # Read bytes directly if output is not None: # Check if any output is available (not None) output = output.decode().strip() - logger.info(output) # Print output for monitoring + logger.info(f'remote file open: {output}') # Check for LSETUP_COMPLETED message if output == "LSETUP_COMPLETED": + logger.info('lsetup has completed (resetting start time)') lsetup_completed = True start_time = time.time() # Reset start time for 'python3' timeout @@ -863,7 +864,8 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str # Timeout for python script after LSETUP_COMPLETED if lsetup_completed and ((time.time() - start_time) > python_script_timeout): - logger.warning("timeout for 'python3' subscript exceeded - killing script") + logger.warning(f"timeout for 'python3' subscript exceeded - killing script " + f"({time.time()} - {start_time} > {python_script_timeout})") exit_code = 3 # python script timeout process.kill() break From a41d2c6c1da389e9e38ca0c67c7b6874e0ac4b0a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 2 May 2024 18:19:42 +0200 Subject: [PATCH 6/7] Corrected log message --- pilot/user/atlas/container.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index 2e9bc3b7..e74b1ceb 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -873,7 +873,7 @@ def execute_remote_file_open(path: str, python_script_timeout: int) -> (int, str # Check if script has completed normally return_code = process.poll() if return_code is not None: - logger.info("script execution completed with return code: {return_code}") + logger.info(f"script execution completed with return code: {return_code}") exit_code = return_code break From 612f379413e8748a725fc392e45c8a4d580b12d7 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 6 May 2024 19:12:52 +0200 Subject: [PATCH 7/7] Corrected cvmfs root --- PILOTVERSION | 2 +- pilot/user/atlas/cvmfs.py | 11 +++++++++-- pilot/util/constants.py | 2 +- pilot/util/cvmfs.py | 6 +++++- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 1c95e714..fe6e3d7f 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.4.1 \ No newline at end of file +3.7.4.2 \ No newline at end of file diff --git a/pilot/user/atlas/cvmfs.py b/pilot/user/atlas/cvmfs.py index 42212a33..03568b11 100644 --- a/pilot/user/atlas/cvmfs.py +++ b/pilot/user/atlas/cvmfs.py @@ -33,8 +33,6 @@ 'CVMFS_BASE/unpacked.cern.ch/logDir/lastUpdate', 'CVMFS_BASE/sft-nightlies.cern.ch/lcg/lastUpdate', ] -# when was the last cvmfs update? -last_update_file = '/cvmfs/sft.cern.ch/lcg/lastUpdate' def get_cvmfs_base_path() -> str: @@ -44,3 +42,12 @@ def get_cvmfs_base_path() -> str: :return: base path for CVMFS (str). """ return get_file_system_root_path() + + +def get_last_update_file() -> str: + """ + Return the last update file. + + :return: last update file (str). + """ + return f'{get_cvmfs_base_path()}/sft.cern.ch/lcg/lastUpdate' diff --git a/pilot/util/constants.py b/pilot/util/constants.py index dcb89023..da65ded0 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '1' # build number should be reset to '1' for every new development cycle +BUILD = '2' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/cvmfs.py b/pilot/util/cvmfs.py index 23d9560b..a869d528 100644 --- a/pilot/util/cvmfs.py +++ b/pilot/util/cvmfs.py @@ -87,7 +87,11 @@ def get_last_update() -> int: """ pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__(f'pilot.user.{pilot_user}.cvmfs', globals(), locals(), [pilot_user], 0) - last_update_file = getattr(user, 'last_update_file', None) + try: + last_update_file = user.get_last_update_file() + except AttributeError: + last_update_file = None + timestamp = None if last_update_file: if os.path.exists(last_update_file):