From df9a4d9ddd21c094430ab9a30707a8775d3e06ca Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 9 Sep 2024 16:50:40 +0200 Subject: [PATCH 1/9] New version --- PILOTVERSION | 2 +- pilot/util/constants.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 4b3244f5..bd3ffbfc 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.8.1.66 \ No newline at end of file +3.8.2.1 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 0a4003b3..9e37aba5 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -27,8 +27,8 @@ # Pilot version RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '8' # version number is '1' for first release, '0' until then, increased for bigger updates -REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '66' # build number should be reset to '1' for every new development cycle +REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '1' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From a292eb8d8774cb6db6e2c76b9f4af33d8a9bb945 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 17 Sep 2024 11:53:13 +0200 Subject: [PATCH 2/9] Improved IPv6 info extraction --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/networking.py | 26 +++++++++++++++++++++++--- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index bd3ffbfc..ebf6e6f6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.8.2.1 \ No newline at end of file +3.8.2.2 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 9e37aba5..8c0bcef3 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '8' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '1' # build number should be reset to '1' for every new development cycle +BUILD = '2' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/networking.py b/pilot/util/networking.py index 5e03368d..d081b784 100644 --- a/pilot/util/networking.py +++ b/pilot/util/networking.py @@ -35,7 +35,7 @@ def dump_ipv6_info() -> None: """Dump the IPv6 info to the log.""" cmd = 'ifconfig' if not is_command_available(cmd): - _cmd = '/usr/sbin/ifconfig' + _cmd = '/usr/sbin/ifconfig -a' if not is_command_available(_cmd): logger.warning(f'command {cmd} is not available - this WN might not support IPv6') return @@ -43,15 +43,35 @@ def dump_ipv6_info() -> None: _, stdout, stderr = execute(cmd, timeout=10) if stdout: - ipv6 = extract_ipv6(stdout) + ipv6 = extract_ipv6_addresses(stdout) if ipv6: logger.info(f'IPv6 addresses: {ipv6}') else: - logger.warning('no IPv6 addresses found - this WN does not support IPv6') + logger.warning('no IPv6 addresses were found') else: logger.warning(f'failed to run ifconfig: {stderr}') +def extract_ipv6_addresses(ifconfig_output: str) -> list: + """Extracts IPv6 addresses from ifconfig output. + + Args: + ifconfig_output: The output of the ifconfig command. + + Returns: + A list of IPv6 addresses. + """ + + ipv6_addresses = [] + for line in ifconfig_output.splitlines(): + line = line.strip().replace("\t", " ").replace("\r", "").replace("\n", "") + match = re.search(r"inet6 (.*?)\s", line) + if match and match.group(1) != "::1": # skip loopback address + ipv6_addresses.append(match.group(1)) + + return ipv6_addresses + + def extract_ipv6(ifconfig: str) -> str: """ Extract the IPv6 address from the ifconfig output. From 3e6ef76c0514fa85c043d328471717efc767fc3c Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 18 Sep 2024 11:01:16 +0200 Subject: [PATCH 3/9] Spelling correction. Debug info --- pilot/api/data.py | 2 +- pilot/control/data.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 3305b47f..a0748044 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -133,7 +133,7 @@ def __init__(self, self.trace_report = trace_report if trace_report else TraceReport(pq=os.environ.get('PILOT_SITENAME', ''), ipv=self.ipv, workdir=self.workdir) if not self.acopytools: - msg = f'failed to initilize StagingClient: no acopytools options found, acopytools={self.acopytools}' + msg = f'failed to initialize StagingClient: no acopytools options found, acopytools={self.acopytools}' logger.error(msg) self.trace_report.update(clientState='BAD_COPYTOOL', stateReason=msg) self.trace_report.send() diff --git a/pilot/control/data.py b/pilot/control/data.py index 3c76a9b9..864d6bfe 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -285,11 +285,13 @@ def _stage_in(args: object, job: JobData) -> bool: logger.info('stage-in will not be done in a container') client, activity = get_stagein_client(job, args, label) + logger.info(f'activity={activity}') use_pcache = job.infosys.queuedata.use_pcache - + logger.debug(f'use_pcache={use_pcache}') # get the proper input file destination (normally job.workdir unless stager workflow) jobworkdir = job.workdir # there is a distinction for mv copy tool on ND vs non-ATLAS workdir = get_proper_input_destination(job.workdir, args.input_destination_dir) + logger.debug(f'workdir={workdir}') kwargs = {'workdir': workdir, 'cwd': job.workdir, 'usecontainer': False, @@ -301,7 +303,9 @@ def _stage_in(args: object, job: JobData) -> bool: 'rucio_host': args.rucio_host, 'jobworkdir': jobworkdir, 'args': args} + logger.debug(f'kwargs={kwargs}') client.prepare_sources(job.indata) + logger.info('prepared sources - will now transfer files') client.transfer(job.indata, activity=activity, **kwargs) except PilotException as error: error_msg = traceback.format_exc() From c9ce7f9a7c25722d7d34ff014724cacc67825604 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 18 Sep 2024 16:11:53 +0200 Subject: [PATCH 4/9] Now using psutils instead of ps command --- pilot/user/atlas/utilities.py | 22 ++++++++++++++-------- pilot/util/constants.py | 2 +- pilot/util/psutils.py | 24 ++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index c73df92a..27d91384 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -41,7 +41,10 @@ ) from pilot.util.parameters import convert_to_int from pilot.util.processes import is_process_running -from pilot.util.psutils import get_command_by_pid +from pilot.util.psutils import ( + get_command_by_pid, + find_process_by_jobid +) from .setup import get_asetup @@ -158,11 +161,9 @@ def get_proper_pid(pid: int, jobid: str, use_container: bool = True) -> int: if not is_process_running(pid): return -1 - ps = get_ps_info() - - # lookup the process id using ps aux + # lookup the process id using ps command or psutils logger.debug(f'attempting to identify pid from job id ({jobid})') - _pid = get_pid_for_jobid(ps, jobid) + _pid = get_pid_for_jobid(jobid) if _pid: logger.debug(f'discovered pid={_pid} for job id {jobid}') cmd = get_command_by_pid(_pid) @@ -188,6 +189,8 @@ def get_ps_info(whoami: str = None, options: str = 'axfo pid,user,args') -> str: """ Return ps info for the given user. + Note: this is a fallback solution in case the pid cannot be found in the psutils lookup. + :param whoami: username (str) :param options: ps options (str) :return: ps aux for given user (str). @@ -200,16 +203,19 @@ def get_ps_info(whoami: str = None, options: str = 'axfo pid,user,args') -> str: return stdout -def get_pid_for_jobid(ps: str, jobid: str) -> int or None: +def get_pid_for_jobid(jobid: str) -> int or None: """ Return the process id for the ps entry that contains the job id. - :param ps: ps command output (str) :param jobid: PanDA job id (str). :return: pid (int) or None if no such process (int or None). """ - pid = None + pid = find_process_by_jobid(jobid) + if pid: + return pid + # fallback to ps command + ps = get_ps_info() for line in ps.split('\n'): if jobid in line and 'xrootd' not in line: # extract pid diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 8c0bcef3..83e17faf 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '8' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '2' # build number should be reset to '1' for every new development cycle +BUILD = '4' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index f9b606e2..70bea556 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -267,3 +267,27 @@ def get_command_by_pid(pid: int) -> str or None: except psutil.NoSuchProcess: logger.warning(f"process with PID {pid} not found") return None + + +def find_process_by_jobid(jobid: int) -> int or None: + """ + Find the process ID of a process whose command arguments contain the given job ID. + + :param jobid: the job ID to search for (int) + :return: the process ID of the matching process, or None if no match is found (int or None). + """ + if not _is_psutil_available: + logger.warning('find_process_by_jobid(): psutil not available - aborting') + return None + + for proc in psutil.process_iter(): + try: + cmd_line = proc.cmdline() + except psutil.NoSuchProcess: + continue + + for arg in cmd_line: + if str(jobid) in arg: + return proc.pid + + return None From a6acee86a87e6a8a8e814f9d6168c1c838108f94 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 18 Sep 2024 18:21:57 +0200 Subject: [PATCH 5/9] Added debug messages, avoiding xrootd command --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/psutils.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index ebf6e6f6..3466543b 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.8.2.2 \ No newline at end of file +3.8.2.5 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 83e17faf..4a6c98f4 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '8' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '4' # build number should be reset to '1' for every new development cycle +BUILD = '5' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index 70bea556..d54670b3 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -286,8 +286,10 @@ def find_process_by_jobid(jobid: int) -> int or None: except psutil.NoSuchProcess: continue + logger.debug(f'cmd_line={cmd_line}') for arg in cmd_line: - if str(jobid) in arg: + logger.debug(f'arg={arg}') + if str(jobid) in arg and 'xrootd' not in arg: return proc.pid return None From 5685d2a116b7401d604e8ba62b353d0ace1a05a2 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 19 Sep 2024 17:48:51 +0200 Subject: [PATCH 6/9] Added possibility of setting rt logging info in catchall --- PILOTVERSION | 2 +- pilot/control/payload.py | 20 +++++++++++++++++--- pilot/util/constants.py | 2 +- pilot/util/realtimelogger.py | 2 +- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 3466543b..0a24dc84 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.8.2.5 \ No newline at end of file +3.8.2.6 \ No newline at end of file diff --git a/pilot/control/payload.py b/pilot/control/payload.py index 25adce15..7d091073 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -376,13 +376,21 @@ def extract_error_info(error: str) -> (int, str): return error_code, diagnostics -def get_rtlogging() -> str: +def get_rtlogging(catchall: str) -> str: """ - Return the proper rtlogging value from the experiment specific plug-in or the config file. + Return the proper rtlogging value from PQ.catchall, the experiment specific plug-in or the config file. + :param catchall: catchall field from queuedata (str) :return: rtlogging (str). """ + if catchall: + _rtlogging = findall(r'logging=([^,]+)', catchall) + if _rtlogging and ";" in _rtlogging[0]: + logger.info(f"found rtlogging in catchall: {_rtlogging[0]}") + return _rtlogging[0] + rtlogging = None + pilot_user = os.environ.get('PILOT_USER', 'generic').lower() try: user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) @@ -419,7 +427,13 @@ def get_logging_info(job: JobData, args: object) -> dict: info_dic['logname'] = args.realtime_logname if args.realtime_logname else "pilot-log" logserver = args.realtime_logging_server if args.realtime_logging_server else "" - info = findall(r'(\S+)\;(\S+)\:\/\/(\S+)\:(\d+)', get_rtlogging()) + try: + catchall = job.infosys.queuedata.catchall + except Exception as exc: + logger.warning(f'exception caught: {exc}') + catchall = "" + + info = findall(r'(\S+)\;(\S+)\:\/\/(\S+)\:(\d+)', get_rtlogging(catchall)) if not logserver and not info: logger.warning(f"not enough info available for activating real-time logging (info='{info}', logserver='{logserver}')") return {} diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4a6c98f4..b2693f62 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '8' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '5' # build number should be reset to '1' for every new development cycle +BUILD = '6' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/realtimelogger.py b/pilot/util/realtimelogger.py index d06984fd..df085a29 100644 --- a/pilot/util/realtimelogger.py +++ b/pilot/util/realtimelogger.py @@ -121,7 +121,7 @@ def __init__(self, args: Any, info_dic: dict, workdir: str, secrets: str, level: if workdir: # bypass pylint warning - keep workdir for possible future development pass if not info_dic: - logger.warning('info dictionary not set - add \'logging=type:protocol://host:port\' to PQ.catchall)') + logger.warning('info dictionary not set - add \'logging=type;protocol://host:port\' to PQ.catchall)') RealTimeLogger.glogger = None return From 98d1360417feb53165e1b514b8ae5cf17f040fdf Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 23 Sep 2024 12:09:11 +0200 Subject: [PATCH 7/9] Removed debug message --- pilot/util/psutils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index d54670b3..eb70f263 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -286,9 +286,7 @@ def find_process_by_jobid(jobid: int) -> int or None: except psutil.NoSuchProcess: continue - logger.debug(f'cmd_line={cmd_line}') for arg in cmd_line: - logger.debug(f'arg={arg}') if str(jobid) in arg and 'xrootd' not in arg: return proc.pid From 486af877850f079cf729534c2f853ec9ce2168da Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 23 Sep 2024 13:08:59 +0200 Subject: [PATCH 8/9] Improved exception handling --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/https.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 0a24dc84..6e7d9009 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.8.2.6 \ No newline at end of file +3.8.2.7 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index b2693f62..4c42e76e 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '8' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '6' # build number should be reset to '1' for every new development cycle +BUILD = '7' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/https.py b/pilot/util/https.py index 9a2da0d2..a26c75ce 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -646,7 +646,7 @@ def get_panda_server(url: str, port: int, update_server: bool = True) -> str: if default in pandaserver: try: rnd = random.choice([socket.getfqdn(vv) for vv in set([v[-1][0] for v in socket.getaddrinfo(default, 25443, socket.AF_INET)])]) - except socket.herror as exc: + except (socket.herror, socket.gaierror) as exc: logger.warning(f'failed to get address from socket: {exc} - will use default server ({pandaserver})') else: pandaserver = pandaserver.replace(default, rnd) From adaedd74cd00ea0126803f3af4349775c5e0151b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 24 Sep 2024 10:53:41 +0200 Subject: [PATCH 9/9] Merge with Alexey's code --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 6e7d9009..e88f7664 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.8.2.7 \ No newline at end of file +3.8.2.8 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4c42e76e..e497600c 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '8' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '7' # build number should be reset to '1' for every new development cycle +BUILD = '8' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1