From ad666d1f1f9491ff5e12bfd43776282a9f0a9bb6 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 8 May 2024 10:04:33 +0200 Subject: [PATCH 01/41] New version --- PILOTVERSION | 2 +- pilot/util/constants.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index b53bc731..71e9aaea 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.5.4 \ No newline at end of file +3.7.6.1 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6053694d..056dd905 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -27,8 +27,8 @@ # Pilot version RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates -REVISION = '5' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '4' # build number should be reset to '1' for every new development cycle +REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '1' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 01a7693428b807cdc1d99907b56fd81c6a2b3de2 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 8 May 2024 15:54:41 +0200 Subject: [PATCH 02/41] Debug info for write_json() --- pilot/util/filehandling.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 307bce9a..cc5afd9b 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -401,14 +401,25 @@ def write_json(filename: str, data: Union[dict, list], sort_keys: bool = True, i """ status = False + logger.debug(f"data: {data} (type={type(data)}") + if os.path.exists(filename): + logger.debug(f'file already exists: {filename} timestamp: {os.path.getmtime(filename)}') + else: + logger.debug(f'writing data to file: {filename} (file does not exist yet)') try: with open(filename, 'w', encoding='utf-8') as _fh: dumpjson(data, _fh, sort_keys=sort_keys, indent=indent, separators=separators) + if os.path.exists(filename): + logger.debug(f'wrote data to file: {filename} timestamp: {os.path.getmtime(filename)}') + else: + logger.warning(f'failed to write data to file: {filename}') except (IOError, TypeError) as exc: - logger.warning(f'exception caught in write_json: {exc}') + logger.warning(f'exception caught (1) in write_json: {exc}') + except Exception as exc: + logger.warning(f'exception caught (2) in write_json: {exc}') else: status = True - + logger.debug(f"write_json status: {status} timestamp: {os.path.getmtime(filename)}") return status From c90726ff66a1f7d0f569abd7f05520522e94d0b4 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 9 May 2024 13:53:25 +0200 Subject: [PATCH 03/41] Corrected problem with handling write_json() return type --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/filehandling.py | 11 +---------- pilot/util/harvester.py | 4 ++-- 4 files changed, 5 insertions(+), 14 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 71e9aaea..0e096134 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.6.1 \ No newline at end of file +3.7.6.2 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 056dd905..fa7ee346 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '1' # build number should be reset to '1' for every new development cycle +BUILD = '2' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index cc5afd9b..4a0deb2c 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -401,25 +401,16 @@ def write_json(filename: str, data: Union[dict, list], sort_keys: bool = True, i """ status = False - logger.debug(f"data: {data} (type={type(data)}") - if os.path.exists(filename): - logger.debug(f'file already exists: {filename} timestamp: {os.path.getmtime(filename)}') - else: - logger.debug(f'writing data to file: {filename} (file does not exist yet)') try: with open(filename, 'w', encoding='utf-8') as _fh: dumpjson(data, _fh, sort_keys=sort_keys, indent=indent, separators=separators) - if os.path.exists(filename): - logger.debug(f'wrote data to file: {filename} timestamp: {os.path.getmtime(filename)}') - else: - logger.warning(f'failed to write data to file: {filename}') except (IOError, TypeError) as exc: logger.warning(f'exception caught (1) in write_json: {exc}') except Exception as exc: logger.warning(f'exception caught (2) in write_json: {exc}') else: status = True - logger.debug(f"write_json status: {status} timestamp: {os.path.getmtime(filename)}") + return status diff --git a/pilot/util/harvester.py b/pilot/util/harvester.py index 9b59a796..efa2ee21 100644 --- a/pilot/util/harvester.py +++ b/pilot/util/harvester.py @@ -282,8 +282,8 @@ def publish_work_report(work_report: dict = {}, worker_attributes_file: str = "w if "xml" in work_report: del (work_report["xml"]) - ec = write_json(worker_attributes_file, work_report) - if ec: + status = write_json(worker_attributes_file, work_report) + if not status: logger.error(f"work report publish failed: {work_report}") return False else: From 6622dc156d8a4f4e5d6a70aaffa040268e466c6a Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 10 May 2024 09:58:09 +0200 Subject: [PATCH 04/41] Updated cpu_arch handling --- PILOTVERSION | 2 +- pilot/user/atlas/utilities.py | 3 ++- pilot/util/constants.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 0e096134..45ad4061 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.6.2 \ No newline at end of file +3.7.6.3 \ No newline at end of file diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index d5cea6ea..41cb3e20 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -883,7 +883,8 @@ def filter_output(stdout): # CPU arch script has now been copied, time to execute it # (reset irrelevant stderr) ec, stdout, stderr = execute(cmd) - if ec == 0 and 'RHEL9 and clone support is relatively new' in stderr: + if ec == 0 and ('RHEL9 and clone support is relatively new' in stderr or + 'RHEL8 and clones are not supported for users' in stderr): stderr = '' if ec or stderr: logger.warning(f'ec={ec}, stdout={stdout}, stderr={stderr}') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index fa7ee346..21679fd3 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '2' # build number should be reset to '1' for every new development cycle +BUILD = '3' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 6b3b3313c3c957f16f00406214b054882f268d19 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 10 May 2024 10:56:13 +0200 Subject: [PATCH 05/41] Aborting cvmfs check if env var set --- PILOTVERSION | 2 +- pilot.py | 45 +++++++++++++++++++++++++++++------------ pilot/util/constants.py | 2 +- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 45ad4061..12b51479 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.6.3 \ No newline at end of file +3.7.6.4 \ No newline at end of file diff --git a/pilot.py b/pilot.py index ebd356b3..c4b24795 100755 --- a/pilot.py +++ b/pilot.py @@ -121,19 +121,9 @@ def main() -> int: ) # note: assuming IPv6, fallback in place # check cvmfs if available - is_available = is_cvmfs_available() - if is_available is None: - pass # ignore this case - elif is_available is True: - timestamp = get_last_update() - if timestamp and timestamp > 0: - logger.info('CVMFS has been validated') - else: - logger.warning('CVMFS is not responding - aborting pilot') - return errors.CVMFSISNOTALIVE - else: - logger.warning('CVMFS is not alive - aborting pilot') - return errors.CVMFSISNOTALIVE + ec = check_cvmfs(logger) + if ec: + return ec if not args.rucio_host: args.rucio_host = config.Rucio.host @@ -202,6 +192,35 @@ def main() -> int: return exitcode +def check_cvmfs(logger: Any) -> int: + """ + Check if cvmfs is available. + + :param logger: logging object. + :return: exit code (int). + """ + # skip all tests if required + if os.environ.get("NO_CVMFS_OK", False): + logger.info("skipping cvmfs checks") + return 0 + + is_available = is_cvmfs_available() + if is_available is None: + pass # ignore this case + elif is_available is True: + timestamp = get_last_update() + if timestamp and timestamp > 0: + logger.info('CVMFS has been validated') + else: + logger.warning('CVMFS is not responding - aborting pilot') + return errors.CVMFSISNOTALIVE + else: + logger.warning('CVMFS is not alive - aborting pilot') + return errors.CVMFSISNOTALIVE + + return 0 + + def str2bool(var: str) -> bool: """ Convert string to bool. diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 21679fd3..685879fe 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '3' # build number should be reset to '1' for every new development cycle +BUILD = '4' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From c8da8521a7de8e7a9fbd46d6185380d639f95dfc Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 14 May 2024 13:09:51 +0200 Subject: [PATCH 06/41] pylint changes --- doc/components/api/benchmark.rst | 19 ----------- doc/components/api/index.rst | 3 +- pilot/api/analytics.py | 6 +--- pilot/api/benchmark.py | 39 ---------------------- pilot/api/data.py | 25 ++++++++------ pilot/api/services.py | 6 ++-- pilot/user/atlas/common.py | 6 +--- pilot/user/atlas/utilities.py | 56 ++++++++++++-------------------- 8 files changed, 41 insertions(+), 119 deletions(-) delete mode 100644 doc/components/api/benchmark.rst delete mode 100644 pilot/api/benchmark.py diff --git a/doc/components/api/benchmark.rst b/doc/components/api/benchmark.rst deleted file mode 100644 index e96256a6..00000000 --- a/doc/components/api/benchmark.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. - Pilot 2 pilot.api.benchmark doc file - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - Paul Nilsson, paul.nilsson@cern.ch, 2018 - -benchmark -========= - -.. automodule:: pilot.api.benchmark - :members: - :private-members: - :special-members: - :undoc-members: diff --git a/doc/components/api/index.rst b/doc/components/api/index.rst index 6812f893..6851eab5 100644 --- a/doc/components/api/index.rst +++ b/doc/components/api/index.rst @@ -7,7 +7,7 @@ http://www.apache.org/licenses/LICENSE-2.0 Authors: - - Paul Nilsson, paul.nilsson@cern.ch, 2017-2018 + - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 api components ============== @@ -16,7 +16,6 @@ api components :maxdepth: 2 analytics - benchmark data memorymonitor services diff --git a/pilot/api/analytics.py b/pilot/api/analytics.py index 34ca9ef6..5561b268 100644 --- a/pilot/api/analytics.py +++ b/pilot/api/analytics.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Functions for performing analytics including fitting of data.""" @@ -379,8 +379,6 @@ def chi2(self): def set_slope(self): """ Calculate and set the slope of the linear fit. - - :return: """ if self._ss2 and self._ss and self._ss != 0: self._slope = self._ss2 / float(self._ss) @@ -398,8 +396,6 @@ def slope(self): def set_intersect(self): """ Calculate and set the intersect of the linear fit. - - :return: """ if self._ym and self._slope and self._xm: self._intersect = self._ym - self._slope * self._xm diff --git a/pilot/api/benchmark.py b/pilot/api/benchmark.py deleted file mode 100644 index cd55e68f..00000000 --- a/pilot/api/benchmark.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 - -"""Benchmark prototype.""" - -from .services import Services - -import logging -logger = logging.getLogger(__name__) - - -class Benchmark(Services): - """Benchmark service class.""" - - def __init__(self, *args): - """ - Init function. - - :param args: - """ - pass diff --git a/pilot/api/data.py b/pilot/api/data.py index b62df256..a0b94f04 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -18,7 +18,7 @@ # # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2018-2019 @@ -29,7 +29,7 @@ import logging import time from functools import reduce - +from typing import Any try: import requests except ImportError: @@ -56,7 +56,7 @@ from pilot.util.tracereport import TraceReport -class StagingClient(object): +class StagingClient: """Base Staging Client.""" ipv = "IPv6" @@ -80,18 +80,23 @@ class StagingClient(object): # list of allowed schemas to be used for transfers from REMOTE sites remoteinput_allowed_schemas = ['root', 'gsiftp', 'dcap', 'srm', 'storm', 'https'] - def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_copytools='rucio', trace_report=None, ipv='IPv6', workdir=''): + def __init__(self, infosys_instance: Any = None, acopytools: dict = None, logger: Any = None, + default_copytools: str = 'rucio', trace_report: dict = None, ipv: str = 'IPv6', workdir: str = ""): """ Set default/init values. - If `acopytools` is not specified then it will be automatically resolved via infosys. In this case `infosys` requires initialization. + If `acopytools` is not specified then it will be automatically resolved via infosys. In this case `infosys` + requires initialization. + :param infosys_instance: infosys instance to be used for data resolution (Any) :param acopytools: copytool names per activity to be used for transfers. Accepts also list of names or string value without activity passed (dict) - :param logger: logging.Logger object to use for logging (None means no logging) + :param logger: logging.Logger object to use for logging (None means no logging) (Any) :param default_copytools: copytool name(s) to be used in case of unknown activity passed. Accepts either list of names or single string value (str) - :param ipv: internet protocol version (str). + :param trace_report: trace report object (dict) + :param ipv: internet protocol version (str) + :param workdir: working directory (str). """ - super(StagingClient, self).__init__() + super().__init__() if not logger: logger = logging.getLogger(__name__ + '.null') @@ -127,12 +132,12 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ raise PilotException("failed to resolve acopytools settings") logger.info('configured copytools per activity: acopytools=%s', self.acopytools) - def allow_mvfinaldest(self, catchall): + def allow_mvfinaldest(self, catchall: str): """ Check if there is an override in catchall to allow mv to final destination. :param catchall: catchall from queuedata (str) - :return: True if 'mv_final_destination' is present in catchall, otherwise False (bool) + :return: True if 'mv_final_destination' is present in catchall, otherwise False (bool). """ return True if catchall and 'mv_final_destination' in catchall else False diff --git a/pilot/api/services.py b/pilot/api/services.py index 3e7e08d1..1aa352fd 100644 --- a/pilot/api/services.py +++ b/pilot/api/services.py @@ -17,9 +17,9 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 -"""High-level base class for Benchmark(), MemoryMonitoring() and Analytics() classes.""" +"""High-level base class for MemoryMonitoring and Analytics classes.""" # from pilot.util.container import execute @@ -28,4 +28,4 @@ class Services: - """High-level base class for Benchmark(), MemoryMonitoring() and Analytics() classes.""" + """High-level base class for MemoryMonitoring( and Analytics classes.""" diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 2bd8a56e..8f38d37c 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2018 """Common functions for ATLAS.""" @@ -101,7 +101,6 @@ post_memory_monitor_action, get_memory_monitor_summary_filename, get_prefetcher_setup, - get_benchmark_setup, get_memory_monitor_output_filename, get_metadata_dict_from_txt, ) @@ -2481,9 +2480,6 @@ def get_utility_command_setup(name: str, job: Any, setup: str = None) -> str: if name == 'Prefetcher': return get_prefetcher_setup(job) - if name == 'Benchmark': - return get_benchmark_setup(job) - return "" diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index 41cb3e20..ba47ba9d 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -17,11 +17,15 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 +"""Functions related to memory monitoring and other utilities.""" + +import logging import os import time from re import search +from typing import Any # from pilot.info import infosys from .setup import get_asetup @@ -31,47 +35,33 @@ from pilot.util.processes import is_process_running from pilot.util.psutils import get_command_by_pid -import logging logger = logging.getLogger(__name__) -def get_benchmark_setup(job): - """ - Return the proper setup for the benchmark command. - - :param job: job object. - :return: setup string for the benchmark command. - """ - - return '' - - -def get_prefetcher_setup(job): +def get_prefetcher_setup(job: Any) -> str: """ Return the proper setup for the Prefetcher. + Prefetcher is a tool used with the Event Streaming Service. - :param job: job object. - :return: setup string for the Prefetcher command. + :param job: job object (Any) + :return: setup string for the Prefetcher command (str). """ - - # add code here .. - - return '' + return "" -def get_network_monitor_setup(setup, job): +def get_network_monitor_setup(setup: str, job: Any) -> str: """ Return the proper setup for the network monitor. + The network monitor is currently setup together with the payload and is start before it. The payload setup should therefore be provided. The network monitor setup is prepended to it. - :param setup: payload setup string. - :param job: job object. - :return: network monitor setup string. + :param setup: payload setup string (str) + :param job: job object (Any) + :return: network monitor setup string (str). """ - - return '' + return "" def get_memory_monitor_summary_filename(selector=None): @@ -811,14 +801,12 @@ def get_memory_values(workdir, name=""): return summary_dictionary -def post_memory_monitor_action(job): +def post_memory_monitor_action(job: Any): """ Perform post action items for memory monitor. - :param job: job object. - :return: + :param job: job object (Any). """ - nap = 3 path1 = os.path.join(job.workdir, get_memory_monitor_summary_filename()) path2 = os.environ.get('PILOT_HOME') @@ -841,10 +829,7 @@ def post_memory_monitor_action(job): def precleanup(): """ Pre-cleanup at the beginning of the job to remove any pre-existing files from previous jobs in the main work dir. - - :return: """ - logger.debug('performing pre-cleanup of potentially pre-existing files from earlier job in main work dir') path = os.path.join(os.environ.get('PILOT_HOME'), get_memory_monitor_summary_filename()) if os.path.exists(path): @@ -852,16 +837,15 @@ def precleanup(): remove(path) -def get_cpu_arch(): +def get_cpu_arch() -> str: """ Return the CPU architecture string. The CPU architecture string is determined by a script (cpu_arch.py), run by the pilot but setup with lsetup. For details about this script, see: https://its.cern.ch/jira/browse/ATLINFR-4844 - :return: CPU arch (string). + :return: CPU arch (str). """ - cpu_arch = '' def filter_output(stdout): From 21e5bc3faadb2a9dff3b57848f62862cf2268cc7 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 14 May 2024 13:11:28 +0200 Subject: [PATCH 07/41] pylint changes --- pilot/api/data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index a0b94f04..86b401c1 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -149,15 +149,16 @@ def set_acopytools(self): self.acopytools = dict(default=list((self.infosys.queuedata.copytools or {}).keys())) @staticmethod - def get_default_copytools(default_copytools): + def get_default_copytools(default_copytools: str): """ - Get the default copytools. + Get the default copytools as a list. :param default_copytools: default copytools (str) :return: default copytools (str). """ if isinstance(default_copytools, str): default_copytools = [default_copytools] if default_copytools else [] + return default_copytools @classmethod From 8394e28060ac7664b670afd808d588454b4a1a89 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 14 May 2024 13:14:58 +0200 Subject: [PATCH 08/41] Changed annoying log message from info to debug --- pilot/control/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 7a42206d..7641dc65 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -3152,7 +3152,7 @@ def send_heartbeat_if_time(job: Any, args: Any, update_time: float) -> int: send_state(job, args, 'running') update_time = time.time() else: - logger.info('will not send any job update') + logger.debug('will not send any job update') return int(update_time) From 9c695c06ef63a98101d5523dc96cc76c29945ca5 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 14 May 2024 13:41:05 +0200 Subject: [PATCH 09/41] Removed running state requirement leading to lost heartbeat --- pilot/control/job.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 7641dc65..483da499 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -3147,9 +3147,9 @@ def send_heartbeat_if_time(job: Any, args: Any, update_time: float) -> int: # check for state==running here, and send explicit 'running' in send_state, rather than sending job.state # since the job state can actually change in the meantime by another thread # job.completed will anyway be checked in https::send_update() - if job.serverstate != 'finished' and job.serverstate != 'failed' and job.state == 'running': - logger.info('will send heartbeat for job in \'running\' state') - send_state(job, args, 'running') + if job.serverstate != 'finished' and job.serverstate != 'failed': + logger.info(f'will send heartbeat for job in \'{job.state}\' state') + send_state(job, args, job.state) update_time = time.time() else: logger.debug('will not send any job update') From c0053a51ea221ad8b9009abd142abd0e837489b1 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 14 May 2024 17:44:23 +0200 Subject: [PATCH 10/41] Pylint changes --- pilot/api/data.py | 113 ++++++++++++++++++++++------------------------ 1 file changed, 54 insertions(+), 59 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 86b401c1..264a6c10 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -139,14 +139,14 @@ def allow_mvfinaldest(self, catchall: str): :param catchall: catchall from queuedata (str) :return: True if 'mv_final_destination' is present in catchall, otherwise False (bool). """ - return True if catchall and 'mv_final_destination' in catchall else False + return catchall and 'mv_final_destination' in catchall def set_acopytools(self): """Set the internal acopytools.""" if not self.acopytools: # resolve from queuedata.acopytools using infosys self.acopytools = (self.infosys.queuedata.acopytools or {}).copy() if not self.acopytools: # resolve from queuedata.copytools using infosys - self.acopytools = dict(default=list((self.infosys.queuedata.copytools or {}).keys())) + self.acopytools = {"default": list((self.infosys.queuedata.copytools or {}).keys())} @staticmethod def get_default_copytools(default_copytools: str): @@ -162,51 +162,44 @@ def get_default_copytools(default_copytools: str): return default_copytools @classmethod - def get_preferred_replica(self, replicas, allowed_schemas): + def get_preferred_replica(cls, replicas: list, allowed_schemas: list) -> Any: """ Get preferred replica from the `replicas` list suitable for `allowed_schemas`. - :return: first matched replica or None if not found. + :param replicas: list of replicas (list) + :param allowed_schemas: list of allowed schemas (list) + :return: first matched replica or None if not found (Any). """ for replica in replicas: pfn = replica.get('pfn') for schema in allowed_schemas: if pfn and (not schema or pfn.startswith(f'{schema}://')): return replica + return None - def prepare_sources(self, files, activities=None): + def prepare_sources(self, files: list, activities: Any = None): """ Prepare sources. - Customize/prepare source data for each entry in `files` optionally checking data for requested `activities` - (custom StageClient could extend the logic if need) + Customize/prepare source data for each entry in `files` optionally checking data for requested `activities`. + (custom StageClient could extend the logic if needed). - :param files: list of `FileSpec` objects to be processed - :param activities: string or ordered list of activities to resolve `astorages` (optional) - :return: None. + :param files: list of `FileSpec` objects to be processed (list) + :param activities: string or ordered list of activities to resolve `astorages` (optional) (Any) + :return: (None) """ return None - def prepare_inputddms(self, files, activities=None): + def prepare_inputddms(self, files: list): """ Prepare input DDMs. - Populates filespec.inputddms for each entry from `files` list + Populates filespec.inputddms for each entry from `files` list. :param files: list of `FileSpec` objects - :param activities: sting or ordered list of activities to resolve astorages (optional). """ - activities = activities or 'read_lan' - if isinstance(activities, str): - activities = [activities] - astorages = self.infosys.queuedata.astorages if self.infosys and self.infosys.queuedata else {} - - storages = [] - for a in activities: - storages = astorages.get(a, []) - if storages: - break + storages = astorages.get('read_lan', []) #activity = activities[0] #if not storages: ## ignore empty astorages @@ -219,7 +212,7 @@ def prepare_inputddms(self, files, activities=None): if not fdat.inputddms and fdat.ddmendpoint: fdat.inputddms = [fdat.ddmendpoint] - def print_replicas(self, replicas, label='unsorted'): + def print_replicas(self, replicas: list, label: str = 'unsorted'): """ Print replicas. @@ -239,15 +232,15 @@ def print_replicas(self, replicas, label='unsorted'): break @classmethod - def sort_replicas(self, replicas, inputddms): + def sort_replicas(self, replicas: list, inputddms: list) -> list: """ Sort input replicas. Consider first affected replicas from inputddms. - :param replicas: Prioritized list of replicas [(pfn, dat)] - :param inputddms: preferred list of ddmebdpoint - :return: sorted `replicas`. + :param replicas: Prioritized list of replicas [(pfn, dat)] (list) + :param inputddms: preferred list of ddmebdpoint (list) + :return: sorted list of `replicas` (list). """ if not inputddms: return replicas @@ -269,21 +262,22 @@ def sort_replicas(self, replicas, inputddms): return xreplicas - def resolve_replicas(self, files, use_vp=False): + def resolve_replicas(self, files: list, use_vp: bool = False) -> list: """ Populate filespec.replicas for each entry from `files` list. fdat.replicas = [{'ddmendpoint':'ddmendpoint', 'pfn':'replica', 'domain':'domain value'}] - :param files: list of `FileSpec` objects - :param use_vp: True for VP jobs (boolean) - :return: files object. + :param files: list of `FileSpec` objects (list) + :param use_vp: True for VP jobs (bool) + :raise: Exception in case of list_replicas() failure + :return: list of files (list). """ logger = self.logger xfiles = [] for fdat in files: - # skip fdat if need for further workflow (e.g. to properly handle OS ddms) + # skip fdat if needed (e.g. to properly handle OS ddms) xfiles.append(fdat) if not xfiles: # no files for replica look-up @@ -336,14 +330,15 @@ def resolve_replicas(self, files, use_vp=False): return files - def list_replicas(self, xfiles, use_vp): + def list_replicas(self, xfiles: list, use_vp: bool) -> list: """ List Rucio replicas. Wrapper around rucio_client.list_replicas() - :param xfiles: files object + :param xfiles: list of files objects (list) :param use_vp: True for VP jobs (bool) + :raise: PilotException in case of list_replicas() failure :return: replicas (list). """ # load replicas from Rucio @@ -380,13 +375,13 @@ def list_replicas(self, xfiles, use_vp): return replicas - def add_replicas(self, fdat, replica): + def add_replicas(self, fdat: Any, replica: Any) -> Any: """ Add the replicas to the fdat structure. - :param fdat: fdat object - :param replica: replica object - :return: updated fdat object. + :param fdat: fdat object (Any) + :param replica: replica object (Any) + :return: updated fdat object (Any). """ fdat.replicas = [] # reset replicas list @@ -589,14 +584,14 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 if remain_files: # failed or incomplete transfer # propagate message from first error back up - errmsg = str(caught_errors[0]) if caught_errors else '' + # errmsg = str(caught_errors[0]) if caught_errors else '' if caught_errors and "Cannot authenticate" in str(caught_errors): code = ErrorCodes.STAGEINAUTHENTICATIONFAILURE elif caught_errors and "bad queue configuration" in str(caught_errors): code = ErrorCodes.BADQUEUECONFIGURATION elif caught_errors and isinstance(caught_errors[0], PilotException): code = caught_errors[0].get_error_code() - errmsg = caught_errors[0].get_last_error() + # errmsg = caught_errors[0].get_last_error() elif caught_errors and isinstance(caught_errors[0], TimeoutException): code = ErrorCodes.STAGEINTIMEOUT if self.mode == 'stage-in' else ErrorCodes.STAGEOUTTIMEOUT # is it stage-in/out? self.logger.warning('caught time-out exception: %s', caught_errors[0]) @@ -686,15 +681,15 @@ def resolve_protocols(self, files): return files @classmethod - def resolve_protocol(self, fspec, allowed_schemas=None): + def resolve_protocol(cls, fspec: Any, allowed_schemas: Any = None) -> list: """ Resolve protocol. Resolve protocol according to allowed schema - :param fspec: `FileSpec` instance - :param allowed_schemas: list of allowed schemas or any if None - :return: list of dict(endpoint, path, flavour). + :param fspec: `FileSpec` instance (list) + :param allowed_schemas: list of allowed schemas or any if None (Any) + :return: list of dict(endpoint, path, flavour) (list). """ if not fspec.protocols: return [] @@ -715,7 +710,7 @@ class StageInClient(StagingClient): mode = "stage-in" - def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None, domain=None): + def resolve_replica(self, fspec: Any, primary_schemas: Any = None, allowed_schemas: Any = None, domain: Any = None) -> dict or None: """ Resolve replica. @@ -724,9 +719,9 @@ def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None, dom Primary schemas ignore replica priority (used to resolve direct access replica, which could be not with top priority set). - :param fspec: input `FileSpec` objects - :param allowed_schemas: list of allowed schemas or any if None - :return: dict(surl, ddmendpoint, pfn, domain) or None if replica not found. + :param fspec: input `FileSpec` objects (Any) + :param allowed_schemas: list of allowed schemas or any if None (Any) + :return: dict(surl, ddmendpoint, pfn, domain) or None if replica not found (dict or None). """ if not fspec.replicas: self.logger.warning('resolve_replica() received no fspec.replicas') @@ -769,12 +764,12 @@ def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None, dom return {'surl': surl['pfn'], 'ddmendpoint': replica['ddmendpoint'], 'pfn': replica['pfn'], 'domain': replica['domain']} - def get_direct_access_variables(self, job): + def get_direct_access_variables(self, job: Any) -> (bool, str): """ Return the direct access settings for the PQ. - :param job: job object. - :return: allow_direct_access (bool), direct_access_type (string). + :param job: job object (Any) + :return: allow_direct_access (bool), direct_access_type (str). """ allow_direct_access, direct_access_type = False, '' if self.infosys.queuedata: # infosys is initialized @@ -1054,8 +1049,9 @@ def prepare_destinations(self, files, activities): if 'mv' in self.infosys.queuedata.copytools: return files else: - raise PilotException("Failed to resolve destination: no associated storages defined for activity=%s (%s)" - % (activity, ','.join(activities)), code=ErrorCodes.NOSTORAGE, state='NO_ASTORAGES_DEFINED') + act = ','.join(activities) + raise PilotException(f"Failed to resolve destination: no associated storages defined for activity={activity} ({act})", + code=ErrorCodes.NOSTORAGE, state='NO_ASTORAGES_DEFINED') # take the fist choice for now, extend the logic later if need ddm = storages[0] @@ -1077,7 +1073,7 @@ def prepare_destinations(self, files, activities): return files @classmethod - def get_path(self, scope, lfn, prefix='rucio'): + def get_path(cls, scope: str, lfn: str) -> str: """ Construct a partial Rucio PFN using the scope and the LFN. @@ -1085,12 +1081,11 @@ def get_path(self, scope, lfn, prefix='rucio'): :param scope: replica scope (str) :param lfn: repliva LFN (str) - :param prefix: prefix (str). + :return: constructed path (str). """ s = f'{scope}:{lfn}' hash_hex = hashlib.md5(s.encode('utf-8')).hexdigest() - #paths = [prefix] + scope.split('.') + [hash_hex[0:2], hash_hex[2:4], lfn] # exclude prefix from the path: this should be properly considered in protocol/AGIS for today paths = scope.split('.') + [hash_hex[0:2], hash_hex[2:4], lfn] paths = [_f for _f in paths if _f] # remove empty parts to avoid double /-chars @@ -1119,8 +1114,8 @@ def resolve_surl(self, fspec, protocol, ddmconf, **kwargs): # path = protocol.get('path', '').rstrip('/') # if not (ddm.is_deterministic or (path and path.endswith('/rucio'))): if not ddm.is_deterministic: - raise PilotException('resolve_surl(): Failed to construct SURL for non deterministic ddm=%s: ' - 'NOT IMPLEMENTED' % fspec.ddmendpoint, code=ErrorCodes.NONDETERMINISTICDDM) + raise PilotException(f'resolve_surl(): Failed to construct SURL for non deterministic ' + f'ddm={fspec.ddmendpoint}: NOT IMPLEMENTED', code=ErrorCodes.NONDETERMINISTICDDM) surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), self.get_path(fspec.scope, fspec.lfn)) return {'surl': surl} From 48f089a64edad82f0a29b49e0673be0c1e8ea421 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 14 May 2024 18:09:37 +0200 Subject: [PATCH 11/41] Pylint changes --- pilot/api/data.py | 54 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 264a6c10..297335be 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -232,7 +232,7 @@ def print_replicas(self, replicas: list, label: str = 'unsorted'): break @classmethod - def sort_replicas(self, replicas: list, inputddms: list) -> list: + def sort_replicas(cls, replicas: list, inputddms: list) -> list: """ Sort input replicas. @@ -311,7 +311,7 @@ def resolve_replicas(self, files: list, use_vp: bool = False) -> list: fdat.filesize = replica['bytes'] logger.warning("Filesize value for input file=%s is not defined, assigning info from Rucio replica: filesize=%s", fdat.lfn, replica['bytes']) - for ctype in ['adler32', 'md5']: + for ctype in ('adler32', 'md5'): if fdat.checksum.get(ctype) != replica[ctype] and replica[ctype]: logger.warning("Checksum value of input file=%s mismatched with info got from Rucio replica: checksum=%s, replica.checksum=%s, fdat=%s", fdat.lfn, fdat.checksum, replica[ctype], fdat) @@ -325,8 +325,7 @@ def resolve_replicas(self, files: list, use_vp: bool = False) -> list: self.trace_report.update(clientState="DONE") logger.info('Number of resolved replicas:\n' + - '\n'.join(["lfn=%s: replicas=%s, is_directaccess=%s" - % (f.lfn, len(f.replicas or []), f.is_directaccess(ensure_replica=False)) for f in files])) + '\n'.join([f"lfn={f.lfn}: nr replicas={len(f.replicas or [])}, is_directaccess={f.is_directaccess(ensure_replica=False)}" for f in files])) return files @@ -351,7 +350,7 @@ def list_replicas(self, xfiles: list, use_vp: bool) -> list: query = { 'schemes': ['srm', 'root', 'davs', 'gsiftp', 'https', 'storm', 'file'], - 'dids': [dict(scope=e.scope, name=e.lfn) for e in xfiles], + 'dids': [{"scope": e.scope, "name": e.lfn} for e in xfiles], } query.update(sort='geoip', client_location=location) # reset the schemas for VP jobs @@ -368,7 +367,7 @@ def list_replicas(self, xfiles: list, use_vp: bool) -> list: try: replicas = rucio_client.list_replicas(**query) except Exception as exc: - raise PilotException(f"Failed to get replicas from Rucio: {exc}", code=ErrorCodes.RUCIOLISTREPLICASFAILED) + raise PilotException(f"Failed to get replicas from Rucio: {exc}", code=ErrorCodes.RUCIOLISTREPLICASFAILED) from exc replicas = list(replicas) self.logger.debug(f"replicas received from Rucio: {replicas}") @@ -473,7 +472,7 @@ def detect_client_location(self, use_vp: bool = False) -> dict: self.logger.debug(f'will use client_location={client_location}') return client_location, diagnostics - def transfer_files(self, copytool, files, **kwargs): + def transfer_files(self, copytool, files, activity, **kwargs): """ Transfer the files. @@ -482,6 +481,7 @@ def transfer_files(self, copytool, files, **kwargs): :param copytool: copytool module :param files: list of `FileSpec` objects + :param activity: list of activity names used to determine appropriate copytool (prioritized list) :param kwargs: extra kwargs to be passed to copytool transfer handler :raise: PilotException in case of controlled error. """ @@ -513,8 +513,7 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 break if not copytools: - raise PilotException('failed to resolve copytool by preferred activities=%s, acopytools=%s' % - (activity, self.acopytools)) + raise PilotException(f'failed to resolve copytool by preferred activities={activity}, acopytools={self.acopytools}') # populate inputddms if needed self.prepare_inputddms(files) @@ -548,30 +547,30 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 code=ErrorCodes.UNKNOWNCOPYTOOL) module = self.copytool_modules[name]['module_name'] - self.logger.info('trying to use copytool=%s for activity=%s', name, activity) + self.logger.info(f'trying to use copytool={name} for activity={activity}') copytool = __import__(f'pilot.copytool.{module}', globals(), locals(), [module], 0) #self.trace_report.update(protocol=name) except PilotException as exc: caught_errors.append(exc) - self.logger.debug('error: %s', exc) + self.logger.debug(f'error: {exc}') continue except Exception as exc: - self.logger.warning('failed to import copytool module=%s, error=%s', module, exc) + self.logger.warning(f'failed to import copytool module={module}, error={exc}') continue try: result = self.transfer_files(copytool, remain_files, activity, **kwargs) - self.logger.debug('transfer_files() using copytool=%s completed with result=%s', copytool, str(result)) + self.logger.debug(f'transfer_files() using copytool={copytool} completed with result={result}') break except PilotException as exc: - self.logger.warning('failed to transfer_files() using copytool=%s .. skipped; error=%s', copytool, exc) + self.logger.warning(f'failed to transfer_files() using copytool={copytool} .. skipped; error={exc}') caught_errors.append(exc) except TimeoutException as exc: - self.logger.warning('function timed out: %s', exc) + self.logger.warning(f'function timed out: {exc}') caught_errors.append(exc) except Exception as exc: - self.logger.warning('failed to transfer files using copytool=%s .. skipped; error=%s', copytool, exc) + self.logger.warning(f'failed to transfer files using copytool={copytool} .. skipped; error={exc}') caught_errors.append(exc) import traceback self.logger.error(traceback.format_exc()) @@ -594,7 +593,7 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 # errmsg = caught_errors[0].get_last_error() elif caught_errors and isinstance(caught_errors[0], TimeoutException): code = ErrorCodes.STAGEINTIMEOUT if self.mode == 'stage-in' else ErrorCodes.STAGEOUTTIMEOUT # is it stage-in/out? - self.logger.warning('caught time-out exception: %s', caught_errors[0]) + self.logger.warning(f'caught time-out exception: {caught_errors[0]}') else: code = ErrorCodes.STAGEINFAILED if self.mode == 'stage-in' else ErrorCodes.STAGEOUTFAILED # is it stage-in/out? details = str(caught_errors) + ":" + f'failed to transfer files using copytools={copytools}' @@ -793,6 +792,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 :param copytool: copytool module :param files: list of `FileSpec` objects + :param activity: list of activity names used to determine appropriate copytool (prioritized list) :param kwargs: extra kwargs to be passed to copytool transfer handler :return: list of processed `FileSpec` objects @@ -978,18 +978,17 @@ def check_availablespace(self, files): :raise: PilotException in case of not enough space or total input size too large. """ for f in files: - self.logger.debug('lfn=%s filesize=%d accessmode=%s', f.lfn, f.filesize, f.accessmode) + self.logger.debug(f'lfn={f.lfn} filesize={f.filesize} accessmode={f.accessmode}') maxinputsize = convert_mb_to_b(get_maximum_input_sizes()) totalsize = reduce(lambda x, y: x + y.filesize, files, 0) # verify total filesize if maxinputsize and totalsize > maxinputsize: - error = "too many/too large input files (%s). total file size=%s B > maxinputsize=%s B" % \ - (len(files), totalsize, maxinputsize) + error = f"too many/too large input files ({len(files)}). total file size={totalsize} B > maxinputsize={maxinputsize} B" raise SizeTooLarge(error) - self.logger.info("total input file size=%s B within allowed limit=%s B (zero value means unlimited)", totalsize, maxinputsize) + self.logger.info(f"total input file size={totalsize} B within allowed limit={maxinputsize} B (zero value means unlimited)") # get available space try: @@ -1000,12 +999,11 @@ def check_availablespace(self, files): else: if disk_space: available_space = convert_mb_to_b(disk_space) - self.logger.info("locally available space: %d B", available_space) + self.logger.info("locally available space: {available_space} B") # are we within the limit? if totalsize > available_space: - error = "not enough local space for staging input files and run the job (need %d B, but only have %d B)" % \ - (totalsize, available_space) + error = f"not enough local space for staging input files and run the job (need {totalsize} B, but only have {available_space} B)" raise NoLocalSpace(error) else: self.logger.warning('get_local_disk_space() returned None') @@ -1048,10 +1046,10 @@ def prepare_destinations(self, files, activities): if not storages: if 'mv' in self.infosys.queuedata.copytools: return files - else: - act = ','.join(activities) - raise PilotException(f"Failed to resolve destination: no associated storages defined for activity={activity} ({act})", - code=ErrorCodes.NOSTORAGE, state='NO_ASTORAGES_DEFINED') + + act = ','.join(activities) + raise PilotException(f"Failed to resolve destination: no associated storages defined for activity={activity} ({act})", + code=ErrorCodes.NOSTORAGE, state='NO_ASTORAGES_DEFINED') # take the fist choice for now, extend the logic later if need ddm = storages[0] From 5597a42785863271319b30e6cac9bff4bc180ec6 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 15 May 2024 10:28:44 +0200 Subject: [PATCH 12/41] Pylint updates --- pilot/api/data.py | 50 +++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 297335be..7631b3e4 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -162,13 +162,13 @@ def get_default_copytools(default_copytools: str): return default_copytools @classmethod - def get_preferred_replica(cls, replicas: list, allowed_schemas: list) -> Any: + def get_preferred_replica(cls, replicas: list, allowed_schemas: list) -> Any or None: """ Get preferred replica from the `replicas` list suitable for `allowed_schemas`. :param replicas: list of replicas (list) :param allowed_schemas: list of allowed schemas (list) - :return: first matched replica or None if not found (Any). + :return: first matched replica or None if not found (Any or None). """ for replica in replicas: pfn = replica.get('pfn') @@ -472,32 +472,32 @@ def detect_client_location(self, use_vp: bool = False) -> dict: self.logger.debug(f'will use client_location={client_location}') return client_location, diagnostics - def transfer_files(self, copytool, files, activity, **kwargs): + def transfer_files(self, copytool: Any, files: list, activity: list, **kwargs: dict): """ Transfer the files. Apply transfer of given `files` using passed `copytool` module. Should be implemented by custom Staging Client. - :param copytool: copytool module - :param files: list of `FileSpec` objects - :param activity: list of activity names used to determine appropriate copytool (prioritized list) - :param kwargs: extra kwargs to be passed to copytool transfer handler + :param copytool: copytool module (Any) + :param files: list of `FileSpec` objects (list) + :param activity: list of activity names used to determine appropriate copytool (list) + :param kwargs: extra kwargs to be passed to copytool transfer handler (dict) :raise: PilotException in case of controlled error. """ raise NotImplementedError() - def transfer(self, files, activity='default', **kwargs): # noqa: C901 + def transfer(self, files: list, activity: list or str = 'default', **kwargs: dict) -> list: # noqa: C901 """ Perform file transfer. Automatically stage passed files using copy tools related to given `activity`. - :param files: list of `FileSpec` objects - :param activity: list of activity names used to determine appropriate copytool (prioritized list) - :param kwargs: extra kwargs to be passed to copytool transfer handler + :param files: list of `FileSpec` objects (list) + :param activity: list of activity names used to determine appropriate copytool (list or str) + :param kwargs: extra kwargs to be passed to copytool transfer handler (dict) :raise: PilotException in case of controlled error - :return: list of processed `FileSpec` objects. + :return: list of processed `FileSpec` objects (list). """ self.trace_report.update(relativeStart=time.time(), transferStart=time.time()) @@ -602,14 +602,14 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 return files - def require_protocols(self, files, copytool, activity, local_dir=''): + def require_protocols(self, files: list, copytool: Any, activity: list or str, local_dir: str = ''): """ Require protocols. Populates fspec.protocols and fspec.turl for each entry in `files` according to preferred fspec.ddm_activity - :param files: list of `FileSpec` objects - :param activity: str or ordered list of transfer activity names to resolve acopytools related data. + :param files: list of `FileSpec` objects (list) + :param activity: str or ordered list of transfer activity names to resolve acopytools related data (list or str). """ allowed_schemas = getattr(copytool, 'allowed_schemas', None) @@ -632,13 +632,13 @@ def require_protocols(self, files, copytool, activity, local_dir=''): protocols = self.resolve_protocol(fspec, allowed_schemas) if not protocols and 'mv' not in self.infosys.queuedata.copytools: # no protocols found error = f'Failed to resolve protocol for file={fspec.lfn}, allowed_schemas={allowed_schemas}, fspec={fspec}' - self.logger.error("resolve_protocol: %s", error) + self.logger.error(f"resolve_protocol: {error}") raise PilotException(error, code=ErrorCodes.NOSTORAGEPROTOCOL) # take first available protocol for copytool: FIX ME LATER if need (do iterate over all allowed protocols?) protocol = protocols[0] - self.logger.info("Resolved protocol to be used for transfer: \'%s\': lfn=\'%s\'", protocol, fspec.lfn) + self.logger.info(f"Resolved protocol to be used for transfer: \'{protocol}\': lfn=\'{fspec.lfn}\'") resolve_surl = getattr(copytool, 'resolve_surl', None) if not callable(resolve_surl): @@ -651,14 +651,16 @@ def require_protocols(self, files, copytool, activity, local_dir=''): if r.get('ddmendpoint'): fspec.ddmendpoint = r['ddmendpoint'] - def resolve_protocols(self, files): + def resolve_protocols(self, files: list) -> list: """ Resolve protocols. Populates filespec.protocols for each entry from `files` according to preferred `fspec.ddm_activity` value - :param files: list of `FileSpec` objects + fdat.protocols = [dict(endpoint, path, flavour), ..] - :return: `files` object. + + :param files: list of `FileSpec` objects (list) + :return: list of `files` object (list). """ ddmconf = self.infosys.resolve_storage_data() @@ -666,7 +668,7 @@ def resolve_protocols(self, files): ddm = ddmconf.get(fdat.ddmendpoint) if not ddm: error = f'Failed to resolve output ddmendpoint by name={fdat.ddmendpoint} (from PanDA), please check configuration.' - self.logger.error("resolve_protocols: %s, fspec=%s", error, fdat) + self.logger.error(f"resolve_protocols: {error}, fspec={fdat}") raise PilotException(error, code=ErrorCodes.NOSTORAGE) protocols = [] @@ -719,7 +721,9 @@ def resolve_replica(self, fspec: Any, primary_schemas: Any = None, allowed_schem Primary schemas ignore replica priority (used to resolve direct access replica, which could be not with top priority set). :param fspec: input `FileSpec` objects (Any) + :param primary_schemas: list of primary schemas or any if None (Any) :param allowed_schemas: list of allowed schemas or any if None (Any) + :param domain: domain value to match (Any) :return: dict(surl, ddmendpoint, pfn, domain) or None if replica not found (dict or None). """ if not fspec.replicas: @@ -759,7 +763,7 @@ def resolve_replica(self, fspec: Any, primary_schemas: Any = None, allowed_schem # prefer SRM protocol for surl -- to be verified, can it be deprecated? rse_replicas = replicas.get(replica['ddmendpoint'], []) surl = self.get_preferred_replica(rse_replicas, ['srm']) or rse_replicas[0] - self.logger.info("[stage-in] surl (srm replica) from Rucio: pfn=%s, ddmendpoint=%s", surl['pfn'], surl['ddmendpoint']) + self.logger.info(f"[stage-in] surl (srm replica) from Rucio: pfn={surl['pfn']}, ddmendpoint={surl['ddmendpoint']}") return {'surl': surl['pfn'], 'ddmendpoint': replica['ddmendpoint'], 'pfn': replica['pfn'], 'domain': replica['domain']} @@ -782,7 +786,7 @@ def get_direct_access_variables(self, job: Any) -> (bool, str): if job and not job.is_analysis() and job.transfertype != 'direct': # task forbids direct access allow_direct_access = False - self.logger.info('switched off direct access mode for production job since transfertype=%s', job.transfertype) + self.logger.info(f'switched off direct access mode for production job since transfertype={job.transfertype}') return allow_direct_access, direct_access_type From 8613fac8539d94f8352a8f2faff663f8ec8a299a Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 17 May 2024 08:50:53 +0200 Subject: [PATCH 13/41] Pylint changes --- pilot/api/data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 7631b3e4..9517d6ac 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -472,7 +472,7 @@ def detect_client_location(self, use_vp: bool = False) -> dict: self.logger.debug(f'will use client_location={client_location}') return client_location, diagnostics - def transfer_files(self, copytool: Any, files: list, activity: list, **kwargs: dict): + def transfer_files(self, copytool: Any, files: list, activity: list, **kwargs: dict) -> list: """ Transfer the files. @@ -790,7 +790,7 @@ def get_direct_access_variables(self, job: Any) -> (bool, str): return allow_direct_access, direct_access_type - def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C901 + def transfer_files(self, copytool, files, activity=None, **kwargs) -> list: # noqa: C901 """ Automatically stage in files using the selected copy tool module. @@ -1122,7 +1122,7 @@ def resolve_surl(self, fspec, protocol, ddmconf, **kwargs): surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), self.get_path(fspec.scope, fspec.lfn)) return {'surl': surl} - def transfer_files(self, copytool, files, activity, **kwargs): + def transfer_files(self, copytool: Any, files: list, activity: list, **kwargs: dict) -> list: """ Transfer files. From 0dccf740ba12179f2209fe3d532882fe26bdb6a7 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 17 May 2024 09:15:29 +0200 Subject: [PATCH 14/41] Pylint changes --- pilot/api/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 9517d6ac..9b821bd3 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -177,7 +177,7 @@ def get_preferred_replica(cls, replicas: list, allowed_schemas: list) -> Any or return replica return None - def prepare_sources(self, files: list, activities: Any = None): + def prepare_sources(self, files: list, activities: Any = None) -> None: """ Prepare sources. From 032bec8b6bf4f518eb820158970f317d771250ad Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 10:15:22 +0200 Subject: [PATCH 15/41] Pylint updates --- pilot/api/data.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 9b821bd3..3132e433 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -911,14 +911,14 @@ def transfer_files(self, copytool, files, activity=None, **kwargs) -> list: # n # return copytool.copy_in_bulk(remain_files, **kwargs) return copytool.copy_in(remain_files, **kwargs) - def set_status_for_direct_access(self, files, workdir): + def set_status_for_direct_access(self, files: list, workdir: str): """ Update the FileSpec status with 'remote_io' for direct access mode. Should be called only once since the function sends traces. - :param files: list of FileSpec objects. - :param workdir: work directory (string). + :param files: list of FileSpec objects (list) + :param workdir: work directory (str). """ for fspec in files: direct_lan = (fspec.domain == 'lan' and fspec.direct_access_lan and @@ -974,11 +974,11 @@ def set_status_for_direct_access(self, files, workdir): else: self.trace_report.send() - def check_availablespace(self, files): + def check_availablespace(self, files: list): """ Verify that enough local space is available to stage in and run the job. - :param files: list of FileSpec objects. + :param files: list of FileSpec objects (list) :raise: PilotException in case of not enough space or total input size too large. """ for f in files: @@ -1018,15 +1018,15 @@ class StageOutClient(StagingClient): mode = "stage-out" - def prepare_destinations(self, files, activities): + def prepare_destinations(self, files: list, activities: list or str) -> list: """ Resolve destination RSE (filespec.ddmendpoint) for each entry from `files` according to requested `activities`. Apply Pilot-side logic to choose proper destination. - :param files: list of FileSpec objects to be processed - :param activities: ordered list of activities to be used to resolve astorages - :return: updated fspec entries. + :param files: list of FileSpec objects to be processed (list) + :param activities: ordered list of activities to be used to resolve astorages (list or str) + :return: updated fspec entries (list). """ if not self.infosys.queuedata: # infosys is not initialized: not able to fix destination if need, nothing to do return files @@ -1039,7 +1039,6 @@ def prepare_destinations(self, files, activities): code=ErrorCodes.INTERNALPILOTPROBLEM, state='INTERNAL_ERROR') astorages = self.infosys.queuedata.astorages or {} - storages = None activity = activities[0] for a in activities: @@ -1058,8 +1057,8 @@ def prepare_destinations(self, files, activities): # take the fist choice for now, extend the logic later if need ddm = storages[0] - self.logger.info("[prepare_destinations][%s]: allowed (local) destinations: %s", activity, storages) - self.logger.info("[prepare_destinations][%s]: resolved default destination ddm=%s", activity, ddm) + self.logger.info(f"[prepare_destinations][{activity}]: allowed (local) destinations: {storages}") + self.logger.info(f"[prepare_destinations][{activity}]: resolved default destination ddm={ddm}") for e in files: if not e.ddmendpoint: # no preferences => use default destination From ccca637290af4aee431a96ea84eca29dc49d208e Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 17 May 2024 10:55:16 +0200 Subject: [PATCH 16/41] Pylint changes --- pilot/api/data.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 9b821bd3..60f9f0dd 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -439,8 +439,14 @@ def detect_client_location(self, use_vp: bool = False) -> dict: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(("8.8.8.8", 80)) ip = s.getsockname()[0] - except Exception as exc: - diagnostics = f'failed to get socket info: {exc}' + except socket.gaierror as e: + diagnostics = f'failed to get socket info due to address-related error: {e}' + self.logger.warning(diagnostics) + except socket.timeout as e: + diagnostics = f'failed to get socket info due to timeout: {e}' + self.logger.warning(diagnostics) + except socket.error as e: + diagnostics = f'failed to get socket info due to general socket error: {e}' self.logger.warning(diagnostics) client_location['ip'] = ip site = os.environ.get('PILOT_RUCIO_SITENAME', 'unknown') From 476fa9a7202b22428724ed5a77d360469595d69f Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 17 May 2024 11:06:53 +0200 Subject: [PATCH 17/41] Pylint changes --- pilot/api/data.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 60f9f0dd..9b821bd3 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -439,14 +439,8 @@ def detect_client_location(self, use_vp: bool = False) -> dict: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(("8.8.8.8", 80)) ip = s.getsockname()[0] - except socket.gaierror as e: - diagnostics = f'failed to get socket info due to address-related error: {e}' - self.logger.warning(diagnostics) - except socket.timeout as e: - diagnostics = f'failed to get socket info due to timeout: {e}' - self.logger.warning(diagnostics) - except socket.error as e: - diagnostics = f'failed to get socket info due to general socket error: {e}' + except Exception as exc: + diagnostics = f'failed to get socket info: {exc}' self.logger.warning(diagnostics) client_location['ip'] = ip site = os.environ.get('PILOT_RUCIO_SITENAME', 'unknown') From b4f76bec6efb637971f16ccd0aacc6a5ad70d667 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 17 May 2024 11:20:57 +0200 Subject: [PATCH 18/41] Pylint changes --- pilot/api/data.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 9b821bd3..8bb26603 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -439,9 +439,16 @@ def detect_client_location(self, use_vp: bool = False) -> dict: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(("8.8.8.8", 80)) ip = s.getsockname()[0] - except Exception as exc: - diagnostics = f'failed to get socket info: {exc}' + except socket.gaierror as e: + diagnostics = f'failed to get socket info due to address-related error: {e}' + self.logger.warning(diagnostics) + except socket.timeout as e: + diagnostics = f'failed to get socket info due to timeout: {e}' + self.logger.warning(diagnostics) + except socket.error as e: + diagnostics = f'failed to get socket info due to general socket error: {e}' self.logger.warning(diagnostics) + client_location['ip'] = ip site = os.environ.get('PILOT_RUCIO_SITENAME', 'unknown') client_location['site'] = site @@ -911,14 +918,14 @@ def transfer_files(self, copytool, files, activity=None, **kwargs) -> list: # n # return copytool.copy_in_bulk(remain_files, **kwargs) return copytool.copy_in(remain_files, **kwargs) - def set_status_for_direct_access(self, files, workdir): + def set_status_for_direct_access(self, files: list, workdir: str): """ Update the FileSpec status with 'remote_io' for direct access mode. Should be called only once since the function sends traces. - :param files: list of FileSpec objects. - :param workdir: work directory (string). + :param files: list of FileSpec objects (list) + :param workdir: work directory (str). """ for fspec in files: direct_lan = (fspec.domain == 'lan' and fspec.direct_access_lan and @@ -974,11 +981,11 @@ def set_status_for_direct_access(self, files, workdir): else: self.trace_report.send() - def check_availablespace(self, files): + def check_availablespace(self, files: list): """ Verify that enough local space is available to stage in and run the job. - :param files: list of FileSpec objects. + :param files: list of FileSpec objects (list) :raise: PilotException in case of not enough space or total input size too large. """ for f in files: @@ -1018,15 +1025,15 @@ class StageOutClient(StagingClient): mode = "stage-out" - def prepare_destinations(self, files, activities): + def prepare_destinations(self, files: list, activities: list or str) -> list: """ Resolve destination RSE (filespec.ddmendpoint) for each entry from `files` according to requested `activities`. Apply Pilot-side logic to choose proper destination. - :param files: list of FileSpec objects to be processed - :param activities: ordered list of activities to be used to resolve astorages - :return: updated fspec entries. + :param files: list of FileSpec objects to be processed (list) + :param activities: ordered list of activities to be used to resolve astorages (list or str) + :return: updated fspec entries (list). """ if not self.infosys.queuedata: # infosys is not initialized: not able to fix destination if need, nothing to do return files @@ -1039,7 +1046,6 @@ def prepare_destinations(self, files, activities): code=ErrorCodes.INTERNALPILOTPROBLEM, state='INTERNAL_ERROR') astorages = self.infosys.queuedata.astorages or {} - storages = None activity = activities[0] for a in activities: @@ -1058,8 +1064,8 @@ def prepare_destinations(self, files, activities): # take the fist choice for now, extend the logic later if need ddm = storages[0] - self.logger.info("[prepare_destinations][%s]: allowed (local) destinations: %s", activity, storages) - self.logger.info("[prepare_destinations][%s]: resolved default destination ddm=%s", activity, ddm) + self.logger.info(f"[prepare_destinations][{activity}]: allowed (local) destinations: {storages}") + self.logger.info(f"[prepare_destinations][{activity}]: resolved default destination ddm={ddm}") for e in files: if not e.ddmendpoint: # no preferences => use default destination From 89f976f109cf96756b02380030c45d52216e78b5 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 14:53:15 +0200 Subject: [PATCH 19/41] Pylint updates --- pilot/api/data.py | 84 +++++++++++++++++++++++++++++++++------------ pilot/util/https.py | 2 +- 2 files changed, 64 insertions(+), 22 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 8bb26603..02785845 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -472,13 +472,33 @@ def detect_client_location(self, use_vp: bool = False) -> dict: client_location = response.json() # put back the site client_location['site'] = site - except Exception as exc: - diagnostics = f'requests.post failed: {exc}' + except requests.exceptions.Timeout as exc: + diagnostics = f'requests.post timed out: {exc}' + self.logger.warning(diagnostics) + except requests.exceptions.RequestException as exc: + diagnostics = f'requests.post failed with general exception: {exc}' self.logger.warning(diagnostics) self.logger.debug(f'will use client_location={client_location}') return client_location, diagnostics + def resolve_surl(self, fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> dict: + """ + Resolve SURL. + + Only needed in StageOutClient. + + Get final destination SURL for file to be transferred. + Can be customized at the level of specific copytool. + + :param fspec: `FileSpec` object (Any) + :param protocol: suggested protocol (dict) + :param ddmconf: full ddmconf data (dict) + :param kwargs: extra kwargs (dict) + :return: dictionary with keys ('pfn', 'ddmendpoint') (dict). + """ + raise NotImplementedError() + def transfer_files(self, copytool: Any, files: list, activity: list, **kwargs: dict) -> list: """ Transfer the files. @@ -616,7 +636,9 @@ def require_protocols(self, files: list, copytool: Any, activity: list or str, l Populates fspec.protocols and fspec.turl for each entry in `files` according to preferred fspec.ddm_activity :param files: list of `FileSpec` objects (list) - :param activity: str or ordered list of transfer activity names to resolve acopytools related data (list or str). + :param copytool: copytool module (Any) + :param activity: list of activity names used to determine appropriate copytool (list or str) + :param local_dir: local directory (str). """ allowed_schemas = getattr(copytool, 'allowed_schemas', None) @@ -774,6 +796,23 @@ def resolve_replica(self, fspec: Any, primary_schemas: Any = None, allowed_schem return {'surl': surl['pfn'], 'ddmendpoint': replica['ddmendpoint'], 'pfn': replica['pfn'], 'domain': replica['domain']} + def resolve_surl(self, fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> dict: + """ + Resolve SURL. + + Only needed in StageOutClient. + + Get final destination SURL for file to be transferred. + Can be customized at the level of specific copytool. + + :param fspec: `FileSpec` object (Any) + :param protocol: suggested protocol (dict) + :param ddmconf: full ddmconf data (dict) + :param kwargs: extra kwargs (dict) + :return: dictionary with keys ('pfn', 'ddmendpoint') (dict). + """ + raise NotImplementedError() + def get_direct_access_variables(self, job: Any) -> (bool, str): """ Return the direct access settings for the PQ. @@ -797,16 +836,15 @@ def get_direct_access_variables(self, job: Any) -> (bool, str): return allow_direct_access, direct_access_type - def transfer_files(self, copytool, files, activity=None, **kwargs) -> list: # noqa: C901 + def transfer_files(self, copytool: Any, files: list, activity: list = None, **kwargs: dict) -> list: # noqa: C901 """ Automatically stage in files using the selected copy tool module. - :param copytool: copytool module - :param files: list of `FileSpec` objects - :param activity: list of activity names used to determine appropriate copytool (prioritized list) - :param kwargs: extra kwargs to be passed to copytool transfer handler - - :return: list of processed `FileSpec` objects + :param copytool: copytool module (Any) + :param files: list of `FileSpec` objects (list) + :param activity: list of activity names used to determine appropriate copytool (list or None) + :param kwargs: extra kwargs to be passed to copytool transfer handler (dict) + :return: list of processed `FileSpec` objects (list) :raise: PilotException in case of controlled error. """ if getattr(copytool, 'require_replicas', False) and files: @@ -1100,17 +1138,18 @@ def get_path(cls, scope: str, lfn: str) -> str: return '/'.join(paths) - def resolve_surl(self, fspec, protocol, ddmconf, **kwargs): + def resolve_surl(self, fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> dict: """ Resolve SURL. Get final destination SURL for file to be transferred. Can be customized at the level of specific copytool. - :param protocol: suggested protocol - :param ddmconf: full ddmconf data - :param activity: ordered list of preferred activity names to resolve SE protocols - :return: dict with keys ('pfn', 'ddmendpoint'). + :param fspec: `FileSpec` object (Any) + :param protocol: suggested protocol (dict) + :param ddmconf: full ddmconf data (dict) + :param kwargs: extra kwargs (dict) + :return: dictionary with keys ('pfn', 'ddmendpoint') (dict). """ local_dir = kwargs.get('local_dir', '') if not local_dir: @@ -1126,6 +1165,7 @@ def resolve_surl(self, fspec, protocol, ddmconf, **kwargs): f'ddm={fspec.ddmendpoint}: NOT IMPLEMENTED', code=ErrorCodes.NONDETERMINISTICDDM) surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), self.get_path(fspec.scope, fspec.lfn)) + return {'surl': surl} def transfer_files(self, copytool: Any, files: list, activity: list, **kwargs: dict) -> list: @@ -1134,11 +1174,11 @@ def transfer_files(self, copytool: Any, files: list, activity: list, **kwargs: d Automatically stage out files using the selected copy tool module. - :param copytool: copytool module - :param files: list of `FileSpec` objects - :param activity: ordered list of preferred activity names to resolve SE protocols - :param kwargs: extra kwargs to be passed to copytool transfer handler - :return: the output of the copytool transfer operation + :param copytool: copytool module (Any) + :param files: list of `FileSpec` objects (list) + :param activity: ordered list of preferred activity names to resolve SE protocols (list) + :param kwargs: extra kwargs to be passed to copytool transfer handler (dict) + :return: the output of the copytool transfer operation (list) :raise: PilotException in case of controlled error. """ # check if files exist before actual processing @@ -1172,7 +1212,9 @@ def transfer_files(self, copytool: Any, files: list, activity: list, **kwargs: d try: fspec.checksum[config.File.checksum_type] = calculate_checksum(pfn, algorithm=config.File.checksum_type) - except (FileHandlingFailure, NotImplementedError, Exception) as exc: + except (FileHandlingFailure, NotImplementedError) as exc: + raise exc + except Exception as exc: raise exc # prepare files (resolve protocol/transfer url) diff --git a/pilot/util/https.py b/pilot/util/https.py index 0f9d1506..13a55faf 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -19,7 +19,7 @@ # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Mario Lassnig, mario.lassnig@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 """Functions for https interactions.""" From 4e06c37a0091ad0394470ca9fcc9fb2f950f8373 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 15:17:40 +0200 Subject: [PATCH 20/41] Pylint updates --- pilot/api/es_data.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pilot/api/es_data.py b/pilot/api/es_data.py index 64a8af2a..b8e5adf6 100644 --- a/pilot/api/es_data.py +++ b/pilot/api/es_data.py @@ -19,11 +19,12 @@ # Authors: # - Wen Guan, wen.guan@cern,ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2019 -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 """API for event service data transfers.""" import logging +from typing import Any from pilot.api.data import StageInClient, StageOutClient @@ -33,14 +34,14 @@ class StageInESClient(StageInClient): """Stage-in client.""" - def __init__(self, *argc, **kwargs): + def __init__(self, **kwargs: dict): """Set default/init values.""" - super(StageInESClient, self).__init__(*argc, **kwargs) + super().__init__(**kwargs) self.copytool_modules.setdefault('objectstore', {'module_name': 'objectstore'}) self.acopytools.setdefault('es_events_read', ['objectstore']) - def prepare_sources(self, files, activities=None): + def prepare_sources(self, files: list, activities: Any = None): """ Prepare sources. @@ -49,13 +50,15 @@ def prepare_sources(self, files, activities=None): If storage_id is specified, replace ddmendpoint by parsing storage_id. - :param files: list of `FileSpec` objects to be processed - :param activities: string or ordered list of activities to resolve `astorages` (optional) - :return: None. + :param files: list of `FileSpec` objects to be processed (list) + :param activities: string or ordered list of activities to resolve `astorages` (optional) (list or str) """ if not self.infosys: self.logger.warning('infosys instance is not initialized: skip calling prepare_sources()') - return None + return + + if activities: + pass for fspec in files: if fspec.storage_token: ## FIX ME LATER: no need to parse each time storage_id, all this staff should be applied in FileSpec clean method @@ -65,15 +68,14 @@ def prepare_sources(self, files, activities=None): if storage_id: fspec.ddmendpoint = self.infosys.get_ddmendpoint(storage_id) logger.info(f"Processed file with storage id: {fspec}") - return None class StageOutESClient(StageOutClient): """Stage-out client.""" - def __init__(self, *argc, **kwargs): + def __init__(self, **kwargs: dict): """Set default/init values.""" - super(StageOutESClient, self).__init__(*argc, **kwargs) + super().__init__(**kwargs) self.copytool_modules.setdefault('objectstore', {'module_name': 'objectstore'}) self.acopytools.setdefault('es_events', ['objectstore']) From 9f60274edd6d99aa1cb2522ca25c6567d6adc16c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 15:18:47 +0200 Subject: [PATCH 21/41] Pylint updates --- pilot/api/es_data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pilot/api/es_data.py b/pilot/api/es_data.py index b8e5adf6..4d3098db 100644 --- a/pilot/api/es_data.py +++ b/pilot/api/es_data.py @@ -59,7 +59,6 @@ def prepare_sources(self, files: list, activities: Any = None): if activities: pass - for fspec in files: if fspec.storage_token: ## FIX ME LATER: no need to parse each time storage_id, all this staff should be applied in FileSpec clean method storage_id, path_convention = fspec.get_storage_id_and_path_convention() From 672c39565f25b864cc0020c181498055d1438f90 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 15:19:41 +0200 Subject: [PATCH 22/41] Pylint updates --- pilot/api/es_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/api/es_data.py b/pilot/api/es_data.py index 4d3098db..adc28c80 100644 --- a/pilot/api/es_data.py +++ b/pilot/api/es_data.py @@ -58,7 +58,8 @@ def prepare_sources(self, files: list, activities: Any = None): return if activities: - pass + pass # to bypass pylint complaint about activities not used (it cannot be removed) + for fspec in files: if fspec.storage_token: ## FIX ME LATER: no need to parse each time storage_id, all this staff should be applied in FileSpec clean method storage_id, path_convention = fspec.get_storage_id_and_path_convention() From 692954ab6e4cae8c007d9df4e7a2d3995728392c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 15:23:53 +0200 Subject: [PATCH 23/41] Pylint updates --- pilot/api/memorymonitor.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pilot/api/memorymonitor.py b/pilot/api/memorymonitor.py index deed3282..69432150 100644 --- a/pilot/api/memorymonitor.py +++ b/pilot/api/memorymonitor.py @@ -17,14 +17,15 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """API for memory monitoring.""" +import logging from os import getcwd + from .services import Services -import logging logger = logging.getLogger(__name__) @@ -36,14 +37,14 @@ class MemoryMonitoring(Services): workdir = "" # Job work directory _cmd = "" # Memory monitoring command (full path, all options) - def __init__(self, **kwargs): + def __init__(self, **kwargs: dict): """ Init function. - :param kwargs: kwargs dictionary. + :param kwargs: kwargs dictionary (dict). """ - for key in kwargs: - setattr(self, key, kwargs[key]) + for key, value in kwargs.items(): + setattr(self, key, value) if not self.workdir: self.workdir = getcwd() @@ -52,7 +53,7 @@ def __init__(self, **kwargs): user_utility = __import__(f'pilot.user.{self.user}.utilities', globals(), locals(), [self.user], 0) # Python 2/3 self._cmd = user_utility.get_memory_monitor_setup(self.pid, self.workdir) - def get_command(self): + def get_command(self) -> str: """ Return the full command for the memory monitor. @@ -68,11 +69,11 @@ def execute(self): """ return None - def get_filename(self): + def get_filename(self) -> str: """ Return the filename from the memory monitor tool. - :return: fiename (str). + :return: filename (str). """ return "" From 986a78958a6e943cc253ff954d9de82ddb81c679 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 16:28:36 +0200 Subject: [PATCH 24/41] Pylint updates --- pilot/api/memorymonitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/api/memorymonitor.py b/pilot/api/memorymonitor.py index 69432150..87e08c15 100644 --- a/pilot/api/memorymonitor.py +++ b/pilot/api/memorymonitor.py @@ -65,7 +65,7 @@ def execute(self): """ Execute the memory monitor command. - :return: process (currently None). + return: process (currently None). """ return None @@ -81,6 +81,6 @@ def get_results(self): """ Return the results from the memory monitoring. - :return: results (currently None). + return: results (currently None). """ return None From 7c89a7f24e56addb452dfc4097a08c5befcb1012 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 16:42:18 +0200 Subject: [PATCH 25/41] Pylint updates --- pilot/common/errorcodes.py | 39 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 4c84c373..b035adcb 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -17,13 +17,13 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2018 """Error codes set by the pilot.""" import re -from typing import Any, Tuple, List +from typing import Any class ErrorCodes: @@ -36,8 +36,8 @@ class ErrorCodes: """ # global variables shared by all modules/jobs - pilot_error_codes: List[int] = [] - pilot_error_diags: List[str] = [] + pilot_error_codes: list[int] = [] + pilot_error_diags: list[str] = [] # Error code constants (from Pilot 1) GENERALERROR = 1008 @@ -355,7 +355,7 @@ def get_error_message(self, errorcode: int) -> str: """ return self._error_messages.get(errorcode, f"unknown error code: {errorcode}") - def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None) -> Tuple[list, list]: + def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None) -> tuple[list, list]: """ Add pilot error code to list of error codes. @@ -365,10 +365,8 @@ def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None The function also sets the corresponding error message. :param errorcode: pilot error code (int) - :param pilot_error_codes: list of pilot error codes (list of integers) - :param pilot_error_diags: list of pilot error diags (list of strings) :param priority: if set to True, the new errorcode will be added to the error code list first (highest priority) (bool) - :param msg: error message (more detailed) to overwrite standard error message (str). + :param msg: error message (more detailed) to overwrite standard error message (str) :return: pilot_error_codes (list), pilot_error_diags (list). """ # do nothing if the error code has already been added @@ -385,7 +383,7 @@ def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None return pilot_error_codes, pilot_error_diags - def remove_error_code(self, errorcode: int) -> Tuple[list, list]: + def remove_error_code(self, errorcode: int) -> tuple[list, list]: """ Silently remove an error code and its diagnostics from the internal error lists. @@ -419,7 +417,7 @@ def report_errors(self) -> str: counter = 0 pilot_error_codes = ErrorCodes.pilot_error_codes pilot_error_diags = ErrorCodes.pilot_error_diags - if pilot_error_codes == []: + if not pilot_error_codes: report = "no pilot errors were reported" else: report = "Nr.\tError code\tError diagnostics" @@ -458,15 +456,15 @@ def resolve_transform_error(self, exit_code: int, stderr: str) -> int: # Handle specific exit codes if exit_code == 2: return self.LSETUPTIMEDOUT - elif exit_code == 3: + if exit_code == 3: return self.REMOTEFILEOPENTIMEDOUT - elif exit_code == 251: + if exit_code == 251: return self.UNKNOWNTRFFAILURE - elif exit_code == -1: + if exit_code == -1: return self.UNKNOWNTRFFAILURE - elif exit_code == self.COMMANDTIMEDOUT: + if exit_code == self.COMMANDTIMEDOUT: return exit_code - elif exit_code != 0: + if exit_code != 0: return self.PAYLOADEXECUTIONFAILURE return exit_code # Return original exit code if no specific error is found @@ -524,7 +522,7 @@ def format_diagnostics(self, code: int, diag: str) -> str: max_message_length = 256 try: standard_message = self._error_messages[code] + ":" - except Exception: + except KeyError: standard_message = "" # extract the relevant info for reporting exceptions @@ -548,18 +546,17 @@ def format_diagnostics(self, code: int, diag: str) -> str: error_message = standard_message + diag[-(max_message_length - len(standard_message)):] else: error_message = standard_message + diag[len(standard_message):][-max_message_length:] + elif len(diag) + len(standard_message) > max_message_length: + error_message = standard_message + diag[:(max_message_length + len(standard_message))] else: - if len(diag) + len(standard_message) > max_message_length: - error_message = standard_message + diag[:(max_message_length + len(standard_message))] - else: - error_message = standard_message + diag + error_message = standard_message + diag if '::' in error_message: error_message = re.sub(':+', ':', error_message) else: error_message = standard_message - except Exception: + except (TypeError, IndexError): error_message = diag return error_message From b296ea73859983d8981bc0b8e40975a82ac9514d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 16:54:08 +0200 Subject: [PATCH 26/41] Pylint updates --- pilot/common/exception.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/pilot/common/exception.py b/pilot/common/exception.py index 373e4aa5..b84a1867 100644 --- a/pilot/common/exception.py +++ b/pilot/common/exception.py @@ -18,15 +18,16 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2017-2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 """Exceptions set by the pilot.""" import time import threading import traceback +from collections.abc import Callable from sys import exc_info -from typing import Callable, Any, Dict +from typing import Any from .errorcodes import ErrorCodes errors = ErrorCodes() @@ -418,7 +419,7 @@ def __str__(self): class ExcThread(threading.Thread): """Support class that allows for catching exceptions in threads.""" - def __init__(self, bucket: Any, target: Callable, kwargs: Dict[str, Any], name: str): + def __init__(self, bucket: Any, target: Callable, kwargs: dict[str, Any], name: str): """ Set data members. @@ -446,12 +447,24 @@ def run(self): by the run() function and placed in the bucket belonging to the retrieve thread. The bucket is emptied in job.control(). """ + # pylint: disable=broad-except try: - self.target(**self.kwargs) + self._target(**self.kwargs) + except ValueError: + print(f'ValueError caught by thread run() function: {exc_info()}') + print(traceback.format_exc()) + print(traceback.print_tb(exc_info()[2])) + self.bucket.put(exc_info()) + print(f"exception has been put in bucket queue belonging to thread \'{self.name}\'") + args = self._kwargs.get('args', None) + if args: + print('setting graceful stop in 10 s since there is no point in continuing') + time.sleep(10) + args.graceful_stop.set() except Exception: # logger object can't be used here for some reason: # IOError: [Errno 2] No such file or directory: '/state/partition1/scratch/PanDA_Pilot2_*/pilotlog.txt' - print(f'exception caught by thread run() function: {exc_info()}') + print(f'unexpected exception caught by thread run() function: {exc_info()}') print(traceback.format_exc()) print(traceback.print_tb(exc_info()[2])) self.bucket.put(exc_info()) @@ -464,12 +477,7 @@ def run(self): args.graceful_stop.set() @property - def target(self) -> Callable: - """Help Pyright understand the type for self._target.""" - return self._target - - @property - def kwargs(self) -> Dict[str, Any]: + def kwargs(self) -> dict[str, Any]: """Help Pyright understand the type for self._kwargs.""" return self._kwargs From 80d0b196648980ba5ce80cc05b0137245b7446b6 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 16:57:00 +0200 Subject: [PATCH 27/41] Pylint updates --- pilot/common/pluginfactory.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pilot/common/pluginfactory.py b/pilot/common/pluginfactory.py index 934ad56a..4396067e 100644 --- a/pilot/common/pluginfactory.py +++ b/pilot/common/pluginfactory.py @@ -22,7 +22,7 @@ """A factory to manage plugins.""" -from typing import Any, Dict +from typing import Any import logging logger = logging.getLogger(__name__) @@ -30,16 +30,18 @@ class PluginFactory: """Plugin factory class.""" - def __init__(self, *args: Any, **kwargs: Any): + def __init__(self, **kwargs: dict): """Set initial values.""" - self.classMap: Dict[str, Any] = {} + if kwargs: # to avoid pylint complaint + pass + self.classMap: dict[str, Any] = {} def get_plugin(self, confs: dict) -> dict: """ Load plugin class. - :param confs: a dict of configurations. - :return: plugin class. + :param confs: a dict of configurations (dict) + :return: plugin class (dict). """ class_name = confs['class'] if class_name is None: From 112705fee3ac75321aa667fdb8d32ca1c6cdd98d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 19:56:48 +0200 Subject: [PATCH 28/41] Version update --- PILOTVERSION | 2 +- pilot/util/constants.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 12b51479..5b61602f 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.6.4 \ No newline at end of file +3.7.6.5 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 685879fe..b41a4fb1 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -18,7 +18,7 @@ # # Authors # - Mario Lassnig, mario.lassnig@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Constamts.""" @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '4' # build number should be reset to '1' for every new development cycle +BUILD = '5' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 2e2593c7b20a8ed15110f49abf2393434f6c7a35 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 May 2024 19:58:06 +0200 Subject: [PATCH 29/41] Updated comments --- pilot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot.py b/pilot.py index c4b24795..37575def 100755 --- a/pilot.py +++ b/pilot.py @@ -17,9 +17,9 @@ # under the License. # # Authors: -# - Mario Lassnig, mario.lassnig@cern.ch, 2016-17 +# - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 """This is the entry point for the PanDA Pilot, executed with 'python3 pilot.py '.""" From 01877f6f91c1e8dbfa67d997d3648d9783053fdf Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 20 May 2024 08:19:26 +0200 Subject: [PATCH 30/41] Pylint updates --- pilot/control/payloads/eventservice.py | 26 +++---- pilot/control/payloads/eventservicemerge.py | 28 ++++---- pilot/control/payloads/generic.py | 80 ++++++++++----------- 3 files changed, 65 insertions(+), 69 deletions(-) diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py index e1c61cce..689a7ba8 100644 --- a/pilot/control/payloads/eventservice.py +++ b/pilot/control/payloads/eventservice.py @@ -18,7 +18,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2017-2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 """Executor module for event service payloads.""" @@ -37,17 +37,19 @@ class Executor(generic.Executor): """Executor class for event service payloads.""" - def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): - """ - Set initial values. - - :param args: args object (Any) - :param job: job object (Any) - :param out: stdout file object (TextIO) - :param err: stderr file object (TextIO) - :param traces: traces object (Any). - """ - super().__init__(args, job, out, err, traces) + # only define the __init__ function if it actually does anything - otherwise it can be omitted since the + # parent __init__ function will be called automatically + #def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): + # """ + # Set initial values. + # + # :param args: args object (Any) + # :param job: job object (Any) + # :param out: stdout file object (TextIO) + # :param err: stderr file object (TextIO) + # :param traces: traces object (Any). + # """ + # super().__init__(args, job, out, err, traces) def run_payload(self, job: Any, cmd: str, out: TextIO, err: TextIO) -> Any: """ diff --git a/pilot/control/payloads/eventservicemerge.py b/pilot/control/payloads/eventservicemerge.py index 05fb44b0..679fc321 100644 --- a/pilot/control/payloads/eventservicemerge.py +++ b/pilot/control/payloads/eventservicemerge.py @@ -18,13 +18,13 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 """Executor module for event service merge payloads.""" import logging import os -from typing import Any, TextIO +from typing import Any #, TextIO from pilot.control.payloads import generic from pilot.util.container import execute @@ -35,17 +35,19 @@ class Executor(generic.Executor): """Executor class for event service merge payloads.""" - def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): - """ - Set initial values. - - :param args: args object (Any) - :param job: job object (Any) - :param out: stdout file object (TextIO) - :param err: stderr file object (TextIO) - :param traces: traces object (Any). - """ - super().__init__(args, job, out, err, traces) + # only define the __init__ function if it actually does anything - otherwise it can be omitted since the + # parent __init__ function will be called automatically + #def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): + # """ + # Set initial values. + # + # :param args: args object (Any) + # :param job: job object (Any) + # :param out: stdout file object (TextIO) + # :param err: stderr file object (TextIO) + # :param traces: traces object (Any). + # """ + # super().__init__(args, job, out, err, traces) def untar_file(self, lfn: str, workdir: str): """ diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index c46ef868..e96cabc7 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -20,7 +20,7 @@ # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2018 """Executor module for generic payloads.""" @@ -29,6 +29,7 @@ import os import signal import time +import traceback from subprocess import PIPE from typing import Any, TextIO @@ -82,12 +83,12 @@ def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): self.__out = out # payload stdout file object self.__err = err # payload stderr file object self.__traces = traces - self.__preprocess_stdout_name = '' - self.__preprocess_stderr_name = '' +# self.__preprocess_stdout_name = '' +# self.__preprocess_stderr_name = '' self.__coprocess_stdout_name = 'coprocess_stdout.txt' self.__coprocess_stderr_name = 'coprocess_stderr.txt' - self.__postprocess_stdout_name = '' - self.__postprocess_stderr_name = '' +# self.__postprocess_stdout_name = '' +# self.__postprocess_stderr_name = '' def get_job(self): """ @@ -396,14 +397,14 @@ def write_utility_output(self, workdir: str, step: str, stdout: str, stderr: str """ # dump to file try: - name_stdout = step + '_stdout.txt' - name_stderr = step + '_stderr.txt' - if step == 'preprocess': - self.__preprocess_stdout_name = name_stdout - self.__preprocess_stderr_name = name_stderr - elif step == 'postprocess': - self.__postprocess_stdout_name = name_stdout - self.__postprocess_stderr_name = name_stderr + #name_stdout = step + '_stdout.txt' + #name_stderr = step + '_stderr.txt' + #if step == 'preprocess': + # self.__preprocess_stdout_name = name_stdout + # self.__preprocess_stderr_name = name_stderr + #elif step == 'postprocess': + # self.__postprocess_stdout_name = name_stdout + # self.__postprocess_stderr_name = name_stderr name = os.path.join(workdir, step + '_stdout.txt') write_file(name, stdout, unique=True) except PilotException as error: @@ -533,10 +534,6 @@ def extract_setup(self, cmd: str) -> str: def cut_str_from(_cmd: str, _str: str) -> str: """ Cut the string from the position of the given _cmd. - - :param _cmd: command (str) - :param _str: substring (str) - :return: cut command (str). """ return _cmd[:_cmd.find(_str)] @@ -545,9 +542,6 @@ def cut_str_from_last_semicolon(_cmd: str) -> str: Cut the string from the last semicolon. NOTE: this will not work if jobParams also contain ; - - :param _cmd: command (str) - :return: cut command (str). """ # remove any trailing spaces and ;-signs _cmd = _cmd.strip() @@ -608,8 +602,7 @@ def wait_graceful(self, args: Any, proc: Any) -> int: logger.info(f'running: iteration={iteration} pid={proc.pid} exit_code={exit_code}') if exit_code is not None: break - else: - continue + continue return exit_code @@ -628,7 +621,6 @@ def get_payload_command(self, job: Any) -> str: cmd = user.get_payload_command(job) #+ 'sleep 900' # to test looping jobs except PilotException as error: self.post_setup(job) - import traceback logger.error(traceback.format_exc()) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) self.__traces.pilot['error_code'] = job.piloterrorcodes[0] @@ -941,14 +933,14 @@ def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: if os.WIFEXITED(status): logger.debug('normal exit') return os.WEXITSTATUS(status) - else: - # Handle abnormal termination if needed - logger.warning('abnormal termination') - return None - else: - # Process doesn't exist - ignore - logger.info(f'process {pid} no longer exists') - return True + + # Handle abnormal termination if needed + logger.warning('abnormal termination') + return None + + # Process doesn't exist - ignore + logger.info(f'process {pid} no longer exists') + return True except OSError as exc: # Handle errors, such as process not found @@ -998,16 +990,16 @@ def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: # logger.warning(f"exception caught: {exc}") # return None - def rename_log_files(self, iteration: int): - """ - Rename log files. - - :param iteration: iteration (int). - """ - names = [self.__preprocess_stdout_name, self.__preprocess_stderr_name, - self.__postprocess_stdout_name, self.__postprocess_stderr_name] - for name in names: - if os.path.exists(name): - os.rename(name, name + f'{iteration}') - else: - logger.warning(f'cannot rename {name} since it does not exist') +# def rename_log_files(self, iteration: int): +# """ +# Rename log files. +# +# :param iteration: iteration (int). +# """ +# names = [self.__preprocess_stdout_name, self.__preprocess_stderr_name, +# self.__postprocess_stdout_name, self.__postprocess_stderr_name] +# for name in names: +# if os.path.exists(name): +# os.rename(name, name + f'{iteration}') +# else: +# logger.warning(f'cannot rename {name} since it does not exist') From deb831b711a31cf799c3fc292adf617c815f5994 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 20 May 2024 08:21:23 +0200 Subject: [PATCH 31/41] Pydocstyle update --- pilot/control/payloads/generic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index e96cabc7..fafb4068 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -532,9 +532,7 @@ def extract_setup(self, cmd: str) -> str: :return: updated secondary command (str). """ def cut_str_from(_cmd: str, _str: str) -> str: - """ - Cut the string from the position of the given _cmd. - """ + """Cut the string from the position of the given _cmd.""" return _cmd[:_cmd.find(_str)] def cut_str_from_last_semicolon(_cmd: str) -> str: From 49b6a67b7b8a102a5c00c5947b7e916307cd800f Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 20 May 2024 08:43:15 +0200 Subject: [PATCH 32/41] Black updates --- pilot/common/errorcodes.py | 60 ++- pilot/common/exception.py | 57 +- pilot/common/pluginfactory.py | 9 +- pilot/control/payloads/eventservice.py | 22 +- pilot/control/payloads/eventservicemerge.py | 4 +- pilot/control/payloads/generic.py | 563 +++++++++++++------- 6 files changed, 456 insertions(+), 259 deletions(-) diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index b035adcb..890763a6 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -319,7 +319,7 @@ class ErrorCodes: LEASETIME: "Lease time is up", # internal use only LOGCREATIONTIMEOUT: "Log file creation timed out", CVMFSISNOTALIVE: "CVMFS is not responding", - LSETUPTIMEDOUT: "Lsetup command timed out during remote file open" + LSETUPTIMEDOUT: "Lsetup command timed out during remote file open", } put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181] @@ -337,12 +337,14 @@ def get_kill_signal_error_code(self, signal: str) -> int: :param signal: signal name (str). :return: Pilot error code (int). """ - signals_dictionary = {'SIGTERM': self.SIGTERM, - 'SIGQUIT': self.SIGQUIT, - 'SIGSEGV': self.SIGSEGV, - 'SIGXCPU': self.SIGXCPU, - 'SIGUSR1': self.SIGUSR1, - 'SIGBUS': self.SIGBUS} + signals_dictionary = { + "SIGTERM": self.SIGTERM, + "SIGQUIT": self.SIGQUIT, + "SIGSEGV": self.SIGSEGV, + "SIGXCPU": self.SIGXCPU, + "SIGUSR1": self.SIGUSR1, + "SIGBUS": self.SIGBUS, + } return signals_dictionary.get(signal, self.KILLSIGNAL) @@ -355,7 +357,9 @@ def get_error_message(self, errorcode: int) -> str: """ return self._error_messages.get(errorcode, f"unknown error code: {errorcode}") - def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None) -> tuple[list, list]: + def add_error_code( + self, errorcode: int, priority: bool = False, msg: Any = None + ) -> tuple[list, list]: """ Add pilot error code to list of error codes. @@ -445,7 +449,7 @@ def resolve_transform_error(self, exit_code: int, stderr: str) -> int: "Singularity is not installed": self.SINGULARITYNOTINSTALLED, "Apptainer is not installed": self.APPTAINERNOTINSTALLED, "cannot create directory": self.MKDIR, - "General payload setup verification error": self.SETUPFAILURE + "General payload setup verification error": self.SETUPFAILURE, } # Check if stderr contains any known error messages @@ -480,7 +484,9 @@ def extract_stderr_error(self, stderr: str) -> str: if "command not found" in stderr: msg = stderr else: - msg = self.get_message_for_pattern([r"ERROR\s*:\s*(.*)", r"Error\s*:\s*(.*)", r"error\s*:\s*(.*)"], stderr) + msg = self.get_message_for_pattern( + [r"ERROR\s*:\s*(.*)", r"Error\s*:\s*(.*)", r"error\s*:\s*(.*)"], stderr + ) return msg def extract_stderr_warning(self, stderr: str) -> str: @@ -490,7 +496,10 @@ def extract_stderr_warning(self, stderr: str) -> str: :param stderr: stderr (str) :return: warning message (str). """ - return self.get_message_for_pattern([r"WARNING\s*:\s*(.*)", r"Warning\s*:\s*(.*)", r"warning\s*:\s*(.*)"], stderr) + return self.get_message_for_pattern( + [r"WARNING\s*:\s*(.*)", r"Warning\s*:\s*(.*)", r"warning\s*:\s*(.*)"], + stderr, + ) def get_message_for_pattern(self, patterns: list, stderr: str) -> str: """ @@ -527,14 +536,14 @@ def format_diagnostics(self, code: int, diag: str) -> str: # extract the relevant info for reporting exceptions if "Traceback" in diag: - pattern = 'details:(.+)' + pattern = "details:(.+)" found = re.findall(pattern, diag) if found: diag = found[0] - diag = re.sub(r'\[?PilotException\(\"?\'?', r'', diag) - diag = re.sub(r'\[?StageInFailure\(\"?\'?', r'', diag) - diag = re.sub(r'\[?StageOutFailure\(\"?\'?', r'', diag) - diag = re.sub(' +', ' ', diag) + diag = re.sub(r"\[?PilotException\(\"?\'?", r"", diag) + diag = re.sub(r"\[?StageInFailure\(\"?\'?", r"", diag) + diag = re.sub(r"\[?StageOutFailure\(\"?\'?", r"", diag) + diag = re.sub(" +", " ", diag) try: if diag: @@ -543,16 +552,25 @@ def format_diagnostics(self, code: int, diag: str) -> str: # e.g. "Failed to stage-in file:abcdefghijklmnopqrstuvwxyz0123456789" if standard_message in diag: if len(diag) > max_message_length: - error_message = standard_message + diag[-(max_message_length - len(standard_message)):] + error_message = ( + standard_message + + diag[-(max_message_length - len(standard_message)):] + ) else: - error_message = standard_message + diag[len(standard_message):][-max_message_length:] + error_message = ( + standard_message + + diag[len(standard_message):][-max_message_length:] + ) elif len(diag) + len(standard_message) > max_message_length: - error_message = standard_message + diag[:(max_message_length + len(standard_message))] + error_message = ( + standard_message + + diag[:(max_message_length + len(standard_message))] + ) else: error_message = standard_message + diag - if '::' in error_message: - error_message = re.sub(':+', ':', error_message) + if "::" in error_message: + error_message = re.sub(":+", ":", error_message) else: error_message = standard_message diff --git a/pilot/common/exception.py b/pilot/common/exception.py index b84a1867..b7c5725d 100644 --- a/pilot/common/exception.py +++ b/pilot/common/exception.py @@ -30,6 +30,7 @@ from typing import Any from .errorcodes import ErrorCodes + errors = ErrorCodes() @@ -41,22 +42,26 @@ def __init__(self, *args, **kwargs): super().__init__(args, kwargs) self.args = args self.kwargs = kwargs - code = self.kwargs.get('code', None) + code = self.kwargs.get("code", None) if code: self._errorCode = code else: self._errorCode = errors.UNKNOWNEXCEPTION self._message = errors.get_error_message(self._errorCode) self._error_string = None - self._stack_trace = f'{traceback.format_exc()}' + self._stack_trace = f"{traceback.format_exc()}" def __str__(self): """Set and return the error string for string representation of the class instance.""" try: - self._error_string = f"error code: {self._errorCode}, message: {self._message % self.kwargs}" + self._error_string = ( + f"error code: {self._errorCode}, message: {self._message % self.kwargs}" + ) except TypeError: # at least get the core message out if something happened - self._error_string = f"error code: {self._errorCode}, message: {self._message}" + self._error_string = ( + f"error code: {self._errorCode}, message: {self._message}" + ) if len(self.args) > 0: # If there is a non-kwarg parameter, assume it's the error @@ -64,19 +69,23 @@ def __str__(self): # of the exception message # Convert all arguments into their string representations... try: - args = [f'{arg}' for arg in self.args if arg] + args = [f"{arg}" for arg in self.args if arg] except TypeError: - args = [f'{self.args}'] - self._error_string = (self._error_string + "\ndetails: %s" % '\n'.join(args)) + args = [f"{self.args}"] + self._error_string = self._error_string + "\ndetails: %s" % "\n".join(args) return self._error_string.strip() def get_detail(self): """Set and return the error string with the exception details.""" try: - self._error_string = f"error code: {self._errorCode}, message: {self._message % self.kwargs}" + self._error_string = ( + f"error code: {self._errorCode}, message: {self._message % self.kwargs}" + ) except TypeError: # at least get the core message out if something happened - self._error_string = f"error code: {self._errorCode}, message: {self._message}" + self._error_string = ( + f"error code: {self._errorCode}, message: {self._message}" + ) return self._error_string + f"\nstacktrace: {self._stack_trace}" @@ -91,7 +100,7 @@ def get_last_error(self): return self._message -#class NotImplementedError(PilotException): +# class NotImplementedError(PilotException): # """ # Not implemented exception. # """ @@ -419,7 +428,9 @@ def __str__(self): class ExcThread(threading.Thread): """Support class that allows for catching exceptions in threads.""" - def __init__(self, bucket: Any, target: Callable, kwargs: dict[str, Any], name: str): + def __init__( + self, bucket: Any, target: Callable, kwargs: dict[str, Any], name: str + ): """ Set data members. @@ -451,28 +462,36 @@ def run(self): try: self._target(**self.kwargs) except ValueError: - print(f'ValueError caught by thread run() function: {exc_info()}') + print(f"ValueError caught by thread run() function: {exc_info()}") print(traceback.format_exc()) print(traceback.print_tb(exc_info()[2])) self.bucket.put(exc_info()) - print(f"exception has been put in bucket queue belonging to thread \'{self.name}\'") - args = self._kwargs.get('args', None) + print( + f"exception has been put in bucket queue belonging to thread '{self.name}'" + ) + args = self._kwargs.get("args", None) if args: - print('setting graceful stop in 10 s since there is no point in continuing') + print( + "setting graceful stop in 10 s since there is no point in continuing" + ) time.sleep(10) args.graceful_stop.set() except Exception: # logger object can't be used here for some reason: # IOError: [Errno 2] No such file or directory: '/state/partition1/scratch/PanDA_Pilot2_*/pilotlog.txt' - print(f'unexpected exception caught by thread run() function: {exc_info()}') + print(f"unexpected exception caught by thread run() function: {exc_info()}") print(traceback.format_exc()) print(traceback.print_tb(exc_info()[2])) self.bucket.put(exc_info()) - print(f"exception has been put in bucket queue belonging to thread \'{self.name}\'") - args = self._kwargs.get('args', None) + print( + f"exception has been put in bucket queue belonging to thread '{self.name}'" + ) + args = self._kwargs.get("args", None) if args: # the sleep is needed to allow the threads to catch up - print('setting graceful stop in 10 s since there is no point in continuing') + print( + "setting graceful stop in 10 s since there is no point in continuing" + ) time.sleep(10) args.graceful_stop.set() diff --git a/pilot/common/pluginfactory.py b/pilot/common/pluginfactory.py index 4396067e..389ee62b 100644 --- a/pilot/common/pluginfactory.py +++ b/pilot/common/pluginfactory.py @@ -24,6 +24,7 @@ from typing import Any import logging + logger = logging.getLogger(__name__) @@ -43,21 +44,21 @@ def get_plugin(self, confs: dict) -> dict: :param confs: a dict of configurations (dict) :return: plugin class (dict). """ - class_name = confs['class'] + class_name = confs["class"] if class_name is None: logger.error(f"class is not defined in confs: {confs}") return {} if class_name not in self.classMap: logger.info(f"trying to import {class_name}") - components = class_name.split('.') - mod = __import__('.'.join(components[:-1])) + components = class_name.split(".") + mod = __import__(".".join(components[:-1])) for comp in components[1:]: mod = getattr(mod, comp) self.classMap[class_name] = mod args = {} - excluded_keys = {'class'} # Use a set to store keys to exclude + excluded_keys = {"class"} # Use a set to store keys to exclude for key in confs: if key in excluded_keys: continue diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py index 689a7ba8..ebff6c7a 100644 --- a/pilot/control/payloads/eventservice.py +++ b/pilot/control/payloads/eventservice.py @@ -39,7 +39,7 @@ class Executor(generic.Executor): # only define the __init__ function if it actually does anything - otherwise it can be omitted since the # parent __init__ function will be called automatically - #def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): + # def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): # """ # Set initial values. # @@ -64,8 +64,10 @@ def run_payload(self, job: Any, cmd: str, out: TextIO, err: TextIO) -> Any: self.pre_setup(job) # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'atlas').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "atlas").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) self.post_setup(job) @@ -76,13 +78,19 @@ def run_payload(self, job: Any, cmd: str, out: TextIO, err: TextIO) -> Any: try: executable = user.get_payload_command(job) except exception.PilotException: - logger.fatal('could not define payload command') + logger.fatal("could not define payload command") return None logger.info(f"payload execution command: {executable}") try: - payload = {'executable': executable, 'workdir': job.workdir, 'output_file': out, 'error_file': err, 'job': job} + payload = { + "executable": executable, + "workdir": job.workdir, + "output_file": out, + "error_file": err, + "job": job, + } logger.debug("payload: %s", payload) logger.info("starting EventService WorkExecutor") @@ -99,7 +107,7 @@ def run_payload(self, job: Any, cmd: str, out: TextIO, err: TextIO) -> Any: self.utility_after_payload_started(job) except Exception as error: - logger.error(f'could not execute: {error}') + logger.error(f"could not execute: {error}") return None return executor @@ -115,7 +123,7 @@ def get_executor_type(self) -> dict: """ # executor_type = 'hpo' if job.is_hpo else os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') # return {'executor_type': executor_type} - return {'executor_type': os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic')} + return {"executor_type": os.environ.get("PILOT_ES_EXECUTOR_TYPE", "generic")} def wait_graceful(self, args: Any, proc: Any) -> int: """ diff --git a/pilot/control/payloads/eventservicemerge.py b/pilot/control/payloads/eventservicemerge.py index 679fc321..bd3be12b 100644 --- a/pilot/control/payloads/eventservicemerge.py +++ b/pilot/control/payloads/eventservicemerge.py @@ -24,7 +24,7 @@ import logging import os -from typing import Any #, TextIO +from typing import Any # , TextIO from pilot.control.payloads import generic from pilot.util.container import execute @@ -37,7 +37,7 @@ class Executor(generic.Executor): # only define the __init__ function if it actually does anything - otherwise it can be omitted since the # parent __init__ function will be called automatically - #def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): + # def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): # """ # Set initial values. # diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index fafb4068..91d98268 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -35,7 +35,7 @@ from pilot.common.errorcodes import ErrorCodes from pilot.control.job import send_state -from pilot.util.auxiliary import set_pilot_state #, show_memory_usage +from pilot.util.auxiliary import set_pilot_state # , show_memory_usage from pilot.util.config import config from pilot.util.container import execute from pilot.util.constants import ( @@ -48,24 +48,18 @@ PILOT_PRE_PAYLOAD, PILOT_POST_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED2, - UTILITY_AFTER_PAYLOAD_FINISHED2 -) -from pilot.util.filehandling import ( - write_file, - read_file + UTILITY_AFTER_PAYLOAD_FINISHED2, ) +from pilot.util.filehandling import write_file, read_file from pilot.util.processes import kill_processes -from pilot.util.timing import ( - add_to_pilot_timing, - get_time_measurement -) +from pilot.util.timing import add_to_pilot_timing, get_time_measurement from pilot.common.exception import PilotException logger = logging.getLogger(__name__) errors = ErrorCodes() -class Executor(): +class Executor: """Executor class for generic payloads.""" def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): @@ -83,12 +77,13 @@ def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): self.__out = out # payload stdout file object self.__err = err # payload stderr file object self.__traces = traces -# self.__preprocess_stdout_name = '' -# self.__preprocess_stderr_name = '' - self.__coprocess_stdout_name = 'coprocess_stdout.txt' - self.__coprocess_stderr_name = 'coprocess_stderr.txt' -# self.__postprocess_stdout_name = '' -# self.__postprocess_stderr_name = '' + # self.__preprocess_stdout_name = '' + # self.__preprocess_stderr_name = '' + self.__coprocess_stdout_name = "coprocess_stdout.txt" + self.__coprocess_stderr_name = "coprocess_stderr.txt" + + # self.__postprocess_stdout_name = '' + # self.__postprocess_stderr_name = '' def get_job(self): """ @@ -106,8 +101,8 @@ def pre_setup(self, job: Any): """ # write time stamps to pilot timing file update_time = time.time() - logger.debug(f'setting pre-setup time to {update_time} s') - logger.debug(f'gmtime is {time.gmtime(update_time)}') + logger.debug(f"setting pre-setup time to {update_time} s") + logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, update_time, self.__args) def post_setup(self, job: Any, update_time: bool = None): @@ -120,8 +115,8 @@ def post_setup(self, job: Any, update_time: bool = None): # write time stamps to pilot timing file if not update_time: update_time = time.time() - logger.debug(f'setting post-setup time to {update_time} s') - logger.debug(f'gmtime is {time.gmtime(update_time)}') + logger.debug(f"setting post-setup time to {update_time} s") + logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, update_time, self.__args) def improve_post_setup(self): @@ -136,22 +131,32 @@ def improve_post_setup(self): if not os.path.exists(path): return - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.setup', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.setup", globals(), locals(), [pilot_user], 0 + ) try: end_setup_time = user.get_end_setup_time(path) # since epoch except Exception as exc: - logger.debug(f'caught exception: {exc} (will not update setup time)') + logger.debug(f"caught exception: {exc} (will not update setup time)") end_setup_time = None if end_setup_time: # get the currently stored post-setup time time_measurement_dictionary = self.__args.timing.get(self.__job.jobid, None) - current_post_setup = get_time_measurement(PILOT_POST_SETUP, time_measurement_dictionary, self.__args.timing) + current_post_setup = get_time_measurement( + PILOT_POST_SETUP, time_measurement_dictionary, self.__args.timing + ) if current_post_setup: - logger.info(f'existing post-setup time: {current_post_setup} s (since epoch) (current time: {time.time()})') - logger.debug(f'extracted end time from payload stdout: {end_setup_time} s') + logger.info( + f"existing post-setup time: {current_post_setup} s (since epoch) (current time: {time.time()})" + ) + logger.debug( + f"extracted end time from payload stdout: {end_setup_time} s" + ) diff = end_setup_time - current_post_setup - logger.info(f'payload setup finished {diff} s later than previously recorded') + logger.info( + f"payload setup finished {diff} s later than previously recorded" + ) self.post_setup(self.__job, update_time=end_setup_time) def utility_before_payload(self, job: Any) -> str: @@ -169,15 +174,21 @@ def utility_before_payload(self, job: Any) -> str: cmd = "" # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should we run any additional commands? (e.g. special monitoring commands) - cmd_dictionary = user.get_utility_commands(order=UTILITY_BEFORE_PAYLOAD, job=job) + cmd_dictionary = user.get_utility_commands( + order=UTILITY_BEFORE_PAYLOAD, job=job + ) if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - _label = cmd_dictionary.get('label', 'utility') - logger.info(f'utility command (\'{_label}\') to be executed before the payload: {cmd}') + _label = cmd_dictionary.get("label", "utility") + logger.info( + f"utility command ('{_label}') to be executed before the payload: {cmd}" + ) return cmd @@ -193,15 +204,19 @@ def utility_with_payload(self, job: Any) -> str: cmd = "" # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should any additional commands be prepended to the payload execution string? cmd_dictionary = user.get_utility_commands(order=UTILITY_WITH_PAYLOAD, job=job) if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - _label = cmd_dictionary.get('label', 'utility') - logger.info(f'utility command (\'{_label}\') to be executed with the payload: {cmd}') + _label = cmd_dictionary.get("label", "utility") + logger.info( + f"utility command ('{_label}') to be executed with the payload: {cmd}" + ) return cmd @@ -218,15 +233,19 @@ def get_utility_command(self, order: str = "") -> str: cmd = "" # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should any additional commands be executed after the payload? cmd_dictionary = user.get_utility_commands(order=order, job=self.__job) if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - _label = cmd_dictionary.get('label', 'utility') - logger.info(f'utility command (\'{_label}\') to be executed after the payload: {cmd}') + _label = cmd_dictionary.get("label", "utility") + logger.info( + f"utility command ('{_label}') to be executed after the payload: {cmd}" + ) return cmd @@ -237,37 +256,55 @@ def utility_after_payload_started(self, job: Any): :param job: job object (Any). """ # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should any additional commands be executed after the payload? - cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job) + cmd_dictionary = user.get_utility_commands( + order=UTILITY_AFTER_PAYLOAD_STARTED, job=job + ) if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - logger.info(f'utility command to be executed after the payload: {cmd}') + logger.info(f"utility command to be executed after the payload: {cmd}") # how should this command be executed? - utilitycommand = user.get_utility_command_setup(cmd_dictionary.get('command'), job) + utilitycommand = user.get_utility_command_setup( + cmd_dictionary.get("command"), job + ) if not utilitycommand: - logger.warning('empty utility command - nothing to run') + logger.warning("empty utility command - nothing to run") return try: - proc1 = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, - stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) + proc1 = execute( + utilitycommand, + workdir=job.workdir, + returnproc=True, + usecontainer=False, + stdout=PIPE, + stderr=PIPE, + cwd=job.workdir, + job=job, + ) except Exception as error: - logger.error(f'could not execute: {error}') + logger.error(f"could not execute: {error}") else: # store process handle in job object, and keep track on how many times the command has been launched # also store the full command in case it needs to be restarted later (by the job_monitor() thread) - logger.debug(f'storing process for {utilitycommand}') - job.utilities[cmd_dictionary.get('command')] = [proc1, 1, utilitycommand] + logger.debug(f"storing process for {utilitycommand}") + job.utilities[cmd_dictionary.get("command")] = [ + proc1, + 1, + utilitycommand, + ] # wait for command to start - #time.sleep(1) - #cmd = utilitycommand.split(';')[-1] - #prmon = f'prmon --pid {job.pid}' - #pid = None - #if prmon in cmd: + # time.sleep(1) + # cmd = utilitycommand.split(';')[-1] + # prmon = f'prmon --pid {job.pid}' + # pid = None + # if prmon in cmd: # import subprocess # import re # ps = subprocess.run(['ps', 'aux', str(os.getpid())], stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -279,10 +316,10 @@ def utility_after_payload_started(self, job: Any): # if matches: # pid = matches[0] # break - #if pid: + # if pid: # logger.info(f'{prmon} command has pid={pid} (appending to cmd dictionary)') # job.utilities[cmd_dictionary.get('command')].append(pid) - #else: + # else: # logger.info(f'could not extract any pid from ps for cmd={cmd}') def utility_after_payload_started_new(self, job: Any) -> str: @@ -297,31 +334,35 @@ def utility_after_payload_started_new(self, job: Any) -> str: cmd = "" # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should any additional commands be executed after the payload? - cmd_dictionary = user.get_utility_commands(order=UTILITY_AFTER_PAYLOAD_STARTED, job=job) + cmd_dictionary = user.get_utility_commands( + order=UTILITY_AFTER_PAYLOAD_STARTED, job=job + ) if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - logger.info(f'utility command to be executed after the payload: {cmd}') + logger.info(f"utility command to be executed after the payload: {cmd}") return cmd -# # how should this command be executed? -# utilitycommand = user.get_utility_command_setup(cmd_dictionary.get('command'), job) -# if not utilitycommand: -# logger.warning('empty utility command - nothing to run') -# return -# try: -# proc = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, -# stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) -# except Exception as error: -# logger.error('could not execute: %s', error) -# else: -# # store process handle in job object, and keep track on how many times the command has been launched -# # also store the full command in case it needs to be restarted later (by the job_monitor() thread) -# job.utilities[cmd_dictionary.get('command')] = [proc, 1, utilitycommand] + # # how should this command be executed? + # utilitycommand = user.get_utility_command_setup(cmd_dictionary.get('command'), job) + # if not utilitycommand: + # logger.warning('empty utility command - nothing to run') + # return + # try: + # proc = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, + # stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) + # except Exception as error: + # logger.error('could not execute: %s', error) + # else: + # # store process handle in job object, and keep track on how many times the command has been launched + # # also store the full command in case it needs to be restarted later (by the job_monitor() thread) + # job.utilities[cmd_dictionary.get('command')] = [proc, 1, utilitycommand] def utility_after_payload_finished(self, job: Any, order: str) -> (str, str, bool): """ @@ -338,17 +379,23 @@ def utility_after_payload_finished(self, job: Any, order: str) -> (str, str, boo cmd = "" # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) # should any additional commands be prepended to the payload execution string? cmd_dictionary = user.get_utility_commands(order=order, job=job) - label = cmd_dictionary.get('label') if cmd_dictionary else 'unknown' + label = cmd_dictionary.get("label") if cmd_dictionary else "unknown" if cmd_dictionary: cmd = f"{cmd_dictionary.get('command')} {cmd_dictionary.get('args')}" - logger.info(f'utility command (\'{label}\') to be executed after the payload has finished: {cmd}') + logger.info( + f"utility command ('{label}') to be executed after the payload has finished: {cmd}" + ) - ignore_failure = cmd_dictionary.get('ignore_failure') if cmd_dictionary else False + ignore_failure = ( + cmd_dictionary.get("ignore_failure") if cmd_dictionary else False + ) return cmd, label, ignore_failure def execute_utility_command(self, cmd: str, job: Any, label: str) -> int: @@ -360,20 +407,24 @@ def execute_utility_command(self, cmd: str, job: Any, label: str) -> int: :param label: command label (str) :return: exit code (int). """ - exit_code, stdout, stderr = execute(cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False) + exit_code, stdout, stderr = execute( + cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False + ) if exit_code: ignored_exit_codes = [160, 161, 162] logger.warning( - f'command returned non-zero exit code: {cmd} (exit code = {exit_code}) - see utility logs for details' + f"command returned non-zero exit code: {cmd} (exit code = {exit_code}) - see utility logs for details" ) - if label == 'preprocess': + if label == "preprocess": err = errors.PREPROCESSFAILURE - elif label == 'postprocess': + elif label == "postprocess": err = errors.POSTPROCESSFAILURE else: err = 0 # ie ignore exit_code = 0 - if err and exit_code not in ignored_exit_codes: # ignore no-more-data-points exit codes + if ( + err and exit_code not in ignored_exit_codes + ): # ignore no-more-data-points exit codes job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(err) if exit_code in ignored_exit_codes: job.transexitcode = exit_code @@ -397,28 +448,28 @@ def write_utility_output(self, workdir: str, step: str, stdout: str, stderr: str """ # dump to file try: - #name_stdout = step + '_stdout.txt' - #name_stderr = step + '_stderr.txt' - #if step == 'preprocess': + # name_stdout = step + '_stdout.txt' + # name_stderr = step + '_stderr.txt' + # if step == 'preprocess': # self.__preprocess_stdout_name = name_stdout # self.__preprocess_stderr_name = name_stderr - #elif step == 'postprocess': + # elif step == 'postprocess': # self.__postprocess_stdout_name = name_stdout # self.__postprocess_stderr_name = name_stderr - name = os.path.join(workdir, step + '_stdout.txt') + name = os.path.join(workdir, step + "_stdout.txt") write_file(name, stdout, unique=True) except PilotException as error: - logger.warning(f'failed to write utility stdout to file: {error}, {stdout}') + logger.warning(f"failed to write utility stdout to file: {error}, {stdout}") else: - logger.debug(f'wrote {name}') + logger.debug(f"wrote {name}") try: - name = os.path.join(workdir, step + '_stderr.txt') + name = os.path.join(workdir, step + "_stderr.txt") write_file(name, stderr, unique=True) except PilotException as error: - logger.warning(f'failed to write utility stderr to file: {error}, {stderr}') + logger.warning(f"failed to write utility stderr to file: {error}, {stderr}") else: - logger.debug(f'wrote {name}') + logger.debug(f"wrote {name}") def pre_payload(self, job: Any): """ @@ -430,8 +481,8 @@ def pre_payload(self, job: Any): """ # write time stamps to pilot timing file update_time = time.time() - logger.debug(f'setting pre-payload time to {update_time} s') - logger.debug(f'gmtime is {time.gmtime(update_time)}') + logger.debug(f"setting pre-payload time to {update_time} s") + logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, update_time, self.__args) def post_payload(self, job: Any): @@ -444,8 +495,8 @@ def post_payload(self, job: Any): """ # write time stamps to pilot timing file update_time = time.time() - logger.debug(f'setting post-payload time to {update_time} s') - logger.debug(f'gmtime is {time.gmtime(update_time)}') + logger.debug(f"setting post-payload time to {update_time} s") + logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, update_time, self.__args) def run_command(self, cmd: str, label: str = "") -> Any: @@ -457,29 +508,41 @@ def run_command(self, cmd: str, label: str = "") -> Any: :return: subprocess object (Any). """ if label: - logger.info(f'\n\n{label}:\n\n{cmd}\n') - if label == 'coprocess': + logger.info(f"\n\n{label}:\n\n{cmd}\n") + if label == "coprocess": try: - out = open(os.path.join(self.__job.workdir, self.__coprocess_stdout_name), 'wb') - err = open(os.path.join(self.__job.workdir, self.__coprocess_stderr_name), 'wb') + out = open( + os.path.join(self.__job.workdir, self.__coprocess_stdout_name), "wb" + ) + err = open( + os.path.join(self.__job.workdir, self.__coprocess_stderr_name), "wb" + ) except IOError as error: - logger.warning(f'failed to open coprocess stdout/err: {error}') + logger.warning(f"failed to open coprocess stdout/err: {error}") out = None err = None else: out = None err = None try: - proc = execute(cmd, workdir=self.__job.workdir, returnproc=True, stdout=out, stderr=err, - usecontainer=False, cwd=self.__job.workdir, job=self.__job) + proc = execute( + cmd, + workdir=self.__job.workdir, + returnproc=True, + stdout=out, + stderr=err, + usecontainer=False, + cwd=self.__job.workdir, + job=self.__job, + ) except Exception as error: - logger.error(f'could not execute: {error}') + logger.error(f"could not execute: {error}") return None if isinstance(proc, tuple) and not proc[0]: - logger.error('failed to execute command') + logger.error("failed to execute command") return None - logger.info(f'started {label} -- pid={proc.pid} executable={cmd}') + logger.info(f"started {label} -- pid={proc.pid} executable={cmd}") return proc @@ -502,21 +565,29 @@ def run_payload(self, job: Any, cmd: str, out: Any, err: Any) -> Any: logger.info(f"\n\npayload execution command:\n\n{cmd}\n") try: - proc = execute(cmd, workdir=job.workdir, returnproc=True, - usecontainer=True, stdout=out, stderr=err, cwd=job.workdir, job=job) + proc = execute( + cmd, + workdir=job.workdir, + returnproc=True, + usecontainer=True, + stdout=out, + stderr=err, + cwd=job.workdir, + job=job, + ) except Exception as error: - logger.error(f'could not execute: {error}') + logger.error(f"could not execute: {error}") return None if isinstance(proc, tuple) and not proc[0]: - logger.error('failed to execute payload') + logger.error("failed to execute payload") return None - logger.info(f'started -- pid={proc.pid} executable={cmd}') + logger.info(f"started -- pid={proc.pid} executable={cmd}") job.pid = proc.pid job.pgrp = os.getpgid(job.pid) set_pilot_state(job=job, state="running") - #_cmd = self.utility_with_payload(job) + # _cmd = self.utility_with_payload(job) self.utility_after_payload_started(job) @@ -531,9 +602,10 @@ def extract_setup(self, cmd: str) -> str: :param cmd: payload command (str) :return: updated secondary command (str). """ + def cut_str_from(_cmd: str, _str: str) -> str: """Cut the string from the position of the given _cmd.""" - return _cmd[:_cmd.find(_str)] + return _cmd[: _cmd.find(_str)] def cut_str_from_last_semicolon(_cmd: str) -> str: """ @@ -543,16 +615,20 @@ def cut_str_from_last_semicolon(_cmd: str) -> str: """ # remove any trailing spaces and ;-signs _cmd = _cmd.strip() - _cmd = _cmd[:-1] if _cmd.endswith(';') else _cmd - last_bit = _cmd.split(';')[-1] - return _cmd.replace(last_bit.strip(), '') - - if '/' in self.__job.transformation: # e.g. http://pandaserver.cern.ch:25080/trf/user/runHPO-00-00-01 - trfname = self.__job.transformation[self.__job.transformation.rfind('/') + 1:] # runHPO-00-00-01 - _trf = './' + trfname + _cmd = _cmd[:-1] if _cmd.endswith(";") else _cmd + last_bit = _cmd.split(";")[-1] + return _cmd.replace(last_bit.strip(), "") + + if ( + "/" in self.__job.transformation + ): # e.g. http://pandaserver.cern.ch:25080/trf/user/runHPO-00-00-01 + trfname = self.__job.transformation[ + self.__job.transformation.rfind("/") + 1: + ] # runHPO-00-00-01 + _trf = "./" + trfname else: trfname = self.__job.transformation - _trf = './' + self.__job.transformation + _trf = "./" + self.__job.transformation if _trf in cmd: setup = cut_str_from(cmd, _trf) @@ -581,7 +657,7 @@ def wait_graceful(self, args: Any, proc: Any) -> int: for _ in range(60): if args.graceful_stop.is_set(): breaker = True - logger.info(f'breaking -- sending SIGTERM to pid={proc.pid}') + logger.info(f"breaking -- sending SIGTERM to pid={proc.pid}") os.killpg(os.getpgid(proc.pid), signal.SIGTERM) break exit_code = proc.poll() @@ -589,7 +665,9 @@ def wait_graceful(self, args: Any, proc: Any) -> int: break time.sleep(1) if breaker: - logger.info(f'breaking -- sleep 10 s before sending SIGKILL pid={proc.pid}') + logger.info( + f"breaking -- sleep 10 s before sending SIGKILL pid={proc.pid}" + ) time.sleep(10) proc.kill() break @@ -597,7 +675,9 @@ def wait_graceful(self, args: Any, proc: Any) -> int: exit_code = proc.poll() if iteration % 10 == 0: - logger.info(f'running: iteration={iteration} pid={proc.pid} exit_code={exit_code}') + logger.info( + f"running: iteration={iteration} pid={proc.pid} exit_code={exit_code}" + ) if exit_code is not None: break continue @@ -614,14 +694,18 @@ def get_payload_command(self, job: Any) -> str: cmd = "" # for testing looping job: cmd = user.get_payload_command(job) + ';sleep 240' try: - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) - cmd = user.get_payload_command(job) #+ 'sleep 900' # to test looping jobs + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) + cmd = user.get_payload_command(job) # + 'sleep 900' # to test looping jobs except PilotException as error: self.post_setup(job) logger.error(traceback.format_exc()) - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) - self.__traces.pilot['error_code'] = job.piloterrorcodes[0] + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( + error.get_error_code() + ) + self.__traces.pilot["error_code"] = job.piloterrorcodes[0] logger.fatal( f"could not define payload command (traces error set to: {self.__traces.pilot['error_code']})" ) @@ -648,20 +732,33 @@ def run_preprocess(self, job: Any): if cmd_before_payload: cmd_before_payload = job.setup + cmd_before_payload logger.info(f"\n\npreprocess execution command:\n\n{cmd_before_payload}\n") - exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess') + exit_code = self.execute_utility_command( + cmd_before_payload, job, "preprocess" + ) if exit_code == 160: - logger.warning('no more HP points - time to abort processing loop') + logger.warning("no more HP points - time to abort processing loop") elif exit_code == 161: - logger.warning('no more HP points but at least one point was processed - time to abort processing loop') + logger.warning( + "no more HP points but at least one point was processed - time to abort processing loop" + ) elif exit_code == 162: - logger.warning('loop count reached the limit - time to abort processing loop') + logger.warning( + "loop count reached the limit - time to abort processing loop" + ) elif exit_code: # set error code - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PREPROCESSFAILURE) - logger.fatal(f'cannot continue since preprocess failed: exit_code={exit_code}') + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( + errors.PREPROCESSFAILURE + ) + logger.fatal( + f"cannot continue since preprocess failed: exit_code={exit_code}" + ) else: # in case the preprocess produced a command, chmod it - path = os.path.join(job.workdir, job.containeroptions.get('containerExec', 'does_not_exist')) + path = os.path.join( + job.workdir, + job.containeroptions.get("containerExec", "does_not_exist"), + ) if os.path.exists(path): os.chmod(path, 0o755) @@ -673,8 +770,10 @@ def should_verify_setup(self): :return: should verify setup (bool). """ - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.setup', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.setup", globals(), locals(), [pilot_user], 0 + ) return user.should_verify_setup(self.__job) def run(self) -> (int, str): # noqa: C901 @@ -686,7 +785,7 @@ def run(self) -> (int, str): # noqa: C901 :return: exit code (int), diagnostics (str). """ - diagnostics = '' + diagnostics = "" # get the payload command from the user specific code self.pre_setup(self.__job) @@ -694,46 +793,58 @@ def run(self) -> (int, str): # noqa: C901 # get the user defined payload command cmd = self.get_payload_command(self.__job) if not cmd: - logger.warning('aborting run() since payload command could not be defined') - return errors.UNKNOWNPAYLOADFAILURE, 'undefined payload command' + logger.warning("aborting run() since payload command could not be defined") + return errors.UNKNOWNPAYLOADFAILURE, "undefined payload command" # extract the setup in case the preprocess command needs it self.__job.setup = self.extract_setup(cmd) # should the setup be verified? (user defined) verify_setup = self.should_verify_setup() if verify_setup: - logger.debug(f'extracted setup to be verified:\n\n{self.__job.setup}') + logger.debug(f"extracted setup to be verified:\n\n{self.__job.setup}") try: _cmd = self.__job.setup stdout_filename = os.path.join(self.__job.workdir, "setup.stdout") stderr_filename = os.path.join(self.__job.workdir, "setup.stderr") - out = open(stdout_filename, 'wb') - err = open(stderr_filename, 'wb') + out = open(stdout_filename, "wb") + err = open(stderr_filename, "wb") # remove any trailing spaces and ;-signs _cmd = _cmd.strip() - trail = ';' if not _cmd.endswith(';') else '' - _cmd += trail + 'echo \"Done.\"' - exit_code, stdout, stderr = execute(_cmd, workdir=self.__job.workdir, returnproc=False, usecontainer=True, - stdout=out, stderr=err, cwd=self.__job.workdir, job=self.__job) + trail = ";" if not _cmd.endswith(";") else "" + _cmd += trail + 'echo "Done."' + exit_code, stdout, stderr = execute( + _cmd, + workdir=self.__job.workdir, + returnproc=False, + usecontainer=True, + stdout=out, + stderr=err, + cwd=self.__job.workdir, + job=self.__job, + ) if exit_code: - logger.warning(f'setup returned exit code={exit_code}') - diagnostics = stderr + stdout if stdout and stderr else '' + logger.warning(f"setup returned exit code={exit_code}") + diagnostics = stderr + stdout if stdout and stderr else "" if not diagnostics: stdout = read_file(stdout_filename) stderr = read_file(stderr_filename) - diagnostics = stderr + stdout if stdout and stderr else 'General payload setup verification error (check setup logs)' + diagnostics = ( + stderr + stdout + if stdout and stderr + else "General payload setup verification error (check setup logs)" + ) # check for special errors in thw output exit_code = errors.resolve_transform_error(exit_code, diagnostics) diagnostics = errors.format_diagnostics(exit_code, diagnostics) return exit_code, diagnostics if out: out.close() - logger.debug(f'closed {stdout_filename}') + logger.debug(f"closed {stdout_filename}") if err: err.close() - logger.debug(f'closed {stderr_filename}') + logger.debug(f"closed {stderr_filename}") except Exception as error: - diagnostics = f'could not execute: {error}' + diagnostics = f"could not execute: {error}" logger.error(diagnostics) return errors.PAYLOADEXECUTIONEXCEPTION, diagnostics @@ -743,10 +854,9 @@ def run(self) -> (int, str): # noqa: C901 # abort when nothing more to run, or when the preprocess returns a special exit code iteration = 0 while True: - - logger.info(f'payload iteration loop #{iteration + 1}') - os.environ['PILOT_EXEC_ITERATION_COUNT'] = f'{iteration}' - #if self.__args.debug: + logger.info(f"payload iteration loop #{iteration + 1}") + os.environ["PILOT_EXEC_ITERATION_COUNT"] = f"{iteration}" + # if self.__args.debug: # show_memory_usage() # first run the preprocess (if necessary) - note: this might update jobparams -> must update cmd @@ -758,22 +868,24 @@ def run(self) -> (int, str): # noqa: C901 exit_code = 0 # wipe the output file list since there won't be any new files # any output files from previous iterations, should have been transferred already - logger.debug('reset outdata since further output should not be expected after preprocess exit') + logger.debug( + "reset outdata since further output should not be expected after preprocess exit" + ) self.__job.outdata = [] break if jobparams_pre != jobparams_post: - logger.debug('jobparams were updated by utility_before_payload()') + logger.debug("jobparams were updated by utility_before_payload()") # must update cmd cmd = cmd.replace(jobparams_pre, jobparams_post) # now run the main payload, when it finishes, run the postprocess (if necessary) # note: no need to run any main payload in HPO Horovod jobs on Kubernetes - if os.environ.get('HARVESTER_HOROVOD', '') == '': + if os.environ.get("HARVESTER_HOROVOD", "") == "": proc = self.run_payload(self.__job, cmd, self.__out, self.__err) if not proc: # something went wrong, abort exit_code = errors.UNKNOWNEXCEPTION - diagnostics = 'command execution failed, check log for clues' + diagnostics = "command execution failed, check log for clues" break else: proc = None @@ -781,9 +893,11 @@ def run(self) -> (int, str): # noqa: C901 proc_co = None if proc is None: # run the post-process command even if there was no main payload - if os.environ.get('HARVESTER_HOROVOD', '') != '': - logger.info('no need to execute any main payload') - exit_code = self.run_utility_after_payload_finished(exit_code, True, UTILITY_AFTER_PAYLOAD_FINISHED2) + if os.environ.get("HARVESTER_HOROVOD", "") != "": + logger.info("no need to execute any main payload") + exit_code = self.run_utility_after_payload_finished( + exit_code, True, UTILITY_AFTER_PAYLOAD_FINISHED2 + ) self.post_payload(self.__job) else: break @@ -793,44 +907,60 @@ def run(self) -> (int, str): # noqa: C901 send_state(self.__job, self.__args, self.__job.state) # note: when sending a state change to the server, the server might respond with 'tobekilled' - if self.__job.state == 'failed': - logger.warning('job state is \'failed\' - abort payload and run()') + if self.__job.state == "failed": + logger.warning("job state is 'failed' - abort payload and run()") kill_processes(proc.pid) break # allow for a secondary command to be started after the payload (e.g. a coprocess) - utility_cmd = self.get_utility_command(order=UTILITY_AFTER_PAYLOAD_STARTED2) + utility_cmd = self.get_utility_command( + order=UTILITY_AFTER_PAYLOAD_STARTED2 + ) if utility_cmd: - logger.debug(f'starting utility command: {utility_cmd}') - label = 'coprocess' if 'coprocess' in utility_cmd else None + logger.debug(f"starting utility command: {utility_cmd}") + label = "coprocess" if "coprocess" in utility_cmd else None proc_co = self.run_command(utility_cmd, label=label) - logger.info('will wait for graceful exit') + logger.info("will wait for graceful exit") exit_code = self.wait_graceful(self.__args, proc) # reset error if Raythena decided to kill payload (no error) if errors.KILLPAYLOAD in self.__job.piloterrorcodes: - logger.debug('ignoring KILLPAYLOAD error') - self.__job.piloterrorcodes, self.__job.piloterrordiags = errors.remove_error_code(errors.KILLPAYLOAD, - pilot_error_codes=self.__job.piloterrorcodes, - pilot_error_diags=self.__job.piloterrordiags) + logger.debug("ignoring KILLPAYLOAD error") + ( + self.__job.piloterrorcodes, + self.__job.piloterrordiags, + ) = errors.remove_error_code( + errors.KILLPAYLOAD, + pilot_error_codes=self.__job.piloterrorcodes, + pilot_error_diags=self.__job.piloterrordiags, + ) exit_code = 0 - state = 'finished' + state = "finished" else: - state = 'finished' if exit_code == 0 else 'failed' + state = "finished" if exit_code == 0 else "failed" set_pilot_state(job=self.__job, state=state) - logger.info(f'\n\nfinished pid={proc.pid} exit_code={exit_code} state={self.__job.state}\n') + logger.info( + f"\n\nfinished pid={proc.pid} exit_code={exit_code} state={self.__job.state}\n" + ) # stop the utility command (e.g. a coprocess if necessary if proc_co: - logger.debug(f'stopping utility command: {utility_cmd}') + logger.debug(f"stopping utility command: {utility_cmd}") kill_processes(proc_co.pid) if exit_code is None: - logger.warning('detected unset exit_code from wait_graceful - reset to -1') + logger.warning( + "detected unset exit_code from wait_graceful - reset to -1" + ) exit_code = -1 - for order in (UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2): - exit_code = self.run_utility_after_payload_finished(exit_code, state, order) + for order in ( + UTILITY_AFTER_PAYLOAD_FINISHED, + UTILITY_AFTER_PAYLOAD_FINISHED2, + ): + exit_code = self.run_utility_after_payload_finished( + exit_code, state, order + ) # keep track of post-payload timing self.post_payload(self.__job) @@ -843,16 +973,18 @@ def run(self) -> (int, str): # noqa: C901 if self.__job.utilities != {}: self.stop_utilities() - if self.__job.is_hpo and state != 'failed': + if self.__job.is_hpo and state != "failed": # in case there are more hyper-parameter points, move away the previous log files - #self.rename_log_files(iteration) + # self.rename_log_files(iteration) iteration += 1 else: break return exit_code, diagnostics - def run_utility_after_payload_finished(self, exit_code: int, state: str, order: str) -> int: + def run_utility_after_payload_finished( + self, exit_code: int, state: str, order: str + ) -> int: """ Run utility command after the main payload has finished. @@ -866,18 +998,30 @@ def run_utility_after_payload_finished(self, exit_code: int, state: str, order: """ _exit_code = 0 try: - cmd_after_payload, label, ignore_failure = self.utility_after_payload_finished(self.__job, order) + ( + cmd_after_payload, + label, + ignore_failure, + ) = self.utility_after_payload_finished(self.__job, order) except Exception as error: logger.error(error) ignore_failure = False else: - if cmd_after_payload and self.__job.postprocess and state != 'failed': + if cmd_after_payload and self.__job.postprocess and state != "failed": cmd_after_payload = self.__job.setup + cmd_after_payload - logger.info(f"\n\npostprocess execution command:\n\n{cmd_after_payload}\n") - _exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) + logger.info( + f"\n\npostprocess execution command:\n\n{cmd_after_payload}\n" + ) + _exit_code = self.execute_utility_command( + cmd_after_payload, self.__job, label + ) elif cmd_after_payload: - logger.info(f"\n\npostprocess execution command:\n\n{cmd_after_payload}\n") - _exit_code = self.execute_utility_command(cmd_after_payload, self.__job, label) + logger.info( + f"\n\npostprocess execution command:\n\n{cmd_after_payload}\n" + ) + _exit_code = self.execute_utility_command( + cmd_after_payload, self.__job, label + ) # only set a new non-zero exit code if exit_code was not already set and ignore_failure is False # (e.g. any Xcache failure should be ignored to prevent job from failing since exit_code might get updated) @@ -888,22 +1032,28 @@ def run_utility_after_payload_finished(self, exit_code: int, state: str, order: def stop_utilities(self): """Stop any running utilities.""" - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) + pilot_user = os.environ.get("PILOT_USER", "generic").lower() + user = __import__( + f"pilot.user.{pilot_user}.common", globals(), locals(), [pilot_user], 0 + ) for utcmd in list(self.__job.utilities.keys()): utproc = self.__job.utilities[utcmd][0] if utproc: # get the pid for prmon - _list = self.__job.utilities.get('MemoryMonitor') + _list = self.__job.utilities.get("MemoryMonitor") if _list: pid = int(_list[-1]) if len(_list) == 4 else utproc.pid - logger.info(f'using pid={pid} to kill prmon') + logger.info(f"using pid={pid} to kill prmon") else: - logger.warning(f'did not find the pid for the memory monitor in the utilities list: {self.__job.utilities}') + logger.warning( + f"did not find the pid for the memory monitor in the utilities list: {self.__job.utilities}" + ) pid = utproc.pid status = self.kill_and_wait_for_process(pid, user, utcmd) - logger.info(f'utility process {utproc.pid} cleanup finished with status={status}') + logger.info( + f"utility process {utproc.pid} cleanup finished with status={status}" + ) user.post_utility_command_action(utcmd, self.__job) def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: @@ -916,7 +1066,7 @@ def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: :return: process exit status (int or None). """ sig = user.get_utility_command_kill_signal(utcmd) - logger.info(f"stopping utility process \'{utcmd}\' with signal {sig}") + logger.info(f"stopping utility process '{utcmd}' with signal {sig}") try: # Send SIGUSR1 signal to the process @@ -929,15 +1079,15 @@ def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: # Check the exit status of the process if os.WIFEXITED(status): - logger.debug('normal exit') + logger.debug("normal exit") return os.WEXITSTATUS(status) # Handle abnormal termination if needed - logger.warning('abnormal termination') + logger.warning("abnormal termination") return None # Process doesn't exist - ignore - logger.info(f'process {pid} no longer exists') + logger.info(f"process {pid} no longer exists") return True except OSError as exc: @@ -945,6 +1095,7 @@ def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: logger.warning(f"Error sending signal to/waiting for process {pid}: {exc}") return None + # try: # # Send SIGUSR1 signal to the process # os.kill(pid, sig) From 09aef380f79be30220545e7ee3fbf3f2fff53524 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 20 May 2024 12:19:19 +0200 Subject: [PATCH 33/41] Pylint updates (W0102 warning) --- pilot/control/data.py | 64 +++++++----- pilot/copytool/rucio.py | 8 +- pilot/info/basedata.py | 6 +- pilot/info/dataloader.py | 6 +- pilot/info/extinfo.py | 24 +++-- pilot/info/infoservice.py | 21 ++-- pilot/info/jobinfo.py | 9 +- pilot/user/atlas/common.py | 5 +- pilot/user/atlas/cpu.py | 28 ++--- pilot/user/atlas/jobmetrics.py | 29 ++++-- pilot/user/generic/common.py | 170 ++++++++++++++----------------- pilot/user/generic/cpu.py | 21 ++-- pilot/user/generic/jobmetrics.py | 14 ++- pilot/user/rubin/common.py | 73 ++++++------- pilot/user/rubin/cpu.py | 26 ++--- pilot/user/rubin/jobmetrics.py | 16 ++- pilot/user/sphenix/common.py | 34 +++---- pilot/user/sphenix/cpu.py | 6 +- pilot/user/sphenix/jobmetrics.py | 6 +- pilot/user/sphenix/utilities.py | 10 +- pilot/util/filestate.py | 8 +- pilot/util/harvester.py | 8 +- pilot/util/https.py | 16 ++- pilot/util/jobmetrics.py | 8 +- 24 files changed, 346 insertions(+), 270 deletions(-) diff --git a/pilot/control/data.py b/pilot/control/data.py index 9c8577ff..69a4561a 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -27,6 +27,7 @@ import os import time +import traceback import queue from typing import Any from pathlib import Path @@ -75,6 +76,10 @@ find_files_with_pattern, rename_xrdlog ) +from pilot.util.middleware import ( + containerise_middleware, + use_middleware_script +) from pilot.util.processes import threads_aborted from pilot.util.queuehandling import ( declare_failed_by_kill, @@ -102,7 +107,7 @@ def control(queues: Any, traces: Any, args: Any): threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args}, name=name) for name, target in list(targets.items())] # Python 2/3 - [thread.start() for thread in threads] + _ = [thread.start() for thread in threads] # if an exception is thrown, the graceful_stop will be set by the ExcThread class run() function try: @@ -114,7 +119,8 @@ def control(queues: Any, traces: Any, args: Any): except queue.Empty: pass else: - exc_type, exc_obj, exc_trace = exc + # exc_type, exc_obj, exc_trace = exc + _, exc_obj, _ = exc logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception @@ -260,18 +266,17 @@ def _stage_in(args: Any, job: Any) -> bool: label = 'stage-in' # should stage-in be done by a script (for containerisation) or by invoking the API (ie classic mode)? - use_container = pilot.util.middleware.use_middleware_script(job.infosys.queuedata.container_type.get("middleware")) + use_container = use_middleware_script(job.infosys.queuedata.container_type.get("middleware")) if use_container: logger.info('stage-in will be done in a container') try: eventtype, localsite, remotesite = get_trace_report_variables(job, label=label) - pilot.util.middleware.containerise_middleware(job, args, job.indata, eventtype, localsite, remotesite, - job.infosys.queuedata.container_options, label=label, - container_type=job.infosys.queuedata.container_type.get("middleware")) + containerise_middleware(job, args, job.indata, eventtype, localsite, remotesite, + job.infosys.queuedata.container_options, label=label, + container_type=job.infosys.queuedata.container_type.get("middleware")) except PilotException as error: logger.warning('stage-in containerisation threw a pilot exception: %s', error) except Exception as error: - import traceback logger.warning('stage-in containerisation threw an exception: %s', error) logger.error(traceback.format_exc()) else: @@ -298,7 +303,6 @@ def _stage_in(args: Any, job: Any) -> bool: client.prepare_sources(job.indata) client.transfer(job.indata, activity=activity, **kwargs) except PilotException as error: - import traceback error_msg = traceback.format_exc() logger.error(error_msg) msg = errors.format_diagnostics(error.get_error_code(), error_msg) @@ -322,7 +326,10 @@ def _stage_in(args: Any, job: Any) -> bool: add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) remain_files = [infile for infile in job.indata if infile.status not in ['remote_io', 'transferred', 'no_transfer']] - logger.info("stage-in finished") if not remain_files else logger.info("stage-in failed") + if not remain_files: + logger.info("stage-in finished") + else: + logger.info("stage-in failed") os.environ['PILOT_JOB_STATE'] = 'stageincompleted' return not remain_files @@ -698,8 +705,22 @@ def get_input_file_dictionary(indata: list) -> dict: return ret -def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool, input_files: list = [], - output_files: list = [], piloterrors: list = [], debugmode: bool = False): +def remove_input_output_files(workdir: str, files: list): + """ + Remove input/output files. + + :param workdir: work directory (str) + :param files: list of files to remove (list). + """ + for fname in files: + path = os.path.join(workdir, fname) + if os.path.exists(path): + logger.info(f'removing file: {path}') + remove(path) + + +def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool, # noqa: C901 + input_files: list = None, output_files: list = None, piloterrors: list = None, debugmode: bool = False): # noqa: C901 """ Create the tarball for the job. @@ -713,8 +734,13 @@ def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool :param debugmode: True if debug mode has been switched on (bool) :raises LogFileCreationFailure: in case of log file creation problem. """ + if input_files is None: + input_files = [] + if output_files is None: + output_files = [] + if piloterrors is None: + piloterrors = [] logger.debug(f'preparing to create log file (debug mode={debugmode})') - logger.debug(f'workdir: {workdir}') # PILOT_HOME is the launch directory of the pilot (or the one specified in pilot options as pilot workdir) pilot_home = os.environ.get('PILOT_HOME', os.getcwd()) @@ -722,8 +748,6 @@ def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool if pilot_home != current_dir: os.chdir(pilot_home) - logger.debug(f'current_dir: {current_dir}') - # copy special files if they exist (could be made experiment specific if there's a need for it) copy_special_files(workdir) @@ -734,26 +758,18 @@ def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool user.remove_redundant_files(workdir, piloterrors=piloterrors, debugmode=debugmode) # remove any present input/output files before tarring up workdir - for fname in input_files + output_files: - path = os.path.join(workdir, fname) - if os.path.exists(path): - logger.info(f'removing file: {path}') - remove(path) + remove_input_output_files(workdir, input_files + output_files) if logfile_name is None or len(logfile_name.strip('/ ')) == 0: logger.info('skipping tarball creation, since the logfile_name is empty') return - # rename the workdir for the tarball creation newworkdir = os.path.join(os.path.dirname(workdir), tarball_name) orgworkdir = workdir os.rename(workdir, newworkdir) workdir = newworkdir - - # get the size of the workdir dirsize = get_directory_size(workdir) timeout = get_tar_timeout(dirsize) - fullpath = os.path.join(current_dir, logfile_name) # /some/path/to/dirname/log.tgz logger.info(f'will create archive {fullpath} using timeout={timeout} s for directory size={dirsize} MB') @@ -900,13 +916,11 @@ def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, i client.prepare_destinations(xdata, activity) ## FIX ME LATER: split activities: for astorages and for copytools (to unify with ES workflow) client.transfer(xdata, activity, **kwargs) except PilotException as error: - import traceback error_msg = traceback.format_exc() logger.error(error_msg) msg = errors.format_diagnostics(error.get_error_code(), error_msg) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code(), msg=msg) except Exception: - import traceback logger.error(traceback.format_exc()) # do not raise the exception since that will prevent also the log from being staged out # error = PilotException("stageOut failed with error=%s" % e, code=ErrorCodes.STAGEOUTFAILED) diff --git a/pilot/copytool/rucio.py b/pilot/copytool/rucio.py index 8cd2dcdf..786da3db 100644 --- a/pilot/copytool/rucio.py +++ b/pilot/copytool/rucio.py @@ -27,8 +27,6 @@ """Rucio copy tool.""" -from __future__ import absolute_import # Python 2 (2to3 complains about this) - import json import logging import os @@ -573,7 +571,7 @@ def _stage_in_api(dst: str, fspec: Any, trace_report: dict, trace_report_out: li return ec, trace_report_out -def _stage_in_bulk(dst: str, files: list, trace_report_out: list = [], trace_common_fields: dict = {}, +def _stage_in_bulk(dst: str, files: list, trace_report_out: list = None, trace_common_fields: dict = None, rucio_host: str = ''): """ Stage-in files in bulk using the Rucio API. @@ -585,6 +583,10 @@ def _stage_in_bulk(dst: str, files: list, trace_report_out: list = [], trace_com :param rucio_host: optional rucio host (string). :raises Exception: download_client.download_pfns exception. """ + if trace_report_out is None: + trace_report_out = [] + if trace_common_fields is None: + trace_common_fields = {} # init. download client from rucio.client import Client from rucio.client.downloadclient import DownloadClient diff --git a/pilot/info/basedata.py b/pilot/info/basedata.py index 81676f4c..519b0a1e 100644 --- a/pilot/info/basedata.py +++ b/pilot/info/basedata.py @@ -17,7 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 """ Base data class. @@ -54,7 +54,7 @@ class BaseData: _keys = {} - def _load_data(self, data: dict, kmap: dict = {}, validators: dict = None): + def _load_data(self, data: dict, kmap: dict = None, validators: dict = None): """ Construct and initialize data from ext source. @@ -62,6 +62,8 @@ def _load_data(self, data: dict, kmap: dict = {}, validators: dict = None): :param kmap: the translation map of data attributes from external format to internal schema (dict) :param validators: map of validation handlers to be applied (dict). """ + if kmap is None: + kmap = {} # the translation map of the queue data attributes from external data to internal schema # 'internal_name':('ext_name1', 'extname2_if_any') # 'internal_name2':'ext_name3' diff --git a/pilot/info/dataloader.py b/pilot/info/dataloader.py index 9acfc753..530b8480 100644 --- a/pilot/info/dataloader.py +++ b/pilot/info/dataloader.py @@ -17,7 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 """ Base loader class to retrieve data from Ext sources (file, url). @@ -218,7 +218,7 @@ def jsonparser(c): return None -def merge_dict_data(dic1: dict, dic2: dict, keys: list = [], common: bool = True, left: bool = True, +def merge_dict_data(dic1: dict, dic2: dict, keys: list = None, common: bool = True, left: bool = True, right: bool = True, rec: bool = False) -> dict: """ Recursively merge two dictionary objects. @@ -234,6 +234,8 @@ def merge_dict_data(dic1: dict, dic2: dict, keys: list = [], common: bool = True :param rec: if True then merge recursively (bool) :return: merged dictionary (dict). """ + if keys is None: + keys = [] ### TODO: verify and configure logic later if not (isinstance(dic1, dict) and isinstance(dic2, dict)): diff --git a/pilot/info/extinfo.py b/pilot/info/extinfo.py index fda96cb6..beb4fc15 100644 --- a/pilot/info/extinfo.py +++ b/pilot/info/extinfo.py @@ -16,8 +16,8 @@ # under the License. # # Authors: -# - Alexey Anisenkov, anisyonk@cern.ch, 2018-21 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Alexey Anisenkov, anisyonk@cern.ch, 2018-2021 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """ Information provider from external source(s). @@ -59,7 +59,7 @@ def __init__(self, cache_time: int = 60): self.cache_time = cache_time @classmethod - def load_schedconfig_data(cls, pandaqueues: list = [], priority: list = [], cache_time: int = 60) -> dict: + def load_schedconfig_data(cls, pandaqueues: list = None, priority: list = None, cache_time: int = 60) -> dict: """ Download the (CRIC-extended) data associated to PandaQueue from various sources (prioritized). @@ -71,6 +71,10 @@ def load_schedconfig_data(cls, pandaqueues: list = [], priority: list = [], cach :param cache_time: default cache time in seconds (int). :return: dict of schedconfig settings by PandaQueue name as a key (dict). """ + if pandaqueues is None: + pandaqueues = [] + if priority is None: + priority = [] pandaqueues = sorted(set(pandaqueues)) cache_dir = config.Information.cache_dir @@ -121,7 +125,7 @@ def get_cvmfs_path(url: str, fname: str) -> str: return cvmfs_path @classmethod - def load_queuedata(cls, pandaqueue: str, priority: list = [], cache_time: int = 60) -> dict: + def load_queuedata(cls, pandaqueue: str, priority: list = None, cache_time: int = 60) -> dict: """ Download the queuedata from various sources (prioritized). @@ -135,6 +139,8 @@ def load_queuedata(cls, pandaqueue: str, priority: list = [], cache_time: int = :return: dict of queuedata settings by PandaQueue name as a key (dict) :raises PilotException: in case of error. """ + if priority is None: + priority = [] if not pandaqueue: raise PilotException('load_queuedata(): pandaqueue name is not specififed', code=ErrorCodes.QUEUEDATA) @@ -195,7 +201,7 @@ def jsonparser_panda(dat: Any) -> dict: return cls.load_data(sources, priority, cache_time) @classmethod - def load_storage_data(cls, ddmendpoints: list = [], priority: list = [], cache_time: int = 60) -> dict: + def load_storage_data(cls, ddmendpoints: list = None, priority: list = None, cache_time: int = 60) -> dict: """ Download DDM Storages details by given name (DDMEndpoint) from various sources (prioritized). @@ -206,6 +212,10 @@ def load_storage_data(cls, ddmendpoints: list = [], priority: list = [], cache_t :param cache_time: default cache time in seconds (int) :return: dictionary of DDMEndpoint settings by DDMendpoint name as a key (dict). """ + if ddmendpoints is None: + ddmendpoints = [] + if priority is None: + priority = [] ddmendpoints = sorted(set(ddmendpoints)) cache_dir = config.Information.cache_dir @@ -268,12 +278,14 @@ def resolve_queuedata(self, pandaqueue: str, schedconf_priority: list = None) -> # merge return merge_dict_data(r, master_data) - def resolve_storage_data(self, ddmendpoints: list = []) -> dict: + def resolve_storage_data(self, ddmendpoints: list = None) -> dict: """ Resolve final DDM Storages details by given names (DDMEndpoint). :param ddmendpoints: list of ddmendpoint names (list) :return: dictionary of settings for given DDMEndpoint as a key (dict). """ + if ddmendpoints is None: + ddmendpoints = [] # load ddmconf settings return self.load_storage_data(ddmendpoints, cache_time=self.cache_time) ## use default priority diff --git a/pilot/info/infoservice.py b/pilot/info/infoservice.py index b9c7dd72..5162f6fd 100644 --- a/pilot/info/infoservice.py +++ b/pilot/info/infoservice.py @@ -17,8 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 - +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 """ Info Service module. @@ -141,7 +140,7 @@ def whoami(cls): return inspect.stack()[1][3] @classmethod - def _resolve_data(cls, fname: Any, providers: list = [], args: list = [], kwargs: dict = {}, merge: bool = False) -> Any: + def _resolve_data(cls, fname: Any, providers: Any = None, args: list = None, kwargs: dict = None, merge: bool = False) -> Any: """ Resolve data by calling function `fname` of passed provider objects. @@ -150,12 +149,18 @@ def _resolve_data(cls, fname: Any, providers: list = [], args: list = [], kwargs and resolve data by execution function `fname` with passed arguments `args` and `kwargs` :param fname: name of function to be called (Any) - :param providers: list of provider objects (list) + :param providers: list or tuple of provider objects (Any) :param args: list of arguments to be passed to function (list) :param kwargs: list of keyword arguments to be passed to function (dict) :param merge: if True then merge data from all providers (bool) :return: The result of first successful execution will be returned (Any). """ + if providers is None: + providers = [] + if args is None: + args = [] + if kwargs is None: + kwargs = {} ret = None if merge: providers = list(providers) @@ -196,7 +201,7 @@ def resolve_queuedata(self, pandaqueue: str) -> Any: ## high level API return cache.get(pandaqueue) #@require_init - def resolve_storage_data(self, ddmendpoints: list = []) -> dict: ## high level API + def resolve_storage_data(self, ddmendpoints: list = None) -> dict: ## high level API """ Resolve final full storage data details. @@ -204,6 +209,8 @@ def resolve_storage_data(self, ddmendpoints: list = []) -> dict: ## high level :return: dictionary of DDMEndpoint settings by DDMEndpoint name as a key (dict) :raises PilotException: in case of error. """ + if ddmendpoints is None: + ddmendpoints = [] if isinstance(ddmendpoints, str): ddmendpoints = [ddmendpoints] @@ -252,12 +259,14 @@ def resolve_schedconf_sources(self) -> Any: ## high level API # # look up priority order: either from job, local config, extinfo provider # return self._resolve_data(self.whoami(), providers=(self.confinfo, self.jobinfo, self.extinfo), args=[name]) - def resolve_ddmendpoint_storageid(self, ddmendpoint: list = []): + def resolve_ddmendpoint_storageid(self, ddmendpoint: list = None): """ Resolve the map between ddmendpoint and storage_id. :param ddmendpoint: ddmendpoint name (list). """ + if ddmendpoint is None: + ddmendpoint = [] if not ddmendpoint or ddmendpoint not in self.ddmendpoint2storage_id: storages = self.resolve_storage_data(ddmendpoint) for storage_name, storage in storages.items(): diff --git a/pilot/info/jobinfo.py b/pilot/info/jobinfo.py index 1f37a6c4..af9562c9 100644 --- a/pilot/info/jobinfo.py +++ b/pilot/info/jobinfo.py @@ -17,7 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 """ @@ -33,7 +33,7 @@ logger = logging.getLogger(__name__) -class JobInfoProvider(object): +class JobInfoProvider: """ Job info provider which is used to extract settings specific for given Job and overwrite general configuration used by Information Service @@ -85,12 +85,13 @@ def resolve_queuedata(self, pandaqueue, **kwargs): return {pandaqueue: data} - def resolve_storage_data(self, ddmendpoints=[], **kwargs): + def resolve_storage_data(self, ddmendpoints: list = None, **kwargs: dict) -> dict: """ Resolve Job specific settings for storage data (including data passed via --overwriteStorageData) :return: dict of settings for requested DDMEndpoints with ddmendpoin as a key """ - + if ddmendpoints is None: + ddmendpoints = [] data = {} ## use job.overwrite_storagedata as a master source diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 8f38d37c..a09fbfc9 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -2077,7 +2077,7 @@ def remove_special_files(workdir: str, dir_list: list): remove_dir_tree(item) -def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = [], debugmode: bool = False): +def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = None, debugmode: bool = False): """ Remove redundant files and directories prior to creating the log file. @@ -2090,7 +2090,8 @@ def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: """ if outputfiles is None: outputfiles = [] - + if piloterrors is None: + piloterrors = [] logger.debug("removing redundant files prior to log creation") workdir = os.path.abspath(workdir) diff --git a/pilot/user/atlas/cpu.py b/pilot/user/atlas/cpu.py index ac2e8c7f..25e7e6c7 100644 --- a/pilot/user/atlas/cpu.py +++ b/pilot/user/atlas/cpu.py @@ -17,26 +17,29 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 + +""" CPU related functionality.""" -import os import logging +import os +from typing import Any # from .utilities import get_memory_values #from pilot.util.container import execute from pilot.util.math import float_to_rounded_string from .utilities import get_memory_values + logger = logging.getLogger(__name__) -def get_core_count(job): +def get_core_count(job: Any) -> int: """ Return the core count from ATHENA_PROC_NUMBER. - :param job: job object. + :param job: job object (Any) :return: core count (int). """ - if "HPC_HPC" in job.infosys.queuedata.catchall: if job.corecount is None: job.corecount = 0 @@ -58,30 +61,27 @@ def get_core_count(job): return job.corecount -def add_core_count(corecount, core_counts=[]): +def add_core_count(corecount: int, core_counts: list = None) -> list: """ Add a core count measurement to the list of core counts. - :param corecount: current actual core count (int). - :param core_counts: list of core counts (list). + :param corecount: current actual core count (int) + :param core_counts: list of core counts (list) :return: updated list of core counts (list). """ - - if core_counts is None: # protection + if core_counts is None: core_counts = [] core_counts.append(corecount) return core_counts -def set_core_counts(**kwargs): +def set_core_counts(**kwargs: dict): """ Set the number of used cores. - :param kwargs: kwargs (dictionary). - :return: + :param kwargs: kwargs (dict). """ - # something like this could be used if prmon also gave info about ncores # (change nprocs -> ncores and add ncores to list in utilities module, get_average_summary_dictionary_prmon()) diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py index 6115ecdc..4b030183 100644 --- a/pilot/user/atlas/jobmetrics.py +++ b/pilot/user/atlas/jobmetrics.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Functions for building job metrics.""" @@ -49,7 +49,7 @@ logger = logging.getLogger(__name__) -def get_job_metrics_string(job: Any, extra: dict = {}) -> str: +def get_job_metrics_string(job: Any, extra: dict = None) -> str: """ Get the job metrics string. @@ -57,6 +57,8 @@ def get_job_metrics_string(job: Any, extra: dict = {}) -> str: :param extra: any extra information to be added (dict) :return: job metrics (str). """ + if extra is None: + extra = {} job_metrics = "" # report core count (will also set corecount in job object) @@ -146,7 +148,7 @@ def get_trace_exit_code(workdir: str) -> str: return trace_exit_code -def add_features(job_metrics: str, corecount: int, add: list = []) -> str: +def add_features(job_metrics: str, corecount: int, add: list = None) -> str: """ Add job and machine feature data to the job metrics if available. @@ -157,13 +159,18 @@ def add_features(job_metrics: str, corecount: int, add: list = []) -> str: :param add: features to be added (list) :return: updated job metrics (str). """ + if add is None: + add = [] if job_metrics and not job_metrics.endswith(' '): job_metrics += ' ' - def add_sub_features(features_dic, add=[]): + def add_sub_features(features_dic: dict, _add: list = None): + """Helper function.""" + if _add is None: + _add = [] features_str = '' for key in features_dic.keys(): - if add and key not in add: + if _add and key not in _add: continue value = features_dic.get(key, None) if value: @@ -185,7 +192,7 @@ def add_sub_features(features_dic, add=[]): logger.warning(f'cannot process hs06 machine feature: {exc} (hs06={hs06}, total_cpu={total_cpu}, corecount={corecount})') features_list = [machinefeatures, jobfeatures] for feature_item in features_list: - features_str = add_sub_features(feature_item, add=add) + features_str = add_sub_features(feature_item, _add=add) if features_str: job_metrics += features_str @@ -241,9 +248,10 @@ def add_event_number(job_metrics: str, workdir: str) -> str: return job_metrics -def get_job_metrics(job: Any, extra: dict = {}) -> str: +def get_job_metrics(job: Any, extra: dict = None) -> str: """ Return a properly formatted job metrics string. + The format of the job metrics string is defined by the server. It will be reported to the server during updateJob. Example of job metrics: @@ -251,11 +259,12 @@ def get_job_metrics(job: Any, extra: dict = {}) -> str: Format: nEvents= nEventsW= vmPeakMax= vmPeakMean= RSSMean= hs06= shutdownTime= cpuFactor= cpuLimit= diskLimit= jobStart= memLimit= runLimit= - :param job: job object + :param job: job object (Any) :param extra: any extra information to be added (dict) - :return: job metrics (string). + :return: job metrics (str). """ - + if extra is None: + extra = {} # get job metrics string job_metrics = get_job_metrics_string(job, extra=extra) diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py index a76c9ee5..ec1d3212 100644 --- a/pilot/user/generic/common.py +++ b/pilot/user/generic/common.py @@ -17,10 +17,14 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 +"""Generic user specific functionality.""" + +import logging import os from signal import SIGTERM +from typing import Any from pilot.common.exception import TrfDownloadFailure from pilot.util.config import config @@ -28,44 +32,42 @@ from pilot.util.filehandling import read_file from .setup import get_analysis_trf -import logging logger = logging.getLogger(__name__) -def sanity_check(): +def sanity_check() -> int: """ Perform an initial sanity check before doing anything else in a given workflow. + This function can be used to verify importing of modules that are otherwise used much later, but it is better to abort the pilot if a problem is discovered early. - :return: exit code (0 if all is ok, otherwise non-zero exit code). + :return: exit code (0 if all is ok, otherwise non-zero exit code) (int). """ - return 0 -def validate(job): +def validate(job: Any) -> bool: """ Perform user specific payload/job validation. - :param job: job object. - :return: Boolean (True if validation is successful). + :param job: job object (Any) + :return: True if validation is successful (bool). """ - return True -def get_payload_command(job): +def get_payload_command(job: Any) -> str: """ - Return the full command for executing the payload, including the sourcing of all setup files and setting of - environment variables. + Return the full command for executing the payload + The returned command string includes the sourcing of all setup files and setting of + environment variables. By default, the full payload command is assumed to be in the job.jobparams. - :param job: job object - :return: command (string) + :param job: job object (Any) + :return: command (str). """ - # Try to download the trf # if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") @@ -79,17 +81,16 @@ def get_payload_command(job): return get_analysis_run_command(job, trf_name) -def get_analysis_run_command(job, trf_name): +def get_analysis_run_command(job: Any, trf_name: str) -> str: """ Return the proper run command for the user job. Example output: export X509_USER_PROXY=<..>;./runAthena --usePFCTurl --directIn - :param job: job object. - :param trf_name: name of the transform that will run the job (string). Used when containers are not used. - :return: command (string). + :param job: job object (Any) + :param trf_name: name of the transform that will run the job (string). Used when containers are not used (str) + :return: command (str). """ - cmd = "" # add the user proxy @@ -108,35 +109,36 @@ def get_analysis_run_command(job, trf_name): return cmd -def update_job_data(job): +def update_job_data(job: Any): """ This function can be used to update/add data to the job object. E.g. user specific information can be extracted from other job object fields. In the case of ATLAS, information is extracted from the metaData field and added to other job object fields. - :param job: job object - :return: + :param job: job object (Any) """ - pass -def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode=False): +def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = None, debugmode: bool = False): """ Remove redundant files and directories prior to creating the log file. - :param workdir: working directory (string). - :param outputfiles: list of output files. + :param workdir: working directory (str) + :param outputfiles: list of output files (list) :param piloterrors: list of Pilot assigned error codes (list). - :return: """ - + #if outputfiles is None: + # outputfiles = [] + #if piloterrors is None: + # piloterrors = [] pass -def get_utility_commands(order=None, job=None): +def get_utility_commands(order: int = None, job: Any = None) -> dict: """ Return a dictionary of utility commands and arguments to be executed in parallel with the payload. + This could e.g. be memory and network monitor commands. A separate function can be used to determine the corresponding command setups using the utility command name. If the optional order parameter is set, the function should return the list of corresponding commands. @@ -147,34 +149,31 @@ def get_utility_commands(order=None, job=None): FORMAT: {'command': , 'args': } - :param order: optional sorting order (see pilot.util.constants) - :param job: optional job object. - :return: dictionary of utilities to be executed in parallel with the payload. + :param order: optional sorting order (see pilot.util.constants) (int) + :param job: optional job object (Any) + :return: dictionary of utilities to be executed in parallel with the payload (dict). """ - return {} -def get_utility_command_setup(name, job, setup=None): +def get_utility_command_setup(name: str, job: Any, setup: str = None) -> str: """ Return the proper setup for the given utility command. If a payload setup is specified - :param name: - :param setup: - :return: + :param name: name of utility command (str) + :param setup: setup string (str) + :return: full setup string of the utility command (str). """ - - pass + return "" -def get_utility_command_execution_order(name): +def get_utility_command_execution_order(name: str) -> int: """ Should the given utility command be executed before or after the payload? - :param name: utility name (string). - :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD_STARTED) + :param name: utility name (str) + :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD_STARTED) (int). """ - # example implementation if name == 'monitor': return UTILITY_BEFORE_PAYLOAD @@ -182,139 +181,128 @@ def get_utility_command_execution_order(name): return UTILITY_AFTER_PAYLOAD_STARTED -def post_utility_command_action(name, job): +def post_utility_command_action(name: str, job: Any): """ Perform post action for given utility command. - :param name: name of utility command (string). - :param job: job object. - :return: + :param name: name of utility command (str) + :param job: job object (Any). """ - pass -def get_utility_command_kill_signal(name): +def get_utility_command_kill_signal(name: str) -> int: """ Return the proper kill signal used to stop the utility command. - :param name: - :return: kill signal + :param name: utility command name (str) + :return: kill signal (int). """ - return SIGTERM -def get_utility_command_output_filename(name, selector=None): +def get_utility_command_output_filename(name: str, selector: bool = None) -> str: """ Return the filename to the output of the utility command. - :param name: utility name (string). - :param selector: optional special conditions flag (boolean). - :return: filename (string). + :param name: utility name (str) + :param selector: optional special conditions flag (bool) + :return: filename (str). """ - return "" -def verify_job(job): +def verify_job(job: Any) -> bool: """ Verify job parameters for specific errors. + Note: in case of problem, the function should set the corresponding pilot error code using job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) - :param job: job object - :return: Boolean. + :param job: job object (Any) + :return: True if job is verified (bool). """ - return True -def update_stagein(job): +def update_stagein(job: Any): """ In case special files need to be skipped during stage-in, the job.indata list can be updated here. + See ATLAS code for an example. - :param job: job object. - :return: + :param job: job object (Any). """ - pass -def get_metadata(workdir): +def get_metadata(workdir: str) -> str: """ Return the metadata from file. - :param workdir: work directory (string) - :return: + :param workdir: work directory (str) + :return: metadata (str). """ - path = os.path.join(workdir, config.Payload.jobreport) metadata = read_file(path) if os.path.exists(path) else None return metadata -def update_server(job): +def update_server(job: Any): """ Perform any user specific server actions. E.g. this can be used to send special information to a logstash. - :param job: job object. - :return: + :param job: job object (Any) """ - pass -def post_prestagein_utility_command(**kwargs): +def post_prestagein_utility_command(**kwargs: dict): """ Execute any post pre-stage-in utility commands. - :param kwargs: kwargs (dictionary). - :return: + :param kwargs: kwargs (dict). """ - # label = kwargs.get('label', 'unknown_label') # stdout = kwargs.get('output', None) - pass -def process_debug_command(debug_command, pandaid): +def process_debug_command(debug_command: str, pandaid: str) -> str: """ In debug mode, the server can send a special debug command to the pilot via the updateJob backchannel. + This function can be used to process that command, i.e. to identify a proper pid to debug (which is unknown to the server). - :param debug_command: debug command (string), payload pid (int). - :param pandaid: PanDA id (string). - :return: updated debug command (string) + :param debug_command: debug command (str) + :param pandaid: PanDA id (str) + :return: updated debug command (str). """ - return debug_command -def allow_timefloor(submitmode): +def allow_timefloor(submitmode: str) -> bool: """ Should the timefloor mechanism (multi-jobs) be allowed for the given submit mode? - :param submitmode: submit mode (string). + :param submitmode: submit mode (str) + :return: True if timefloor is allowed (bool). """ - return True -def get_pilot_id(jobid): +def get_pilot_id(jobid: int) -> str: """ Get the pilot id from the environment variable GTAG. + Update if necessary (do not used if you want the same pilot id for all multi-jobs). - :param jobid: PanDA job id - UNUSED (int). - :return: pilot id (string). + :param jobid: PanDA job id - UNUSED (int) + :return: pilot id (str). """ - return os.environ.get("GTAG", "unknown") diff --git a/pilot/user/generic/cpu.py b/pilot/user/generic/cpu.py index c6ed1b4b..03300de3 100644 --- a/pilot/user/generic/cpu.py +++ b/pilot/user/generic/cpu.py @@ -17,33 +17,38 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 -from pilot.util.container import execute +""" CPU related functionality.""" import logging +from typing import Any + +from pilot.util.container import execute + logger = logging.getLogger(__name__) -def get_core_count(job): +def get_core_count(job: Any) -> int: """ Return the core count. - :param job: job object. + :param job: job object (Any) :return: core count (int). """ - return 0 -def add_core_count(corecount, core_counts=[]): +def add_core_count(corecount: int, core_counts: list = None): """ Add a core count measurement to the list of core counts. - :param corecount: current actual core count (int). - :param core_counts: list of core counts (list). + :param corecount: current actual core count (int) + :param core_counts: list of core counts (list) :return: updated list of core counts (list). """ + if core_counts is None: + core_counts = [] return core_counts.append(corecount) diff --git a/pilot/user/generic/jobmetrics.py b/pilot/user/generic/jobmetrics.py index 864983d4..b24739ce 100644 --- a/pilot/user/generic/jobmetrics.py +++ b/pilot/user/generic/jobmetrics.py @@ -17,17 +17,20 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 # from pilot.util.jobmetrics import get_job_metrics_entry import logging +from typing import Any + logger = logging.getLogger(__name__) -def get_job_metrics(job, extra={}): +def get_job_metrics(job: Any, extra: dict = None) -> str: """ Return a properly formatted job metrics string. + The format of the job metrics string is defined by the server. It will be reported to the server during updateJob. Example of job metrics: @@ -35,9 +38,10 @@ def get_job_metrics(job, extra={}): Format: nEvents= nEventsW= vmPeakMax= vmPeakMean= RSSMean= hs06= shutdownTime= cpuFactor= cpuLimit= diskLimit= jobStart= memLimit= runLimit= - :param job: job object + :param job: job object (Any) :param extra: any extra information to be added (dict) - :return: job metrics (string). + :return: job metrics (str). """ - + #if extra is None: + # extra = {} return "" diff --git a/pilot/user/rubin/common.py b/pilot/user/rubin/common.py index 8f736223..b68aa6c1 100644 --- a/pilot/user/rubin/common.py +++ b/pilot/user/rubin/common.py @@ -17,11 +17,15 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 +"""Common functions for Rubin.""" + +import logging import os from re import findall from signal import SIGTERM +from typing import Any from pilot.common.exception import TrfDownloadFailure from pilot.util.config import config @@ -29,44 +33,41 @@ from pilot.util.filehandling import read_file from .setup import get_analysis_trf -import logging logger = logging.getLogger(__name__) -def sanity_check(): +def sanity_check() -> int: """ Perform an initial sanity check before doing anything else in a given workflow. + This function can be used to verify importing of modules that are otherwise used much later, but it is better to abort the pilot if a problem is discovered early. - :return: exit code (0 if all is ok, otherwise non-zero exit code). + :return: exit code (0 if all is ok, otherwise non-zero exit code) (int). """ - return 0 -def validate(job): +def validate(job: Any) -> bool: """ Perform user specific payload/job validation. - :param job: job object. - :return: Boolean (True if validation is successful). + :param job: job object (Any) + :return: True if validation is successful (bool) """ - return True -def get_payload_command(job): +def get_payload_command(job: Any): """ - Return the full command for executing the payload, including the sourcing of all setup files and setting of - environment variables. + Return the full command for executing the payload. + The returned string includes the sourcing of all setup files and setting of environment variables. By default, the full payload command is assumed to be in the job.jobparams. - :param job: job object - :return: command (string) + :param job: job object (Any) + :return: command (str). """ - # Try to download the trf # if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") @@ -80,17 +81,16 @@ def get_payload_command(job): return get_analysis_run_command(job, trf_name) -def get_analysis_run_command(job, trf_name): +def get_analysis_run_command(job: Any, trf_name: str) -> str: """ Return the proper run command for the user job. Example output: export X509_USER_PROXY=<..>;./runAthena --usePFCTurl --directIn - :param job: job object. - :param trf_name: name of the transform that will run the job (string). Used when containers are not used. - :return: command (string). + :param job: job object (Any) + :param trf_name: name of the transform that will run the job (string). Used when containers are not used (str) + :return: command (str). """ - cmd = "" # add the user proxy @@ -109,36 +109,38 @@ def get_analysis_run_command(job, trf_name): return cmd -def update_job_data(job): +def update_job_data(job: Any): """ This function can be used to update/add data to the job object. + E.g. user specific information can be extracted from other job object fields. In the case of ATLAS, information is extracted from the metaData field and added to other job object fields. - :param job: job object - :return: + :param job: job object (Any) """ - pass -def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode=False): +def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = None, debugmode: bool = False): """ Remove redundant files and directories prior to creating the log file. - :param workdir: working directory (string). - :param outputfiles: list of output files. - :param piloterrors: list of Pilot assigned error codes (list). - :param debugmode: True if debug mode has been switched on (Boolean). - :return: + :param workdir: working directory (str) + :param outputfiles: list of output files (list) + :param piloterrors: list of Pilot assigned error codes (list) + :param debugmode: True if debug mode has been switched on (bool). """ - + #if outputfiles is None: + # outputfiles = [] + #if piloterrors is None: + # piloterrors = [] pass -def get_utility_commands(order=None, job=None): +def get_utility_commands(order: int = None, job: Any = None) -> dict: """ Return a dictionary of utility commands and arguments to be executed in parallel with the payload. + This could e.g. be memory and network monitor commands. A separate function can be used to determine the corresponding command setups using the utility command name. If the optional order parameter is set, the function should return the list of corresponding commands. @@ -149,11 +151,10 @@ def get_utility_commands(order=None, job=None): FORMAT: {'command': , 'args': } - :param order: optional sorting order (see pilot.util.constants) - :param job: optional job object. - :return: dictionary of utilities to be executed in parallel with the payload. + :param order: optional sorting order (see pilot.util.constants) (int) + :param job: optional job object (Any) + :return: dictionary of utilities to be executed in parallel with the payload (dict). """ - return {} diff --git a/pilot/user/rubin/cpu.py b/pilot/user/rubin/cpu.py index 4873673b..db2674cc 100644 --- a/pilot/user/rubin/cpu.py +++ b/pilot/user/rubin/cpu.py @@ -17,45 +17,47 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 -from pilot.util.container import execute +""" CPU related functionality.""" import logging +from typing import Any +from pilot.util.container import execute + logger = logging.getLogger(__name__) -def get_core_count(job): +def get_core_count(job: Any) -> int: """ Return the core count. - :param job: job object. + :param job: job object (Any) :return: core count (int). """ - return 0 -def add_core_count(corecount, core_counts=[]): +def add_core_count(corecount: int, core_counts: list = None): """ Add a core count measurement to the list of core counts. - :param corecount: current actual core count (int). - :param core_counts: list of core counts (list). + :param corecount: current actual core count (int) + :param core_counts: list of core counts (list) :return: updated list of core counts (list). """ + if core_counts is None: + core_counts = [] return core_counts.append(corecount) -def set_core_counts(**kwargs): +def set_core_counts(**kwargs: dict): """ Set the number of used cores. - :param kwargs: kwargs (dictionary). - :return: + :param kwargs: kwargs (dict). """ - job = kwargs.get('job', None) if job and job.pgrp: cmd = f"ps axo pgid,psr | sort | grep {job.pgrp} | uniq | awk '{{print $1}}' | grep -x {job.pgrp} | wc -l" diff --git a/pilot/user/rubin/jobmetrics.py b/pilot/user/rubin/jobmetrics.py index 864983d4..b517bbdc 100644 --- a/pilot/user/rubin/jobmetrics.py +++ b/pilot/user/rubin/jobmetrics.py @@ -17,17 +17,22 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 + +"""Functions for building job metrics.""" # from pilot.util.jobmetrics import get_job_metrics_entry import logging +from typing import Any + logger = logging.getLogger(__name__) -def get_job_metrics(job, extra={}): +def get_job_metrics(job: Any, extra: dict = None) -> str: """ Return a properly formatted job metrics string. + The format of the job metrics string is defined by the server. It will be reported to the server during updateJob. Example of job metrics: @@ -35,9 +40,10 @@ def get_job_metrics(job, extra={}): Format: nEvents= nEventsW= vmPeakMax= vmPeakMean= RSSMean= hs06= shutdownTime= cpuFactor= cpuLimit= diskLimit= jobStart= memLimit= runLimit= - :param job: job object + :param job: job object (Any) :param extra: any extra information to be added (dict) - :return: job metrics (string). + :return: job metrics (str). """ - + #if extra is None: + # extra = {} return "" diff --git a/pilot/user/sphenix/common.py b/pilot/user/sphenix/common.py index 4d100aae..657a180d 100644 --- a/pilot/user/sphenix/common.py +++ b/pilot/user/sphenix/common.py @@ -17,11 +17,13 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 +import logging import os import re from signal import SIGTERM +from typing import Any from pilot.common.exception import ( TrfDownloadFailure, @@ -46,11 +48,10 @@ from pilot.util.filehandling import read_file from .setup import get_analysis_trf -import logging logger = logging.getLogger(__name__) -def sanity_check(): +def sanity_check() -> int: """ Perform an initial sanity check before doing anything else in a given workflow. This function can be used to verify importing of modules that are otherwise used much later, but it is better to abort @@ -61,27 +62,26 @@ def sanity_check(): return 0 -def validate(job): +def validate(job: Any) -> bool: """ Perform user specific payload/job validation. - :param job: job object. - :return: Boolean (True if validation is successful). + :param job: job object (Any) + :return: True if validation is successful (bool). """ return True -def get_payload_command(job): +def get_payload_command(job: Any) -> str: """ Return the full command for executing the payload, including the sourcing of all setup files and setting of environment variables. By default, the full payload command is assumed to be in the job.jobparams. - :param job: job object - :return: command (string) + :param job: job object (Any) + :return: command (str). """ - # Try to download the trf # if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") @@ -95,7 +95,7 @@ def get_payload_command(job): return get_analysis_run_command(job, trf_name) -def get_analysis_run_command(job, trf_name): +def get_analysis_run_command(job: Any, trf_name: str) -> str: """ Return the proper run command for the user job. @@ -123,14 +123,13 @@ def get_analysis_run_command(job, trf_name): return cmd -def update_job_data(job): +def update_job_data(job: Any): """ This function can be used to update/add data to the job object. E.g. user specific information can be extracted from other job object fields. In the case of ATLAS, information is extracted from the metaData field and added to other job object fields. - :param job: job object - :return: + :param job: job object (Any). """ # in case the job was created with --outputs="regex|DST_.*\.root", we can now look for the corresponding # output files and add them to the output file list @@ -174,19 +173,18 @@ def update_job_data(job): logger.debug('no regex found in outdata file list') -def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode=False): +def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = None, debugmode: bool = False): """ Remove redundant files and directories prior to creating the log file. :param workdir: working directory (string). :param outputfiles: list of output files. :param piloterrors: list of Pilot assigned error codes (list). - :return: None """ - return + pass -def get_utility_commands(order=None, job=None): +def get_utility_commands(order: int = None, job: Any = None) -> dict: """ Return a dictionary of utility commands and arguments to be executed in parallel with the payload. This could e.g. be memory and network diff --git a/pilot/user/sphenix/cpu.py b/pilot/user/sphenix/cpu.py index 7f7c967f..2126ab0b 100644 --- a/pilot/user/sphenix/cpu.py +++ b/pilot/user/sphenix/cpu.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 from typing import Any from pilot.util.container import execute @@ -36,7 +36,7 @@ def get_core_count(job: Any) -> int: return 0 -def add_core_count(corecount: int, core_counts: list = []) -> list: +def add_core_count(corecount: int, core_counts: list = None) -> list: """ Add a core count measurement to the list of core counts. @@ -44,6 +44,8 @@ def add_core_count(corecount: int, core_counts: list = []) -> list: :param core_counts: list of core counts (list). :return: updated list of core counts (list). """ + if core_counts is None: + core_counts = [] return core_counts.append(corecount) diff --git a/pilot/user/sphenix/jobmetrics.py b/pilot/user/sphenix/jobmetrics.py index 01255974..b24739ce 100644 --- a/pilot/user/sphenix/jobmetrics.py +++ b/pilot/user/sphenix/jobmetrics.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 # from pilot.util.jobmetrics import get_job_metrics_entry @@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) -def get_job_metrics(job: Any, extra: dict = {}) -> str: +def get_job_metrics(job: Any, extra: dict = None) -> str: """ Return a properly formatted job metrics string. @@ -42,4 +42,6 @@ def get_job_metrics(job: Any, extra: dict = {}) -> str: :param extra: any extra information to be added (dict) :return: job metrics (str). """ + #if extra is None: + # extra = {} return "" diff --git a/pilot/user/sphenix/utilities.py b/pilot/user/sphenix/utilities.py index 4515190a..72a74218 100644 --- a/pilot/user/sphenix/utilities.py +++ b/pilot/user/sphenix/utilities.py @@ -17,8 +17,9 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 +import logging import os import time from re import search @@ -31,7 +32,6 @@ from pilot.util.parameters import convert_to_int from pilot.util.processes import is_process_running -import logging logger = logging.getLogger(__name__) @@ -60,7 +60,7 @@ def get_memory_monitor_output_filename(suffix: str = 'txt') -> str: def get_memory_monitor_setup(pid: int, pgrp: int, jobid: int, workdir: str, command: str, setup: str = "", - use_container: bool = True, transformation: str = "", outdata: list = [], + use_container: bool = True, transformation: str = "", outdata: list = None, dump_ps: bool = False) -> (str, int): """ Return the proper setup for the memory monitor. @@ -81,6 +81,10 @@ def get_memory_monitor_setup(pid: int, pgrp: int, jobid: int, workdir: str, comm :param dump_ps: should ps output be dumped when identifying prmon process? (bool) :return: job work directory (str), pid for process inside container (int). """ + if setup: # to get rid of pylint warning, setup is not used for this user + pass + if outdata is None: + outdata = [] # try to get the pid from a pid.txt file which might be created by a container_script pid = get_proper_pid(pid, pgrp, jobid, command=command, transformation=transformation, outdata=outdata, use_container=use_container, dump_ps=dump_ps) if pid == -1: diff --git a/pilot/util/filestate.py b/pilot/util/filestate.py index a2445f0d..75c0035d 100644 --- a/pilot/util/filestate.py +++ b/pilot/util/filestate.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2022-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2022-2024 """Handling of file states.""" @@ -39,12 +39,14 @@ class FileState(object): _lfns = [] _state_list = ['NOT_YET_TRANSFERRED', 'TRANSFER_IN_PROGRESS', 'TRANSFERRED', 'TRANSFER_FAILED'] - def __init__(self, file_states={}): + def __init__(self, file_states: dict = None): """ Initialize variables. :param file_states: file states (dict). """ + if file_states is None: + file_states = {} self._lfns = file_states.get('lfns', []) self.set_initial_list() @@ -61,7 +63,7 @@ def get_file_states(self) -> dict: """ return self._file_states - def update(self, lfn='', state=''): + def update(self, lfn: str = "", state: str = ""): """ Update the state for a given LFN. diff --git a/pilot/util/harvester.py b/pilot/util/harvester.py index efa2ee21..2ed9bef9 100644 --- a/pilot/util/harvester.py +++ b/pilot/util/harvester.py @@ -17,10 +17,11 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Functions for interactiving with Harvester.""" +import logging import os import os.path import socket @@ -31,7 +32,6 @@ from pilot.util.filehandling import write_json, touch, remove, read_json, get_checksum_value from pilot.util.timing import time_stamp -import logging logger = logging.getLogger(__name__) @@ -262,7 +262,7 @@ def publish_stageout_files(job: Any, event_status_file: str) -> bool: return False -def publish_work_report(work_report: dict = {}, worker_attributes_file: str = "worker_attributes.json") -> bool: +def publish_work_report(work_report: dict = None, worker_attributes_file: str = "worker_attributes.json") -> bool: """ Publish the work report. @@ -273,6 +273,8 @@ def publish_work_report(work_report: dict = {}, worker_attributes_file: str = "w :raises FileHandlingFailure: in case of IOError :return: True if successfully published, False otherwise (bool). """ + if work_report is None: + work_report = {} if work_report: work_report['timestamp'] = time_stamp() if "outputfiles" in work_report: diff --git a/pilot/util/https.py b/pilot/util/https.py index 13a55faf..5613043c 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -180,7 +180,7 @@ def https_setup(args: Any = None, version: str = ""): logger.warning(f'Failed to initialize SSL context .. skipped, error: {exc}') -def request(url: str, data: dict = {}, plain: bool = False, secure: bool = True, ipv: str = 'IPv6') -> Any: +def request(url: str, data: dict = None, plain: bool = False, secure: bool = True, ipv: str = 'IPv6') -> Any: """ Send a request using HTTPS. @@ -204,12 +204,14 @@ def request(url: str, data: dict = {}, plain: bool = False, secure: bool = True, :param data: data to send (dict) :param plain: if true, treats the response as a plain text (bool) :param secure: default: True, i.e. use certificates (bool) - :param ipv: internet protocol version (str). + :param ipv: internet protocol version (str) :returns: - :keyword:`dict` -- if everything went OK - `str` -- if ``plain`` parameter is `True` - - `None` -- if something went wrong + - `None` -- if something went wrong. """ + if data is None: + data = {} _ctx.ssl_context = None # certificates are not available on the grid, use curl # note that X509_USER_PROXY might change during running (in the case of proxy downloads), so @@ -658,7 +660,7 @@ def get_server_command(url: str, port: str, cmd: str = 'getJob') -> str: return f'{url}/server/panda/{cmd}' -def request2(url: str = "", data: dict = {}, secure: bool = True, compressed: bool = True) -> str or dict: +def request2(url: str = "", data: dict = None, secure: bool = True, compressed: bool = True) -> str or dict: """ Send a request using HTTPS (using urllib module). @@ -668,6 +670,8 @@ def request2(url: str = "", data: dict = {}, secure: bool = True, compressed: bo :param compressed: compress data (bool) :return: server response (str or dict). """ + if data is None: + data = {} # https might not have been set up if running in a [middleware] container if not _ctx.cacert: logger.debug('setting up unset https') @@ -754,7 +758,7 @@ def request2(url: str = "", data: dict = {}, secure: bool = True, compressed: bo return ret -def request3(url: str, data: dict = {}) -> str: +def request3(url: str, data: dict = None) -> str: """ Send a request using HTTPS (using requests module). @@ -762,6 +766,8 @@ def request3(url: str, data: dict = {}) -> str: :param data: data to send (dict) :return: server response (str). """ + if data is None: + data = {} if not requests: logger.warning('cannot use requests module (not available)') return "" diff --git a/pilot/util/jobmetrics.py b/pilot/util/jobmetrics.py index 2eda49bf..12470345 100644 --- a/pilot/util/jobmetrics.py +++ b/pilot/util/jobmetrics.py @@ -17,14 +17,14 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Functions for building job metrics.""" +import logging from os import environ from typing import Any -import logging logger = logging.getLogger(__name__) @@ -45,7 +45,7 @@ def get_job_metrics_entry(name: str, value: str) -> str: return job_metrics_entry -def get_job_metrics(job: Any, extra: dict = {}) -> str: +def get_job_metrics(job: Any, extra: dict = None) -> str: """ Return a properly formatted job metrics string. @@ -62,6 +62,8 @@ def get_job_metrics(job: Any, extra: dict = {}) -> str: :param extra: any extra information to be added (dict) :return: job metrics (str). """ + if extra is None: + extra = {} user = environ.get('PILOT_USER', 'generic').lower() # TODO: replace with singleton try: job_metrics_module = __import__(f'pilot.user.{user}.jobmetrics', globals(), locals(), [user], 0) From a6a0c21cad488682459567c4bc1b41b524be8af2 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 20 May 2024 13:04:35 +0200 Subject: [PATCH 34/41] Pylint updates --- pilot/common/exception.py | 3 ++ pilot/control/data.py | 30 ++++++++-------- pilot/control/interceptor.py | 6 ++-- pilot/control/job.py | 14 ++++---- pilot/control/monitor.py | 67 +++++++++++++++++------------------- pilot/control/payload.py | 40 +++++++-------------- 6 files changed, 71 insertions(+), 89 deletions(-) diff --git a/pilot/common/exception.py b/pilot/common/exception.py index b7c5725d..c6501ad1 100644 --- a/pilot/common/exception.py +++ b/pilot/common/exception.py @@ -37,6 +37,9 @@ class PilotException(Exception): """Pilot exceptions class.""" + _errorCode = None + _message = None + def __init__(self, *args, **kwargs): """Set default or initial values.""" super().__init__(args, kwargs) diff --git a/pilot/control/data.py b/pilot/control/data.py index 69a4561a..12d6a33f 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -25,6 +25,7 @@ """Control interface to data API.""" +import logging import os import time import traceback @@ -87,9 +88,7 @@ ) from pilot.util.timing import add_to_pilot_timing from pilot.util.tracereport import TraceReport -import pilot.util.middleware -import logging logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -778,11 +777,11 @@ def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool cmd = f"pwd;tar cvfz {fullpath} {tarball_name} --dereference --one-file-system; echo $?" exit_code, stdout, stderr = execute(cmd, timeout=timeout) except Exception as error: - raise LogFileCreationFailure(error) - else: - if pilot_home != current_dir: - os.chdir(pilot_home) - logger.debug(f'stdout: {stdout}') + raise LogFileCreationFailure(error) from error + if pilot_home != current_dir: + os.chdir(pilot_home) + logger.debug(f'stdout: {stdout}') + try: os.rename(workdir, orgworkdir) except OSError as error: @@ -877,7 +876,7 @@ def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, i label = 'stage-out' # should stage-in be done by a script (for containerisation) or by invoking the API (ie classic mode)? - use_container = pilot.util.middleware.use_middleware_script(job.infosys.queuedata.container_type.get("middleware")) + use_container = use_middleware_script(job.infosys.queuedata.container_type.get("middleware")) # switch the X509_USER_PROXY on unified dispatch queues (restore later in this function) x509_unified_dispatch = os.environ.get('X509_UNIFIED_DISPATCH', '') @@ -894,9 +893,9 @@ def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, i logger.info('stage-out will be done in a container') try: eventtype, localsite, remotesite = get_trace_report_variables(job, label=label) - pilot.util.middleware.containerise_middleware(job, args, xdata, eventtype, localsite, remotesite, - job.infosys.queuedata.container_options, label=label, - container_type=job.infosys.queuedata.container_type.get("middleware")) + containerise_middleware(job, args, xdata, eventtype, localsite, remotesite, + job.infosys.queuedata.container_options, label=label, + container_type=job.infosys.queuedata.container_type.get("middleware")) except PilotException as error: logger.warning('stage-out containerisation threw a pilot exception: %s', error) except Exception as error: @@ -909,8 +908,9 @@ def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, i trace_report = create_trace_report(job, label=label) client = StageOutClient(job.infosys, logger=logger, trace_report=trace_report, ipv=ipv, workdir=job.workdir) - kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, job=job, output_dir=args.output_dir, - catchall=job.infosys.queuedata.catchall, rucio_host=args.rucio_host) #, mode='stage-out') + kwargs = {'workdir': job.workdir, 'cwd': job.workdir, 'usecontainer': False, 'job': job, + 'output_dir': args.output_dir, 'catchall': job.infosys.queuedata.catchall, + 'rucio_host': args.rucio_host} #, mode='stage-out') # prod analy unification: use destination preferences from PanDA server for unified queues if job.infosys.queuedata.type != 'unified': client.prepare_destinations(xdata, activity) ## FIX ME LATER: split activities: for astorages and for copytools (to unify with ES workflow) @@ -975,7 +975,7 @@ def _stage_out_new(job: Any, args: Any) -> bool: is_success = False logger.warning('transfer of output file(s) failed') - if job.stageout in ['log', 'all'] and job.logdata: ## do stage-out log files + if job.stageout in {'log', 'all'} and job.logdata: ## do stage-out log files # prepare log file, consider only 1st available log file status = job.get_status('LOG_TRANSFER') if status != LOG_TRANSFER_NOT_DONE: @@ -1058,7 +1058,7 @@ def generate_fileinfo(job: Any) -> dict: fileinfo = {} checksum_type = config.File.checksum_type if config.File.checksum_type == 'adler32' else 'md5sum' for iofile in job.outdata + job.logdata: - if iofile.status in ['transferred']: + if iofile.status in {'transferred'}: fileinfo[iofile.lfn] = {'guid': iofile.guid, 'fsize': iofile.filesize, f'{checksum_type}': iofile.checksum.get(config.File.checksum_type), diff --git a/pilot/control/interceptor.py b/pilot/control/interceptor.py index 401e942f..bf1ee766 100644 --- a/pilot/control/interceptor.py +++ b/pilot/control/interceptor.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 # Note: leave this module for now - the code might be useful for reuse @@ -46,7 +46,7 @@ def run(args: Any): threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'args': args}, name=name) for name, target in list(targets.items())] # Python 2/3 - [thread.start() for thread in threads] + _ = [thread.start() for thread in threads] # if an exception is thrown, the graceful_stop will be set by the ExcThread class run() function while not args.graceful_stop.is_set(): @@ -57,7 +57,7 @@ def run(args: Any): except queue.Empty: pass else: - exc_type, exc_obj, exc_trace = exc + _, exc_obj, _ = exc logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) # deal with the exception diff --git a/pilot/control/job.py b/pilot/control/job.py index 483da499..c996fad5 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -17,15 +17,13 @@ # under the License. # # Authors: -# - Mario Lassnig, mario.lassnig@cern.ch, 2016-17 +# - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2018 """Job module with functions for job handling.""" -from __future__ import print_function # Python 2 - import os import time import hashlib @@ -2896,7 +2894,7 @@ def job_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 # by turning on debug mode, ie we need to get the heartbeat period in case it has changed) try: update_time = send_heartbeat_if_time(jobs[i], args, update_time) - except Exception as exc: + except (ValueError, TypeError) as exc: logger.warning(f'exception caught during send_heartbeat_if_time: {exc}') # note: when sending a state change to the server, the server might respond with 'tobekilled' @@ -3000,7 +2998,7 @@ def job_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 # has expired in the meantime) try: _job = jobs[i] - except Exception: + except (IndexError, TypeError): logger.info('aborting job monitoring since job object (job id=%s) has expired', current_id) break @@ -3008,7 +3006,7 @@ def job_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 # by turning on debug mode, ie we need to get the heartbeat period in case it has changed) try: update_time = send_heartbeat_if_time(_job, args, update_time) - except Exception as error: + except (ValueError, TypeError) as error: logger.warning('exception caught: %s (job id=%s)', error, current_id) break else: @@ -3147,7 +3145,7 @@ def send_heartbeat_if_time(job: Any, args: Any, update_time: float) -> int: # check for state==running here, and send explicit 'running' in send_state, rather than sending job.state # since the job state can actually change in the meantime by another thread # job.completed will anyway be checked in https::send_update() - if job.serverstate != 'finished' and job.serverstate != 'failed': + if job.serverstate not in {'finished', 'failed'}: logger.info(f'will send heartbeat for job in \'{job.state}\' state') send_state(job, args, job.state) update_time = time.time() diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 4f4c89bb..66ab1840 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -18,7 +18,7 @@ # # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # NOTE: this module should deal with non-job related monitoring, such as thread monitoring. Job monitoring is # a task for the job_monitor thread in the Job component. @@ -29,7 +29,7 @@ import threading import time import re -from os import environ, getpid, getuid +from os import environ, getuid from subprocess import Popen, PIPE from typing import Any @@ -64,8 +64,8 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 threadchecktime = int(config.Pilot.thread_check) # for CPU usage debugging - cpuchecktime = int(config.Pilot.cpu_check) - tcpu = t_0 + # cpuchecktime = int(config.Pilot.cpu_check) + # tcpu = t_0 last_minute_check = t_0 queuedata = get_queuedata_from_job(queues) @@ -110,9 +110,8 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 f'exceeded - time to abort pilot') reached_maxtime_abort(args) break - else: - if niter % 60 == 0: - logger.info(f'{time_since_start}s have passed since pilot start') + if niter % 60 == 0: + logger.info(f'{time_since_start}s have passed since pilot start') # every minute run the following check if is_pilot_check(check='machinefeatures'): @@ -127,15 +126,15 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 time.sleep(1) # time to check the CPU usage? - if is_pilot_check(check='cpu_usage'): - if int(time.time() - tcpu) > cpuchecktime and False: # for testing only - processes = get_process_info('python3 pilot3/pilot.py', pid=getpid()) - if processes: - logger.info(f'PID={getpid()} has CPU usage={processes[0]}% CMD={processes[2]}') - nproc = processes[3] - if nproc > 1: - logger.info(f'.. there are {nproc} such processes running') - tcpu = time.time() + # if is_pilot_check(check='cpu_usage'): + # if int(time.time() - tcpu) > cpuchecktime: + # processes = get_process_info('python3 pilot3/pilot.py', pid=getpid()) + # if processes: + # logger.info(f'PID={getpid()} has CPU usage={processes[0]}% CMD={processes[2]}') + # nproc = processes[3] + # if nproc > 1: + # logger.info(f'.. there are {nproc} such processes running') + # tcpu = time.time() # proceed with running the other checks run_checks(queues, args) @@ -154,7 +153,7 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 except Exception as error: print((f"monitor: exception caught: {error}")) - raise PilotException(error) + raise PilotException(error) from error logger.info('[monitor] control thread has ended') @@ -185,8 +184,7 @@ def run_shutdowntime_minute_check(time_since_start: int) -> bool: except (TypeError, ValueError): # as exc: #logger.debug(f'failed to convert shutdowntime: {exc}') return False # will be ignored - else: - logger.debug(f'machinefeatures shutdowntime={shutdowntime} - now={now}') + logger.debug(f'machinefeatures shutdowntime={shutdowntime} - now={now}') if not shutdowntime: logger.debug('ignoring shutdowntime since it is not set') return False # will be ignored @@ -268,19 +266,19 @@ def get_process_info(cmd: str, user: str = "", args: str = 'aufx', pid: int = 0) pattern = re.compile(r"\S+|[-+]?\d*\.\d+|\d+") arguments = ['ps', '-u', user, args, '--no-headers'] - process = Popen(arguments, stdout=PIPE, stderr=PIPE, encoding='utf-8') - stdout, _ = process.communicate() - for line in stdout.splitlines(): - found = re.findall(pattern, line) - if found is not None: - processid = found[1] - cpu = found[2] - mem = found[3] - command = ' '.join(found[10:]) - if cmd in command: - num += 1 - if processid == str(pid): - processes = [cpu, mem, command] + with Popen(arguments, stdout=PIPE, stderr=PIPE, encoding='utf-8') as process: + stdout, _ = process.communicate() + for line in stdout.splitlines(): + found = re.findall(pattern, line) + if found is not None: + processid = found[1] + cpu = found[2] + mem = found[3] + command = ' '.join(found[10:]) + if cmd in command: + num += 1 + if processid == str(pid): + processes = [cpu, mem, command] if processes: processes.append(num) @@ -327,7 +325,7 @@ def run_checks(queues: Any, args: Any) -> None: _pilot_heartbeat = get_proper_pilot_heartbeat() if last_heartbeat > _pilot_heartbeat: - detected_job_suspension = True if last_heartbeat > 10 * 60 else False + detected_job_suspension = last_heartbeat > 10 * 60 if detected_job_suspension: logger.warning(f'detected job suspension (last heartbeat was updated more than 10 minutes ago: {last_heartbeat} s)') else: @@ -427,8 +425,7 @@ def get_max_running_time(lifetime: int, queuedata: Any, queues: Any, push: bool, logger.warning(f'failed to convert maxtime from queuedata, will use default value for max running time ' f'({max_running_time}s)') else: - if max_running_time == 0: - max_running_time = lifetime # fallback to default value + max_running_time = lifetime if max_running_time == 0 else max_running_time # logger.debug(f'will use default value for max running time: {max_running_time}s') #else: # logger.debug(f'will use queuedata.maxtime value for max running time: {max_running_time}s') diff --git a/pilot/control/payload.py b/pilot/control/payload.py index ad39cbb8..f55c9f6d 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -20,7 +20,7 @@ # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2017-2018 """Functions for handling the payload.""" @@ -76,7 +76,7 @@ def control(queues: Any, traces: Any, args: Any): threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args}, name=name) for name, target in list(targets.items())] - [thread.start() for thread in threads] + _ = [thread.start() for thread in threads] # if an exception is thrown, the graceful_stop will be set by the ExcThread class run() function try: @@ -88,7 +88,8 @@ def control(queues: Any, traces: Any, args: Any): except queue.Empty: pass else: - exc_type, exc_obj, exc_trace = exc + # exc_type, exc_obj, exc_trace = exc + _, exc_obj, _ = exc logger.warning(f"thread \'{thread.name}\' received an exception from bucket: {exc_obj}") # deal with the exception @@ -293,7 +294,7 @@ def execute_payloads(queues: Any, traces: Any, args: Any): # noqa: C901 user = __import__(f'pilot.user.{pilot_user}.diagnose', globals(), locals(), [pilot_user], 0) try: exit_code_interpret = user.interpret(job) - except Exception as error: + except (Exception, PilotException) as error: logger.warning(f'exception caught: {error}') if isinstance(error, PilotException): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error._errorCode, msg=error._message) @@ -439,11 +440,11 @@ def get_logging_info(job: Any, args: Any) -> dict: except IndexError as exc: logger.warning(f'exception caught: {exc}') return {} - else: - # find the log file to tail - path = find_log_to_tail(job.debug_command, job.workdir, args, job.is_analysis()) - logger.info(f'using {path} for real-time logging') - info_dic['logfiles'] = [path] + + # find the log file to tail + path = find_log_to_tail(job.debug_command, job.workdir, args, job.is_analysis()) + logger.info(f'using {path} for real-time logging') + info_dic['logfiles'] = [path] if 'logfiles' not in info_dic: # Rubin @@ -529,7 +530,7 @@ def run_realtimelog(queues: Any, traces: Any, args: Any): # noqa: C901 f'(job.debug={job.debug}, job.debug_command={job.debug_command})') first1 = False break - if job.state == 'stageout' or job.state == 'failed' or job.state == 'holding': + if job.state in {'stageout', 'failed', 'holding'}: if first2: logger.debug(f'job is in state {job.state}, continue to next job or abort (wait for graceful stop)') first1 = True @@ -578,9 +579,8 @@ def run_realtimelog(queues: Any, traces: Any, args: Any): # noqa: C901 if realtime_logger is None: logger.debug('realtime logger was not found, waiting ..') continue - else: - logger.debug('realtime logger was found') + logger.debug('realtime logger was found') realtime_logger.sending_logs(args, job) # proceed to set the job_aborted flag? @@ -661,22 +661,6 @@ def perform_initial_payload_error_analysis(job: Any, exit_code: int): msg = errors.format_diagnostics(exit_code, msg) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg) - - ''' - if exit_code != 0: - if msg: - msg = errors.format_diagnostics(exit_code, msg) - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=msg) - else: - if job.piloterrorcodes: - logger.warning('error code(s) already set: %s', str(job.piloterrorcodes)) - else: - # check if core dumps exist, if so remove them and return True - if remove_core_dumps(job.workdir) and not job.debug: - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP) - else: - logger.warning('initial error analysis did not resolve the issue (and core dumps were not found)') - ''' else: logger.info('main payload execution returned zero exit code') From 5e62d2ab0f08df5b099a3f5e34fdb6e7038ad506 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 20 May 2024 16:09:27 +0200 Subject: [PATCH 35/41] Pylint updates --- pilot/copytool/gfal.py | 24 ++++++++---- pilot/copytool/gs.py | 25 ++++++------ pilot/copytool/lsm.py | 53 ++++++++++++++------------ pilot/copytool/mv.py | 37 ++++++++++-------- pilot/copytool/objectstore.py | 18 ++++++--- pilot/copytool/rucio.py | 67 ++++++++++++++++---------------- pilot/copytool/s3.py | 23 ++++++----- pilot/copytool/xrdcp.py | 72 +++++++++++++++++------------------ 8 files changed, 175 insertions(+), 144 deletions(-) diff --git a/pilot/copytool/gfal.py b/pilot/copytool/gfal.py index acf01a18..7fb34301 100644 --- a/pilot/copytool/gfal.py +++ b/pilot/copytool/gfal.py @@ -19,7 +19,7 @@ # Authors: # - Pavlo Svirin, pavlo.svirin@cern.ch, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 # - Alexey Anisenkov, anisyonk@cern.ch, 2018 """GFAL2 copy tool.""" @@ -29,10 +29,18 @@ import os from time import time -from .common import resolve_common_transfer_errors, get_timeout -from pilot.common.exception import PilotException, ErrorCodes, StageInFailure, StageOutFailure +from pilot.common.exception import ( + PilotException, + ErrorCodes, + StageInFailure, + StageOutFailure +) from pilot.util.container import execute #from pilot.util.timer import timeout +from .common import ( + resolve_common_transfer_errors, + get_timeout +) logger = logging.getLogger(__name__) @@ -148,8 +156,9 @@ def copy_out(files: list, **kwargs: dict) -> list: Upload given files using gfal command. :param files: Files to upload (files) - :raises: PilotException in case of errors - :return: updated files (list). + :param kwargs: kwargs dictionary (dict) + :return: updated files (list) + :raises: PilotException in case of errors. """ if not check_for_gfal(): raise StageOutFailure("No GFAL2 tools found") @@ -222,7 +231,7 @@ def move_all_files_in(files: list, nretries: int = 1) -> (int, str, str): ### exit_code, stdout, stderr = move(source, destination, entry.get('recursive', False)) if exit_code != 0: - if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: + if ((exit_code not in (errno.ETIMEDOUT, errno.ETIME)) or ((retry + 1) == nretries)): logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful @@ -236,6 +245,7 @@ def move_all_files_out(files: list, nretries: int = 1) -> (int, str, str): ### Move all output files. :param files: list of FileSpec objects (list) + :param nretries: number of retries; sometimes there can be a timeout copying, but the next attempt may succeed (int) :return: exit_code (int), stdout (str), stderr (str). """ exit_code = 0 @@ -253,7 +263,7 @@ def move_all_files_out(files: list, nretries: int = 1) -> (int, str, str): ### exit_code, stdout, stderr = move(source, destination) if exit_code != 0: - if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: + if ((exit_code not in (errno.ETIMEDOUT, errno.ETIME)) or ((retry + 1) == nretries)): logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py index 2d54a1b4..b0506468 100644 --- a/pilot/copytool/gs.py +++ b/pilot/copytool/gs.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 # - Shuwei Ye, yesw@bnl.gov, 2021 """GS copy tool.""" @@ -30,19 +30,18 @@ from glob import glob from typing import Any -from pilot.info import infosys - try: from google.cloud import storage -except Exception: +except ImportError: storage_client = None else: storage_client = storage.Client() -from .common import resolve_common_transfer_errors from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException +from pilot.info import infosys from pilot.util.config import config +from .common import resolve_common_transfer_errors logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -98,7 +97,7 @@ def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> d """ try: pandaqueue = infosys.pandaqueue - except Exception: + except AttributeError: pandaqueue = "" if pandaqueue is None: pandaqueue = "" @@ -128,8 +127,9 @@ def copy_in(files: list, **kwargs: dict) -> list: Download given files from a GCS bucket. :param files: list of `FileSpec` objects (list) - :raise: PilotException in case of controlled error - :return: updated files (list). + :param kwargs: kwargs dictionary (dict) + :return: updated files (list) + :raise: PilotException in case of controlled error. """ for fspec in files: @@ -180,8 +180,9 @@ def copy_out(files: list, **kwargs: dict): Upload given files to GS storage. :param files: list of `FileSpec` objects (list) - :raise: PilotException in case of controlled error - :return: updated files (list). + :param kwargs: kwargs dictionary (dict) + :return: updated files (list) + :raise: PilotException in case of controlled error. """ workdir = kwargs.pop('workdir') @@ -215,7 +216,7 @@ def copy_out(files: list, **kwargs: dict): for path in logfiles: logfile = os.path.basename(path) if os.path.exists(path): - if logfile == config.Pilot.pilotlog or logfile == config.Payload.payloadstdout or logfile == config.Payload.payloadstderr: + if logfile in (config.Pilot.pilotlog, config.Payload.payloadstdout, config.Payload.payloadstderr): content_type = "text/plain" logger.debug(f'change the file {logfile} content-type to text/plain') else: @@ -225,7 +226,7 @@ def copy_out(files: list, **kwargs: dict): if not isinstance(result, str): result = result.decode('utf-8') if result.find(';') > 0: - content_type = result.split(';')[0] + content_type = result.split(';', maxsplit=1)[0] logger.debug(f'change the file {logfile} content-type to {content_type}') except Exception: pass diff --git a/pilot/copytool/lsm.py b/pilot/copytool/lsm.py index 9b182ea4..c8eca99f 100644 --- a/pilot/copytool/lsm.py +++ b/pilot/copytool/lsm.py @@ -19,7 +19,7 @@ # Authors: # - Pavlo Svirin, pavlo.svirin@cern.ch, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2024 """Local site mover copy tool.""" @@ -28,17 +28,17 @@ import os from time import time +from pilot.common.exception import ( + ErrorCodes, + PilotException +) +from pilot.util.container import execute +#from pilot.util.timer import timeout from .common import ( get_copysetup, verify_catalog_checksum, resolve_common_transfer_errors #, get_timeout ) -from pilot.common.exception import ( - PilotException, - ErrorCodes -) -from pilot.util.container import execute -#from pilot.util.timer import timeout logger = logging.getLogger(__name__) @@ -54,7 +54,7 @@ def is_valid_for_copy_in(files: list) -> bool: Placeholder. - :param files: list of FileSpec objects (list). + :param files: list of FileSpec objects (list) :return: always True (for now) (bool). """ # for f in files: @@ -69,7 +69,7 @@ def is_valid_for_copy_out(files: list) -> bool: Placeholder. - :param files: list of FileSpec objects (list). + :param files: list of FileSpec objects (list) :return: always True (for now) (bool). """ # for f in files: @@ -84,8 +84,8 @@ def copy_in(files: list, **kwargs: dict) -> list: :param files: list of `FileSpec` objects (list) :param kwargs: kwargs dictionary (dict) - :raises: PilotException in case of controlled error - :return: updated files (list). + :return: updated files (list) + :raises: PilotException in case of controlled error. """ copytools = kwargs.get('copytools') or [] copysetup = get_copysetup(copytools, 'lsm') @@ -157,8 +157,9 @@ def copy_out(files: list, **kwargs: dict) -> list: Upload given files using lsm copytool. :param files: list of `FileSpec` objects (list) - :raise: PilotException in case of controlled error - :return: updated files (list). + :param kwargs: kwargs dictionary (dict) + :return: updated files (list) + :raises: PilotException in case of controlled error. """ copytools = kwargs.get('copytools') or [] copysetup = get_copysetup(copytools, 'lsm') @@ -194,16 +195,16 @@ def copy_out(files: list, **kwargs: dict) -> list: checksum = f"adler32:{fspec.checksum.get('adler32')}" # define the command options - opts = {'--size': fspec.filesize, - '-t': token, - '--checksum': checksum, - '--guid': fspec.guid} - opts = " ".join([f"{k} {v}" for (k, v) in list(opts.items())]) + _opts = {'--size': fspec.filesize, + '-t': token, + '--checksum': checksum, + '--guid': fspec.guid} + opts = " ".join([f"{k} {v}" for (k, v) in list(_opts.items())]) logger.info(f"transferring file {fspec.lfn} from {source} to {destination}") nretries = 1 # input parameter to function? - for retry in range(nretries): + for _ in range(nretries): exit_code, stdout, stderr = move(source, destination, dst_in=False, copysetup=copysetup, options=opts) if exit_code != 0: @@ -217,9 +218,9 @@ def copy_out(files: list, **kwargs: dict) -> list: timeEnd=time()) trace_report.send() raise PilotException(error.get('error'), code=error.get('exit_code'), state=error.get('state')) - else: # all successful - logger.info('all successful') - break + + logger.info('all successful') + break fspec.status_code = 0 fspec.status = 'transferred' @@ -250,7 +251,7 @@ def move_all_files_in(files: list, nretries: int = 1) -> (int, str, str): exit_code, stdout, stderr = move(source, destination, dst_in=True) if exit_code != 0: - if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: + if ((exit_code not in (errno.ETIMEDOUT, errno.ETIME)) or ((retry + 1) == nretries)): logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful @@ -280,7 +281,7 @@ def move_all_files_out(files: list, nretries: int = 1) -> (int, str, str): exit_code, stdout, stderr = move(source, destination, dst_in=False) if exit_code != 0: - if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: + if ((exit_code not in (errno.ETIMEDOUT, errno.ETIME)) or ((retry + 1) == nretries)): logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful @@ -297,6 +298,8 @@ def move(source: str, destination: str, dst_in: bool = True, copysetup: str = "" :param source: path to source (str) :param destination: path to destination (str) :param dst_in: True for stage-in, False for stage-out (bool) + :param copysetup: path to copysetup (str) + :param options: additional options (str) :return: exit code (int), stdout (str), stderr (str). """ # copysetup = '/osg/mwt2/app/atlas_app/atlaswn/setup.sh' @@ -321,7 +324,7 @@ def move(source: str, destination: str, dst_in: bool = True, copysetup: str = "" exit_code = ErrorCodes.STAGEINFAILED else: exit_code = ErrorCodes.STAGEOUTFAILED - stdout = 'exception caught: e' % error + stdout = f'exception caught: {error}' stderr = '' logger.warning(stdout) diff --git a/pilot/copytool/mv.py b/pilot/copytool/mv.py index 62f04e73..7c31b6fc 100644 --- a/pilot/copytool/mv.py +++ b/pilot/copytool/mv.py @@ -45,6 +45,8 @@ def is_valid_for_copy_in(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -60,6 +62,8 @@ def is_valid_for_copy_out(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -90,7 +94,7 @@ def create_output_list(files: list, init_dir: str): logger.info(f'adding to output.list: {fspec.lfn} {arcturl}') # Write output.list - with open(os.path.join(init_dir, 'output.list'), 'a') as f: + with open(os.path.join(init_dir, 'output.list'), 'a', encoding='utf-8') as f: f.write(f'{fspec.lfn} {arcturl}\n') @@ -128,8 +132,12 @@ def build_final_path(turl: str, prefix: str = 'file://localhost') -> (int, str, except FileExistsError: # ignore if sub dirs already exist pass - except (IsADirectoryError, OSError) as exc: - diagnostics = f'caught exception: {exc}' + except IsADirectoryError as exc: + diagnostics = f'caught IsADirectoryError exception: {exc}' + logger.warning(diagnostics) + return ErrorCodes.MKDIR, diagnostics, path + except OSError as exc: + diagnostics = f'caught OSError exception: {exc}' logger.warning(diagnostics) return ErrorCodes.MKDIR, diagnostics, path else: @@ -168,11 +176,11 @@ def copy_in(files: list, copy_type: str = "symlink", **kwargs: dict) -> list: logger.debug(f"workdir={kwargs.get('workdir')}") logger.debug(f"jobworkdir={kwargs.get('jobworkdir')}") - exit_code, stdout, stderr = move_all_files(files, - copy_type, - kwargs.get('workdir'), - kwargs.get('jobworkdir'), - mvfinaldest=kwargs.get('mvfinaldest', False)) + exit_code, stdout, _ = move_all_files(files, + copy_type, + kwargs.get('workdir'), + kwargs.get('jobworkdir'), + mvfinaldest=kwargs.get('mvfinaldest', False)) if exit_code != 0: # raise failure raise StageInFailure(stdout) @@ -187,8 +195,8 @@ def copy_out(files: list, copy_type: str = "mv", **kwargs: dict) -> list: :param files: list of `FileSpec` objects (list) :param copy_type: copy type (str) :param kwargs: kwargs dictionary (dict) - :raises PilotException: StageOutFailure, MKDirFailure - :return: updated files (list). + :return: updated files (list) + :raises PilotException: StageOutFailure, MKDirFailure. """ if copy_type not in ["cp", "mv"]: raise StageOutFailure("incorrect method for copy out") @@ -205,8 +213,7 @@ def copy_out(files: list, copy_type: str = "mv", **kwargs: dict) -> list: # raise failure if exit_code == ErrorCodes.MKDIR: raise MKDirFailure(stdout) - else: - raise StageOutFailure(stdout) + raise StageOutFailure(stdout) # Create output list for ARC CE if necessary logger.debug(f"init_dir for output.list={os.path.dirname(kwargs.get('workdir'))}") @@ -282,9 +289,9 @@ def move_all_files(files: list, copy_type: str, workdir: str, jobworkdir: str, else: fspec.status_code = ErrorCodes.STAGEOUTFAILED break - else: - fspec.status_code = 0 - fspec.status = 'transferred' + + fspec.status_code = 0 + fspec.status = 'transferred' return exit_code, stdout, stderr diff --git a/pilot/copytool/objectstore.py b/pilot/copytool/objectstore.py index 13589fd3..56849984 100644 --- a/pilot/copytool/objectstore.py +++ b/pilot/copytool/objectstore.py @@ -19,7 +19,7 @@ # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2019 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 """Objectstore copy tool.""" @@ -57,6 +57,8 @@ def is_valid_for_copy_in(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -72,6 +74,8 @@ def is_valid_for_copy_out(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -87,15 +91,18 @@ def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> d :param fspec: file spec data (Any) :param protocol: suggested protocol (dict) :param ddmconf: full ddm storage data (dict) + :param kwargs: kwargs dictionary (dict) :return: SURL dictionary {'surl': surl} (dict). """ + if kwargs: # to get rid of pylint warning + pass ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: raise PilotException(f'failed to resolve ddmendpoint by name={fspec.ddmendpoint}') if ddm.is_deterministic: surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), get_rucio_path(fspec.scope, fspec.lfn)) - elif ddm.type in ['OS_ES', 'OS_LOGS']: + elif ddm.type in {'OS_ES', 'OS_LOGS'}: surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), fspec.lfn) fspec.protocol_id = protocol.get('id') else: @@ -109,8 +116,9 @@ def copy_in(files: list, **kwargs: dict) -> list: Download given files using rucio copytool. :param files: list of `FileSpec` objects (list) - :raise: PilotException in case of controlled error - :return: updated list of files (list). + :param kwargs: kwargs dictionary (dict) + :return: updated list of files (list) + :raises: PilotException in case of controlled error. """ # don't spoil the output, we depend on stderr parsing os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' @@ -166,7 +174,7 @@ def is_new_rucio_version() -> bool: :return: True if new rucio version (bool). """ _, stdout, _ = execute('rucio download -h') - return True if '--rses RSES' in stdout else False + return '--rses RSES' in stdout def copy_out(files: list, **kwargs: dict) -> list: diff --git a/pilot/copytool/rucio.py b/pilot/copytool/rucio.py index 786da3db..e526c415 100644 --- a/pilot/copytool/rucio.py +++ b/pilot/copytool/rucio.py @@ -42,13 +42,13 @@ timeout, TimedThread, ) +from pilot.util.config import config +from pilot.util.filehandling import rename_xrdlog from .common import ( resolve_common_transfer_errors, verify_catalog_checksum, get_timeout ) -from pilot.util.config import config -from pilot.util.filehandling import rename_xrdlog logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -68,6 +68,8 @@ def is_valid_for_copy_in(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -83,6 +85,8 @@ def is_valid_for_copy_out(files: list) -> bool: :param files: list of FileSpec objects (list). :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -96,8 +100,8 @@ def copy_in(files: list, **kwargs: dict) -> list: :param files: list of `FileSpec` objects (list) :param kwargs: kwargs dictionary (dict) - :raises: PilotException in case of controlled error - :return: updated files (list). + :return: updated files (list) + :raises: PilotException in case of controlled error. """ ignore_errors = kwargs.get('ignore_errors') trace_report = kwargs.get('trace_report') @@ -149,7 +153,7 @@ def copy_in(files: list, **kwargs: dict) -> list: if not ignore_errors: trace_report.send() msg = f"{fspec.scope}:{fspec.lfn} from {fspec.ddmendpoint}, {error_details.get('error')}" - raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) + raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) from error else: protocol = get_protocol(trace_report_out) trace_report.update(protocol=protocol) @@ -251,10 +255,10 @@ def copy_in_bulk(files: list, **kwargs: dict) -> list: """ Download given files using rucio copytool. - :param files: list of `FileSpec` objects - :param ignore_errors: boolean, if specified then transfer failures will be ignored - :raise: PilotException in case of controlled error - :return: list of done files (list). + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) + :return: list of done files (list) + :raises: PilotException in case of controlled error. """ #allow_direct_access = kwargs.get('allow_direct_access') ignore_errors = kwargs.get('ignore_errors') @@ -280,6 +284,7 @@ def copy_in_bulk(files: list, **kwargs: dict) -> list: trace_report = deepcopy(trace_common_fields) localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) diagnostics = f'none of the traces received from rucio. response from rucio: {error_msg}' + code = 0 for fspec in files: localsite = localsite if localsite else fspec.ddmendpoint trace_report.update(localSite=localsite, remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) @@ -287,8 +292,9 @@ def copy_in_bulk(files: list, **kwargs: dict) -> list: trace_report.update(scope=fspec.scope, dataset=fspec.dataset) trace_report.update('STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, timeEnd=time()) trace_report.send() + code = fspec.status_code # just get one of them for later logger.error(diagnostics) - raise PilotException(diagnostics, code=fspec.status_code, state='STAGEIN_ATTEMPT_FAILED') + raise PilotException(diagnostics, code=code, state='STAGEIN_ATTEMPT_FAILED') from error # VALIDATION AND TERMINATION files_done = [] @@ -358,17 +364,13 @@ def _get_trace(fspec: Any, traces: list) -> list: :return: trace_candidates that correspond to the given file (list). """ try: - try: - trace_candidates = list(filter(lambda t: t['filename'] == fspec.lfn and t['scope'] == fspec.scope, traces)) # Python 2 - except Exception: - trace_candidates = list([t for t in traces if t['filename'] == fspec.lfn and t['scope'] == fspec.scope]) # Python 3 + trace_candidates = list(t for t in traces if t['filename'] == fspec.lfn and t['scope'] == fspec.scope) if trace_candidates: return trace_candidates - else: - logger.warning(f'file does not match to any trace received from Rucio: {fspec.lfn} {fspec.scope}') + logger.warning(f'file does not match to any trace received from Rucio: {fspec.lfn} {fspec.scope}') except Exception as error: logger.warning(f'traces from pilot and rucio could not be merged: {error}') - return [] + return [] #@timeout(seconds=10800) @@ -377,9 +379,9 @@ def copy_out(files: list, **kwargs: dict) -> list: # noqa: C901 Upload given files using rucio copytool. :param files: list of `FileSpec` objects (list) - :param ignore_errors: boolean, if specified then transfer failures will be ignored (bool) - :raise: PilotException in case of controlled error - :return: updated files list (list). + :param kwargs: kwargs dictionary (dict) + :return: updated files list (list) + :raises: PilotException in case of controlled error. """ # don't spoil the output, we depend on stderr parsing os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' @@ -418,23 +420,21 @@ def copy_out(files: list, **kwargs: dict) -> list: # noqa: C901 rucio_host) #_stage_out_api(fspec, summary_file_path, trace_report, trace_report_out) except PilotException as error: - error_msg = str(error) - error_details = handle_rucio_error(error_msg, trace_report, trace_report_out, fspec, stagein=False) + error_details = handle_rucio_error(str(error), trace_report, trace_report_out, fspec, stagein=False) protocol = get_protocol(trace_report_out) trace_report.update(protocol=protocol) if not ignore_errors: trace_report.send() msg = f" {fspec.scope}:{fspec.lfn} to {fspec.ddmendpoint}, {error_details.get('error')}" - raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) + raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) from error except Exception as error: - error_msg = str(error) - error_details = handle_rucio_error(error_msg, trace_report, trace_report_out, fspec, stagein=False) + error_details = handle_rucio_error(str(error), trace_report, trace_report_out, fspec, stagein=False) protocol = get_protocol(trace_report_out) trace_report.update(protocol=protocol) if not ignore_errors: trace_report.send() msg = f" {fspec.scope}:{fspec.lfn} to {fspec.ddmendpoint}, {error_details.get('error')}" - raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) + raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) from error else: protocol = get_protocol(trace_report_out) trace_report.update(protocol=protocol) @@ -475,12 +475,11 @@ def copy_out(files: list, **kwargs: dict) -> list: # noqa: C901 if not ignore_errors: raise PilotException("Failed to stageout: CRC mismatched", code=fspec.status_code, state=state) + elif local_checksum and checksum and local_checksum == checksum: + logger.info(f'local checksum ({local_checksum}) = remote checksum ({checksum})') else: - if local_checksum and checksum and local_checksum == checksum: - logger.info(f'local checksum ({local_checksum}) = remote checksum ({checksum})') - else: - logger.warning(f'checksum could not be verified: local checksum ({local_checksum}), ' - f'remote checksum ({checksum})') + logger.warning(f'checksum could not be verified: local checksum ({local_checksum}), ' + f'remote checksum ({checksum})') if not fspec.status_code: fspec.status_code = 0 fspec.status = 'transferred' @@ -709,6 +708,9 @@ def _stage_out_api(fspec: Any, summary_file_path: str, trace_report: dict, trace logger.debug(f'summary_file_path={summary_file_path}') logger.debug(f'trace_report_out={trace_report_out}') result = upload_client.upload([_file], summary_file_path=summary_file_path, traces_copy_out=trace_report_out, ignore_availability=True) + except UnboundLocalError: + logger.warning('*** rucio API upload client failed ***') + logger.warning('rucio still needs a bug fix of the summary in the uploadclient') except Exception as error: logger.warning('*** rucio API upload client failed ***') logger.warning(f'caught exception: {error}') @@ -720,9 +722,6 @@ def _stage_out_api(fspec: Any, summary_file_path: str, trace_report: dict, trace if not trace_report_out[0].get('stateReason'): raise error ec = -1 - except UnboundLocalError: - logger.warning('*** rucio API upload client failed ***') - logger.warning('rucio still needs a bug fix of the summary in the uploadclient') else: logger.warning('*** rucio API upload client finished ***') logger.debug(f'client returned {result}') diff --git a/pilot/copytool/s3.py b/pilot/copytool/s3.py index 8a7e908d..eaf4d835 100644 --- a/pilot/copytool/s3.py +++ b/pilot/copytool/s3.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 """S3 copy tool.""" @@ -33,12 +33,12 @@ except Exception: pass -from .common import resolve_common_transfer_errors from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException from pilot.info import infosys from pilot.util.config import config from pilot.util.ruciopath import get_rucio_path +from .common import resolve_common_transfer_errors logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -56,9 +56,11 @@ def is_valid_for_copy_in(files: list) -> bool: Placeholder. - :param files: list of FileSpec objects (list). + :param files: list of FileSpec objects (list) :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -71,9 +73,11 @@ def is_valid_for_copy_out(files: list) -> bool: Placeholder. - :param files: list of FileSpec objects (list). + :param files: list of FileSpec objects (list) :return: always True (for now) (bool). """ + if files: # to get rid of pylint warning + pass # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False @@ -130,6 +134,8 @@ def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> d :param kwargs: kwargs dictionary (dict) :return: SURL dictionary {'surl': surl} (dict). """ + if kwargs: # to get rid of pylint warning + pass try: pandaqueue = infosys.pandaqueue except Exception: @@ -143,10 +149,10 @@ def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> d if ddm.is_deterministic: surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), get_rucio_path(fspec.scope, fspec.lfn)) - elif ddm.type in ['OS_ES', 'OS_LOGS']: + elif ddm.type in {'OS_ES', 'OS_LOGS'}: try: pandaqueue = infosys.pandaqueue - except Exception: + except AttributeError: pandaqueue = "" if pandaqueue is None: pandaqueue = "" @@ -327,13 +333,12 @@ def copy_out(files: list, **kwargs: dict) -> list: return files -def upload_file(file_name: str, full_url: str, object_name: str = None) -> (bool, str): +def upload_file(file_name: str, full_url: str) -> (bool, str): """ Upload a file to an S3 bucket. :param file_name: file to upload (str) - :param turl: target url to upload to (str) - :param object_name: S3 object name. If not specified then file_name is used (str) + :param full_url: full URL to upload to (str) :return: True if file was uploaded - otherwise False (bool), diagnostics (str). """ # upload the file diff --git a/pilot/copytool/xrdcp.py b/pilot/copytool/xrdcp.py index 7a4a6c3b..14897421 100644 --- a/pilot/copytool/xrdcp.py +++ b/pilot/copytool/xrdcp.py @@ -18,7 +18,7 @@ # # Authors: # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Alexey Anisenkov, anisyonk@cern.ch, 2017 """Xrdcp copy tool.""" @@ -28,10 +28,10 @@ import re from time import time -from .common import resolve_common_transfer_errors, verify_catalog_checksum #, get_timeout from pilot.util.container import execute from pilot.common.exception import PilotException, ErrorCodes #from pilot.util.timer import timeout +from .common import resolve_common_transfer_errors, verify_catalog_checksum #, get_timeout logger = logging.getLogger(__name__) @@ -53,6 +53,8 @@ def is_valid_for_copy_in(files: list) -> bool: # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False + if files: # to get rid of pylint warning + pass return True ## FIX ME LATER @@ -68,10 +70,12 @@ def is_valid_for_copy_out(files: list) -> bool: # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False + if files: # to get rid of pylint warning + pass return True ## FIX ME LATER -def _resolve_checksum_option(setup: str, **kwargs) -> str: +def _resolve_checksum_option(setup: str, **kwargs: dict) -> str: """ Resolve which checksum option to use. @@ -79,13 +83,6 @@ def _resolve_checksum_option(setup: str, **kwargs) -> str: :param kwargs: kwargs dictionary (dict) :return: option (str). """ - cmd = f"{copy_command} --version" - if setup: - cmd = f"source {setup}; {cmd}" - - logger.info(f"execute command ({cmd}) to check xrdcp client version") - - rcode, stdout, stderr = execute(cmd, **kwargs) cmd = f"{copy_command} -h" if setup: cmd = f"source {setup}; {cmd}" @@ -100,13 +97,12 @@ def _resolve_checksum_option(setup: str, **kwargs) -> str: if rcode: logger.error(f'FAILED to execute command={cmd}: {output}') - else: - if "--cksum" in output: - coption = f"--cksum {checksum_type}:print" - elif "-adler" in output and checksum_type == 'adler32': - coption = "-adler" - elif "-md5" in output and checksum_type == 'md5': - coption = "-md5" + elif "--cksum" in output: + coption = f"--cksum {checksum_type}:print" + elif "-adler" in output and checksum_type == 'adler32': + coption = "-adler" + elif "-md5" in output and checksum_type == 'md5': + coption = "-md5" if coption: logger.info(f"use {coption} option to get the checksum for {copy_command} command") @@ -130,6 +126,8 @@ def _stagefile(coption: str, source: str, destination: str, filesize: int, is_st :raises: PilotException in case of controlled error :return: destination file details - file size (int) checksum (str), checksum_type (str). """ + if filesize: # to get rid of pylint warning - could be useful + pass filesize_cmd, checksum_cmd, checksum_type = None, None, None cmd = f'{copy_command} -np -f {coption} {source} {destination}' @@ -173,8 +171,8 @@ def copy_in(files: list, **kwargs: dict) -> list: :param files: list of `FileSpec` objects (list) :param kwargs: kwargs dictionary (dict) - :raises: PilotException in case of controlled error - :return: updated list of files (list). + :return: updated list of files (list) + :raises: PilotException in case of controlled error. """ #allow_direct_access = kwargs.get('allow_direct_access') or False setup = kwargs.pop('copytools', {}).get('xrdcp', {}).get('setup') @@ -214,16 +212,16 @@ def copy_in(files: list, **kwargs: dict) -> list: state = 'STAGEIN_ATTEMPT_FAILED' trace_report.update(clientState=state, stateReason=diagnostics, timeEnd=time()) trace_report.send() + raise PilotException(diagnostics, code=fspec.status_code, state=state) from error + + # compare checksums + fspec.checksum[checksum_type] = checksum_cmd # remote checksum + state, diagnostics = verify_catalog_checksum(fspec, destination) + if diagnostics != "": + trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, + timeEnd=time()) + trace_report.send() raise PilotException(diagnostics, code=fspec.status_code, state=state) - else: - # compare checksums - fspec.checksum[checksum_type] = checksum_cmd # remote checksum - state, diagnostics = verify_catalog_checksum(fspec, destination) - if diagnostics != "": - trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, - timeEnd=time()) - trace_report.send() - raise PilotException(diagnostics, code=fspec.status_code, state=state) trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time()) trace_report.send() @@ -263,16 +261,16 @@ def copy_out(files: list, **kwargs: dict) -> list: diagnostics = error.get_detail() trace_report.update(clientState=state, stateReason=diagnostics, timeEnd=time()) trace_report.send() + raise PilotException(diagnostics, code=fspec.status_code, state=state) from error + + # compare checksums + fspec.checksum[checksum_type] = checksum_cmd # remote checksum + state, diagnostics = verify_catalog_checksum(fspec, fspec.surl) + if diagnostics != "": + trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, + timeEnd=time()) + trace_report.send() raise PilotException(diagnostics, code=fspec.status_code, state=state) - else: - # compare checksums - fspec.checksum[checksum_type] = checksum_cmd # remote checksum - state, diagnostics = verify_catalog_checksum(fspec, fspec.surl) - if diagnostics != "": - trace_report.update(clientState=state or 'STAGEIN_ATTEMPT_FAILED', stateReason=diagnostics, - timeEnd=time()) - trace_report.send() - raise PilotException(diagnostics, code=fspec.status_code, state=state) return files From d67cfb5186f42ccce2cf023dabd87d9eecbbec39 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 20 May 2024 16:32:12 +0200 Subject: [PATCH 36/41] Pylint updates --- .../communicationmanager/communicationmanager.py | 12 ++++++------ .../communicationmanager/plugins/basecommunicator.py | 4 +++- .../plugins/pandacommunicator.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pilot/eventservice/communicationmanager/communicationmanager.py b/pilot/eventservice/communicationmanager/communicationmanager.py index a7087ba9..61f47bb9 100644 --- a/pilot/eventservice/communicationmanager/communicationmanager.py +++ b/pilot/eventservice/communicationmanager/communicationmanager.py @@ -18,7 +18,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2023-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2023-2024 """Main classes to manage the messages between ES and harvester/ACT/Panda.""" @@ -320,8 +320,8 @@ def update_events(self, update_events: Any, post_hook: Any = None) -> Any: :param update_events: update events (Any) :param post_hook: post hook function (Any) - :raises: Exception caught when updating event ranges - :return: status of updating event ranges + :return: status of updating event ranges (Any) + :raises: Exception caught when updating event ranges. """ if self.is_stop(): return None @@ -333,7 +333,7 @@ def update_events(self, update_events: Any, post_hook: Any = None) -> Any: self.queues['update_events'].put(req) if req.post_hook: - return + return None while req.response is None: time.sleep(1) @@ -341,8 +341,8 @@ def update_events(self, update_events: Any, post_hook: Any = None) -> Any: raise req.response.exception if req.response.status is False: return None - else: - return req.response.content + + return req.response.content def get_plugin_confs(self) -> dict: """ diff --git a/pilot/eventservice/communicationmanager/plugins/basecommunicator.py b/pilot/eventservice/communicationmanager/plugins/basecommunicator.py index 49ce8f55..d668dc5b 100644 --- a/pilot/eventservice/communicationmanager/plugins/basecommunicator.py +++ b/pilot/eventservice/communicationmanager/plugins/basecommunicator.py @@ -18,7 +18,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 """Base communicator.""" @@ -53,6 +53,8 @@ def __init__(self, *args: Any, **kwargs: dict): :param args: args object (Any) :param kwargs: kwargs dictionary (dict) """ + if args: # to get rid of pylint warning + pass super().__init__() for key, value in kwargs.items(): setattr(self, key, value) diff --git a/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py b/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py index 6b7531f0..65f37d0a 100644 --- a/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py +++ b/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py @@ -18,7 +18,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 """PanDA communicator.""" From 39e981e9b9e6a81201788a9b4979f3c886d14b26 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 20 May 2024 19:08:14 +0200 Subject: [PATCH 37/41] Version update --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 5b61602f..d366ecd9 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.6.5 \ No newline at end of file +3.7.6.6 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index b41a4fb1..f2728005 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '5' # build number should be reset to '1' for every new development cycle +BUILD = '6' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 5fd9db3f360391926588611ae0cb59269d7df199 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 22 May 2024 15:53:18 +0200 Subject: [PATCH 38/41] Added error_report --- pilot/util/default.cfg | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index dd09b570..63f715f9 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -17,7 +17,7 @@ # # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 ################################ @@ -227,6 +227,10 @@ payloadstderr: payload.stderr # looping = looping job check checks: looping +# The payload may produce an error report with a specifiec error_code and error_diag. +# If the file exists, the pilot will use it to report the error. +error_report: payload_error_report.json + ################################ # Container parameters From eaf3f187a03545ff4c2614282e96986fd45988bb Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 22 May 2024 16:21:44 +0200 Subject: [PATCH 39/41] Support for error report --- PILOTVERSION | 2 +- pilot/control/payload.py | 45 ++++++++++++++++++++++++++++------------ pilot/util/constants.py | 2 +- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index d366ecd9..3695e848 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.6.6 \ No newline at end of file +3.7.6.7 \ No newline at end of file diff --git a/pilot/control/payload.py b/pilot/control/payload.py index f55c9f6d..af8839b9 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -33,6 +33,11 @@ from re import findall, split from typing import Any, TextIO +from pilot.common.errorcodes import ErrorCodes +from pilot.common.exception import ( + ExcThread, + PilotException +) from pilot.control.payloads import ( generic, eventservice, @@ -41,22 +46,20 @@ from pilot.control.job import send_state from pilot.util.auxiliary import set_pilot_state from pilot.util.container import execute -from pilot.util.processes import get_cpu_consumption_time from pilot.util.config import config from pilot.util.filehandling import ( - read_file, - remove_core_dumps, - get_guid, extract_lines_from_file, - find_file + find_file, + get_guid, + read_file, + read_json, + remove_core_dumps ) -from pilot.util.processes import threads_aborted -from pilot.util.queuehandling import put_in_queue -from pilot.common.errorcodes import ErrorCodes -from pilot.common.exception import ( - ExcThread, - PilotException +from pilot.util.processes import ( + get_cpu_consumption_time, + threads_aborted ) +from pilot.util.queuehandling import put_in_queue from pilot.util.realtimelogger import get_realtime_logger logger = logging.getLogger(__name__) @@ -616,6 +619,23 @@ def perform_initial_payload_error_analysis(job: Any, exit_code: int): if exit_code != 0: logger.warning(f'main payload execution returned non-zero exit code: {exit_code}') + # check if the transform has produced an error report + path = os.path.join(job.workdir, config.Payload.error_report) + if os.path.exists(path): + error_report = read_json(path) + error_code = error_report.get('error_code') + error_diag = error_report.get('error_diag') + if error_code: + logger.warning(f'{config.Payload.error_report} contained error code: {error_code}') + logger.warning(f'{config.Payload.error_report} contained error diag: {error_diag}') + job.exeerrorcode = error_code + job.exeerrordiag = error_report.get('error_diag') + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXECUTIONFAILURE, msg=error_diag) + return + logger.info(f'{config.Payload.error_report} exists but did not contain any non-zero error code') + else: + logger.debug(f'{config.Payload.error_report} does not exist') + # look for singularity/apptainer errors (the exit code can be zero in this case) path = os.path.join(job.workdir, config.Payload.payloadstderr) if os.path.exists(path): @@ -664,9 +684,8 @@ def perform_initial_payload_error_analysis(job: Any, exit_code: int): else: logger.info('main payload execution returned zero exit code') - # check if core dumps exist, if so remove them and return True + # check if core dumps exist, if so remove them if not job.debug: # do not shorten these if-statements - # only return True if found core dump belongs to payload if remove_core_dumps(job.workdir, pid=job.pid): # COREDUMP error will only be set if the core dump belongs to the payload (ie 'core.') logger.warning('setting COREDUMP error') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index f2728005..e00b5ea3 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '6' # build number should be reset to '1' for every new development cycle +BUILD = '7' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 42ed3e86fd40bc73c94b578c4ac21f3e9eb79450 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 29 May 2024 10:36:44 +0200 Subject: [PATCH 40/41] Changed _errorCode to _error_code --- PILOTVERSION | 2 +- pilot.py | 2 +- pilot/common/exception.py | 146 +++++++++++++++++++------------------- pilot/control/payload.py | 2 +- pilot/util/auxiliary.py | 2 +- pilot/util/constants.py | 2 +- 6 files changed, 78 insertions(+), 78 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 3695e848..66566d18 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.6.7 \ No newline at end of file +3.7.6.8 \ No newline at end of file diff --git a/pilot.py b/pilot.py index 37575def..20a3c47d 100755 --- a/pilot.py +++ b/pilot.py @@ -657,7 +657,7 @@ def create_main_work_dir() -> (int, str): f"failed to create workdir at {_mainworkdir} -- aborting: {error}", file=sys.stderr, ) - exitcode = shell_exit_code(error._errorCode) + exitcode = shell_exit_code(error._error_code) else: _mainworkdir = getcwd() diff --git a/pilot/common/exception.py b/pilot/common/exception.py index c6501ad1..a731df73 100644 --- a/pilot/common/exception.py +++ b/pilot/common/exception.py @@ -37,7 +37,7 @@ class PilotException(Exception): """Pilot exceptions class.""" - _errorCode = None + _error_code = None _message = None def __init__(self, *args, **kwargs): @@ -47,10 +47,10 @@ def __init__(self, *args, **kwargs): self.kwargs = kwargs code = self.kwargs.get("code", None) if code: - self._errorCode = code + self._error_code = code else: - self._errorCode = errors.UNKNOWNEXCEPTION - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.UNKNOWNEXCEPTION + self._message = errors.get_error_message(self._error_code) self._error_string = None self._stack_trace = f"{traceback.format_exc()}" @@ -58,12 +58,12 @@ def __str__(self): """Set and return the error string for string representation of the class instance.""" try: self._error_string = ( - f"error code: {self._errorCode}, message: {self._message % self.kwargs}" + f"error code: {self._error_code}, message: {self._message % self.kwargs}" ) except TypeError: # at least get the core message out if something happened self._error_string = ( - f"error code: {self._errorCode}, message: {self._message}" + f"error code: {self._error_code}, message: {self._message}" ) if len(self.args) > 0: @@ -82,19 +82,19 @@ def get_detail(self): """Set and return the error string with the exception details.""" try: self._error_string = ( - f"error code: {self._errorCode}, message: {self._message % self.kwargs}" + f"error code: {self._error_code}, message: {self._message % self.kwargs}" ) except TypeError: # at least get the core message out if something happened self._error_string = ( - f"error code: {self._errorCode}, message: {self._message}" + f"error code: {self._error_code}, message: {self._message}" ) return self._error_string + f"\nstacktrace: {self._stack_trace}" def get_error_code(self): """Return the error code.""" - return self._errorCode + return self._error_code def get_last_error(self): """Return the last error message.""" @@ -109,8 +109,8 @@ def get_last_error(self): # """ # def __init__(self, *args, **kwargs): # super().__init__(args, kwargs) -# self._errorCode = errors.NOTIMPLEMENTED -# self._message = errors.get_error_message(self._errorCode) +# self._error_code = errors.NOTIMPLEMENTED +# self._message = errors.get_error_message(self._error_code) class UnknownException(PilotException): @@ -119,8 +119,8 @@ class UnknownException(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.UNKNOWNEXCEPTION - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.UNKNOWNEXCEPTION + self._message = errors.get_error_message(self._error_code) class NoLocalSpace(PilotException): @@ -129,8 +129,8 @@ class NoLocalSpace(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOLOCALSPACE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOLOCALSPACE + self._message = errors.get_error_message(self._error_code) class SizeTooLarge(PilotException): @@ -139,8 +139,8 @@ class SizeTooLarge(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.SIZETOOLARGE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.SIZETOOLARGE + self._message = errors.get_error_message(self._error_code) class StageInFailure(PilotException): @@ -149,8 +149,8 @@ class StageInFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.STAGEINFAILED - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.STAGEINFAILED + self._message = errors.get_error_message(self._error_code) class StageOutFailure(PilotException): @@ -159,8 +159,8 @@ class StageOutFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.STAGEOUTFAILED - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.STAGEOUTFAILED + self._message = errors.get_error_message(self._error_code) class SetupFailure(PilotException): @@ -169,8 +169,8 @@ class SetupFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.SETUPFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.SETUPFAILURE + self._message = errors.get_error_message(self._error_code) class RunPayloadFailure(PilotException): @@ -179,8 +179,8 @@ class RunPayloadFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.PAYLOADEXECUTIONFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.PAYLOADEXECUTIONFAILURE + self._message = errors.get_error_message(self._error_code) class MessageFailure(PilotException): @@ -189,8 +189,8 @@ class MessageFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.MESSAGEHANDLINGFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.MESSAGEHANDLINGFAILURE + self._message = errors.get_error_message(self._error_code) class CommunicationFailure(PilotException): @@ -199,8 +199,8 @@ class CommunicationFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.COMMUNICATIONFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.COMMUNICATIONFAILURE + self._message = errors.get_error_message(self._error_code) class FileHandlingFailure(PilotException): @@ -209,8 +209,8 @@ class FileHandlingFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.FILEHANDLINGFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.FILEHANDLINGFAILURE + self._message = errors.get_error_message(self._error_code) class NoSuchFile(PilotException): @@ -219,8 +219,8 @@ class NoSuchFile(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOSUCHFILE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOSUCHFILE + self._message = errors.get_error_message(self._error_code) class ConversionFailure(PilotException): @@ -229,8 +229,8 @@ class ConversionFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.CONVERSIONFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.CONVERSIONFAILURE + self._message = errors.get_error_message(self._error_code) class MKDirFailure(PilotException): @@ -239,8 +239,8 @@ class MKDirFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.MKDIR - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.MKDIR + self._message = errors.get_error_message(self._error_code) class NoGridProxy(PilotException): @@ -249,8 +249,8 @@ class NoGridProxy(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOPROXY - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOPROXY + self._message = errors.get_error_message(self._error_code) class NoVomsProxy(PilotException): @@ -259,8 +259,8 @@ class NoVomsProxy(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOVOMSPROXY - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOVOMSPROXY + self._message = errors.get_error_message(self._error_code) class TrfDownloadFailure(PilotException): @@ -269,8 +269,8 @@ class TrfDownloadFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.TRFDOWNLOADFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.TRFDOWNLOADFAILURE + self._message = errors.get_error_message(self._error_code) class NotDefined(PilotException): @@ -279,8 +279,8 @@ class NotDefined(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOTDEFINED - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOTDEFINED + self._message = errors.get_error_message(self._error_code) class NotSameLength(PilotException): @@ -289,8 +289,8 @@ class NotSameLength(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOTSAMELENGTH - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOTSAMELENGTH + self._message = errors.get_error_message(self._error_code) class ESRecoverable(PilotException): @@ -299,8 +299,8 @@ class ESRecoverable(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.ESRECOVERABLE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.ESRECOVERABLE + self._message = errors.get_error_message(self._error_code) class ESFatal(PilotException): @@ -309,8 +309,8 @@ class ESFatal(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.ESFATAL - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.ESFATAL + self._message = errors.get_error_message(self._error_code) class ExecutedCloneJob(PilotException): @@ -319,8 +319,8 @@ class ExecutedCloneJob(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.EXECUTEDCLONEJOB - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.EXECUTEDCLONEJOB + self._message = errors.get_error_message(self._error_code) class ESNoEvents(PilotException): @@ -329,8 +329,8 @@ class ESNoEvents(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.ESNOEVENTS - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.ESNOEVENTS + self._message = errors.get_error_message(self._error_code) class ExceededMaxWaitTime(PilotException): @@ -339,8 +339,8 @@ class ExceededMaxWaitTime(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.EXCEEDEDMAXWAITTIME - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.EXCEEDEDMAXWAITTIME + self._message = errors.get_error_message(self._error_code) class BadXML(PilotException): @@ -349,8 +349,8 @@ class BadXML(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.BADXML - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.BADXML + self._message = errors.get_error_message(self._error_code) class NoSoftwareDir(PilotException): @@ -359,8 +359,8 @@ class NoSoftwareDir(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOSOFTWAREDIR - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOSOFTWAREDIR + self._message = errors.get_error_message(self._error_code) class LogFileCreationFailure(PilotException): @@ -369,8 +369,8 @@ class LogFileCreationFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.LOGFILECREATIONFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.LOGFILECREATIONFAILURE + self._message = errors.get_error_message(self._error_code) class QueuedataFailure(PilotException): @@ -379,8 +379,8 @@ class QueuedataFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.QUEUEDATA - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.QUEUEDATA + self._message = errors.get_error_message(self._error_code) class QueuedataNotOK(PilotException): @@ -389,8 +389,8 @@ class QueuedataNotOK(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.QUEUEDATANOTOK - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.QUEUEDATANOTOK + self._message = errors.get_error_message(self._error_code) class ReplicasNotFound(PilotException): @@ -399,8 +399,8 @@ class ReplicasNotFound(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.NOREPLICAS - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.NOREPLICAS + self._message = errors.get_error_message(self._error_code) class MiddlewareImportFailure(PilotException): @@ -409,8 +409,8 @@ class MiddlewareImportFailure(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.MIDDLEWAREIMPORTFAILURE - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.MIDDLEWAREIMPORTFAILURE + self._message = errors.get_error_message(self._error_code) class JobAlreadyRunning(PilotException): @@ -419,8 +419,8 @@ class JobAlreadyRunning(PilotException): def __init__(self, *args, **kwargs): """Set default and initial values.""" super().__init__(args, kwargs) - self._errorCode = errors.JOBALREADYRUNNING - self._message = errors.get_error_message(self._errorCode) + self._error_code = errors.JOBALREADYRUNNING + self._message = errors.get_error_message(self._error_code) def __str__(self): """Return string for exception object.""" diff --git a/pilot/control/payload.py b/pilot/control/payload.py index af8839b9..8ef1c453 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -300,7 +300,7 @@ def execute_payloads(queues: Any, traces: Any, args: Any): # noqa: C901 except (Exception, PilotException) as error: logger.warning(f'exception caught: {error}') if isinstance(error, PilotException): - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error._errorCode, msg=error._message) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error._error_code, msg=error._message) elif isinstance(error, str): if 'error code:' in error and 'message:' in error: error_code, diagnostics = extract_error_info(error) diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index 616c0964..a254baab 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -776,7 +776,7 @@ def __init__(self, message: str, timeout: int = None, *args: Any): """Initialize variables.""" self.timeout = timeout self.message = message - self._errorCode = 1334 + self._error_code = 1334 super(TimeoutException, self).__init__(*args) def __str__(self): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index e00b5ea3..528081f7 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '6' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '7' # build number should be reset to '1' for every new development cycle +BUILD = '8' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 94041bce023fd8a4c4c8089433e90639b154350c Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 29 May 2024 10:43:33 +0200 Subject: [PATCH 41/41] Dropped python 3.8 support since unittests now start to fail due to more modern type hints --- .github/workflows/flake8-workflow.yml | 2 +- .github/workflows/unit-tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/flake8-workflow.yml b/.github/workflows/flake8-workflow.yml index a4aa5278..979be048 100644 --- a/.github/workflows/flake8-workflow.yml +++ b/.github/workflows/flake8-workflow.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11"] env: FLAKE8_VERSION: "==6.1.0" FLAKE8_CONFIG: ".flake8" diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 88d5d932..3bff6d38 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -11,7 +11,7 @@ jobs: continue-on-error: true strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.9', '3.10', '3.11'] steps: - name: Checkout Pilot3 repo uses: actions/checkout@v3