diff --git a/PILOTVERSION b/PILOTVERSION index 8da3fc2b..f92be120 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.0.36 \ No newline at end of file +3.7.1.50 \ No newline at end of file diff --git a/pilot.py b/pilot.py index da37e23d..6f710d06 100755 --- a/pilot.py +++ b/pilot.py @@ -19,13 +19,10 @@ # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2016-17 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 """This is the entry point for the PanDA Pilot, executed with 'python3 pilot.py '.""" -from __future__ import print_function # Python 2 (2to3 complains about this) -from __future__ import absolute_import - import argparse import logging import os @@ -44,6 +41,7 @@ from pilot.util.auxiliary import ( pilot_version_banner, shell_exit_code, + convert_signal_to_exit_code ) from pilot.util.constants import ( get_pilot_version, @@ -63,13 +61,15 @@ is_harvester_mode, kill_worker, ) +from pilot.util.heartbeat import update_pilot_heartbeat from pilot.util.https import ( get_panda_server, https_setup, send_update, ) -from pilot.util.processgroups import find_defunct_subprocesses from pilot.util.loggingsupport import establish_logging +from pilot.util.networking import dump_ipv6_info +from pilot.util.processgroups import find_defunct_subprocesses from pilot.util.timing import add_to_pilot_timing errors = ErrorCodes() @@ -84,8 +84,9 @@ def main() -> int: # get the logger logger = logging.getLogger(__name__) - # print the pilot version + # print the pilot version and other information pilot_version_banner() + dump_ipv6_info() # define threading events args.graceful_stop = threading.Event() @@ -154,6 +155,9 @@ def main() -> int: f"pilot.workflow.{args.workflow}", globals(), locals(), [args.workflow], 0 ) + # update the pilot heartbeat file + update_pilot_heartbeat(time.time()) + # execute workflow try: exitcode = workflow.run(args) @@ -700,6 +704,19 @@ def wrap_up() -> int: if args.harvester: kill_worker() + exitcode, shellexitcode = get_proper_exit_code() + logging.info(f"pilot has finished (exit code={exitcode}, shell exit code={shellexitcode})") + logging.shutdown() + + return shellexitcode + + +def get_proper_exit_code() -> (int, int): + """ + Return the proper exit code. + + :return: exit code (int), shell exit code (int). + """ try: exitcode = trace.pilot["error_code"] except KeyError: @@ -734,11 +751,11 @@ def wrap_up() -> int: logging.warning(f"failed to convert exit code to int: {exitcode}, {exc}") exitcode = 1008 + if exitcode == 0 and args.signal: + exitcode = convert_signal_to_exit_code(args.signal) sec = shell_exit_code(exitcode) - logging.info(f"pilot has finished (exit code={exitcode}, shell exit code={sec})") - logging.shutdown() - return sec + return exitcode, sec def get_pilot_source_dir() -> str: @@ -837,7 +854,8 @@ def list_zombies(): if __name__ == "__main__": # get the args from the arg parser args = get_args() - args.last_heartbeat = time.time() + args.last_heartbeat = time.time() # keep track of server heartbeats + args.pilot_heartbeat = time.time() # keep track of pilot heartbeats # Define and set the main harvester control boolean args.harvester = is_harvester_mode(args) diff --git a/pilot/api/analytics.py b/pilot/api/analytics.py index 6be02719..d25a237c 100644 --- a/pilot/api/analytics.py +++ b/pilot/api/analytics.py @@ -21,12 +21,14 @@ """Functions for performing analytics including fitting of data.""" +import logging +from typing import Any + from .services import Services from pilot.common.exception import NotDefined, NotSameLength, UnknownException from pilot.util.filehandling import get_table_from_file from pilot.util.math import mean, sum_square_dev, sum_dev, chi2, float_to_rounded_string -import logging logger = logging.getLogger(__name__) @@ -35,25 +37,25 @@ class Analytics(Services): _fit = None - def __init__(self, **kwargs): + def __init__(self, **kwargs: Any): """ - Init function. + Initialize variables. - :param kwargs: + :param kwargs: kwargs dictionary (dict). """ self._fit = None - def fit(self, x, y, model='linear'): + def fit(self, x: list, y: list, model: str = "linear") -> Any: """ - Fitting function. + Fit the given data according to the given model. For a linear model: y(x) = slope * x + intersect - :param x: list of input data (list of floats or ints). - :param y: list of input data (list of floats or ints). - :param model: model name (string). - :raises UnknownException: in case Fit() fails. - :return: fit. + :param x: list of input data (list of floats or ints) (list) + :param y: list of input data (list of floats or ints) (list) + :param model: model name (str) + :raises UnknownException: in case Fit() fails + :return: fit (Any). """ try: self._fit = Fit(x=x, y=y, model=model) @@ -62,7 +64,7 @@ def fit(self, x, y, model='linear'): return self._fit - def slope(self): + def slope(self) -> float: """ Return the slope of a linear fit, y(x) = slope * x + intersect. @@ -72,62 +74,69 @@ def slope(self): if self._fit: slope = self._fit.slope() else: - raise NotDefined('Fit has not been defined') + raise NotDefined("Fit has not been defined") return slope - def intersect(self): + def intersect(self) -> float: """ Return the intersect of a linear fit, y(x) = slope * x + intersect. - :raises NotDefined: exception thrown if fit is not defined. + :raises NotDefined: exception thrown if fit is not defined :return: intersect (float). """ if self._fit: intersect = self._fit.intersect() else: - raise NotDefined('Fit has not been defined') + raise NotDefined("Fit has not been defined") return intersect - def chi2(self): + def chi2(self) -> float: """ Return the chi2 of the fit. - :raises NotDefined: exception thrown if fit is not defined. + :raises NotDefined: exception thrown if fit is not defined :return: chi2 (float). """ if self._fit: x2 = self._fit.chi2() else: - raise NotDefined('Fit has not been defined') + raise NotDefined("Fit has not been defined") return x2 - def get_table(self, filename, header=None, separator="\t", convert_to_float=True): + def get_table(self, filename: str, header: str = "", separator: str = "\t", convert_to_float: bool = True) -> dict: """ Return a table from file. - :param filename: full path to input file (string). - :param header: header string. - :param separator: separator character (char). - :param convert_to_float: boolean, if True, all values will be converted to floats. - :return: dictionary. + :param filename: full path to input file (str) + :param header: header (str) + :param separator: separator character (str) + :param convert_to_float: boolean, if True, all values will be converted to floats (bool) + :return: table dictionary (dict). """ - return get_table_from_file(filename, header=header, separator=separator, convert_to_float=convert_to_float) + return get_table_from_file( + filename, + header=header, + separator=separator, + convert_to_float=convert_to_float, + ) - def get_fitted_data(self, filename, x_name='Time', y_name='pss+swap', precision=2, tails=True): + def get_fitted_data( + self, filename: str, x_name: str = "Time", y_name: str = "pss+swap", precision: int = 2, tails: bool = True + ) -> dict: """ Return a properly formatted job metrics string with analytics data. Currently, the function returns a fit for PSS+Swap vs time, whose slope measures memory leaks. - :param filename: full path to memory monitor output (string). - :param x_name: optional string, name selector for table column. - :param y_name: optional string, name selector for table column. - :param precision: optional precision for fitted slope parameter, default 2. - :param tails: should tails (first and last values) be used? (boolean). - :return: {"slope": slope, "chi2": chi2} (float strings with desired precision). + :param filename: full path to memory monitor output (str) + :param x_name: optional string, name selector for table column (str) + :param y_name: optional string, name selector for table column (str) + :param precision: optional precision for fitted slope parameter, default 2 (int) + :param tails: should tails (first and last values) be used? (bool) + :return: {"slope": slope, "chi2": chi2} (dict). """ slope = "" chi2 = "" @@ -143,21 +152,27 @@ def get_fitted_data(self, filename, x_name='Time', y_name='pss+swap', precision= itmet = False if len(x) >= 100: - logger.debug('tails will not be removed for large data sample - iterative method will be used instead') + logger.debug( + "tails will not be removed for large data sample - iterative method will be used instead" + ) tails = True itmet = True if not tails and len(x) > 7 and len(y) > 7: - logger.debug('removing tails from data to be fitted') + logger.debug("removing tails from data to be fitted") x = x[5:] x = x[:-2] y = y[5:] y = y[:-2] if not (len(x) > 7 and len(y) > 7) and len(x) == len(y): - logger.warning('wrong length of table data, x=%s, y=%s (must be same and length>=4)', str(x), str(y)) + logger.warning( + "wrong length of table data, x=%s, y=%s (must be same and length>=4)", + str(x), + str(y), + ) else: - logger.info('fitting %s vs %s', y_name, x_name) + logger.info("fitting %s vs %s", y_name, x_name) if itmet: norg = len(x) @@ -183,17 +198,28 @@ def get_fitted_data(self, filename, x_name='Time', y_name='pss+swap', precision= fit = self.fit(x, y) _slope = self.slope() except Exception as exc: - logger.warning('failed to fit data, x=%s, y=%s: %s', str(x), str(y), exc) + logger.warning( + "failed to fit data, x=%s, y=%s: %s", str(x), str(y), exc + ) else: if _slope: - slope = float_to_rounded_string(fit.slope(), precision=precision) + slope = float_to_rounded_string( + fit.slope(), precision=precision + ) chi2 = float_to_rounded_string(fit.chi2(), precision=precision) if slope != "": - logger.info('current memory leak: %s B/s (using %d data points, chi2=%s)', slope, len(x), chi2) + logger.info( + "current memory leak: %s B/s (using %d data points, chi2=%s)", + slope, + len(x), + chi2, + ) return {"slope": slope, "chi2": chi2} - def find_limit(self, _x, _y, _chi2_org, norg, change_limit=0.25, edge="right", steps=5): + def find_limit( + self, _x, _y, _chi2_org, norg, change_limit=0.25, edge="right", steps=5 + ): """Use an iterative approach to find the limits of the distributions that can be used for the final fit.""" _chi2_prev = _chi2_org found = False @@ -209,12 +235,12 @@ def find_limit(self, _x, _y, _chi2_org, norg, change_limit=0.25, edge="right", s try: fit = self.fit(_x, _y) except Exception as exc: - logger.warning(f'caught exception: {exc}') + logger.warning(f"caught exception: {exc}") break _chi2 = fit.chi2() change = (_chi2_prev - _chi2) / _chi2_prev - logger.info(f'current chi2={_chi2} (change={change * 100} %)') + logger.info(f"current chi2={_chi2} (change={change * 100} %)") if change < change_limit: found = True break @@ -224,17 +250,17 @@ def find_limit(self, _x, _y, _chi2_org, norg, change_limit=0.25, edge="right", s if edge == "right": if not found: limit = norg - 1 - logger.warning('right removable region not found') + logger.warning("right removable region not found") else: limit = len(_x) - 1 - logger.info(f'right removable region: {limit}') + logger.info(f"right removable region: {limit}") else: if not found: limit = 0 - logger.info('left removable region not found') + logger.info("left removable region not found") else: limit = iterations * 10 - logger.info(f'left removable region: {limit}') + logger.info(f"left removable region: {limit}") return limit @@ -248,16 +274,16 @@ def extract_from_table(self, table, x_name, y_name): :return: x (list), y (list). """ x = table.get(x_name, []) - if '+' not in y_name: + if "+" not in y_name: y = table.get(y_name, []) else: try: - y1_name = y_name.split('+')[0] - y2_name = y_name.split('+')[1] + y1_name = y_name.split("+")[0] + y2_name = y_name.split("+")[1] y1_value = table.get(y1_name, []) y2_value = table.get(y2_name, []) except Exception as error: - logger.warning('exception caught: %s', error) + logger.warning("exception caught: %s", error) x = [] y = [] else: @@ -270,7 +296,7 @@ def extract_from_table(self, table, x_name, y_name): class Fit(object): """Low-level fitting class.""" - _model = 'linear' # fitting model + _model = "linear" # fitting model _x = None # x values _y = None # y values _xm = None # x mean @@ -289,18 +315,18 @@ def __init__(self, **kwargs): :raises PilotException: NotImplementedError for unknown fitting model, NotDefined if input data not defined. """ # extract parameters - self._model = kwargs.get('model', 'linear') - self._x = kwargs.get('x', None) - self._y = kwargs.get('y', None) + self._model = kwargs.get("model", "linear") + self._x = kwargs.get("x", None) + self._y = kwargs.get("y", None) if not self._x or not self._y: - raise NotDefined('input data not defined') + raise NotDefined("input data not defined") if len(self._x) != len(self._y): - raise NotSameLength('input data (lists) have different lengths') + raise NotSameLength("input data (lists) have different lengths") # base calculations - if self._model == 'linear': + if self._model == "linear": self._ss = sum_square_dev(self._x) self._ss2 = sum_dev(self._x, self._y) self.set_slope() @@ -309,7 +335,7 @@ def __init__(self, **kwargs): self.set_intersect() self.set_chi2() else: - logger.warning("\'%s\' model is not implemented", self._model) + logger.warning("'%s' model is not implemented", self._model) raise NotImplementedError() def fit(self): @@ -336,11 +362,11 @@ def set_chi2(self): """ y_observed = self._y y_expected = [] - #i = 0 + # i = 0 for x in self._x: - #y_expected.append(self.value(x) - y_observed[i]) + # y_expected.append(self.value(x) - y_observed[i]) y_expected.append(self.value(x)) - #i += 1 + # i += 1 if y_observed and y_observed != [] and y_expected and y_expected != []: self._chi2 = chi2(y_observed, y_expected) else: diff --git a/pilot/api/data.py b/pilot/api/data.py index 80c1fa25..1905c125 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -120,7 +120,7 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ self.trace_report = trace_report if trace_report else TraceReport(pq=os.environ.get('PILOT_SITENAME', ''), ipv=self.ipv, workdir=self.workdir) if not self.acopytools: - msg = 'failed to initilize StagingClient: no acopytools options found, acopytools=%s' % self.acopytools + msg = f'failed to initilize StagingClient: no acopytools options found, acopytools={self.acopytools}' logger.error(msg) self.trace_report.update(clientState='BAD_COPYTOOL', stateReason=msg) self.trace_report.send() @@ -165,7 +165,7 @@ def get_preferred_replica(self, replicas, allowed_schemas): for replica in replicas: pfn = replica.get('pfn') for schema in allowed_schemas: - if pfn and (not schema or pfn.startswith('%s://' % schema)): + if pfn and (not schema or pfn.startswith(f'{schema}://')): return replica def prepare_sources(self, files, activities=None): @@ -522,12 +522,12 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 try: if name not in self.copytool_modules: - raise PilotException('passed unknown copytool with name=%s .. skipped' % name, + raise PilotException(f'passed unknown copytool with name={name} .. skipped', code=ErrorCodes.UNKNOWNCOPYTOOL) module = self.copytool_modules[name]['module_name'] self.logger.info('trying to use copytool=%s for activity=%s', name, activity) - copytool = __import__('pilot.copytool.%s' % module, globals(), locals(), [module], 0) + copytool = __import__(f'pilot.copytool.{module}', globals(), locals(), [module], 0) #self.trace_report.update(protocol=name) except PilotException as exc: @@ -575,7 +575,7 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 self.logger.warning('caught time-out exception: %s', caught_errors[0]) else: code = ErrorCodes.STAGEINFAILED if self.mode == 'stage-in' else ErrorCodes.STAGEOUTFAILED # is it stage-in/out? - details = str(caught_errors) + ":" + 'failed to transfer files using copytools=%s' % copytools + details = str(caught_errors) + ":" + f'failed to transfer files using copytools={copytools}' self.logger.fatal(details) raise PilotException(details, code=code) @@ -610,7 +610,7 @@ def require_protocols(self, files, copytool, activity, local_dir=''): protocols = self.resolve_protocol(fspec, allowed_schemas) if not protocols and 'mv' not in self.infosys.queuedata.copytools: # no protocols found - error = 'Failed to resolve protocol for file=%s, allowed_schemas=%s, fspec=%s' % (fspec.lfn, allowed_schemas, fspec) + error = f'Failed to resolve protocol for file={fspec.lfn}, allowed_schemas={allowed_schemas}, fspec={fspec}' self.logger.error("resolve_protocol: %s", error) raise PilotException(error, code=ErrorCodes.NOSTORAGEPROTOCOL) @@ -644,7 +644,7 @@ def resolve_protocols(self, files): for fdat in files: ddm = ddmconf.get(fdat.ddmendpoint) if not ddm: - error = 'Failed to resolve output ddmendpoint by name=%s (from PanDA), please check configuration.' % fdat.ddmendpoint + error = f'Failed to resolve output ddmendpoint by name={fdat.ddmendpoint} (from PanDA), please check configuration.' self.logger.error("resolve_protocols: %s, fspec=%s", error, fdat) raise PilotException(error, code=ErrorCodes.NOSTORAGE) @@ -677,7 +677,7 @@ def resolve_protocol(self, fspec, allowed_schemas=None): allowed_schemas = allowed_schemas or [None] for schema in allowed_schemas: for pdat in fspec.protocols: - if schema is None or pdat.get('endpoint', '').startswith("%s://" % schema): + if schema is None or pdat.get('endpoint', '').startswith(f"{schema}://"): protocols.append(pdat) return protocols @@ -731,7 +731,7 @@ def resolve_replica(self, fspec, primary_schemas=None, allowed_schemas=None, dom schemas = 'any' if not allowed_schemas[0] else ','.join(allowed_schemas) pschemas = 'any' if primary_schemas and not primary_schemas[0] else ','.join(primary_schemas or []) - error = 'Failed to find replica for file=%s, domain=%s, allowed_schemas=%s, pschemas=%s, fspec=%s' % (fspec.lfn, domain, schemas, pschemas, fspec) + error = f'Failed to find replica for file={fspec.lfn}, domain={domain}, allowed_schemas={schemas}, pschemas={pschemas}, fspec={fspec}' self.logger.info("resolve_replica: %s", error) return None @@ -778,7 +778,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 """ if getattr(copytool, 'require_replicas', False) and files: if files[0].replicas is None: # look up replicas only once - files = self.resolve_replicas(files, use_vp=kwargs['use_vp']) + files = self.resolve_replicas(files, use_vp=kwargs.get('use_vp', False)) allowed_schemas = getattr(copytool, 'allowed_schemas', None) @@ -787,7 +787,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 allowed_schemas = self.infosys.queuedata.resolve_allowed_schemas(activity, copytool_name) or allowed_schemas # overwrite allowed_schemas for VP jobs - if kwargs['use_vp']: + if kwargs.get('use_vp', False): allowed_schemas = ['root'] self.logger.debug('overwrote allowed_schemas for VP job: %s', str(allowed_schemas)) @@ -824,7 +824,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 self.logger.info("[stage-in] No WAN replica found for lfn=%s, primary_schemas=%s, allowed_schemas=%s", fspec.lfn, primary_schemas, allowed_schemas) if not replica: - raise ReplicasNotFound('No replica found for lfn=%s (allow_lan=%s, allow_wan=%s)' % (fspec.lfn, fspec.allow_lan, fspec.allow_wan)) + raise ReplicasNotFound(f'No replica found for lfn={fspec.lfn} (allow_lan={fspec.allow_lan}, allow_wan={fspec.allow_wan})') if replica.get('pfn'): fspec.turl = replica['pfn'] @@ -839,7 +839,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 # prepare files (resolve protocol/transfer url) if getattr(copytool, 'require_input_protocols', False) and files: - self.require_protocols(files, copytool, activity, local_dir=kwargs['input_dir']) + self.require_protocols(files, copytool, activity, local_dir=kwargs.get('input_dir')) # mark direct access files with status=remote_io self.set_status_for_direct_access(files, kwargs.get('workdir', '')) @@ -851,7 +851,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 return files if not copytool.is_valid_for_copy_in(remain_files): - msg = 'input is not valid for transfers using copytool=%s' % copytool + msg = f'input is not valid for transfers using copytool={copytool}' self.logger.warning(msg) self.logger.debug('input: %s', remain_files) self.trace_report.update(clientState='NO_REPLICA', stateReason=msg) @@ -1058,7 +1058,7 @@ def get_path(self, scope, lfn, prefix='rucio'): :param lfn: repliva LFN (str) :param prefix: prefix (str). """ - s = '%s:%s' % (scope, lfn) + s = f'{scope}:{lfn}' hash_hex = hashlib.md5(s.encode('utf-8')).hexdigest() #paths = [prefix] + scope.split('.') + [hash_hex[0:2], hash_hex[2:4], lfn] @@ -1085,7 +1085,7 @@ def resolve_surl(self, fspec, protocol, ddmconf, **kwargs): # consider only deterministic sites (output destination) - unless local input/output ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: - raise PilotException('Failed to resolve ddmendpoint by name=%s' % fspec.ddmendpoint) + raise PilotException(f'Failed to resolve ddmendpoint by name={fspec.ddmendpoint}') # path = protocol.get('path', '').rstrip('/') # if not (ddm.is_deterministic or (path and path.endswith('/rucio'))): diff --git a/pilot/api/memorymonitor.py b/pilot/api/memorymonitor.py index e46c900e..deed3282 100644 --- a/pilot/api/memorymonitor.py +++ b/pilot/api/memorymonitor.py @@ -49,7 +49,7 @@ def __init__(self, **kwargs): self.workdir = getcwd() if self.user: - user_utility = __import__('pilot.user.%s.utilities' % self.user, globals(), locals(), [self.user], 0) # Python 2/3 + user_utility = __import__(f'pilot.user.{self.user}.utilities', globals(), locals(), [self.user], 0) # Python 2/3 self._cmd = user_utility.get_memory_monitor_setup(self.pid, self.workdir) def get_command(self): diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 49a234f6..86801ce8 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -23,7 +23,7 @@ """Error codes set by the pilot.""" import re -from typing import Any +from typing import Any, Tuple, List class ErrorCodes: @@ -36,8 +36,8 @@ class ErrorCodes: """ # global variables shared by all modules/jobs - pilot_error_codes = [] - pilot_error_diags = [] + pilot_error_codes: List[int] = [] + pilot_error_diags: List[str] = [] # Error code constants (from Pilot 1) GENERALERROR = 1008 @@ -83,6 +83,7 @@ class ErrorCodes: USERKILL = 1205 # reserved error code, currently not used by pilot SIGBUS = 1206 SIGUSR1 = 1207 + SIGINT = 1208 MISSINGINSTALLATION = 1211 PAYLOADOUTOFMEMORY = 1212 REACHEDMAXTIME = 1213 @@ -220,6 +221,7 @@ class ErrorCodes: SIGXCPU: "Job killed by signal: SIGXCPU", SIGUSR1: "Job killed by signal: SIGUSR1", SIGBUS: "Job killed by signal: SIGBUS", + SIGINT: "Job killed by signal: SIGINT", USERKILL: "Job killed by user", MISSINGINSTALLATION: "Missing installation", PAYLOADOUTOFMEMORY: "Payload ran out of memory", @@ -349,7 +351,7 @@ def get_error_message(self, errorcode: int) -> str: """ return self._error_messages.get(errorcode, f"unknown error code: {errorcode}") - def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None) -> (list, list): + def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None) -> Tuple[list, list]: """ Add pilot error code to list of error codes. @@ -379,7 +381,7 @@ def add_error_code(self, errorcode: int, priority: bool = False, msg: Any = None return pilot_error_codes, pilot_error_diags - def remove_error_code(self, errorcode: int) -> (list, list): + def remove_error_code(self, errorcode: int) -> Tuple[list, list]: """ Silently remove an error code and its diagnostics from the internal error lists. @@ -554,11 +556,11 @@ def format_diagnostics(self, code: int, diag: str) -> str: return error_message @classmethod - def is_recoverable(self, code: int = 0) -> bool: + def is_recoverable(cls, code: int = 0) -> bool: """ Determine whether code is a recoverable error code or not. :param code: Pilot error code (int) :return: is recoverable error (bool). """ - return code in self.recoverable_error_codes + return code in cls.recoverable_error_codes diff --git a/pilot/common/exception.py b/pilot/common/exception.py index b5f967f4..373e4aa5 100644 --- a/pilot/common/exception.py +++ b/pilot/common/exception.py @@ -26,6 +26,7 @@ import threading import traceback from sys import exc_info +from typing import Callable, Any, Dict from .errorcodes import ErrorCodes errors = ErrorCodes() @@ -417,20 +418,23 @@ def __str__(self): class ExcThread(threading.Thread): """Support class that allows for catching exceptions in threads.""" - def __init__(self, bucket, target, kwargs, name): + def __init__(self, bucket: Any, target: Callable, kwargs: Dict[str, Any], name: str): """ Set data members. Init function with a bucket that can be used to communicate exceptions to the caller. The bucket is a Queue.queue() or queue.Queue() object that can hold an exception thrown by a thread. - :param bucket: queue based bucket. - :param target: target function to execute. - :param kwargs: target function options. + :param bucket: queue based bucket (Any) + :param target: target function to execute (Callable) + :param kwargs: target function options (dict) + :param name: name (str). """ threading.Thread.__init__(self, target=target, kwargs=kwargs, name=name) self.name = name self.bucket = bucket + self._target = target + self._kwargs = kwargs def run(self): """ @@ -443,7 +447,7 @@ def run(self): job.control(). """ try: - self._target(**self._kwargs) + self.target(**self.kwargs) except Exception: # logger object can't be used here for some reason: # IOError: [Errno 2] No such file or directory: '/state/partition1/scratch/PanDA_Pilot2_*/pilotlog.txt' @@ -459,6 +463,16 @@ def run(self): time.sleep(10) args.graceful_stop.set() + @property + def target(self) -> Callable: + """Help Pyright understand the type for self._target.""" + return self._target + + @property + def kwargs(self) -> Dict[str, Any]: + """Help Pyright understand the type for self._kwargs.""" + return self._kwargs + def get_bucket(self): """ Return the bucket object that holds any information about thrown exceptions. diff --git a/pilot/common/pluginfactory.py b/pilot/common/pluginfactory.py index 3030e6c7..934ad56a 100644 --- a/pilot/common/pluginfactory.py +++ b/pilot/common/pluginfactory.py @@ -22,7 +22,7 @@ """A factory to manage plugins.""" -from typing import Any +from typing import Any, Dict import logging logger = logging.getLogger(__name__) @@ -32,7 +32,7 @@ class PluginFactory: def __init__(self, *args: Any, **kwargs: Any): """Set initial values.""" - self.classMap = {} + self.classMap: Dict[str, Any] = {} def get_plugin(self, confs: dict) -> dict: """ @@ -44,7 +44,7 @@ def get_plugin(self, confs: dict) -> dict: class_name = confs['class'] if class_name is None: logger.error(f"class is not defined in confs: {confs}") - return None + return {} if class_name not in self.classMap: logger.info(f"trying to import {class_name}") diff --git a/pilot/control/data.py b/pilot/control/data.py index 473ca5a0..121e6263 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -23,9 +23,9 @@ # - Wen Guan, wen.guan@cern.ch, 2018 # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -import copy as objectcopy +"""Control interface to data API.""" + import os -import subprocess import time import queue from typing import Any @@ -90,8 +90,14 @@ errors = ErrorCodes() -def control(queues, traces, args): +def control(queues: Any, traces: Any, args: Any): + """ + Set up data control threads. + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any). + """ targets = {'copytool_in': copytool_in, 'copytool_out': copytool_out, 'queue_monitoring': queue_monitoring} threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args}, name=name) for name, target in list(targets.items())] # Python 2/3 @@ -99,24 +105,29 @@ def control(queues, traces, args): [thread.start() for thread in threads] # if an exception is thrown, the graceful_stop will be set by the ExcThread class run() function - while not args.graceful_stop.is_set(): - for thread in threads: - bucket = thread.get_bucket() - try: - exc = bucket.get(block=False) - except queue.Empty: - pass - else: - exc_type, exc_obj, exc_trace = exc - logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) + try: + while not args.graceful_stop.is_set(): + for thread in threads: + bucket = thread.get_bucket() + try: + exc = bucket.get(block=False) + except queue.Empty: + pass + else: + exc_type, exc_obj, exc_trace = exc + logger.warning("thread \'%s\' received an exception from bucket: %s", thread.name, exc_obj) - # deal with the exception - # .. + # deal with the exception + # .. - thread.join(0.1) - time.sleep(0.1) + thread.join(0.1) + time.sleep(0.1) - time.sleep(0.5) + time.sleep(0.5) + except Exception as exc: + logger.warning(f"exception caught while handling threads: {exc}") + finally: + logger.info('all data control threads have been joined') logger.debug('data control ending since graceful_stop has been set') if args.abort_job.is_set(): @@ -137,31 +148,28 @@ def control(queues, traces, args): logger.info('[data] control thread has finished') -def skip_special_files(job): +def skip_special_files(job: Any): """ Consult user defined code if any files should be skipped during stage-in. + ATLAS code will skip DBRelease files e.g. as they should already be available in CVMFS. - :param job: job object. - :return: + :param job: job object (Any). """ - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) try: user.update_stagein(job) except Exception as error: logger.warning('caught exception: %s', error) -def update_indata(job): +def update_indata(job: Any): """ - In case file were marked as no_transfer files, remove them from stage-in. + Remove files marked as no_transfer files from stage-in. - :param job: job object. - :return: + :param job: job object (Any). """ - toberemoved = [] for fspec in job.indata: if fspec.status == 'no_transfer': @@ -171,15 +179,14 @@ def update_indata(job): job.indata.remove(fspec) -def get_trace_report_variables(job, label='stage-in'): +def get_trace_report_variables(job: Any, label: str = 'stage-in') -> (str, str, str): """ Get some of the variables needed for creating the trace report. - :param job: job object - :param label: 'stage-[in|out]' (string). - :return: event_type (string), localsite (string), remotesite (string). + :param job: job object (Any) + :param label: 'stage-[in|out]' (str) + :return: event_type (str), localsite (str), remotesite (str). """ - event_type = "get_sm" if label == 'stage-in' else "put_sm" if job.is_analysis(): event_type += "_a" @@ -189,15 +196,14 @@ def get_trace_report_variables(job, label='stage-in'): return event_type, localsite, remotesite -def create_trace_report(job, label='stage-in'): +def create_trace_report(job: Any, label: str = 'stage-in') -> Any: """ Create the trace report object. - :param job: job object. - :param label: 'stage-[in|out]' (string). - :return: trace report object. + :param job: job object (Any) + :param label: 'stage-[in|out]' (str) + :return: trace report object (Any). """ - event_type, localsite, remotesite = get_trace_report_variables(job, label=label) trace_report = TraceReport(pq=os.environ.get('PILOT_SITENAME', ''), localSite=localsite, remoteSite=remotesite, dataset="", eventType=event_type, workdir=job.workdir) @@ -206,11 +212,14 @@ def create_trace_report(job, label='stage-in'): return trace_report -def _stage_in(args, job): - """ - :return: True in case of success +def _stage_in(args: Any, job: Any) -> bool: """ + Call the stage-in client. + :param args: pilot args object (Any) + :param job: job object (Any) + :return: True in case of success, False otherwise (bool). + """ # tested ok: #logger.info('testing sending SIGUSR1') #import signal @@ -296,18 +305,17 @@ def _stage_in(args, job): return not remain_files -def get_proper_input_destination(workdir, input_destination_dir): +def get_proper_input_destination(workdir: str, input_destination_dir: str) -> str: """ Return the proper input file destination. Normally this would be the job.workdir, unless an input file destination has been set with pilot option --input-file-destination (which should be set for stager workflow). - :param workdir: job work directory (string). - :param input_destination_dir: optional input file destination (string). - :return: input file destination (string). + :param workdir: job work directory (str) + :param input_destination_dir: optional input file destination (str) + :return: input file destination (str). """ - if input_destination_dir: if not os.path.exists(input_destination_dir): logger.warning(f'input file destination does not exist: {input_destination_dir} (defaulting to {workdir})') @@ -322,16 +330,16 @@ def get_proper_input_destination(workdir, input_destination_dir): return destination -def get_rse(data, lfn=""): +def get_rse(data: Any, lfn: str = "") -> str: """ Return the ddmEndPoint corresponding to the given lfn. + If lfn is not provided, the first ddmEndPoint will be returned. - :param data: FileSpec list object. - :param lfn: local file name (string). - :return: rse (string) + :param data: FileSpec list object (Any) + :param lfn: local file name (str) + :return: rse (str). """ - rse = "" if lfn == "": @@ -353,156 +361,13 @@ def get_rse(data, lfn=""): return rse -def stage_in_auto(files): - """ - Separate dummy implementation for automatic stage-in outside of pilot workflows. - Should be merged with regular stage-in functionality later, but we need to have - some operational experience with it first. - Many things to improve: - - separate file error handling in the merged case - - auto-merging of files with same destination into single copytool call - """ - - # don't spoil the output, we depend on stderr parsing - os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' - - executable = ['/usr/bin/env', - 'rucio', '-v', 'download', - '--no-subdir'] - - # quickly remove non-existing destinations - for _file in files: - if not os.path.exists(_file['destination']): - _file['status'] = 'failed' - _file['errmsg'] = 'Destination directory does not exist: %s' % _file['destination'] - _file['errno'] = 1 - else: - _file['status'] = 'running' - _file['errmsg'] = 'File not yet successfully downloaded.' - _file['errno'] = 2 - - for _file in files: - if _file['errno'] == 1: - continue - - tmp_executable = objectcopy.deepcopy(executable) - - tmp_executable += ['--dir', _file['destination']] - tmp_executable.append('%s:%s' % (_file['scope'], - _file['name'])) - process = subprocess.Popen(tmp_executable, - bufsize=-1, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - _file['errno'] = 2 - while True: - time.sleep(0.5) - exit_code = process.poll() - if exit_code is not None: - _, stderr = process.communicate() - if exit_code == 0: - _file['status'] = 'done' - _file['errno'] = 0 - _file['errmsg'] = 'File successfully downloaded.' - else: - _file['status'] = 'failed' - _file['errno'] = 3 - try: - # the Details: string is set in rucio: lib/rucio/common/exception.py in __str__() - _file['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1] - except Exception as error: - _file['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % error - break - else: - continue - - return files - - -def stage_out_auto(files): - """ - Separate dummy implementation for automatic stage-out outside of pilot workflows. - Should be merged with regular stage-out functionality later, but we need to have - some operational experience with it first. - """ - - # don't spoil the output, we depend on stderr parsing - os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' - - executable = ['/usr/bin/env', - 'rucio', '-v', 'upload'] - - # quickly remove non-existing destinations - for _file in files: - if not os.path.exists(_file['file']): - _file['status'] = 'failed' - _file['errmsg'] = 'Source file does not exist: %s' % _file['file'] - _file['errno'] = 1 - else: - _file['status'] = 'running' - _file['errmsg'] = 'File not yet successfully uploaded.' - _file['errno'] = 2 - - for _file in files: - if _file['errno'] == 1: - continue - - tmp_executable = objectcopy.deepcopy(executable) - - tmp_executable += ['--rse', _file['rse']] - - if 'no_register' in list(_file.keys()) and _file['no_register']: # Python 2/3 - tmp_executable += ['--no-register'] - - if 'summary' in list(_file.keys()) and _file['summary']: # Python 2/3 - tmp_executable += ['--summary'] - - if 'lifetime' in list(_file.keys()): # Python 2/3 - tmp_executable += ['--lifetime', str(_file['lifetime'])] - - if 'guid' in list(_file.keys()): # Python 2/3 - tmp_executable += ['--guid', _file['guid']] - - if 'attach' in list(_file.keys()): # Python 2/3 - tmp_executable += ['--scope', _file['scope'], '%s:%s' % (_file['attach']['scope'], _file['attach']['name']), _file['file']] - else: - tmp_executable += ['--scope', _file['scope'], _file['file']] - - process = subprocess.Popen(tmp_executable, bufsize=-1, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - _file['errno'] = 2 - while True: - time.sleep(0.5) - exit_code = process.poll() - if exit_code is not None: - _, stderr = process.communicate() - if exit_code == 0: - _file['status'] = 'done' - _file['errno'] = 0 - _file['errmsg'] = 'File successfully uploaded.' - else: - _file['status'] = 'failed' - _file['errno'] = 3 - try: - # the Details: string is set in rucio: lib/rucio/common/exception.py in __str__() - _file['errmsg'] = [detail for detail in stderr.split('\n') if detail.startswith('Details:')][0][9:-1] - except Exception as error: - _file['errmsg'] = 'Could not find rucio error message details - please check stderr directly: %s' % error - break - else: - continue - - return files - - -def write_output(filename, output): +def write_output(filename: str, output: str): """ Write command output to file. - :param filename: file name (string). - :param output: command stdout/stderr (string). - :return: + :param filename: file name (str) + :param output: command stdout/stderr (str). """ - try: write_file(filename, output, unique=True) except PilotException as error: @@ -511,34 +376,33 @@ def write_output(filename, output): logger.debug('wrote %s', filename) -def write_utility_output(workdir, step, stdout, stderr): +def write_utility_output(workdir: str, step: str, stdout: str, stderr: str): """ Write the utility command output to stdout, stderr files to the job.workdir for the current step. + -> _stdout.txt, _stderr.txt Example of step: xcache. - :param workdir: job workdir (string). - :param step: utility step (string). - :param stdout: command stdout (string). - :param stderr: command stderr (string). - :return: + :param workdir: job workdir (str) + :param step: utility step (str) + :param stdout: command stdout (str) + :param stderr: command stderr (str). """ - # dump to files write_output(os.path.join(workdir, step + '_stdout.txt'), stdout) write_output(os.path.join(workdir, step + '_stderr.txt'), stderr) -def copytool_in(queues, traces, args): # noqa: C901 +def copytool_in(queues: Any, traces: Any, args: Any): # noqa: C901 """ Call the stage-in function and put the job object in the proper queue. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: - """ + Main stage-in thread. + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any). + """ abort = False while not args.graceful_stop.is_set() and not abort: time.sleep(0.5) @@ -554,7 +418,7 @@ def copytool_in(queues, traces, args): # noqa: C901 # does the user want to execute any special commands before stage-in? pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) # Python 2/3 cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN) if cmd: _, stdout, stderr = execute(cmd.get('command')) @@ -608,7 +472,7 @@ def copytool_in(queues, traces, args): # noqa: C901 if os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') == 'generic': pilot_user = os.environ.get('PILOT_USER', 'generic').lower() try: - user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.metadata', globals(), locals(), [pilot_user], 0) file_dictionary = get_input_file_dictionary(job.indata) xml = user.create_input_file_metadata(file_dictionary, job.workdir) logger.info('created input file metadata:\n%s', xml) @@ -676,17 +540,16 @@ def copytool_in(queues, traces, args): # noqa: C901 logger.info('[data] copytool_in thread has finished') -def copytool_out(queues, traces, args): # noqa: C901 +def copytool_out(queues: Any, traces: Any, args: Any): # noqa: C901 """ - Main stage-out thread. Perform stage-out as soon as a job object can be extracted from the data_out queue. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: - """ + Main stage-out thread. + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any). + """ cont = True if args.graceful_stop.is_set(): logger.debug('graceful_stop already set - do not start copytool_out thread') @@ -760,17 +623,17 @@ def copytool_out(queues, traces, args): # noqa: C901 logger.info('[data] copytool_out thread has finished') -def is_already_processed(queues, processed_jobs): +def is_already_processed(queues: Any, processed_jobs: list) -> bool: """ Skip stage-out in case the job has already been processed. + This should not be necessary so this is a fail-safe but it seems there is a case when a job with multiple output files enters the stage-out more than once. - :param queues: queues object. - :param processed_jobs: list of already processed jobs. - :return: True if stage-out queues contain a job object that has already been processed. + :param queues: queues object (Any) + :param processed_jobs: list of already processed jobs (list) + :return: True if stage-out queues contain a job object that has already been processed, False otherwise (bool). """ - snapshots = list(queues.finished_data_out.queue) + list(queues.failed_data_out.queue) jobids = [obj.jobid for obj in snapshots] found = False @@ -786,17 +649,18 @@ def is_already_processed(queues, processed_jobs): return found -def get_input_file_dictionary(indata): +def get_input_file_dictionary(indata: list) -> dict: """ Return an input file dictionary. + Format: {'guid': 'pfn', ..} + Normally use_turl would be set to True if direct access is used. Note: any environment variables in the turls will be expanded - :param indata: list of FileSpec objects. - :return: file dictionary. + :param indata: list of FileSpec objects (list) + :return: file dictionary (dict). """ - ret = {} for fspec in indata: @@ -811,23 +675,23 @@ def get_input_file_dictionary(indata): return ret -def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], output_files=[], piloterrors=[], debugmode=False): +def create_log(workdir: str, logfile_name: str, tarball_name: str, cleanup: bool, input_files: list = [], + output_files: list = [], piloterrors: list = [], debugmode: bool = False): """ Create the tarball for the job. - :param workdir: work directory for the job (string). - :param logfile_name: log file name (string). - :param tarball_name: tarball name (string). - :param cleanup: perform cleanup (Boolean). - :param input_files: list of input files to remove (list). - :param output_files: list of output files to remove (list). - :param piloterrors: list of Pilot assigned error codes (list). - :param debugmode: True if debug mode has been switched on (Boolean). + :param workdir: work directory for the job (str) + :param logfile_name: log file name (str) + :param tarball_name: tarball name (str) + :param cleanup: perform cleanup (bool) + :param input_files: list of input files to remove (list) + :param output_files: list of output files to remove (list) + :param piloterrors: list of Pilot assigned error codes (list) + :param debugmode: True if debug mode has been switched on (bool) :raises LogFileCreationFailure: in case of log file creation problem. - :return: """ - logger.debug(f'preparing to create log file (debug mode={debugmode})') + logger.debug(f'workdir: {workdir}') # PILOT_HOME is the launch directory of the pilot (or the one specified in pilot options as pilot workdir) pilot_home = os.environ.get('PILOT_HOME', os.getcwd()) @@ -835,6 +699,8 @@ def create_log(workdir, logfile_name, tarball_name, cleanup, input_files=[], out if pilot_home != current_dir: os.chdir(pilot_home) + logger.debug(f'current_dir: {current_dir}') + # copy special files if they exist (could be made experiment specific if there's a need for it) copy_special_files(workdir) @@ -905,9 +771,10 @@ def copy_special_files(tardir: str): """ # general pattern, typically xrdlog.txt. The pilot might produce multiple files, xrdlog.txt-LFN1..N xrd_logfile = os.environ.get('XRD_LOGFILE', None) + pilot_home = os.environ.get('PILOT_HOME', None) + if xrd_logfile: # xrootd is then expected to have produced a corresponding log file - pilot_home = os.environ.get('PILOT_HOME', None) if pilot_home: #suffix = Path(xrd_logfile).suffix # .txt stem = Path(xrd_logfile).stem # xrdlog @@ -926,16 +793,25 @@ def copy_special_files(tardir: str): else: logger.warning(f'cannot look for {xrd_logfile} since PILOT_HOME was not set') + path = os.path.join(pilot_home, config.Pilot.pilot_heartbeat_file) + if os.path.exists(path): + try: + copy(path, tardir) + except (NoSuchFile, FileHandlingFailure) as exc: + logger.warning(f'caught exception when copying {path}: {exc}') + else: + logger.warning(f'cannot find pilot heartbeat file: {path}') + def get_tar_timeout(dirsize: float) -> int: """ Get a proper time-out limit based on the directory size. + It should also handle the case dirsize=None and return the max timeout. - :param dirsize: directory size (float). + :param dirsize: directory size (float) :return: time-out in seconds (int). """ - timeout_max = 3 * 3600 # 3 hours timeout_min = 30 timeout = timeout_min + int(60.0 + dirsize / 5.0) if dirsize else timeout_max @@ -943,24 +819,24 @@ def get_tar_timeout(dirsize: float) -> int: return min(timeout, timeout_max) -def _do_stageout(job, xdata, activity, queue, title, output_dir='', rucio_host='', ipv='IPv6'): +def _do_stageout(job: Any, xdata: list, activity: list, queue: str, title: str, output_dir: str = '', + rucio_host: str = '', ipv: str = 'IPv6') -> bool: """ Use the `StageOutClient` in the Data API to perform stage-out. The rucio host is internally set by Rucio via the client config file. This can be set directly as a pilot option --rucio-host. - :param job: job object. - :param xdata: list of FileSpec objects. - :param activity: copytool activity or preferred list of activities to resolve copytools - :param queue: PanDA queue (string). - :param title: type of stage-out (output, log) (string). - :param output_dir: optional output directory (string). - :param rucio_host: optional rucio host (string). - :param ipv: internet protocol version (string). - :return: True in case of success transfers + :param job: job object (Any) + :param xdata: list of FileSpec objects (list) + :param activity: copytool activity or preferred list of activities to resolve copytools (list) + :param queue: PanDA queue (str) + :param title: type of stage-out (output, log) (str) + :param output_dir: optional output directory (str) + :param rucio_host: optional rucio host (str) + :param ipv: internet protocol version (str) + :return: True in case of success transfers, False otherwise (bool). """ - logger.info('prepare to stage-out %d %s file(s)', len(xdata), title) label = 'stage-out' @@ -1040,14 +916,14 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir='', rucio_host=' def _stage_out_new(job: Any, args: Any) -> bool: """ - Stage-out of all output files. + Stage out all output files. + If job.stageout=log then only log files will be transferred. - :param job: job object - :param args: pilot args object + :param job: job object (Any) + :param args: pilot args object (Any) :return: True in case of success, False otherwise (bool). """ - #logger.info('testing sending SIGUSR1') #import signal #os.kill(os.getpid(), signal.SIGUSR1) @@ -1140,13 +1016,13 @@ def _stage_out_new(job: Any, args: Any) -> bool: return is_success -def generate_fileinfo(job): +def generate_fileinfo(job: Any) -> dict: """ Generate fileinfo details to be sent to Panda. - :param job: job object. + :param job: job object (Any) + :return: file info (dict). """ - fileinfo = {} checksum_type = config.File.checksum_type if config.File.checksum_type == 'adler32' else 'md5sum' for iofile in job.outdata + job.logdata: @@ -1159,16 +1035,16 @@ def generate_fileinfo(job): return fileinfo -def queue_monitoring(queues, traces, args): +def queue_monitoring(queues: Any, traces: Any, args: Any): """ - Monitoring of Data queues. + Monitor data queues. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: - """ + Thread. + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any) + """ while True: # will abort when graceful_stop has been set time.sleep(0.5) if traces.pilot['command'] == 'abort': diff --git a/pilot/control/interceptor.py b/pilot/control/interceptor.py index 09e2e6e9..401e942f 100644 --- a/pilot/control/interceptor.py +++ b/pilot/control/interceptor.py @@ -21,24 +21,27 @@ # Note: leave this module for now - the code might be useful for reuse +"""Interceptor module, currently unused.""" + import time import queue +import logging +from typing import Any from pilot.common.exception import ExcThread from pilot.util.processes import threads_aborted -import logging logger = logging.getLogger(__name__) -def run(args): +def run(args: Any): """ + Set up all interceptor threads. + Main execution function for the interceptor communication layer. - :param args: pilot arguments. - :returns: + :param args: pilot arguments (Any) """ - targets = {'receive': receive, 'send': send} threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'args': args}, name=name) for name, target in list(targets.items())] # Python 2/3 @@ -75,14 +78,12 @@ def run(args): logger.debug('[interceptor] run thread has finished') -def receive(args): +def receive(args: Any): """ Look for interceptor messages. - :param args: Pilot args object. - :return: + :param args: Pilot args object (Any). """ - while not args.graceful_stop.is_set(): time.sleep(0.5) @@ -96,14 +97,12 @@ def receive(args): logger.debug('[interceptor] receive thread has finished') -def send(args): +def send(args: Any): """ Send message to interceptor. - :param args: Pilot args object. - :return: + :param args: Pilot args object (Any). """ - while not args.graceful_stop.is_set(): time.sleep(0.5) @@ -115,3 +114,45 @@ def send(args): logger.debug('will not set job_aborted yet') logger.debug('[interceptor] receive send has finished') + + +# implement if necessary +# def interceptor(queues: Any, traces: Any, args: Any): +# """ +# +# :param queues: internal queues for job handling. +# :param traces: tuple containing internal pilot states. +# :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). +# :return: +# """ +# +# # overall loop counter (ignoring the fact that more than one job may be running) +# counter = 0 +# while not args.graceful_stop.is_set(): +# time.sleep(0.1) +# +# # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) +# # (abort at the end of the loop) +# abort = should_abort(args, label='job:interceptor') +# +# # check for any abort_job requests +# abort_job = check_for_abort_job(args, caller='interceptor') +# if not abort_job: +# # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function +# jobs = queues.monitored_payloads.queue +# if jobs: +# for _ in range(len(jobs)): +# logger.info(f'interceptor loop {counter}: looking for communication file') +# time.sleep(30) +# +# counter += 1 +# +# if abort or abort_job: +# break +# +# # proceed to set the job_aborted flag? +# if threads_aborted(caller='interceptor'): +# logger.debug('will proceed to set job_aborted') +# args.job_aborted.set() +# +# logger.info('[job] interceptor thread has finished') diff --git a/pilot/control/job.py b/pilot/control/job.py index 103b6aac..b0cd32d0 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -19,9 +19,11 @@ # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 # - Wen Guan, wen.guan@cern.ch, 2018 +"""Job module with functions for job handling.""" + from __future__ import print_function # Python 2 import os @@ -30,23 +32,57 @@ import logging import queue from collections import namedtuple - from json import dumps from glob import glob +from typing import Any +from urllib.parse import parse_qsl from pilot.common.errorcodes import ErrorCodes -from pilot.common.exception import ExcThread, PilotException, FileHandlingFailure -from pilot.info import infosys, JobData, InfoService, JobInfoProvider +from pilot.common.exception import ( + ExcThread, + PilotException, + FileHandlingFailure +) +from pilot.info import ( + infosys, + JobData, + InfoService, + JobInfoProvider +) from pilot.util import https from pilot.util.activemq import ActiveMQ -from pilot.util.auxiliary import get_batchsystem_jobid, get_job_scheduler_id, \ - set_pilot_state, get_pilot_state, check_for_final_server_update, pilot_version_banner, is_virtual_machine, \ - has_instruction_sets, locate_core_file, get_display_info, encode_globaljobid +from pilot.util.auxiliary import ( + get_batchsystem_jobid, + get_job_scheduler_id, + set_pilot_state, + get_pilot_state, + check_for_final_server_update, + pilot_version_banner, + is_virtual_machine, + has_instruction_sets, + locate_core_file, + get_display_info, + encode_globaljobid +) from pilot.util.config import config -from pilot.util.common import should_abort, was_pilot_killed -from pilot.util.constants import PILOT_MULTIJOB_START_TIME, PILOT_PRE_GETJOB, PILOT_POST_GETJOB, PILOT_KILL_SIGNAL, LOG_TRANSFER_NOT_DONE, \ - LOG_TRANSFER_IN_PROGRESS, LOG_TRANSFER_DONE, LOG_TRANSFER_FAILED, SERVER_UPDATE_TROUBLE, SERVER_UPDATE_FINAL, \ - SERVER_UPDATE_UPDATING, SERVER_UPDATE_NOT_DONE +from pilot.util.common import ( + should_abort, + was_pilot_killed +) +from pilot.util.constants import ( + PILOT_MULTIJOB_START_TIME, + PILOT_PRE_GETJOB, + PILOT_POST_GETJOB, + PILOT_KILL_SIGNAL, + LOG_TRANSFER_NOT_DONE, + LOG_TRANSFER_IN_PROGRESS, + LOG_TRANSFER_DONE, + LOG_TRANSFER_FAILED, + SERVER_UPDATE_TROUBLE, + SERVER_UPDATE_FINAL, + SERVER_UPDATE_UPDATING, + SERVER_UPDATE_NOT_DONE +) from pilot.util.container import execute from pilot.util.filehandling import ( find_text_files, @@ -59,63 +95,101 @@ write_json, get_total_input_size ) -from pilot.util.harvester import request_new_jobs, remove_job_request_file, parse_job_definition_file, \ - is_harvester_mode, get_worker_attributes_file, publish_job_report, publish_work_report, get_event_status_file, \ +from pilot.util.harvester import ( + request_new_jobs, + remove_job_request_file, + parse_job_definition_file, + is_harvester_mode, + get_worker_attributes_file, + publish_job_report, + publish_work_report, + get_event_status_file, publish_stageout_files +) from pilot.util.jobmetrics import get_job_metrics from pilot.util.loggingsupport import establish_logging from pilot.util.math import mean, float_to_rounded_string from pilot.util.middleware import containerise_general_command -from pilot.util.monitoring import job_monitor_tasks, check_local_space +from pilot.util.monitoring import ( + job_monitor_tasks, + check_local_space +) from pilot.util.monitoringtime import MonitoringTime -from pilot.util.processes import cleanup, threads_aborted, kill_process, kill_processes, kill_defunct_children +from pilot.util.processes import ( + cleanup, + threads_aborted, + kill_process, + kill_processes, + kill_defunct_children +) from pilot.util.proxy import get_distinguished_name -from pilot.util.queuehandling import scan_for_jobs, put_in_queue, queue_report, purge_queue +from pilot.util.queuehandling import ( + scan_for_jobs, + put_in_queue, + queue_report, + purge_queue +) from pilot.util.realtimelogger import cleanup as rtcleanup -from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp -from pilot.util.workernode import get_disk_space, collect_workernode_info, get_node_name, get_cpu_model, get_cpu_cores, get_cpu_arch +from pilot.util.timing import ( + add_to_pilot_timing, + timing_report, + get_postgetjob_time, + get_time_since, + time_stamp +) +from pilot.util.workernode import ( + get_disk_space, + collect_workernode_info, + get_node_name, + get_cpu_model, + get_cpu_cores, + get_cpu_arch +) logger = logging.getLogger(__name__) errors = ErrorCodes() -def control(queues, traces, args): +def control(queues: Any, traces: Any, args: Any): """ - Main function of job control. + Set up job control threads. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any) """ - targets = {'validate': validate, 'retrieve': retrieve, 'create_data_payload': create_data_payload, 'queue_monitor': queue_monitor, 'job_monitor': job_monitor, 'fast_job_monitor': fast_job_monitor, 'message_listener': message_listener} threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args}, name=name) for name, target in list(targets.items())] - [thread.start() for thread in threads] + _ = [thread.start() for thread in threads] # if an exception is thrown, the graceful_stop will be set by the ExcThread class run() function - while not args.graceful_stop.is_set(): - for thread in threads: - bucket = thread.get_bucket() - try: - exc = bucket.get(block=False) - except queue.Empty: - pass - else: - _, exc_obj, _ = exc - logger.warning(f"thread \'{thread.name}\' received an exception from bucket: {exc_obj}") + try: + while not args.graceful_stop.is_set(): + for thread in threads: + bucket = thread.get_bucket() + try: + exc = bucket.get(block=False) + except queue.Empty: + pass + else: + _, exc_obj, _ = exc + logger.warning(f"thread \'{thread.name}\' received an exception from bucket: {exc_obj}") - # deal with the exception - # .. + # deal with the exception + # .. - thread.join(0.1) - time.sleep(0.1) + thread.join(0.1) + time.sleep(0.1) - time.sleep(0.5) + time.sleep(0.5) + except Exception as exc: + logger.warning(f"exception caught while handling threads: {exc}") + finally: + logger.info('all job control threads have been joined') logger.debug('job control ending since graceful_stop has been set') if args.abort_job.is_set(): @@ -139,14 +213,13 @@ def control(queues, traces, args): #os.kill(os.getpid(), signal.SIGBUS) -def _validate_job(job): +def _validate_job(job: Any) -> bool: """ Verify job parameters for specific problems. - :param job: job object. - :return: Boolean. + :param job: job object (Any) + :return: True if job has been verified, False otherwise (bool). """ - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) container = __import__(f'pilot.user.{pilot_user}.container', globals(), locals(), [user], 0) @@ -161,18 +234,17 @@ def _validate_job(job): return user.verify_job(job) -def verify_error_code(job): +def verify_error_code(job: Any): """ Make sure an error code is properly set. + This makes sure that job.piloterrorcode is always set for a failed/holding job, that not only job.piloterrorcodes are set but not job.piloterrorcode. This function also negates the sign of the error code and sets job state 'holding' (instead of 'failed') if the error is found to be recoverable by a later job (user jobs only). - :param job: job object. - :return: + :param job: job object (Any). """ - if job.piloterrorcode == 0 and len(job.piloterrorcodes) > 0: logger.warning(f'piloterrorcode set to first piloterrorcodes list entry: {job.piloterrorcodes}') job.piloterrorcode = job.piloterrorcodes[0] @@ -188,23 +260,23 @@ def verify_error_code(job): logger.info('verified error code') -def get_proper_state(job, state): +def get_proper_state(job: Any, state: str) -> str: """ Return a proper job state to send to server. + This function should only return 'starting', 'running', 'finished', 'holding' or 'failed'. If the internal job.serverstate is not yet set, it means it is the first server update, ie 'starting' should be sent. - :param job: job object. - :param state: internal pilot state (string). - :return: valid server state (string). + :param job: job object (Any) + :param state: internal pilot state (str) + :return: valid server state (str). """ - - if job.serverstate in ('finished', 'failed'): + if job.serverstate in {'finished', 'failed'}: pass elif job.serverstate == "" and state != "finished" and state != "failed": job.serverstate = 'starting' - elif state in ('finished', 'failed', 'holding'): + elif state in {'finished', 'failed', 'holding'}: job.serverstate = state else: job.serverstate = 'running' @@ -212,18 +284,17 @@ def get_proper_state(job, state): return job.serverstate -def publish_harvester_reports(state, args, data, job, final): +def publish_harvester_reports(state: str, args: Any, data: dict, job: Any, final: bool) -> bool: """ Publish all reports needed by Harvester. - :param state: job state (string). - :param args: pilot args object. - :param data: data structure for server update (dictionary). - :param job: job object. - :param final: is this the final update? (Boolean). - :return: True if successful, False otherwise (Boolean). + :param state: job state (str) + :param args: pilot args object (Any) + :param data: data structure for server update (dict) + :param job: job object (Any) + :param final: is this the final update? (bool) + :return: True if successful, False otherwise (bool). """ - # write part of the heartbeat message to worker attributes files needed by Harvester path = get_worker_attributes_file(args) @@ -251,44 +322,43 @@ def publish_harvester_reports(state, args, data, job, final): if publish_job_report(job, args, config.Payload.jobreport): logger.debug('wrote job report file') return True - else: - logger.warning('failed to write job report file') - return False + + logger.warning('failed to write job report file') + return False else: logger.info('finished writing various report files in Harvester mode') return True -def write_heartbeat_to_file(data): +def write_heartbeat_to_file(data: dict) -> bool: """ Write heartbeat dictionary to file. + This is only done when server updates are not wanted. - :param data: server data (dictionary). - :return: True if successful, False otherwise (Boolean). + :param data: server data (dict) + :return: True if successful, False otherwise (bool). """ - path = os.path.join(os.environ.get('PILOT_HOME'), config.Pilot.heartbeat_message) if write_json(path, data): logger.debug(f'heartbeat dictionary: {data}') logger.debug(f'wrote heartbeat to file: {path}') return True - else: - return False + + return False -def is_final_update(job, state, tag='sending'): +def is_final_update(job: Any, state: str, tag: str = 'sending') -> bool: """ - Will it be the final server update? + Determine if it will be the final server update. - :param job: job object. - :param state: job state (Boolean). - :param tag: optional tag ('sending'/'writing') (string). - :return: final state (Boolean). + :param job: job object (Any) + :param state: job state (str) + :param tag: optional tag ('sending'/'writing') (str) + :return: final state (bool). """ - - if state in ('finished', 'failed', 'holding'): + if state in {'finished', 'failed', 'holding'}: final = True os.environ['SERVER_UPDATE'] = SERVER_UPDATE_UPDATING logger.info(f'job {job.jobid} has {state} - {tag} final server update') @@ -308,20 +378,21 @@ def is_final_update(job, state, tag='sending'): return final -def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False): +def send_state(job: Any, args: Any, state: str, xml: str = "", metadata: str = "", + test_tobekilled: bool = False) -> bool: """ Update the server (send heartbeat message). + Interpret and handle any server instructions arriving with the updateJob back channel. - :param job: job object. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :param state: job state (string). - :param xml: optional metadata xml (string). - :param metadata: job report metadata read as a string. - :param test_tobekilled: emulate a tobekilled command (boolean). - :return: boolean (True if successful, False otherwise). + :param job: job object (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param state: job state (str) + :param xml: optional metadata xml (str) + :param metadata: job report metadata read as a string (str) + :param test_tobekilled: emulate a tobekilled command (bool) + :return: True if successful, False otherwise (bool). """ - # insert out of batch time error code if MAXTIME has been reached if os.environ.get('REACHED_MAXTIME', None): msg = 'the max batch system time limit has been reached' @@ -331,7 +402,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) job.state = state state = get_proper_state(job, state) - if state == 'finished' or state == 'holding' or state == 'failed': + if state in {'finished', 'holding', 'failed'}: logger.info(f'this job has now completed (state={state})') # job.completed = True - do not set that here (only after the successful final server update) elif args.pod and args.workflow == 'stager': @@ -346,7 +417,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) final = is_final_update(job, state, tag='sending' if args.update_server else 'writing') # build the data structure needed for updateJob - data = get_data_structure(job, state, args, xml=xml, metadata=metadata, final=final) + data = get_data_structure(job, state, args, xml=xml, metadata=metadata) logger.debug(f'data={data}') # write the heartbeat message to file if the server is not to be updated by the pilot (Nordugrid mode) @@ -354,9 +425,9 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) # if in harvester mode write to files required by harvester if is_harvester_mode(args): return publish_harvester_reports(state, args, data, job, final) - else: - # store the file in the main workdir - return write_heartbeat_to_file(data) + + # store the file in the main workdir + return write_heartbeat_to_file(data) if config.Pilot.pandajob != 'real': logger.info('skipping job update for fake test job') @@ -377,7 +448,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) if final and os.path.exists(job.workdir): # ignore if workdir doesn't exist - might be a delayed jobUpdate os.environ['SERVER_UPDATE'] = SERVER_UPDATE_FINAL - if state == 'finished' or state == 'holding' or state == 'failed': + if state in {'finished', 'holding', 'failed'}: logger.info(f'setting job as completed (state={state})') job.completed = True @@ -389,9 +460,10 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) return False -def get_job_status_from_server(job_id, url, port): +def get_job_status_from_server(job_id: int, url: str, port: str) -> (str, int, int): """ Return the current status of job from the dispatcher. + typical dispatcher response: 'status=finished&StatusCode=0' StatusCode 0: succeeded 10: time-out @@ -399,12 +471,11 @@ def get_job_status_from_server(job_id, url, port): 30: failed In the case of time-out, the dispatcher will be asked one more time after 10 s. - :param job_id: PanDA job id (int). - :param url: PanDA server URL (string). - :param port: PanDA server port (int). - :return: status (string; e.g. holding), attempt_nr (int), status_code (int) + :param job_id: PanDA job id (int) + :param url: PanDA server URL (str + :param port: PanDA server port (str) + :return: status (string; e.g. holding), attempt_nr (int), status_code (int). """ - status = 'unknown' attempt_nr = 0 status_code = 0 @@ -424,7 +495,7 @@ def get_job_status_from_server(job_id, url, port): while trial <= max_trials: try: # open connection - ret = https.request('{pandaserver}/server/panda/getStatus'.format(pandaserver=pandaserver), data=data) + ret = https.request(f'{pandaserver}/server/panda/getStatus', data=data) response = ret[1] logger.info(f"response: {response}") if response: @@ -458,30 +529,30 @@ def get_job_status_from_server(job_id, url, port): else: if status_code == 0: # success break - elif status_code == 10: # time-out + if status_code == 10: # time-out trial += 1 time.sleep(10) continue - elif status_code == 20: # other error + if status_code == 20: # other error if ret[0] == 13056 or ret[0] == '13056': logger.warning(f"wrong certificate used with curl operation? (encountered error {ret[0]})") break - else: # general error - break + + # general error + break return status, attempt_nr, status_code -def get_debug_command(cmd): +def get_debug_command(cmd: str) -> (bool, str): """ Identify and filter the given debug command. Note: only a single command will be allowed from a predefined list: tail, ls, gdb, ps, du. - :param cmd: raw debug command from job definition (string). - :return: debug_mode (Boolean, True if command is deemed ok), debug_command (string). + :param cmd: raw debug command from job definition (str) + :return: True if command is deemed ok, False otherwise (bool), debug_command (str). """ - debug_mode = False debug_command = "" @@ -506,20 +577,19 @@ def get_debug_command(cmd): else: debug_mode = True debug_command = cmd + return debug_mode, debug_command -def handle_backchannel_command(res, job, args, test_tobekilled=False): +def handle_backchannel_command(res: dict, job: Any, args: Any, test_tobekilled: bool = False) -> None: """ - Does the server update contain any backchannel information? if so, update the job object. + Checkk if the server update contain any backchannel information. If so, update the job object. - :param res: server response (dictionary). - :param job: job object. - :param args: pilot args object. - :param test_tobekilled: emulate a tobekilled command (boolean). - :return: + :param res: server response (dict) + :param job: job object (Any) + :param args: pilot args object (Any) + :param test_tobekilled: emulate a tobekilled command (bool) """ - if test_tobekilled: logger.info('faking a \'tobekilled\' command') res['command'] = 'tobekilled' @@ -583,23 +653,22 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): # job.debug_command = 'gdb --pid % -ex \'generate-core-file\'' -def add_data_structure_ids(data, version_tag, job): +def add_data_structure_ids(data: dict, version_tag: str, job: Any) -> dict: """ Add pilot, batch and scheduler ids to the data structure for getJob, updateJob. - :param data: data structure (dict). - :param version_tag: Pilot version tag (string). - :param job: job object. - :return: updated data structure (dict), batchsystem_id (string|None). + :param data: data structure (dict) + :param version_tag: Pilot version tag (str) + :param job: job object (Any) + :return: updated data structure (dict). """ - schedulerid = get_job_scheduler_id() if schedulerid: data['schedulerID'] = schedulerid # update the jobid in the pilotid if necessary (not for ATLAS since there should be one batch log for all multi-jobs) pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) pilotid = user.get_pilot_id(data['jobId']) if pilotid: pilotversion = os.environ.get('PILOT_VERSION') @@ -607,29 +676,27 @@ def add_data_structure_ids(data, version_tag, job): if not job.batchid: job.batchtype, job.batchid = get_batchsystem_jobid() if job.batchtype and job.batchid: - data['pilotID'] = "%s|%s|%s|%s" % (pilotid, job.batchtype, version_tag, pilotversion) + data['pilotID'] = f"{pilotid}|{job.batchtype}|{version_tag}|{pilotversion}" data['batchID'] = job.batchid else: - data['pilotID'] = "%s|%s|%s" % (pilotid, version_tag, pilotversion) + data['pilotID'] = f"{pilotid}|{version_tag}|{pilotversion}" else: logger.warning('pilotid not available') return data -def get_data_structure(job, state, args, xml=None, metadata=None, final=False): # noqa: C901 +def get_data_structure(job: Any, state: str, args: Any, xml: str = "", metadata: str = "") -> dict: # noqa: C901 """ Build the data structure needed for updateJob. - :param job: job object. - :param state: state of the job (string). - :param args: Pilot args object. - :param xml: optional XML string. - :param metadata: job report metadata read as a string. - :param final: is this for the final server update? (Boolean). - :return: data structure (dictionary). + :param job: job object (Any) + :param state: state of the job (str) + :param args: Pilot args object (Any) + :param xml: optional XML string (str) + :param metadata: job report metadata read as a string (str) + :return: data structure (dict). """ - data = {'jobId': job.jobid, 'state': state, 'timestamp': time_stamp(), @@ -722,25 +789,24 @@ def get_data_structure(job, state, args, xml=None, metadata=None, final=False): data['jobMetrics'] = job_metrics # add timing info if finished or failed - if state == 'finished' or state == 'failed': + if state in {'finished', 'failed'}: add_timing_and_extracts(data, job, state, args) https.add_error_codes(data, job) return data -def process_debug_mode(job): +def process_debug_mode(job: Any) -> str: """ Handle debug mode - preprocess debug command, get the output and kill the payload in case of gdb. - :param job: job object. - :return: stdout from debug command (string). + :param job: job object (Any) + :return: stdout from debug command (str). """ - # for gdb commands, use the proper gdb version (the system one may be too old) if job.debug_command.startswith('gdb '): pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) user.preprocess_debug_command(job) if job.debug_command: @@ -758,43 +824,42 @@ def process_debug_mode(job): return stdout -def get_debug_stdout(job): +def get_debug_stdout(job: Any) -> str: """ Return the requested output from a given debug command. - :param job: job object. - :return: output (string). + :param job: job object (Any) + :return: output (str). """ - if job.debug_command == 'debug': return get_payload_log_tail(job.workdir, job.jobid) - elif 'tail ' in job.debug_command: + if 'tail ' in job.debug_command: return get_requested_log_tail(job.debug_command, job.workdir) - elif 'ls ' in job.debug_command: + if 'ls ' in job.debug_command: return get_ls(job.debug_command, job.workdir) - elif 'ps ' in job.debug_command or 'gdb ' in job.debug_command: + if 'ps ' in job.debug_command or 'gdb ' in job.debug_command: return get_general_command_stdout(job) - else: - # general command, execute and return output - _, stdout, _ = execute(job.debug_command) - logger.info(f'debug_command: {job.debug_command}:\n\n{stdout}\n') - return stdout + + # general command, execute and return output + _, stdout, _ = execute(job.debug_command) + logger.info(f'debug_command: {job.debug_command}:\n\n{stdout}\n') + + return stdout -def get_general_command_stdout(job): +def get_general_command_stdout(job: Any): """ Return the output from the requested debug command. - :param job: job object. - :return: output (string). + :param job: job object (Any) + :return: output (str). """ - stdout = '' # for gdb, we might have to process the debug command (e.g. to identify the proper pid to debug) if 'gdb ' in job.debug_command and '--pid %' in job.debug_command: pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) job.debug_command = user.process_debug_command(job.debug_command, job.jobid) if job.debug_command: @@ -825,19 +890,18 @@ def get_general_command_stdout(job): return stdout -def get_ls(debug_command, workdir): +def get_ls(debug_command: str, workdir: str) -> str: """ Return the requested ls debug command. - :param debug_command: full debug command (string). - :param workdir: job work directory (string). - :return: output (string). + :param debug_command: full debug command (str) + :param workdir: job work directory (str) + :return: output (str). """ - items = debug_command.split(' ') # cmd = items[0] options = ' '.join(items[1:]) - path = options.split(' ')[-1] if ' ' in options else options + path = options.rsplit(' ', maxsplit=1)[-1] if ' ' in options else options if path.startswith('-'): path = '.' finalpath = os.path.join(workdir, path) @@ -849,7 +913,7 @@ def get_ls(debug_command, workdir): return stdout -def get_requested_log_tail(debug_command, workdir): +def get_requested_log_tail(debug_command: str, workdir: str) -> str: """ Return the tail of the requested debug log. @@ -857,11 +921,10 @@ def get_requested_log_tail(debug_command, workdir): tail workdir/tmp.stdout* <- pilot finds the requested log file in the specified relative path tail log.RAWtoALL <- pilot finds the requested log file - :param debug_command: full debug command (string). - :param workdir: job work directory (string). - :return: output (string). + :param debug_command: full debug command (str) + :param workdir: job work directory (str) + :return: output (str). """ - _tail = "" items = debug_command.split(' ') cmd = items[0] @@ -870,7 +933,7 @@ def get_requested_log_tail(debug_command, workdir): logger.debug(f'debug options: {options}') # assume that the path is the last of the options; - path = options.split(' ')[-1] if ' ' in options else options + path = options.rsplit(' ', maxsplit=1)[-1] if ' ' in options else options fullpath = os.path.join(workdir, path) # find all files with the given pattern and pick the latest updated file (if several) @@ -887,21 +950,19 @@ def get_requested_log_tail(debug_command, workdir): return _tail -def get_cpu_consumption_time(cpuconsumptiontime): +def get_cpu_consumption_time(cpuconsumptiontime: int) -> int: """ Get the CPU consumption time. + The function makes sure that the value exists and is within allowed limits (< 10^9). - :param cpuconsumptiontime: CPU consumption time (int/None). - :return: properly set CPU consumption time (int/None). + :param cpuconsumptiontime: CPU consumption time (int) + :return: properly set CPU consumption time (int). """ - - constime = None - try: constime = int(cpuconsumptiontime) except Exception: - constime = None + constime = 0 if constime and constime > 10 ** 9: logger.warning(f"unrealistic cpuconsumptiontime: {constime} (reset to -1)") constime = -1 @@ -909,28 +970,26 @@ def get_cpu_consumption_time(cpuconsumptiontime): return constime -def add_timing_and_extracts(data, job, state, args): +def add_timing_and_extracts(data: dict, job: Any, state: str, args: Any): """ Add timing info and log extracts to data structure for a completed job (finished or failed) to be sent to server. + Note: this function updates the data dictionary. - :param data: data structure (dictionary). - :param job: job object. - :param state: state of the job (string). - :param args: pilot args. - :return: + :param data: data structure (dict) + :param job: job object (Any) + :param state: state of the job (str) + :param args: pilot args object (Any) """ - time_getjob, time_stagein, time_payload, time_stageout, time_initial_setup, time_setup, time_log_creation = timing_report(job.jobid, args) - data['pilotTiming'] = "%s|%s|%s|%s|%s|%s" % \ - (time_getjob, time_stagein, time_payload, time_stageout, time_initial_setup, time_setup) + data['pilotTiming'] = f"{time_getjob}|{time_stagein}|{time_payload}|{time_stageout}|{time_initial_setup}|{time_setup}" logger.debug(f'could have reported time_log_creation={time_log_creation} s') # add log extracts (for failed/holding jobs or for jobs with outbound connections) extracts = "" - if state == 'failed' or state == 'holding': + if state in {'failed', 'holding'}: pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.diagnose', globals(), locals(), [pilot_user], 0) extracts = user.get_log_extracts(job, state) if extracts != "": logger.warning(f'\n[begin log extracts]\n{extracts}\n[end log extracts]') @@ -938,19 +997,18 @@ def add_timing_and_extracts(data, job, state, args): data['endTime'] = time.time() -def add_memory_info(data, workdir, name=""): +def add_memory_info(data: dict, workdir: str, name: str = ""): """ - Add memory information (if available) to the data structure that will be sent to the server with job updates + Add memory information (if available) to the data structure that will be sent to the server with job updates. + Note: this function updates the data dictionary. - :param data: data structure (dictionary). - :param workdir: working directory of the job (string). - :param name: name of memory monitor (string). - :return: + :param data: data structure (dict) + :param workdir: working directory of the job (str) + :param name: name of memory monitor (str). """ - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0) + utilities = __import__(f'pilot.user.{pilot_user}.utilities', globals(), locals(), [pilot_user], 0) try: utility_node = utilities.get_memory_monitor_info(workdir, name=name) data.update(utility_node) @@ -958,17 +1016,15 @@ def add_memory_info(data, workdir, name=""): logger.info(f'memory information not available: {error}') -def remove_pilot_logs_from_list(list_of_files, jobid): +def remove_pilot_logs_from_list(list_of_files: list, jobid: str) -> list: """ Remove any pilot logs from the list of last updated files. - :param list_of_files: list of last updated files (list). - :param jobid: PanDA job id (string). + :param list_of_files: list of last updated files (list) + :param jobid: PanDA job id (str) :return: list of files (list). """ - # note: better to move experiment specific files to user area - # ignore the pilot log files try: to_be_removed = [config.Pilot.pilotlog, config.Pilot.stageinlog, config.Pilot.stageoutlog, @@ -977,7 +1033,7 @@ def remove_pilot_logs_from_list(list_of_files, jobid): config.Container.container_script, config.Container.release_setup, config.Container.stagein_status_dictionary, config.Container.stagein_replica_dictionary, 'eventLoopHeartBeat.txt', 'memory_monitor_output.txt', 'memory_monitor_summary.json_snapshot', - f'curl_updateJob_{jobid}.config'] + f'curl_updateJob_{jobid}.config', config.Pilot.pilot_heartbeat_file] except Exception as error: logger.warning(f'exception caught: {error}') to_be_removed = [] @@ -990,15 +1046,14 @@ def remove_pilot_logs_from_list(list_of_files, jobid): return new_list_of_files -def get_payload_log_tail(workdir, jobid): +def get_payload_log_tail(workdir: str, jobid: str) -> str: """ Return the tail of the payload stdout or its latest updated log file. - :param workdir: job work directory (string). - :param jobid: PanDA job id (string). - :return: tail of stdout (string). + :param workdir: job work directory (str) + :param jobid: PanDA job id (str) + :return: tail of stdout (str). """ - # find the latest updated log file # list_of_files = get_list_of_log_files() # find the latest updated text file @@ -1012,13 +1067,13 @@ def get_payload_log_tail(workdir, jobid): return get_latest_log_tail(list_of_files) -def get_latest_log_tail(files): +def get_latest_log_tail(files: list) -> str: """ Get the tail of the latest updated file from the given file list. - :param files: files (list). + :param files: files (list) + :return: tail (str). """ - stdout_tail = "" try: @@ -1034,16 +1089,16 @@ def get_latest_log_tail(files): return stdout_tail -def validate(queues, traces, args): +def validate(queues: Any, traces: Any, args: Any): """ Perform validation of job. - :param queues: queues object. - :param traces: traces object. - :param args: args object. - :return: - """ + Thread. + :param queues: queues object (Any) + :param traces: traces object (Any) + :param args: args object (Any). + """ while not args.graceful_stop.is_set(): time.sleep(0.5) try: @@ -1062,13 +1117,13 @@ def validate(queues, traces, args): # Define a new parent group os.setpgrp() - job_dir = os.path.join(args.mainworkdir, 'PanDA_Pilot-%s' % job.jobid) + job_dir = os.path.join(args.mainworkdir, f'PanDA_Pilot-{job.jobid}') logger.debug(f'creating job working directory: {job_dir}') try: os.mkdir(job_dir) os.chmod(job_dir, 0o770) job.workdir = job_dir - except (FileExistsError, OSError, PermissionError, FileNotFoundError) as error: + except (FileExistsError, PermissionError, FileNotFoundError) as error: logger.debug(f'cannot create working directory: {error}') traces.pilot['error_code'] = errors.MKDIR job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(traces.pilot['error_code']) @@ -1095,7 +1150,7 @@ def validate(queues, traces, args): # hide any secrets hide_secrets(job) - create_symlink(from_path='../%s' % config.Pilot.pilotlog, to_path=os.path.join(job_dir, config.Pilot.pilotlog)) + create_symlink(from_path=f'../{config.Pilot.pilotlog}', to_path=os.path.join(job_dir, config.Pilot.pilotlog)) # handle proxy in unified dispatch if args.verify_proxy: @@ -1107,7 +1162,7 @@ def validate(queues, traces, args): # pre-cleanup pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - utilities = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0) + utilities = __import__(f'pilot.user.{pilot_user}.utilities', globals(), locals(), [pilot_user], 0) try: utilities.precleanup() except Exception as error: @@ -1118,7 +1173,7 @@ def validate(queues, traces, args): store_jobid(job.jobid, args.sourcedir) # make sure that ctypes is available (needed at the end by orphan killer) - verify_ctypes(queues, job) + verify_ctypes() # run the delayed space check now delayed_space_check(queues, traces, args, job) @@ -1135,7 +1190,7 @@ def validate(queues, traces, args): logger.info('[job] validate thread has finished') -def hide_secrets(job): +def hide_secrets(job: Any): """ Hide any user secrets. @@ -1143,10 +1198,8 @@ def hide_secrets(job): and updates the job.pandasecrets string to 'hidden'. The JSON file is removed before the job log is created. The contents of job.pandasecrets is not dumped to the log. - :param job: job object. - :return: + :param job: job object (Any). """ - if job.pandasecrets: try: path = os.path.join(job.workdir, config.Pilot.pandasecrets) @@ -1159,19 +1212,12 @@ def hide_secrets(job): logger.debug('no user secrets for this job') -def verify_ctypes(queues, job): - """ - Verify ctypes and make sure all subprocess are parented. - - :param queues: queues object. - :param job: job object. - :return: - """ - +def verify_ctypes(): + """Verify ctypes and make sure all subprocess are parented.""" try: import ctypes - except (ModuleNotFoundError, ImportError) as error: - diagnostics = 'ctypes python module could not be imported: %s' % error + except ImportError as error: + diagnostics = f'ctypes python module could not be imported: {error}' logger.warning(diagnostics) #job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOCTYPES, msg=diagnostics) #logger.debug('Failed to validate job=%s', job.jobid) @@ -1188,18 +1234,16 @@ def verify_ctypes(queues, job): logger.debug('all child subprocesses will be parented') -def delayed_space_check(queues, traces, args, job): +def delayed_space_check(queues: Any, traces: Any, args: Any, job: Any): """ Run the delayed space check if necessary. - :param queues: queues object. - :param traces: traces object. - :param args: args object. - :param job: job object. - :return: + :param queues: queues object (Any) + :param traces: traces object (Any) + :param args: args object (Any) + :param job: job object (Any). """ - - proceed_with_local_space_check = True if (args.harvester_submitmode.lower() == 'push' and args.update_server) else False + proceed_with_local_space_check = args.harvester_submitmode.lower() == 'push' and args.update_server if proceed_with_local_space_check: logger.debug('pilot will now perform delayed space check') exit_code, diagnostics = check_local_space() @@ -1215,13 +1259,12 @@ def delayed_space_check(queues, traces, args, job): put_in_queue(job, queues.validated_jobs) -def create_k8_link(job_dir): +def create_k8_link(job_dir: str): """ Create a soft link to the payload workdir on Kubernetes if SHARED_DIR exists. - :param job_dir: payload workdir (string). + :param job_dir: payload workdir (str). """ - shared_dir = os.environ.get('SHARED_DIR', None) if shared_dir: #create_symlink(from_path=os.path.join(shared_dir, 'payload_workdir'), to_path=job_dir) @@ -1230,15 +1273,13 @@ def create_k8_link(job_dir): logger.debug('will not create symlink in SHARED_DIR') -def store_jobid(jobid, init_dir): +def store_jobid(jobid: int, init_dir: str): """ Store the PanDA job id in a file that can be picked up by the wrapper for other reporting. - :param jobid: job id (int). - :param init_dir: pilot init dir (string). - :return: + :param jobid: job id (int) + :param init_dir: pilot init dir (str). """ - pilot_source_dir = os.environ.get('PANDA_PILOT_SOURCE', '') if pilot_source_dir: path = os.path.join(pilot_source_dir, config.Pilot.jobid_file) @@ -1248,12 +1289,12 @@ def store_jobid(jobid, init_dir): try: mode = 'a' if os.path.exists(path) else 'w' - write_file(path, "%s\n" % str(jobid), mode=mode, mute=False) + write_file(path, f"{jobid}\n", mode=mode, mute=False) except Exception as error: logger.warning(f'exception caught while trying to store job id: {error}') -def create_data_payload(queues, traces, args): +def create_data_payload(queues: Any, traces: Any, args: Any): """ Get a Job object from the "validated_jobs" queue. @@ -1262,12 +1303,10 @@ def create_data_payload(queues, traces, args): the thread also places the Job object in the "payloads" queue (another thread will retrieve it and wait for any stage-in to finish). - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) """ - while not args.graceful_stop.is_set(): time.sleep(0.5) try: @@ -1306,14 +1345,14 @@ def create_data_payload(queues, traces, args): logger.info('[job] create_data_payload thread has finished') -def get_task_id(): +def get_task_id() -> str: """ Return the task id for the current job. + Note: currently the implementation uses an environmental variable to store this number (PanDA_TaskID). - :return: task id (string). Returns empty string in case of error. + :return: task id. Returns empty string in case of error (str) """ - if "PanDA_TaskID" in os.environ: taskid = os.environ["PanDA_TaskID"] else: @@ -1323,18 +1362,18 @@ def get_task_id(): return taskid -def get_job_label(args): +def get_job_label(args: Any) -> str: """ Return a proper job label. + The function returns a job label that corresponds to the actual pilot version, ie if the pilot is a development version (ptest or rc_test2) or production version (managed or user). Example: -i RC -> job_label = rc_test2. NOTE: it should be enough to only use the job label, -j rc_test2 (and not specify -i RC at all). - :param args: pilot args object. - :return: job_label (string). + :param args: pilot args object (Any) + :return: job_label (str). """ - # PQ status status = infosys.queuedata.status @@ -1355,7 +1394,7 @@ def get_job_label(args): return job_label -def get_dispatcher_dictionary(args, taskid=None): +def get_dispatcher_dictionary(args: Any, taskid: str = "") -> dict: """ Return a dictionary with required fields for the dispatcher getJob operation. @@ -1370,11 +1409,10 @@ def get_dispatcher_dictionary(args, taskid=None): this maintains the behavior relied on by current users of the countryGroup mechanism -- to NOT allow the resource to be used outside the privileged group under any circumstances. - :param args: arguments (e.g. containing queue name, queuedata dictionary, etc). - :param taskid: task id from message broker, if any (None or string). - :returns: dictionary prepared for the dispatcher getJob operation. + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param taskid: task id from message broker, if any (str) + :returns: dictionary prepared for the dispatcher getJob operation (str). """ - _diskspace = get_disk_space(infosys.queuedata) _mem, _cpu, _ = collect_workernode_info(os.getcwd()) _nodename = get_node_name() @@ -1431,24 +1469,26 @@ def get_dispatcher_dictionary(args, taskid=None): return data -def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_getjob_requests, update_server, submitmode, harvester, verify_proxy, traces): +def proceed_with_getjob(timefloor: int, starttime: int, jobnumber: int, getjob_requests: int, max_getjob_requests: int, + should_update_server: bool, submitmode: str, harvester: bool, verify_proxy: bool, traces: Any) -> bool: """ - Can we proceed with getJob? + Check if we can proceed with getJob. + We may not proceed if we have run out of time (timefloor limit), if the proxy is too short, if disk space is too small or if we have already proceed enough jobs. - :param timefloor: timefloor limit (s) (int). - :param starttime: start time of retrieve() (s) (int). - :param jobnumber: number of downloaded jobs (int). - :param getjob_requests: number of getjob requests (int). - :param update_server: should pilot update server? (Boolean). - :param submitmode: Harvester submit mode, PULL or PUSH (string). - :param harvester: True if Harvester is used, False otherwise. Affects the max number of getjob reads (from file) (Boolean). - :param verify_proxy: True if the proxy should be verified. False otherwise (Boolean). - :param traces: traces object (to be able to propagate a proxy error all the way back to the wrapper). - :return: True if pilot should proceed with getJob (Boolean). + :param timefloor: timefloor limit (s) (int) + :param starttime: start time of retrieve() (s) (int) + :param jobnumber: number of downloaded jobs (int) + :param getjob_requests: number of getjob requests (int) + :param max_getjob_requests: max getjob requests (int) + :param should_update_server: should pilot update server? (bool) + :param submitmode: Harvester submit mode, PULL or PUSH (str) + :param harvester: True if Harvester is used, False otherwise. Affects the max number of getjob reads from file (bool) + :param verify_proxy: True if the proxy should be verified. False otherwise (bool) + :param traces: traces object (to be able to propagate a proxy error all the way back to the wrapper) (Any) + :return: True if pilot should proceed with getJob (bool). """ - # use for testing thread exceptions. the exception will be picked up by ExcThread run() and caught in job.control() # raise NoLocalSpace('testing exception from proceed_with_getjob') @@ -1456,19 +1496,19 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge currenttime = time.time() pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - common = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) + common = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) if not common.allow_timefloor(submitmode): timefloor = 0 # should the proxy be verified? if verify_proxy: - userproxy = __import__('pilot.user.%s.proxy' % pilot_user, globals(), locals(), [pilot_user], 0) + userproxy = __import__(f'pilot.user.{pilot_user}.proxy', globals(), locals(), [pilot_user], 0) # is the proxy still valid? exit_code, diagnostics = userproxy.verify_proxy(test=False) if traces.pilot['error_code'] == 0: # careful so we don't overwrite another error code traces.pilot['error_code'] = exit_code - if exit_code == errors.NOPROXY or exit_code == errors.NOVOMSPROXY or exit_code == errors.CERTIFICATEHASEXPIRED: + if exit_code in {errors.NOPROXY, errors.NOVOMSPROXY, errors.CERTIFICATEHASEXPIRED}: logger.warning(diagnostics) return False @@ -1476,7 +1516,7 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge # note: do not run this test at this point if submit mode=PUSH and we are in truePilot mode on ARC # (available local space will in this case be checked after the job definition has been read from file, so the # pilot can report the error with a server update) - proceed_with_local_space_check = False if (submitmode.lower() == 'push' and update_server) else True + proceed_with_local_space_check = not (submitmode.lower() == 'push' and should_update_server) if proceed_with_local_space_check: exit_code, diagnostics = check_local_space() if exit_code != 0: @@ -1525,17 +1565,17 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge return True -def get_job_definition_from_file(path, harvester, pod): +def get_job_definition_from_file(path: str, harvester: bool, pod: bool) -> dict: """ Get a job definition from a pre-placed file. + In Harvester mode, also remove any existing job request files since it is no longer needed/wanted. - :param path: path to job definition file - :param harvester: True if Harvester is being used (determined from args.harvester), otherwise False - :param pod: True if pilot is running in a pod, otherwise False - :return: job definition dictionary. + :param path: path to job definition file (str) + :param harvester: True if Harvester is being used (determined from args.harvester), otherwise False (bool) + :param pod: True if pilot is running in a pod, otherwise False (bool) + :return: job definition (dict). """ - # remove any existing Harvester job request files (silent in non-Harvester mode) and read the JSON if harvester or pod: if harvester: @@ -1545,25 +1585,24 @@ def get_job_definition_from_file(path, harvester, pod): if not job_definition_list: logger.warning(f'no jobs were found in Harvester job definitions file: {path}') return {} - else: - # remove the job definition file from the original location, place a renamed copy in the pilot dir - new_path = os.path.join(os.environ.get('PILOT_HOME'), 'job_definition.json') - copy(path, new_path) - remove(path) - # note: the pilot can only handle one job at the time from Harvester - return job_definition_list[0] + # remove the job definition file from the original location, place a renamed copy in the pilot dir + new_path = os.path.join(os.environ.get('PILOT_HOME'), 'job_definition.json') + copy(path, new_path) + remove(path) + + # note: the pilot can only handle one job at the time from Harvester + return job_definition_list[0] # old style res = {} - with open(path, 'r') as jobdatafile: + with open(path, 'r', encoding='utf-8') as jobdatafile: response = jobdatafile.read() if len(response) == 0: logger.fatal(f'encountered empty job definition file: {path}') res = None # this is a fatal error, no point in continuing as the file will not be replaced else: # parse response message - from urllib.parse import parse_qsl datalist = parse_qsl(response, keep_blank_values=True) # convert to dictionary @@ -1576,15 +1615,14 @@ def get_job_definition_from_file(path, harvester, pod): return res -def get_job_definition_from_server(args, taskid=None): +def get_job_definition_from_server(args: Any, taskid: str = "") -> str: """ Get a job definition from a server. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :param taskid: task id from message broker, if any (None or string) - :return: job definition dictionary. + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param taskid: task id from message broker, if any (str) + :return: job definition (dict). """ - res = {} # get the job dispatcher dictionary @@ -1599,18 +1637,17 @@ def get_job_definition_from_server(args, taskid=None): return res -def locate_job_definition(args): +def locate_job_definition(args: Any) -> str: """ Locate the job definition file among standard locations. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: path (string). + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :return: path (str). """ - if args.harvester_datadir: paths = [os.path.join(args.harvester_datadir, config.Pilot.pandajobdata)] else: - paths = [os.path.join("%s/.." % args.sourcedir, config.Pilot.pandajobdata), + paths = [os.path.join(f"{args.sourcedir}/..", config.Pilot.pandajobdata), os.path.join(args.sourcedir, config.Pilot.pandajobdata), os.path.join(os.environ['PILOT_WORK_DIR'], config.Pilot.pandajobdata)] @@ -1631,14 +1668,14 @@ def locate_job_definition(args): return path -def get_job_definition(queues, args): +def get_job_definition(queues: Any, args: Any) -> dict: """ Get a job definition from a source (server or pre-placed local file). - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: job definition dictionary. + :param queues: queues object (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :return: job definition (dict). """ - res = {} path = locate_job_definition(args) @@ -1649,62 +1686,61 @@ def get_job_definition(queues, args): elif os.path.exists(path): logger.info(f'will read job definition from file: {path}') res = get_job_definition_from_file(path, args.harvester, args.pod) + elif args.harvester and args.harvester_submitmode.lower() == 'push': + pass # local job definition file not found (go to sleep) else: - if args.harvester and args.harvester_submitmode.lower() == 'push': - pass # local job definition file not found (go to sleep) - else: - # get the task id from a message broker if requested - taskid = None - abort = False - if args.subscribe_to_msgsvc: - message = None - while not args.graceful_stop.is_set(): - try: # look for graceful stop every ten seconds, otherwise block the queue - message = queues.messages.get(block=True, timeout=10) - except queue.Empty: - continue - else: - break + # get the task id from a message broker if requested + taskid = None + abort = False + if args.subscribe_to_msgsvc: + message = None + while not args.graceful_stop.is_set(): + try: # look for graceful stop every ten seconds, otherwise block the queue + message = queues.messages.get(block=True, timeout=10) + except queue.Empty: + continue + else: + break -# message = get_message_from_mb(args) - if message and message['msg_type'] == 'get_job': - taskid = message['taskid'] - elif message and message['msg_type'] == 'kill_task': - # abort immediately - logger.warning('received instruction to kill task (abort pilot)') - abort = True - elif message and message['msg_type'] == 'finish_task': - # abort gracefully - let job finish, but no job is downloaded so ignore this? - logger.warning('received instruction to finish task (abort pilot)') - abort = True - elif args.graceful_stop.is_set(): - logger.warning('graceful_stop is set, will abort getJob') - abort = True - if taskid: - logger.info(f'will download job definition from server using taskid={taskid}') - else: - logger.info('will download job definition from server') - if abort: - res = None # None will trigger 'fatal' error and will finish the pilot - else: - res = get_job_definition_from_server(args, taskid=taskid) +# message = get_message_from_mb(args) + if message and message['msg_type'] == 'get_job': + taskid = message['taskid'] + elif message and message['msg_type'] == 'kill_task': + # abort immediately + logger.warning('received instruction to kill task (abort pilot)') + abort = True + elif message and message['msg_type'] == 'finish_task': + # abort gracefully - let job finish, but no job is downloaded so ignore this? + logger.warning('received instruction to finish task (abort pilot)') + abort = True + elif args.graceful_stop.is_set(): + logger.warning('graceful_stop is set, will abort getJob') + abort = True + if taskid: + logger.info(f'will download job definition from server using taskid={taskid}') + else: + logger.info('will download job definition from server') + if abort: + res = None # None will trigger 'fatal' error and will finish the pilot + else: + res = get_job_definition_from_server(args, taskid=taskid) return res -def get_message_from_mb(args): +def get_message_from_mb(args: Any) -> dict: """ - Try and get the task id from a message broker. + Get a message from a message broker. + Wait maximum args.lifetime s, then abort. Note that this might also be interrupted by args.graceful_stop (checked for each ten seconds). - :param args: pilot args object. - :return: task id (string). + :param args: Pilot arguments object (Any) + :return: message (dict). """ - if args.graceful_stop.is_set(): logger.debug('will not start ActiveMQ since graceful_stop is set') - return None + return {} # do not put this import at the top since it can possibly interfere with some modules (esp. Google Cloud Logging modules) import multiprocessing @@ -1722,8 +1758,8 @@ def get_message_from_mb(args): proc.join(10) # wait for ten seconds, then check graceful_stop and that we are within the allowed running time if proc.is_alive(): continue - else: - break # ie abort 'infinite' loop when the process has finished + + break # ie abort 'infinite' loop when the process has finished if proc.is_alive(): # still running after max time/graceful_stop: kill it @@ -1732,18 +1768,20 @@ def get_message_from_mb(args): try: message = message_queue.get(timeout=1) except Exception: - message = None + message = {} if not message: logger.debug('not returning any messages') return message -def get_message(args, message_queue): +def get_message(args: Any, message_queue: Any): """ + Get a message from ActiveMQ and put it in the given message queue. + :param args: Pilot arguments object (Any) + :param message_queue: message queue (Any). """ - queues = namedtuple('queues', ['mbmessages']) queues.mbmessages = queue.Queue() kwargs = get_kwargs_for_mb(queues, args.url, args.port, args.allow_same_user, args.debug) @@ -1772,11 +1810,17 @@ def get_message(args, message_queue): message_queue.put(message) -def get_kwargs_for_mb(queues, url, port, allow_same_user, debug): +def get_kwargs_for_mb(queues: Any, url: str, port: str, allow_same_user: bool, debug: bool): """ + Get the kwargs dictinoary for the message broker. + :param queues: queues object (Any) + :param url: PanDA server URL (str) + :param port: PanDA server port (str) + :param allow_same_user: allow the same user or not (bool) + :param debug: True for pilot debug mode, False otherwise (bool) + :return: kwargs dictionary (dict). """ - topic = f'/{"topic" if allow_same_user else "queue"}/panda.pilot' kwargs = { 'broker': config.Message_broker.url, # 'atlas-test-mb.cern.ch', @@ -1795,33 +1839,34 @@ def get_kwargs_for_mb(queues, url, port, allow_same_user, debug): return kwargs -def now(): +def now() -> str: """ Return the current epoch as a UTF-8 encoded string. - :return: current time as encoded string + + :return: current time as encoded string (str). """ return str(time.time()).encode('utf-8') -def get_fake_job(input=True): +def get_fake_job(inpt: bool = True) -> dict: """ Return a job definition for internal pilot testing. + Note: this function is only used for testing purposes. The job definitions below are ATLAS specific. - :param input: Boolean, set to False if no input files are wanted - :return: job definition (dictionary). + :param inpt: True when there are input files, set to False if no input files are wanted (bool) + :return: job definition (dict). """ - - res = None + res = {} # create hashes - hash = hashlib.md5() - hash.update(now()) - log_guid = hash.hexdigest() - hash.update(now()) - guid = hash.hexdigest() - hash.update(now()) - job_name = hash.hexdigest() + _hash = hashlib.md5() + _hash.update(now()) + log_guid = _hash.hexdigest() + _hash.update(now()) + guid = _hash.hexdigest() + _hash.update(now()) + job_name = _hash.hexdigest() if config.Pilot.testjobtype == 'production': logger.info('creating fake test production job definition') @@ -1857,12 +1902,12 @@ def get_fake_job(input=True): 'transferType': 'NULL', 'destinationDblock': job_name, 'dispatchDBlockToken': 'NULL', - 'jobPars': '--maxEvents=1 --inputHITSFile HITS.06828093._000096.pool.root.1 --outputRDOFile RDO_%s.root' % job_name, + 'jobPars': f'--maxEvents=1 --inputHITSFile HITS.06828093._000096.pool.root.1 --outputRDOFile RDO_{job_name}.root', 'attemptNr': 0, 'swRelease': 'Atlas-20.1.4', 'nucleus': 'NULL', 'maxCpuCount': 0, - 'outFiles': 'RDO_%s.root,%s.job.log.tgz' % (job_name, job_name), + 'outFiles': f'RDO_{job_name}.root,{job_name}.job.log.tgz', 'currentPriority': 1000, 'scopeIn': 'mc15_13TeV', 'PandaID': '0', @@ -1873,7 +1918,7 @@ def get_fake_job(input=True): 'jobName': job_name, 'ddmEndPointIn': 'UTA_SWT2_DATADISK', 'taskID': 'NULL', - 'logFile': '%s.job.log.tgz' % job_name} + 'logFile': f'{job_name}.job.log.tgz'} elif config.Pilot.testjobtype == 'user': logger.info('creating fake test user job definition') res = {'jobsetID': 'NULL', @@ -1918,7 +1963,7 @@ def get_fake_job(input=True): 'swRelease': 'Atlas-20.7.6', 'nucleus': 'NULL', 'maxCpuCount': '0', - 'outFiles': '%s.root,%s.job.log.tgz' % (job_name, job_name), + 'outFiles': f'{job_name}.root,{job_name}.job.log.tgz', 'currentPriority': '1000', 'scopeIn': 'data15_13TeV', 'PandaID': '0', @@ -1929,12 +1974,12 @@ def get_fake_job(input=True): 'jobName': job_name, 'ddmEndPointIn': 'SWT2_CPB_SCRATCHDISK', 'taskID': 'NULL', - 'logFile': '%s.job.log.tgz' % job_name} + 'logFile': f'{job_name}.job.log.tgz'} else: logger.warning(f'unknown test job type: {config.Pilot.testjobtype}') if res: - if not input: + if not inpt: res['inFiles'] = 'NULL' res['GUID'] = 'NULL' res['scopeIn'] = 'NULL' @@ -1942,7 +1987,7 @@ def get_fake_job(input=True): res['realDatasetsIn'] = 'NULL' res['checksum'] = 'NULL' - if config.Pilot.testtransfertype == "NULL" or config.Pilot.testtransfertype == 'direct': + if config.Pilot.testtransfertype in {'NULL', 'direct'}: res['transferType'] = config.Pilot.testtransfertype else: logger.warning(f'unknown test transfer type: {config.Pilot.testtransfertype} (ignored)') @@ -1956,21 +2001,23 @@ def get_fake_job(input=True): return res -def get_job_retrieval_delay(harvester): +def get_job_retrieval_delay(harvester: bool) -> int: """ Return the proper delay between job retrieval attempts. + In Harvester mode, the pilot will look once per second for a job definition file. - :param harvester: True if Harvester is being used (determined from args.harvester), otherwise False - :return: sleep (s) + :param harvester: True if Harvester is being used (determined from args.harvester), otherwise False (bool) + :return: sleep (s) (int) """ - return 1 if harvester else 60 -def retrieve(queues, traces, args): # noqa: C901 +def retrieve(queues: Any, traces: Any, args: Any): # noqa: C901 """ - Retrieve all jobs from a source. + Retrieve all jobs from the proper source. + + Thread. The job definition is a json dictionary that is either present in the launch directory (preplaced) or downloaded from a server specified by `args.url`. @@ -1980,13 +2027,11 @@ def retrieve(queues, traces, args): # noqa: C901 WARNING: this function is nearly too complex. Be careful with adding more lines as flake8 will fail it. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) :raises PilotException: if create_job fails (e.g. because queuedata could not be downloaded). - :return: """ - timefloor = infosys.queuedata.timefloor starttime = time.time() @@ -2018,7 +2063,12 @@ def retrieve(queues, traces, args): # noqa: C901 #res['debug'] = True if res: dump_job_definition(res) - if res is None: + + # only ATLAS wants to abort immediately in this case + pilot_user = os.environ.get('PILOT_USER', 'generic').lower() + jobdata = __import__(f'pilot.user.{pilot_user}.jobdata', globals(), locals(), [pilot_user], 0) + fail_at_none = jobdata.fail_at_getjob_none() + if res is None and fail_at_none: logger.fatal('fatal error in job download loop - cannot continue') # do not set graceful stop if pilot has not finished sending the final job update # i.e. wait until SERVER_UPDATE is DONE_FINAL @@ -2041,80 +2091,79 @@ def retrieve(queues, traces, args): # noqa: C901 if args.graceful_stop.is_set(): break time.sleep(1) - else: + elif 'StatusCode' in res and res['StatusCode'] != '0' and res['StatusCode'] != 0: # it seems the PanDA server returns StatusCode as an int, but the aCT returns it as a string # note: StatusCode keyword is not available in job definition files from Harvester (not needed) - if 'StatusCode' in res and res['StatusCode'] != '0' and res['StatusCode'] != 0: - getjob_failures += 1 - if getjob_failures >= args.getjob_failures: - logger.warning(f'did not get a job -- max number of job request failures reached: {getjob_failures}') - args.graceful_stop.set() - break + getjob_failures += 1 + if getjob_failures >= args.getjob_failures: + logger.warning(f'did not get a job -- max number of job request failures reached: {getjob_failures}') + args.graceful_stop.set() + break - logger.warning(f"did not get a job -- sleep 60s and repeat -- status: {res['StatusCode']}") - for i in range(60): - if args.graceful_stop.is_set(): - break - time.sleep(1) - else: - # create the job object out of the raw dispatcher job dictionary - try: - job = create_job(res, args.queue) - except PilotException as error: - raise error - else: - logger.info('resetting any existing errors') - job.reset_errors() - - #else: - # verify the job status on the server - #try: - # job_status, job_attempt_nr, job_status_code = get_job_status_from_server(job.jobid, args.url, args.port) - # if job_status == "running": - # pilot_error_diag = "job %s is already running elsewhere - aborting" % job.jobid - # logger.warning(pilot_error_diag) - # raise JobAlreadyRunning(pilot_error_diag) - #except Exception as error: - # logger.warning(f"{error}") - # write time stamps to pilot timing file - # note: PILOT_POST_GETJOB corresponds to START_TIME in Pilot 1 - add_to_pilot_timing(job.jobid, PILOT_PRE_GETJOB, time_pre_getjob, args) - add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args) - - # for debugging on HTCondor purposes, set special env var - # (only proceed if there is a condor class ad) - if os.environ.get('_CONDOR_JOB_AD', None): - htcondor_envvar(job.jobid) - - # add the job definition to the jobs queue and increase the job counter, - # and wait until the job has finished - put_in_queue(job, queues.jobs) - - jobnumber += 1 - while not args.graceful_stop.is_set(): - if has_job_completed(queues, args): - # make sure there are no lingering defunct subprocesses - kill_defunct_children(job.pid) - - # purge queue(s) that retains job object - set_pilot_state(state='') - purge_queue(queues.finished_data_in) - - args.job_aborted.clear() - args.abort_job.clear() - logger.info('ready for new job') - - # re-establish logging - logging.info('pilot has finished with previous job - re-establishing logging') - logging.handlers = [] - logging.shutdown() - establish_logging(debug=args.debug, nopilotlog=args.nopilotlog) - pilot_version_banner() - getjob_requests = 0 - add_to_pilot_timing('1', PILOT_MULTIJOB_START_TIME, time.time(), args) - args.signal = None - break - time.sleep(0.5) + logger.warning(f"did not get a job -- sleep 60s and repeat -- status: {res['StatusCode']}") + for _ in range(60): + if args.graceful_stop.is_set(): + break + time.sleep(1) + else: + # create the job object out of the raw dispatcher job dictionary + try: + job = create_job(res, queuename=args.queue) + except PilotException as error: + raise error + + logger.info('resetting any existing errors') + job.reset_errors() + + #else: + # verify the job status on the server + #try: + # job_status, job_attempt_nr, job_status_code = get_job_status_from_server(job.jobid, args.url, args.port) + # if job_status == "running": + # pilot_error_diag = "job %s is already running elsewhere - aborting" % job.jobid + # logger.warning(pilot_error_diag) + # raise JobAlreadyRunning(pilot_error_diag) + #except Exception as error: + # logger.warning(f"{error}") + # write time stamps to pilot timing file + # note: PILOT_POST_GETJOB corresponds to START_TIME in Pilot 1 + add_to_pilot_timing(job.jobid, PILOT_PRE_GETJOB, time_pre_getjob, args) + add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args) + + # for debugging on HTCondor purposes, set special env var + # (only proceed if there is a condor class ad) + if os.environ.get('_CONDOR_JOB_AD', None): + htcondor_envvar(job.jobid) + + # add the job definition to the jobs queue and increase the job counter, + # and wait until the job has finished + put_in_queue(job, queues.jobs) + + jobnumber += 1 + while not args.graceful_stop.is_set(): + if has_job_completed(queues, args): + # make sure there are no lingering defunct subprocesses + kill_defunct_children(job.pid) + + # purge queue(s) that retains job object + set_pilot_state(state='') + purge_queue(queues.finished_data_in) + + args.job_aborted.clear() + args.abort_job.clear() + logger.info('ready for new job') + + # re-establish logging + logging.info('pilot has finished with previous job - re-establishing logging') + logging.handlers = [] + logging.shutdown() + establish_logging(debug=args.debug, nopilotlog=args.nopilotlog) + pilot_version_banner() + getjob_requests = 0 + add_to_pilot_timing('1', PILOT_MULTIJOB_START_TIME, time.time(), args) + args.signal = None + break + time.sleep(0.5) # proceed to set the job_aborted flag? if threads_aborted(caller='retrieve'): @@ -2124,14 +2173,12 @@ def retrieve(queues, traces, args): # noqa: C901 logger.info('[job] retrieve thread has finished') -def htcondor_envvar(jobid): +def htcondor_envvar(jobid: str): """ On HTCondor nodes, set special env var (HTCondor_PANDA) for debugging Lustre. - :param jobid: PanDA job id (string) - :return: + :param jobid: PanDA job id (str). """ - try: globaljobid = encode_globaljobid(jobid) if globaljobid: @@ -2141,35 +2188,33 @@ def htcondor_envvar(jobid): logger.warning(f'caught exception: {exc}') -def handle_proxy(job): +def handle_proxy(job: Any): """ Handle the proxy in unified dispatch. In unified dispatch, the pilot is started with the production proxy, but in case the job is a user job, the production proxy is too powerful. A user proxy is then downloaded instead. - :param job: job object. - :return: + :param job: job object (Any). """ - if job.is_analysis() and job.infosys.queuedata.type == 'unified' and not job.prodproxy: logger.info('the production proxy will be replaced by a user proxy (to be downloaded)') ec = download_new_proxy(role='user', proxy_type='unified', workdir=job.workdir) if ec: logger.warning(f'failed to download proxy for unified dispatch - will continue with X509_USER_PROXY={os.environ.get("X509_USER_PROXY")}') + if ec == errors.CERTIFICATEHASEXPIRED: + logger.warning('the certificate has expired - cannot fail right now, should be picked up by the job later on') else: logger.debug(f'will not download a new proxy since job.is_analysis()={job.is_analysis()}, ' f'job.infosys.queuedata.type={job.infosys.queuedata.type}, job.prodproxy={job.prodproxy}') -def dump_job_definition(res): +def dump_job_definition(res: dict): """ Dump the job definition to the log, but hide any sensitive information. - :param res: raw job definition (dictionary). - :return: + :param res: raw job definition (dict). """ - if 'secrets' in res: _pandasecrets = res['secrets'] res['secrets'] = '********' @@ -2188,31 +2233,25 @@ def dump_job_definition(res): def print_node_info(): - """ - Print information about the local node to the log. - - :return: - """ - + """Print information about the local node to the log.""" if is_virtual_machine(): logger.info("pilot is running in a virtual machine") else: logger.info("pilot is not running in a virtual machine") -def create_job(dispatcher_response, queue): +def create_job(dispatcher_response: dict, queuename: str) -> Any: """ Create a job object out of the dispatcher response. - :param dispatcher_response: raw job dictionary from the dispatcher. - :param queue: queue name (string). - :return: job object + :param dispatcher_response: raw job dictionary from the dispatcher (dict) + :param queuename: queue name (str) + :return: job object (Any) """ - # initialize (job specific) InfoService instance job = JobData(dispatcher_response) jobinfosys = InfoService() - jobinfosys.init(queue, infosys.confinfo, infosys.extinfo, JobInfoProvider(job)) + jobinfosys.init(queuename, infosys.confinfo, infosys.extinfo, JobInfoProvider(job)) job.init(infosys) logger.info(f'received job: {job.jobid} (sleep until the job has finished)') @@ -2226,15 +2265,16 @@ def create_job(dispatcher_response, queue): return job -def has_job_completed(queues, args): +def has_job_completed(queues: Any, args: Any) -> bool: """ - Has the current job completed (finished or failed)? + Check if the current job has completed (finished or failed). + Note: the job object was extracted from monitored_payloads queue before this function was called. - :param queues: Pilot queues object. - :return: True is the payload has finished or failed + :param queues: Pilot queues object (Any) + :param args: Pilot arguments object (Any) + :return: True is the payload has finished or failed, False otherwise (bool). """ - # check if the job has finished try: job = queues.completed_jobs.get(block=True, timeout=1) @@ -2243,7 +2283,7 @@ def has_job_completed(queues, args): pass else: make_job_report(job) - cmd = 'ls -lF %s' % os.environ.get('PILOT_HOME') + cmd = f"ls -lF {os.environ.get('PILOT_HOME')}" logger.debug(f'{cmd}:\n') _, stdout, _ = execute(cmd) logger.debug(stdout) @@ -2285,13 +2325,13 @@ def has_job_completed(queues, args): return False -def get_job_from_queue(queues, state): +def get_job_from_queue(queues: Any, state: str) -> Any: """ Check if the job has finished or failed and if so return it. - :param queues: pilot queues. - :param state: job state (e.g. finished/failed) (string). - :return: job object. + :param queues: Pilot queues object (Any) + :param state: job state (e.g. finished/failed) (str) + :return: job object (Any). """ try: if state == "finished": @@ -2310,39 +2350,36 @@ def get_job_from_queue(queues, state): return job -def is_queue_empty(queues, queue): +def is_queue_empty(queues: Any, queuename: str) -> bool: """ Check if the given queue is empty (without pulling). - :param queues: pilot queues object. - :param queue: queue name (string). - :return: True if queue is empty, False otherwise + :param queues: Pilot queues object (Any) + :param queuename: queue name (str) + :return: True if queue is empty, False otherwise (bool) """ - status = False - if queue in queues._fields: - _queue = getattr(queues, queue) + if queuename in queues._fields: + _queue = getattr(queues, queuename) jobs = list(_queue.queue) if len(jobs) > 0: - logger.info('queue %s not empty: found %d job(s)', queue, len(jobs)) + logger.info(f'queue {queuename} not empty: found {len(jobs)} job(s)') else: - logger.info('queue %s is empty', queue) + logger.info(f'queue {queuename} is empty') status = True else: - logger.warning('queue %s not present in %s', queue, queues._fields) + logger.warning(f'queue {queuename} not present in {queues._fields}') return status -def order_log_transfer(queues, job): +def order_log_transfer(queues: Any, job: Any): """ Order a log transfer for a failed job. - :param queues: pilot queues object. - :param job: job object. - :return: + :param queues: Pilot queues object (Any) + :param job: job object (Any). """ - # add the job object to the data_out queue to have it staged out job.stageout = 'log' # only stage-out log file #set_pilot_state(job=job, state='stageout') @@ -2351,95 +2388,93 @@ def order_log_transfer(queues, job): logger.debug('job added to data_out queue') # wait for the log transfer to finish - n = 0 + counter = 0 nmax = 60 - while n < nmax: + while counter < nmax: # refresh the log_transfer since it might have changed log_transfer = job.get_status('LOG_TRANSFER') - logger.info('waiting for log transfer to finish (#%d/#%d): %s', n + 1, nmax, log_transfer) - if is_queue_empty(queues, 'data_out') and \ - (log_transfer == LOG_TRANSFER_DONE or log_transfer == LOG_TRANSFER_FAILED): # set in data component + logger.info(f'waiting for log transfer to finish (#{counter + 1}/#{nmax}): {log_transfer}') + if is_queue_empty(queues, 'data_out') and log_transfer in {LOG_TRANSFER_DONE, LOG_TRANSFER_FAILED}: logger.info('stage-out of log has completed') break - else: - if log_transfer == LOG_TRANSFER_IN_PROGRESS: # set in data component, job object is singleton - logger.info('log transfer is in progress') - time.sleep(2) - n += 1 - logger.info('proceeding with server update (n=%d)', n) + if log_transfer == LOG_TRANSFER_IN_PROGRESS: # set in data component, job object is singleton + logger.info('log transfer is in progress') + time.sleep(2) + counter += 1 + logger.info('proceeding with server update') -def wait_for_aborted_job_stageout(args, queues, job): + +def wait_for_aborted_job_stageout(args: Any, queues: Any, job: Any): """ Wait for stage-out to finish for aborted job. - :param args: pilot args object. - :param queues: pilot queues object. - :param job: job object. - :return: + :param args: Pilot arguments object (Any) + :param queues: Pilot queues object (Any) + :param job: job object (Any). """ - # if the pilot received a kill signal, how much time has passed since the signal was intercepted? try: time_since_kill = get_time_since('1', PILOT_KILL_SIGNAL, args) was_killed = was_pilot_killed(args.timing) if was_killed: - logger.info('%d s passed since kill signal was intercepted - make sure that stage-out has finished', time_since_kill) + logger.info(f'{time_since_kill} s passed since kill signal was intercepted - make sure that stage-out has finished') except Exception as error: logger.warning('exception caught: %s', error) time_since_kill = 60 - else: - if time_since_kill > 60 or time_since_kill < 0: # fail-safe - logger.warning('reset time_since_kill to 60 since value is out of allowed limits') - time_since_kill = 60 + + if time_since_kill > 60 or time_since_kill < 0: # fail-safe + logger.warning('reset time_since_kill to 60 since value is out of allowed limits') + time_since_kill = 60 # if stage-out has not finished, we need to wait (less than two minutes or the batch system will issue # a hard SIGKILL) max_wait_time = 2 * 60 - time_since_kill - 5 - logger.debug('using max_wait_time = %d s', max_wait_time) + logger.debug(f'using max_wait_time = {max_wait_time} s') t0 = time.time() while time.time() - t0 < max_wait_time: if job in queues.finished_data_out.queue or job in queues.failed_data_out.queue: logger.info('stage-out has finished, proceed with final server update') break - else: - time.sleep(0.5) + + time.sleep(0.5) logger.info('proceeding with final server update') -def get_job_status(job, key): +def get_job_status(job: Any, key: str) -> str: """ + Return the job status corresponding to the given key. + Wrapper function around job.get_status(). If key = 'LOG_TRANSFER' but job object is not defined, the function will return value = LOG_TRANSFER_NOT_DONE. - :param job: job object. - :param key: key name (string). - :return: value (string). + :param job: job object (Any) + :param key: key name (str) + :return: value (str). """ - value = "" if job: value = job.get_status(key) - else: - if key == 'LOG_TRANSFER': - value = LOG_TRANSFER_NOT_DONE + elif key == 'LOG_TRANSFER': + value = LOG_TRANSFER_NOT_DONE return value -def queue_monitor(queues, traces, args): # noqa: C901 +def queue_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 """ - Monitoring of queues. + Monitor queue activity. + + Thread. + This function monitors queue activity, specifically if a job has finished or failed and then reports to the server. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any). """ - # scan queues until at least one queue has a job object. abort if it takes too long time if not scan_for_jobs(queues): logger.warning('queues are still empty of jobs - will begin queue monitoring anyway') @@ -2473,7 +2508,10 @@ def queue_monitor(queues, traces, args): # noqa: C901 if state != 'stage-out': # logger.info("no need to wait since job state=\'%s\'", state) break - pause_queue_monitor(1) if not abort_thread else pause_queue_monitor(10) + if not abort_thread: + pause_queue_monitor(1) + else: + pause_queue_monitor(10) # job has not been defined if it's still running if not job and not abort_thread: @@ -2515,22 +2553,20 @@ def queue_monitor(queues, traces, args): # noqa: C901 logger.info('[job] queue monitor thread has finished') -def update_server(job, args): +def update_server(job: Any, args: Any) -> None: """ Update the server (wrapper for send_state() that also prepares the metadata). - :param job: job object. - :param args: pilot args object. - :return: + :param job: job object (Any) + :param args: Pilot arguments object (Any). """ - if job.completed: logger.warning('job has already completed - cannot send another final update') return # user specific actions pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) metadata = user.get_metadata(job.workdir) try: user.update_server(job) @@ -2542,28 +2578,28 @@ def update_server(job, args): send_state(job, args, job.state, metadata=metadata) -def pause_queue_monitor(delay): +def pause_queue_monitor(delay: int): """ Pause the queue monitor to let log transfer complete. + Note: this function should use globally available object. Use sleep for now. + :param delay: sleep time in seconds (int). - :return: """ - - logger.warning('since job:queue_monitor is responsible for sending job updates, we sleep for %d s', delay) + logger.warning(f'since job:queue_monitor is responsible for sending job updates, we sleep for {delay} s') time.sleep(delay) -def get_finished_or_failed_job(args, queues): +def get_finished_or_failed_job(args: Any, queues: Any) -> Any: """ Check if the job has either finished or failed and if so return it. + If failed, order a log transfer. If the job is in state 'failed' and abort_job is set, set job_aborted. - :param args: pilot args object. - :param queues: pilot queues object. - :return: job object. + :param args: Pilot arguments object (Any) + :param queues: Pilot queues object (Any) + :return: job object (Any). """ - job = get_job_from_queue(queues, "finished") if job: # logger.debug('get_finished_or_failed_job: job has finished') @@ -2595,16 +2631,16 @@ def get_finished_or_failed_job(args, queues): return job -def get_heartbeat_period(debug=False): +def get_heartbeat_period(debug: bool = False) -> int: """ Return the proper heartbeat period, as determined by normal or debug mode. + In normal mode, the heartbeat period is 30*60 s, while in debug mode it is 5*60 s. Both values are defined in the config file. - :param debug: Boolean, True for debug mode. False otherwise. + :param debug: Boolean, True for debug mode. False otherwise (bool) :return: heartbeat period (int). """ - try: return int(config.Pilot.heartbeat if not debug else config.Pilot.debug_heartbeat) except Exception as error: @@ -2612,13 +2648,13 @@ def get_heartbeat_period(debug=False): return 1800 -def check_for_abort_job(args, caller=''): +def check_for_abort_job(args: Any, caller: str = '') -> bool: """ Check if args.abort_job.is_set(). - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :param caller: function name of caller (string). - :return: Boolean, True if args_job.is_set() + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param caller: function name of caller (str) + :return: True if args_job.is_set(), False otherwise (bool). """ abort_job = False if args.abort_job.is_set(): @@ -2628,60 +2664,17 @@ def check_for_abort_job(args, caller=''): return abort_job -def interceptor(queues, traces, args): - """ - MOVE THIS TO INTERCEPTOR.PY; TEMPLATE FOR THREADS - - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: - """ - - # overall loop counter (ignoring the fact that more than one job may be running) - n = 0 - while not args.graceful_stop.is_set(): - time.sleep(0.1) - - # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) - # (abort at the end of the loop) - abort = should_abort(args, label='job:interceptor') - - # check for any abort_job requests - abort_job = check_for_abort_job(args, caller='interceptor') - if not abort_job: - # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function - jobs = queues.monitored_payloads.queue - if jobs: - for _ in range(len(jobs)): - logger.info('interceptor loop %d: looking for communication file', n) - time.sleep(30) - - n += 1 - - if abort or abort_job: - break - - # proceed to set the job_aborted flag? - if threads_aborted(caller='interceptor'): - logger.debug('will proceed to set job_aborted') - args.job_aborted.set() - - logger.info('[job] interceptor thread has finished') - - -def fast_monitor_tasks(job): +def fast_monitor_tasks(job: Any) -> int: """ Perform user specific fast monitoring tasks. - :param job: job object. + :param job: job object (Any) :return: exit code (int). """ - exit_code = 0 pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.monitoring' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.monitoring', globals(), locals(), [pilot_user], 0) try: exit_code = user.fast_monitor_tasks(job) except Exception as exc: @@ -2690,11 +2683,16 @@ def fast_monitor_tasks(job): return exit_code -def message_listener(queues, traces, args): +def message_listener(queues: Any, traces: Any, args: Any): """ + Listen for messages from ActiveMQ. - """ + Thread. + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + """ while not args.graceful_stop.is_set() and args.subscribe_to_msgsvc: # listen for a message and add it to the messages queue @@ -2709,7 +2707,7 @@ def message_listener(queues, traces, args): args.graceful_stop.set() # kill running job? break - elif message and message['msg_type'] == 'get_job': + if message and message['msg_type'] == 'get_job': put_in_queue(message, queues.messages) # will only be put in the queue if not there already continue # wait for the next message @@ -2737,18 +2735,18 @@ def message_listener(queues, traces, args): logger.info('[job] message listener thread has finished') -def fast_job_monitor(queues, traces, args): +def fast_job_monitor(queues: Any, traces: Any, args: Any) -> None: """ Fast monitoring of job parameters. + Thread. + This function can be used for monitoring processes below the one minute threshold of the normal job_monitor thread. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) """ - # peeking and current time; peeking_time gets updated if and when jobs are being monitored, update_time is only # used for sending the heartbeat and is updated after a server update #peeking_time = int(time.time()) @@ -2780,20 +2778,20 @@ def fast_job_monitor(queues, traces, args): abort_job = check_for_abort_job(args, caller='fast job monitor') if abort_job: break - else: - # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function - jobs = queues.monitored_payloads.queue - if jobs: - for i in range(len(jobs)): - #current_id = jobs[i].jobid - if jobs[i].state == 'finished' or jobs[i].state == 'failed': - logger.info('will abort fast job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) - break - # perform the monitoring tasks - exit_code = fast_monitor_tasks(jobs[i]) - if exit_code: - logger.debug('fast monitoring reported an error: %d', exit_code) + # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function + jobs = queues.monitored_payloads.queue + if jobs: + for i in range(len(jobs)): + #current_id = jobs[i].jobid + if jobs[i].state in {'finished', 'failed'}: + logger.info('will abort fast job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) + break + + # perform the monitoring tasks + exit_code = fast_monitor_tasks(jobs[i]) + if exit_code: + logger.debug(f'fast monitoring reported an error: {exit_code}') # proceed to set the job_aborted flag? if threads_aborted(caller='fast_job_monitor'): @@ -2803,20 +2801,21 @@ def fast_job_monitor(queues, traces, args): logger.info('[job] fast job monitor thread has finished') -def job_monitor(queues, traces, args): # noqa: C901 +def job_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 """ - Monitoring of job parameters. + Monitor job parameters. + + Thread. + This function monitors certain job parameters, such as job looping, at various time intervals. The main loop is executed once a minute, while individual verifications may be executed at any time interval (>= 1 minute). E.g. looping jobs are checked once every ten minutes (default) and the heartbeat is sent once every 30 minutes. Memory usage is checked once a minute. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) """ - # initialize the monitoring time object mt = MonitoringTime() @@ -2856,7 +2855,10 @@ def job_monitor(queues, traces, args): # noqa: C901 for i in range(len(jobs)): # send heartbeat if it is time (note that the heartbeat function might update the job object, e.g. # by turning on debug mode, ie we need to get the heartbeat period in case it has changed) - update_time = send_heartbeat_if_time(jobs[i], args, update_time) + try: + update_time = send_heartbeat_if_time(jobs[i], args, update_time) + except Exception as exc: + logger.warning(f'exception caught during send_heartbeat_if_time: {exc}') # note: when sending a state change to the server, the server might respond with 'tobekilled' try: @@ -2912,8 +2914,8 @@ def job_monitor(queues, traces, args): # noqa: C901 logger.debug('killing payload processes') kill_processes(jobs[i].pid) - logger.info('monitor loop #%d: job %d:%s is in state \'%s\'', n, i, current_id, jobs[i].state) - if jobs[i].state == 'finished' or jobs[i].state == 'failed': + logger.info(f"monitor loop #{n}: job {i}:{current_id} is in state \'{jobs[i].state}\'") + if jobs[i].state in {'finished', 'failed'}: logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) if args.workflow == 'stager': # abort interactive stager pilot, this will trigger an abort of all threads set_pilot_state(job=jobs[i], state="finished") @@ -2934,12 +2936,12 @@ def job_monitor(queues, traces, args): # noqa: C901 # attempt to download a new proxy since it is about to expire ec = download_new_proxy(role='production') exit_code = ec if ec != 0 else 0 # reset the exit_code if success - if exit_code == errors.KILLPAYLOAD or exit_code == errors.NOVOMSPROXY or exit_code == errors.CERTIFICATEHASEXPIRED: + if exit_code in {errors.KILLPAYLOAD, errors.NOVOMSPROXY, errors.CERTIFICATEHASEXPIRED}: jobs[i].piloterrorcodes, jobs[i].piloterrordiags = errors.add_error_code(exit_code) logger.debug('killing payload process') kill_process(jobs[i].pid) break - elif exit_code == errors.LEASETIME: # stager mode, order log stage-out + if exit_code == errors.LEASETIME: # stager mode, order log stage-out set_pilot_state(job=jobs[i], state="finished") logger.info('ordering log transfer') jobs[i].stageout = 'log' # only stage-out log file @@ -2989,10 +2991,6 @@ def job_monitor(queues, traces, args): # noqa: C901 elif os.environ.get('PILOT_JOB_STATE') == 'stagein': logger.info('job monitoring is waiting for stage-in to finish') - #else: - # # check the waiting time in the job monitor. set global graceful_stop if necessary - # if args.workflow != 'stager': - # check_job_monitor_waiting_time(args, peeking_time, abort_override=abort_job) n += 1 @@ -3014,15 +3012,14 @@ def job_monitor(queues, traces, args): # noqa: C901 logger.info('[job] job monitor thread has finished') -def preliminary_server_update(job, args, diagnostics): +def preliminary_server_update(job: Any, args: Any, diagnostics: str): """ Send a quick job update to the server (do not send any error code yet) for a failed job. - :param job: job object - :param args: args object - :param diagnostics: error diagnostics (string). + :param job: job object (Any) + :param args: Pilot arguments object (Any) + :param diagnostics: error diagnostics (str). """ - logger.warning(f'will send preliminary diagnostics (and pretend job is still running)={diagnostics}') piloterrorcode = job.piloterrorcode piloterrorcodes = job.piloterrorcodes @@ -3036,130 +3033,101 @@ def preliminary_server_update(job, args, diagnostics): job.piloterrordiags = piloterrordiags -def get_signal_error(sig): +def get_signal_error(sig: Any) -> int: """ Return a corresponding pilot error code for the given signal. - :param sig: signal. + :param sig: signal (Any) :return: pilot error code (int). """ + try: + _sig = str(sig) # e.g. 'SIGTERM' + except ValueError: + ret = errors.KILLSIGNAL + else: + codes = {'SIGBUS': errors.SIGBUS, + 'SIGQUIT': errors.SIGQUIT, + 'SIGSEGV': errors.SIGSEGV, + 'SIGTERM': errors.SIGTERM, + 'SIGXCPU': errors.SIGXCPU, + 'SIGUSR1': errors.SIGUSR1, + 'USERKILL': errors.USERKILL} + ret = codes.get(_sig) if _sig in codes else errors.KILLSIGNAL - _sig = str(sig) # e.g. 'SIGTERM' - codes = {'SIGBUS': errors.SIGBUS, - 'SIGQUIT': errors.SIGQUIT, - 'SIGSEGV': errors.SIGSEGV, - 'SIGTERM': errors.SIGTERM, - 'SIGXCPU': errors.SIGXCPU, - 'SIGUSR1': errors.SIGUSR1, - 'USERKILL': errors.USERKILL} - ret = codes.get(_sig) if _sig in codes else errors.KILLSIGNAL return ret -def download_new_proxy(role='production', proxy_type='', workdir=''): +def download_new_proxy(role: str = 'production', proxy_type: str = '', workdir: str = '') -> int: """ - The production proxy has expired, try to download a new one. + Download a new production proxy, since it has expired. If it fails to download and verify a new proxy, return the NOVOMSPROXY error. - :param role: role, 'production' or 'user' (string). - :param proxy_type: proxy type, e.g. unified (string). - :param workdir: payload work directory (string). + :param role: role, 'production' or 'user' (str) + :param proxy_type: proxy type, e.g. unified (str) + :param workdir: payload work directory (str) :return: exit code (int). """ - exit_code = 0 x509 = os.environ.get('X509_USER_PROXY', '') logger.info(f'attempt to download a new proxy (proxy_type={proxy_type})') pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.proxy' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.proxy', globals(), locals(), [pilot_user], 0) voms_role = user.get_voms_role(role=role) - ec, diagnostics, new_x509 = user.get_and_verify_proxy(x509, voms_role=voms_role, proxy_type=proxy_type, workdir=workdir) + ec, _, new_x509 = user.get_and_verify_proxy(x509, voms_role=voms_role, proxy_type=proxy_type, workdir=workdir) if ec != 0: # do not return non-zero exit code if only download fails logger.warning('failed to download/verify new proxy') - exit_code == errors.NOVOMSPROXY + exit_code = errors.CERTIFICATEHASEXPIRED if ec == errors.CERTIFICATEHASEXPIRED else errors.NOVOMSPROXY + elif new_x509 and new_x509 != x509 and 'unified' in new_x509 and os.path.exists(new_x509): + os.environ['X509_UNIFIED_DISPATCH'] = new_x509 + logger.debug(f'set X509_UNIFIED_DISPATCH to {new_x509}') + # already dumped right after proxy download: + #cmd = f'export X509_USER_PROXY={os.environ.get("X509_UNIFIED_DISPATCH")};echo $X509_USER_PROXY; voms-proxy-info -all' + #_, stdout, _ = execute(cmd) + #logger.debug(f'cmd={cmd}:\n{stdout}') else: - if new_x509 and new_x509 != x509 and 'unified' in new_x509 and os.path.exists(new_x509): - os.environ['X509_UNIFIED_DISPATCH'] = new_x509 - logger.debug(f'set X509_UNIFIED_DISPATCH to {new_x509}') - # already dumped right after proxy download: - #cmd = f'export X509_USER_PROXY={os.environ.get("X509_UNIFIED_DISPATCH")};echo $X509_USER_PROXY; voms-proxy-info -all' - #_, stdout, _ = execute(cmd) - #logger.debug(f'cmd={cmd}:\n{stdout}') - else: - logger.debug(f'will not set X509_UNIFIED_DISPATCH since new_x509={new_x509}, x509={x509}, os.path.exists(new_x509)={os.path.exists(new_x509)}') + logger.debug(f'will not set X509_UNIFIED_DISPATCH since new_x509={new_x509}, x509={x509}, os.path.exists(new_x509)={os.path.exists(new_x509)}') return exit_code -def send_heartbeat_if_time(job, args, update_time): +def send_heartbeat_if_time(job: Any, args: Any, update_time: float) -> int: """ Send a heartbeat to the server if it is time to do so. - :param job: job object. - :param args: args object. - :param update_time: last update time (from time.time()). - :return: possibly updated update_time (from time.time()). + :param job: job object (Any) + :param args: Pilot arguments object (Any) + :param update_time: last update time (from time.time()) (float) + :return: possibly updated update_time, converted to int (from time.time()) (int). """ - if job.completed: logger.info('job already completed - will not send any further updates') - return update_time - - if int(time.time()) - update_time >= get_heartbeat_period(job.debug and job.debug_command): + elif int(time.time()) - update_time >= get_heartbeat_period(job.debug and job.debug_command): # check for state==running here, and send explicit 'running' in send_state, rather than sending job.state # since the job state can actually change in the meantime by another thread # job.completed will anyway be checked in https::send_update() if job.serverstate != 'finished' and job.serverstate != 'failed' and job.state == 'running': logger.info('will send heartbeat for job in \'running\' state') send_state(job, args, 'running') - update_time = int(time.time()) - - return update_time - - -def check_job_monitor_waiting_time(args, peeking_time, abort_override=False): - """ - Check the waiting time in the job monitor. - Set global graceful_stop if necessary. - - :param args: args object. - :param peeking_time: time when monitored_payloads queue was peeked into (int). - :return: - """ - - waiting_time = int(time.time()) - peeking_time - msg = 'no jobs in monitored_payloads queue (waited for %d s)' % waiting_time - if waiting_time > 60 * 60: - msg += ' - aborting' - # abort = True - #else: - # abort = False - if logger: - logger.warning(msg) + update_time = time.time() else: - print(msg) - #if abort or abort_override: - # # do not set graceful stop if pilot has not finished sending the final job update - # # i.e. wait until SERVER_UPDATE is DONE_FINAL - # check_for_final_server_update(args.update_server) - # args.graceful_stop.set() + logger.info('will not send any job update') + + return int(update_time) -def fail_monitored_job(job, exit_code, diagnostics, queues, traces): +def fail_monitored_job(job: Any, exit_code: int, diagnostics: str, queues: Any, traces: Any): """ Fail a monitored job. - :param job: job object - :param exit_code: exit code from job_monitor_tasks (int). - :param diagnostics: pilot error diagnostics (string). - :param queues: queues object. - :param traces: traces object. - :return: + :param job: job object (Any) + :param exit_code: exit code from job_monitor_tasks (int) + :param diagnostics: pilot error diagnostics (str) + :param queues: queues object (Any) + :param traces: traces object (Any). """ - set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=diagnostics) job.piloterrordiag = diagnostics @@ -3168,49 +3136,48 @@ def fail_monitored_job(job, exit_code, diagnostics, queues, traces): logger.info('aborting job monitoring since job state=%s', job.state) -def make_job_report(job): +def make_job_report(job: Any): """ Make a summary report for the given job. + This function is called when the job has completed. - :param job: job object. - :return: + :param job: job object (Any). """ - logger.info('') logger.info('job summary report') logger.info('--------------------------------------------------') - logger.info('PanDA job id: %s', job.jobid) - logger.info('task id: %s', job.taskid) + logger.info(f'PanDA job id: {job.jobid}') + logger.info(f'task id: {job.taskid}') n = len(job.piloterrorcodes) if n > 0: for i in range(n): - logger.info('error %d/%d: %s: %s', i + 1, n, job.piloterrorcodes[i], job.piloterrordiags[i]) + logger.info(f'error {i + 1}/{n}: {job.piloterrorcodes[i]}: {job.piloterrordiags[i]}') else: logger.info('errors: (none)') if job.piloterrorcode != 0: - logger.info('pilot error code: %d', job.piloterrorcode) - logger.info('pilot error diag: %s', job.piloterrordiag) + logger.info(f'pilot error code: {job.piloterrorcode}') + logger.info(f'pilot error diag: {job.piloterrordiag}') info = "" for key in job.status: info += key + " = " + job.status[key] + " " - logger.info('status: %s', info) + logger.info(f'status: {info}') s = "" if job.is_analysis() and job.state != 'finished': s = '(user job is recoverable)' if errors.is_recoverable(code=job.piloterrorcode) else '(user job is not recoverable)' - logger.info('pilot state: %s %s', job.state, s) - logger.info('transexitcode: %d', job.transexitcode) - logger.info('exeerrorcode: %d', job.exeerrorcode) - logger.info('exeerrordiag: %s', job.exeerrordiag) - logger.info('exitcode: %d', job.exitcode) - logger.info('exitmsg: %s', job.exitmsg) - logger.info('cpuconsumptiontime: %d %s', job.cpuconsumptiontime, job.cpuconsumptionunit) - logger.info('nevents: %d', job.nevents) - logger.info('neventsw: %d', job.neventsw) - logger.info('pid: %s', job.pid) - logger.info('pgrp: %s', str(job.pgrp)) - logger.info('corecount: %d', job.corecount) - logger.info('event service: %s', str(job.is_eventservice)) - logger.info('sizes: %s', str(job.sizes)) + logger.info(f'pilot state: {job.state} {s}') + logger.info(f'transexitcode: {job.transexitcode}') + logger.info(f'exeerrorcode: {job.exeerrorcode}') + logger.info(f'exeerrordiag: {job.exeerrordiag}') + logger.info(f'exitcode: {job.exitcode}') + logger.info(f'exitmsg: {job.exitmsg}') + logger.info(f'cpuconsumptiontime: {job.cpuconsumptiontime} {job.cpuconsumptionunit}') + logger.info(f'nevents: {job.nevents}') + logger.info(f'neventsw: {job.neventsw}') + logger.info(f'pid: {job.pid}') + logger.info(f'pgrp: {job.pgrp}') + logger.info(f'corecount: {job.corecount}') + logger.info(f'event service: {job.is_eventservice}') + logger.info(f'sizes: {job.sizes}') logger.info('--------------------------------------------------') logger.info('') diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index de33c0df..4f4c89bb 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -23,12 +23,15 @@ # NOTE: this module should deal with non-job related monitoring, such as thread monitoring. Job monitoring is # a task for the job_monitor thread in the Job component. +"""Functions for monitoring of threads.""" + import logging import threading import time import re from os import environ, getpid, getuid from subprocess import Popen, PIPE +from typing import Any from pilot.common.exception import PilotException, ExceededMaxWaitTime from pilot.util.auxiliary import check_for_final_server_update, set_pilot_state @@ -37,24 +40,23 @@ from pilot.util.constants import MAX_KILL_WAIT_TIME # from pilot.util.container import execute from pilot.util.features import MachineFeatures +from pilot.util.heartbeat import update_pilot_heartbeat from pilot.util.queuehandling import get_queuedata_from_job, get_maxwalltime_from_job, abort_jobs_in_queues from pilot.util.timing import get_time_since_start logger = logging.getLogger(__name__) -# Monitoring of threads functions - -def control(queues, traces, args): # noqa: C901 +def control(queues: Any, traces: Any, args: Any): # noqa: C901 """ + Monitor threads. + Main control function, run from the relevant workflow module. - :param queues: - :param traces: - :param args: - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any) """ - t_0 = time.time() traces.pilot['lifetime_start'] = t_0 # ie referring to when pilot monitoring began traces.pilot['lifetime_max'] = t_0 @@ -157,14 +159,13 @@ def control(queues, traces, args): # noqa: C901 logger.info('[monitor] control thread has ended') -def run_shutdowntime_minute_check(time_since_start): +def run_shutdowntime_minute_check(time_since_start: int) -> bool: """ Run checks on machine features shutdowntime once a minute. - :param time_since_start: how many seconds have lapsed since the pilot started (int). - :return: True if reached max time, False it not (or if shutdowntime not known) (Boolean). + :param time_since_start: how many seconds have lapsed since the pilot started (int) + :return: True if reached max time, False otherwise (also if shutdowntime not known) (bool). """ - # check machine features if present for shutdowntime machinefeatures = MachineFeatures().get() if machinefeatures: @@ -205,15 +206,15 @@ def run_shutdowntime_minute_check(time_since_start): return False -def reached_maxtime_abort(args): +def reached_maxtime_abort(args: Any): """ - Max time has been reached, set REACHED_MAXTIME and graceful_stop, close any ActiveMQ connections. + Set REACHED_MAXTIME and graceful_stop, since max time has been reached. + + Also close any ActiveMQ connections Wait for final server update before setting graceful_stop. - :param args: pilot args. - :return: + :param args: Pilot arguments object (Any). """ - logger.info('setting REACHED_MAXTIME and graceful stop') environ['REACHED_MAXTIME'] = 'REACHED_MAXTIME' # TODO: use singleton instead if args.amq: @@ -237,9 +238,10 @@ def reached_maxtime_abort(args): # logger.info('lifetime: %i used, %i maximum', int(time.time() - traces.pilot['lifetime_start']), traces.pilot['lifetime_max']) -def get_process_info(cmd, user=None, args='aufx', pid=None): +def get_process_info(cmd: str, user: str = "", args: str = 'aufx', pid: int = 0) -> list: """ Return process info for given command. + The function returns a list with format [cpu, mem, command, number of commands] as returned by 'ps -u user args' for a given command (e.g. python3 pilot3/pilot.py). @@ -253,13 +255,12 @@ def get_process_info(cmd, user=None, args='aufx', pid=None): -> ['0.0', '0.0', 'sshd: nilspal@pts/28', 1] - :param cmd: command (string). - :param user: user (string). - :param args: ps arguments (string). - :param pid: process id (int). - :return: list with process info (l[0]=cpu usage(%), l[1]=mem usage(%), l[2]=command(string)). + :param cmd: command (str) + :param user: user (str) + :param args: ps arguments (str) + :param pid: process id (int) + :return: list with process info (l[0]=cpu usage(%), l[1]=mem usage(%), l[2]=command(string)) (list). """ - processes = [] num = 0 if not user: @@ -286,15 +287,28 @@ def get_process_info(cmd, user=None, args='aufx', pid=None): return processes -def run_checks(queues, args): +def get_proper_pilot_heartbeat() -> int: """ - Perform non-job related monitoring checks. + Return the proper pilot heartbeat time limit from config. - :param queues: - :param args: - :return: + :return: pilot heartbeat time limit (int). """ + try: + return int(config.Pilot.pilot_heartbeat) + except Exception as exc: + logger.warning(f'detected outdated config file: please update default.cfg: {exc}') + return 60 + + +def run_checks(queues: Any, args: Any) -> None: + """ + Perform non-job related monitoring checks. + + :param queues: queues object (Any) + :param args: Pilot arguments object (Any) + :raises: ExceedMaxWaitTime. + """ # check how long time has passed since last successful heartbeat if is_pilot_check(check='last_heartbeat'): last_heartbeat = time.time() - args.last_heartbeat @@ -307,6 +321,23 @@ def run_checks(queues, args): args.abort_job.clear() raise ExceededMaxWaitTime(diagnostics) + # note: active update rather than a check (every ten minutes) + if is_pilot_check(check='pilot_heartbeat'): + last_heartbeat = time.time() - args.pilot_heartbeat + _pilot_heartbeat = get_proper_pilot_heartbeat() + + if last_heartbeat > _pilot_heartbeat: + detected_job_suspension = True if last_heartbeat > 10 * 60 else False + if detected_job_suspension: + logger.warning(f'detected job suspension (last heartbeat was updated more than 10 minutes ago: {last_heartbeat} s)') + else: + logger.debug(f'pilot heartbeat file was last updated {last_heartbeat} s ago (time to update)') + + # if the pilot heartbeat file can be updated, update the args object + _time = time.time() + if update_pilot_heartbeat(_time, detected_job_suspension=detected_job_suspension, time_since_detection=last_heartbeat): + args.pilot_heartbeat = _time + if args.graceful_stop.is_set(): # find all running jobs and stop them, find all jobs in queues relevant to this module abort_jobs_in_queues(queues, args.signal) @@ -352,20 +383,20 @@ def run_checks(queues, args): # raise ExceededMaxWaitTime(diagnostics) -def get_max_running_time(lifetime, queuedata, queues, push, pod): +def get_max_running_time(lifetime: int, queuedata: Any, queues: Any, push: bool, pod: bool) -> int: """ Return the maximum allowed running time for the pilot. + The max time is set either as a pilot option or via the schedconfig.maxtime for the PQ in question. If running in a Kubernetes pod, always use the args.lifetime as maxtime (it will be determined by the harvester submitter). - :param lifetime: optional pilot option time in seconds (int). - :param queuedata: queuedata object - :param queues: - :param push: push mode (boolean) - :param pod: pod mode (boolean) - :return: max running time in seconds (int) + :param lifetime: optional pilot option time in seconds (int) + :param queuedata: queuedata object (Any) + :param queues: queues object (Any) + :param push: push mode (bool) + :param pod: pod mode (bool) + :return: max running time in seconds (int). """ - if pod: return lifetime diff --git a/pilot/control/payload.py b/pilot/control/payload.py index 73a440dd..ad39cbb8 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -23,41 +23,54 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 # - Wen Guan, wen.guan@cern.ch, 2017-2018 +"""Functions for handling the payload.""" + +import logging import os import time import traceback import queue from re import findall, split +from typing import Any, TextIO -from pilot.control.payloads import generic, eventservice, eventservicemerge +from pilot.control.payloads import ( + generic, + eventservice, + eventservicemerge +) from pilot.control.job import send_state from pilot.util.auxiliary import set_pilot_state from pilot.util.container import execute from pilot.util.processes import get_cpu_consumption_time from pilot.util.config import config -from pilot.util.filehandling import read_file, remove_core_dumps, get_guid, extract_lines_from_file, find_file +from pilot.util.filehandling import ( + read_file, + remove_core_dumps, + get_guid, + extract_lines_from_file, + find_file +) from pilot.util.processes import threads_aborted from pilot.util.queuehandling import put_in_queue from pilot.common.errorcodes import ErrorCodes -from pilot.common.exception import ExcThread, PilotException +from pilot.common.exception import ( + ExcThread, + PilotException +) from pilot.util.realtimelogger import get_realtime_logger -import logging logger = logging.getLogger(__name__) - errors = ErrorCodes() -def control(queues, traces, args): +def control(queues: Any, traces: Any, args: Any): """ - (add description) + Set up payload threads. - :param queues: - :param traces: - :param args: - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any). """ - targets = {'validate_pre': validate_pre, 'execute_payloads': execute_payloads, 'validate_post': validate_post, 'failed_post': failed_post, 'run_realtimelog': run_realtimelog} threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args}, @@ -66,24 +79,29 @@ def control(queues, traces, args): [thread.start() for thread in threads] # if an exception is thrown, the graceful_stop will be set by the ExcThread class run() function - while not args.graceful_stop.is_set(): - for thread in threads: - bucket = thread.get_bucket() - try: - exc = bucket.get(block=False) - except queue.Empty: - pass - else: - exc_type, exc_obj, exc_trace = exc - logger.warning(f"thread \'{thread.name}\' received an exception from bucket: {exc_obj}") + try: + while not args.graceful_stop.is_set(): + for thread in threads: + bucket = thread.get_bucket() + try: + exc = bucket.get(block=False) + except queue.Empty: + pass + else: + exc_type, exc_obj, exc_trace = exc + logger.warning(f"thread \'{thread.name}\' received an exception from bucket: {exc_obj}") - # deal with the exception - # .. + # deal with the exception + # .. - thread.join(0.1) - time.sleep(0.1) + thread.join(0.1) + time.sleep(0.1) - time.sleep(0.5) + time.sleep(0.5) + except Exception as exc: + logger.warning(f"exception caught while handling threads: {exc}") + finally: + logger.info('all payload control threads have been joined') logger.debug('payload control ending since graceful_stop has been set') if args.abort_job.is_set(): @@ -104,17 +122,18 @@ def control(queues, traces, args): logger.info('[payload] control thread has finished') -def validate_pre(queues, traces, args): +def validate_pre(queues: Any, traces: Any, args: Any): """ Get a Job object from the "payloads" queue and validate it. + Thread. + If the payload is successfully validated (user defined), the Job object is placed in the "validated_payloads" queue, otherwise it is placed in the "failed_payloads" queue. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any). """ while not args.graceful_stop.is_set(): time.sleep(0.5) @@ -137,19 +156,18 @@ def validate_pre(queues, traces, args): logger.info('[payload] validate_pre thread has finished') -def _validate_payload(job): +def _validate_payload(job: Any) -> bool: """ - Perform validation tests for the payload. + Perform user validation tests for the payload. - :param job: job object. - :return: boolean. + :param job: job object (Any) + :return: boolean (bool). """ - status = True # perform user specific validation pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) try: status = user.validate(job) except Exception as error: @@ -159,16 +177,16 @@ def _validate_payload(job): return status -def get_payload_executor(args, job, out, err, traces): +def get_payload_executor(args: Any, job: Any, out: TextIO, err: TextIO, traces: Any) -> Any: """ Get payload executor function for different payload. - :param args: args object - :param job: job object - :param out: stdout file object - :param err: stderr file object - :param traces: traces object - :return: instance of a payload executor. + :param args: Pilot arguments object (Any) + :param job: job object (Any) + :param out: stdout file object (TextIO) + :param err: stderr file object (TextIO) + :param traces: traces object (Any) + :return: instance of a payload executor (Any). """ if job.is_eventservice: # True for native HPO workflow as well payload_executor = eventservice.Executor(args, job, out, err, traces) @@ -176,10 +194,11 @@ def get_payload_executor(args, job, out, err, traces): payload_executor = eventservicemerge.Executor(args, job, out, err, traces) else: payload_executor = generic.Executor(args, job, out, err, traces) + return payload_executor -def execute_payloads(queues, traces, args): # noqa: C901 +def execute_payloads(queues: Any, traces: Any, args: Any): # noqa: C901 """ Execute queued payloads. @@ -189,12 +208,10 @@ def execute_payloads(queues, traces, args): # noqa: C901 is started, the thread will wait for it to finish and then check for any failures. A successfully completed job is placed in the "finished_payloads" queue, and a failed job will be placed in the "failed_payloads" queue. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any). """ - job = None while not args.graceful_stop.is_set(): time.sleep(0.5) @@ -216,8 +233,12 @@ def execute_payloads(queues, traces, args): # noqa: C901 logger.debug(f'job {job.jobid} added to monitored payloads queue') try: - out = open(os.path.join(job.workdir, config.Payload.payloadstdout), 'wb') - err = open(os.path.join(job.workdir, config.Payload.payloadstderr), 'wb') + if job.is_eventservice or job.is_eventservicemerge: + out = open(os.path.join(job.workdir, config.Payload.payloadstdout), 'ab') + err = open(os.path.join(job.workdir, config.Payload.payloadstderr), 'ab') + else: + out = open(os.path.join(job.workdir, config.Payload.payloadstdout), 'wb') + err = open(os.path.join(job.workdir, config.Payload.payloadstderr), 'wb') except Exception as error: logger.warning(f'failed to open payload stdout/err: {error}') out = None @@ -236,7 +257,7 @@ def execute_payloads(queues, traces, args): # noqa: C901 # run the payload and measure the execution time job.t0 = os.times() exit_code, diagnostics = payload_executor.run() - if exit_code > 1000: # pilot error code, add to list + if exit_code and exit_code > 1000: # pilot error code, add to list logger.debug(f'pilot error code received (code={exit_code}, diagnostics=\n{diagnostics})') job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code, msg=diagnostics) @@ -249,7 +270,7 @@ def execute_payloads(queues, traces, args): # noqa: C901 # some HPO jobs will produce new output files (following lfn name pattern), discover those and replace the job.outdata list pilot_user = os.environ.get('PILOT_USER', 'generic').lower() if job.is_hpo: - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) try: user.update_output_for_hpo(job) except Exception as error: @@ -269,7 +290,7 @@ def execute_payloads(queues, traces, args): # noqa: C901 perform_initial_payload_error_analysis(job, exit_code) # was an error already found? - user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.diagnose', globals(), locals(), [pilot_user], 0) try: exit_code_interpret = user.interpret(job) except Exception as error: @@ -320,14 +341,13 @@ def execute_payloads(queues, traces, args): # noqa: C901 logger.info('[payload] execute_payloads thread has finished') -def extract_error_info(error): +def extract_error_info(error: str) -> (int, str): """ Extract the error code and diagnostics from an error exception. - :param error: exception string. - :return: error code (int), diagnostics (string). + :param error: exception string (str) + :return: error code (int), diagnostics (str). """ - error_code = errors.INTERNALPILOTPROBLEM diagnostics = f'full exception: {error}' @@ -343,30 +363,16 @@ def extract_error_info(error): return error_code, diagnostics -def get_transport(catchall): - """ - Extract the transport/protocol from catchall if present. - - :param catchall: PQ.catchall field (string). - :return: transport (string). - """ - - transport = '' - - return transport - - -def get_rtlogging(): +def get_rtlogging() -> str: """ Return the proper rtlogging value from the experiment specific plug-in or the config file. :return: rtlogging (str). """ - rtlogging = None pilot_user = os.environ.get('PILOT_USER', 'generic').lower() try: - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) rtlogging = user.get_rtlogging() except Exception as exc: rtlogging = config.Pilot.rtlogging @@ -375,9 +381,10 @@ def get_rtlogging(): return rtlogging -def get_logging_info(job, args): +def get_logging_info(job: Any, args: Any) -> dict: """ Extract the logging type/protocol/url/port from catchall if present, or from args fields. + Returns a dictionary with the format: {'logging_type': .., 'protocol': .., 'url': .., 'port': .., 'logname': ..} If the provided debug_command contains a tail instruction ('tail log_file_name'), the pilot will locate @@ -385,11 +392,10 @@ def get_logging_info(job, args): Note: the returned dictionary can be built with either args (has priority) or catchall info. - :param job: job object. - :param args: args object. - :return: info dictionary (logging_type (string), protocol (string), url (string), port (int)). + :param job: job object (Any) + :param args: Pilot arguments object (Any) + :return: info dictionary (logging_type (string), protocol (string), url (string), port (int)) (dict). """ - info_dic = {} if not job.realtimelogging: @@ -448,17 +454,16 @@ def get_logging_info(job, args): return info_dic -def find_log_to_tail(debug_command, workdir, args, is_analysis): +def find_log_to_tail(debug_command: str, workdir: str, args: Any, is_analysis: bool) -> str: """ Find the log file to tail in the RT logging. - :param debug_command: requested debug command (string). - :param workdir: job working directory (string). - :param args: pilot args object. - :param is_analysis: True for user jobs (Bool). - :return: path to log file (string). + :param debug_command: requested debug command (str) + :param workdir: job working directory (str) + :param args: Pilot arguments object (Any) + :param is_analysis: True for user jobs, False otherwise (bool) + :return: path to log file (str). """ - path = "" filename = "" counter = 0 @@ -490,18 +495,17 @@ def find_log_to_tail(debug_command, workdir, args, is_analysis): return logf -def run_realtimelog(queues, traces, args): # noqa: C901 +def run_realtimelog(queues: Any, traces: Any, args: Any): # noqa: C901 """ Validate finished payloads. + If payload finished correctly, add the job to the data_out queue. If it failed, add it to the data_out queue as well but only for log stage-out (in failed_post() below). - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any). """ - info_dic = None while not args.graceful_stop.is_set(): time.sleep(0.5) @@ -514,23 +518,31 @@ def run_realtimelog(queues, traces, args): # noqa: C901 abort_loops = False first1 = True first2 = True + gotonextjob = False while not args.graceful_stop.is_set(): # note: in multi-job mode, the real-time logging will be switched off at the end of the job while not args.graceful_stop.is_set(): if job.state == 'running': if first1: - logger.debug('job is running, check if real-time logger is needed') + logger.debug(f'job is running, check if real-time logger is needed ' + f'(job.debug={job.debug}, job.debug_command={job.debug_command})') first1 = False break if job.state == 'stageout' or job.state == 'failed' or job.state == 'holding': if first2: logger.debug(f'job is in state {job.state}, continue to next job or abort (wait for graceful stop)') - first2 = False + first1 = True + break time.sleep(10) continue time.sleep(1) + if first1 and first2: + logger.debug('continue to next job (1)') + gotonextjob = True + break + if args.use_realtime_logging: # always do real-time logging job.realtimelogging = True @@ -547,6 +559,11 @@ def run_realtimelog(queues, traces, args): # noqa: C901 break time.sleep(10) + if gotonextjob: + logger.debug('continue to next job (2)') + gotonextjob = False + continue + # only set info_dic once per job (the info will not change) info_dic = get_logging_info(job, args) if info_dic: @@ -574,13 +591,12 @@ def run_realtimelog(queues, traces, args): # noqa: C901 logger.info('[payload] run_realtimelog thread has finished') -def set_cpu_consumption_time(job): +def set_cpu_consumption_time(job: Any): """ Set the CPU consumption time. - :param job: job object. - :return: - """ + :param job: job object (Any). + """ cpuconsumptiontime = get_cpu_consumption_time(job.t0) job.cpuconsumptiontime = int(round(cpuconsumptiontime)) job.cpuconsumptionunit = "s" @@ -588,16 +604,15 @@ def set_cpu_consumption_time(job): logger.info(f'CPU consumption time: {cpuconsumptiontime} {job.cpuconsumptionunit} (rounded to {job.cpuconsumptiontime} {job.cpuconsumptionunit})') -def perform_initial_payload_error_analysis(job, exit_code): +def perform_initial_payload_error_analysis(job: Any, exit_code: int): """ Perform an initial analysis of the payload. + Singularity/apptainer errors are caught here. - :param job: job object. - :param exit_code: exit code from payload execution. - :return: + :param job: job object (Any) + :param exit_code: exit code from payload execution (int). """ - if exit_code != 0: logger.warning(f'main payload execution returned non-zero exit code: {exit_code}') @@ -674,14 +689,13 @@ def perform_initial_payload_error_analysis(job, exit_code): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COREDUMP) -def scan_for_memory_errors(subprocesses): +def scan_for_memory_errors(subprocesses: list) -> str: """ Scan for memory errors in dmesg messages. - :param subprocesses: list of payload subprocesses. - :return: error diagnostics (string). + :param subprocesses: list of payload subprocesses (list) + :return: error diagnostics (str). """ - diagnostics = "" search_str = 'Memory cgroup out of memory' for pid in subprocesses: @@ -701,16 +715,16 @@ def scan_for_memory_errors(subprocesses): return diagnostics -def set_error_code_from_stderr(msg, fatal): +def set_error_code_from_stderr(msg: str, fatal: bool) -> int: """ Identify specific errors in stderr and set the corresponding error code. + The function returns 0 if no error is recognized. - :param msg: stderr (string). - :param fatal: boolean flag if fatal error among warning messages in stderr. + :param msg: stderr (str) + :param fatal: boolean flag if fatal error among warning messages in stderr (bool) :return: error code (int). """ - exit_code = 0 error_map = {errors.SINGULARITYNEWUSERNAMESPACE: "Failed invoking the NEWUSER namespace runtime", errors.SINGULARITYFAILEDUSERNAMESPACE: "Failed to create user namespace", @@ -731,18 +745,19 @@ def set_error_code_from_stderr(msg, fatal): return exit_code -def validate_post(queues, traces, args): +def validate_post(queues: Any, traces: Any, args: Any): """ Validate finished payloads. + + Thread. + If payload finished correctly, add the job to the data_out queue. If it failed, add it to the data_out queue as well but only for log stage-out (in failed_post() below). - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any). """ - while not args.graceful_stop.is_set(): time.sleep(0.5) # finished payloads @@ -767,17 +782,19 @@ def validate_post(queues, traces, args): logger.info('[payload] validate_post thread has finished') -def failed_post(queues, traces, args): +def failed_post(queues: Any, traces: Any, args: Any): """ - Get a Job object from the "failed_payloads" queue. Set the pilot state to "stakeout" and the stageout field to + Handle failed jobs. + + Thread. + + Get a Job object from the "failed_payloads" queue. Set the pilot state to "stageout" and the stageout field to "log", and add the Job object to the "data_out" queue. - :param queues: internal queues for job handling. - :param traces: tuple containing internal pilot states. - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). - :return: + :param queues: internal queues for job handling (Any) + :param traces: tuple containing internal pilot states (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any). """ - while not args.graceful_stop.is_set(): time.sleep(0.5) # finished payloads diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 8d6cd068..c46ef868 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -597,8 +597,8 @@ def wait_graceful(self, args: Any, proc: Any) -> int: break time.sleep(1) if breaker: - logger.info(f'breaking -- sleep 3s before sending SIGKILL pid={proc.pid}') - time.sleep(3) + logger.info(f'breaking -- sleep 10 s before sending SIGKILL pid={proc.pid}') + time.sleep(10) proc.kill() break @@ -687,14 +687,14 @@ def should_verify_setup(self): user = __import__(f'pilot.user.{pilot_user}.setup', globals(), locals(), [pilot_user], 0) return user.should_verify_setup(self.__job) - def run(self) -> int: # noqa: C901 + def run(self) -> (int, str): # noqa: C901 """ Run all payload processes (including pre- and post-processes, and utilities). In the case of HPO jobs, this function will loop over all processes until the preprocess returns a special exit code. - :return: exit code (int). + :return: exit code (int), diagnostics (str). """ diagnostics = '' @@ -745,7 +745,7 @@ def run(self) -> int: # noqa: C901 except Exception as error: diagnostics = f'could not execute: {error}' logger.error(diagnostics) - return None, diagnostics + return errors.PAYLOADEXECUTIONEXCEPTION, diagnostics self.post_setup(self.__job) diff --git a/pilot/copytool/common.py b/pilot/copytool/common.py index 3646a5f6..9233a7b8 100644 --- a/pilot/copytool/common.py +++ b/pilot/copytool/common.py @@ -21,6 +21,8 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 # - Mario Lassnig, mario.lassnig@cern.ch, 2020 +"""Commnon functions for copytools.""" + import logging import os import re @@ -40,11 +42,10 @@ def get_timeout(filesize: int, add: int = 0) -> int: """ Get a proper time-out limit based on the file size. - :param filesize: file size (int). + :param filesize: file size (int) :param add: optional additional time to be added [s] (int) :return: time-out in seconds (int). """ - timeout_max = 3 * 3600 # 3 hours timeout_min = 300 # self.timeout @@ -56,13 +57,13 @@ def get_timeout(filesize: int, add: int = 0) -> int: def verify_catalog_checksum(fspec: Any, path: str) -> (str, str): """ Verify that the local and remote (fspec) checksum values are the same. + The function will update the fspec object. - :param fspec: FileSpec object for a given file. - :param path: path to local file (string). - :return: state (string), diagnostics (string). + :param fspec: FileSpec object for a given file (Any) + :param path: path to local file (str) + :return: state (str), diagnostics (str). """ - diagnostics = "" state = "" @@ -104,10 +105,10 @@ def verify_catalog_checksum(fspec: Any, path: str) -> (str, str): def merge_destinations(files: list) -> dict: """ - Converts the file-with-destination dict to a destination-with-files dict + Convert the file-with-destination dict to a destination-with-files dictionary. :param files: files to merge (list) - :return: destination-with-files dictionary. + :return: destination-with-files dictionary (dict). """ destinations = {} # ensure type(files) == list @@ -132,9 +133,9 @@ def get_copysetup(copytools: list, copytool_name: str) -> str: """ Return the copysetup for the given copytool. - :param copytools: copytools list from infosys. - :param copytool_name: name of copytool (string). - :return: copysetup (string). + :param copytools: copytools list from infosys (list) + :param copytool_name: name of copytool (str) + :return: copysetup (str). """ copysetup = "" @@ -152,27 +153,27 @@ def get_copysetup(copytools: list, copytool_name: str) -> str: def get_error_info(rcode: int, state: str, error_msg: str) -> dict: """ Return an error info dictionary specific to transfer errors. + Helper function to resolve_common_transfer_errors(). - :param rcode: return code (int). - :param state: state string used in Rucio traces. - :param error_msg: transfer command stdout (string). - :return: dictionary with format {'rcode': rcode, 'state': state, 'error': error_msg}. + :param rcode: return code (int) + :param state: state string used in Rucio traces (str) + :param error_msg: transfer command stdout (str) + :return: dictionary with format {'rcode': rcode, 'state': state, 'error': error_msg} (dict). """ - return {'rcode': rcode, 'state': state, 'error': error_msg} def output_line_scan(ret: dict, output: str) -> dict: """ Do some reg exp on the transfer command output to search for special errors. + Helper function to resolve_common_transfer_errors(). - :param ret: pre-filled error info dictionary with format {'rcode': rcode, 'state': state, 'error': error_msg} - :param output: transfer command stdout (string). - :return: updated error info dictionary. + :param ret: pre-filled error info dictionary with format {'rcode': rcode, 'state': state, 'error': error_msg} (dict) + :param output: transfer command stdout (str) + :return: updated error info dictionary (dict). """ - for line in output.split('\n'): match = re.search(r"[Dd]etails\s*:\s*(?P.*)", line) # Python 3 (added r) if match: @@ -188,11 +189,10 @@ def resolve_common_transfer_errors(output: str, is_stagein: bool = True) -> dict """ Resolve any common transfer related errors. - :param output: stdout from transfer command (string). - :param is_stagein: optional (boolean). - :return: dict {'rcode': rcode, 'state': state, 'error': error_msg}. + :param output: stdout from transfer command (str) + :param is_stagein: optional (bool) + :return: dict {'rcode': rcode, 'state': state, 'error': error_msg} (dict). """ - # default to make sure dictionary exists and all fields are populated (some of which might be overwritten below) ret = get_error_info(ErrorCodes.STAGEINFAILED if is_stagein else ErrorCodes.STAGEOUTFAILED, 'COPY_ERROR', output) if not output: diff --git a/pilot/copytool/gfal.py b/pilot/copytool/gfal.py index 4570753b..acf01a18 100644 --- a/pilot/copytool/gfal.py +++ b/pilot/copytool/gfal.py @@ -22,9 +22,11 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -import os -import logging +"""GFAL2 copy tool.""" + import errno +import logging +import os from time import time from .common import resolve_common_transfer_errors, get_timeout @@ -39,30 +41,45 @@ allowed_schemas = ['srm', 'gsiftp', 'https', 'davs', 'root'] # prioritized list of supported schemas for transfers by given copytool -def is_valid_for_copy_in(files): - return True ## FIX ME LATER +def is_valid_for_copy_in(files: list) -> bool: + """ + Determine if this copytool is valid for input for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ #for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False - #return True + return True ## FIX ME LATER -def is_valid_for_copy_out(files): - return True ## FIX ME LATER - #for f in files: +def is_valid_for_copy_out(files: list) -> bool: + """ + Determine if this copytool is valid for output for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False - #return True + return True ## FIX ME LATER -def copy_in(files, **kwargs): +def copy_in(files: list, **kwargs: dict) -> list: """ - Download given files using gfal-copy command. + Download given files using gfal-copy command. - :param files: list of `FileSpec` objects - :raise: PilotException in case of controlled error + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) + :raises: PilotException in case of controlled error + :return: updated files (list). """ - #allow_direct_access = kwargs.get('allow_direct_access') or False trace_report = kwargs.get('trace_report') @@ -92,12 +109,12 @@ def copy_in(files, **kwargs): timeout = get_timeout(fspec.filesize) source = fspec.turl - destination = "file://%s" % os.path.abspath(os.path.join(dst, fspec.lfn)) + destination = f"file://{os.path.abspath(os.path.join(dst, fspec.lfn))}" - cmd = ['gfal-copy --verbose -f', ' -t %s' % timeout] + cmd = ['gfal-copy --verbose -f', f' -t {timeout}'] if fspec.checksum: - cmd += ['-K', '%s:%s' % list(fspec.checksum.items())[0]] # Python 2/3 + cmd += ['-K', '%s:%s' % list(fspec.checksum.items())[0]] cmd += [source, destination] @@ -107,7 +124,7 @@ def copy_in(files, **kwargs): if rcode in [errno.ETIMEDOUT, errno.ETIME]: error = {'rcode': ErrorCodes.STAGEINTIMEOUT, 'state': 'CP_TIMEOUT', - 'error': 'Copy command timed out: %s' % stderr} + 'error': f'Copy command timed out: {stderr}'} else: error = resolve_common_transfer_errors(stdout + stderr, is_stagein=True) fspec.status = 'failed' @@ -126,14 +143,14 @@ def copy_in(files, **kwargs): return files -def copy_out(files, **kwargs): +def copy_out(files: list, **kwargs: dict) -> list: """ Upload given files using gfal command. - :param files: Files to upload + :param files: Files to upload (files) :raises: PilotException in case of errors + :return: updated files (list). """ - if not check_for_gfal(): raise StageOutFailure("No GFAL2 tools found") @@ -147,10 +164,10 @@ def copy_out(files, **kwargs): timeout = get_timeout(fspec.filesize) - source = "file://%s" % os.path.abspath(fspec.surl or os.path.join(src, fspec.lfn)) + source = f"file://{os.path.abspath(fspec.surl or os.path.join(src, fspec.lfn))}" destination = fspec.turl - cmd = ['gfal-copy --verbose -f', ' -t %s' % timeout] + cmd = ['gfal-copy --verbose -f', f' -t {timeout}'] if fspec.checksum: cmd += ['-K', '%s:%s' % list(fspec.checksum.items())[0]] # Python 2/3 @@ -163,7 +180,7 @@ def copy_out(files, **kwargs): if rcode in [errno.ETIMEDOUT, errno.ETIME]: error = {'rcode': ErrorCodes.STAGEOUTTIMEOUT, 'state': 'CP_TIMEOUT', - 'error': 'Copy command timed out: %s' % stderr} + 'error': f'Copy command timed out: {stderr}'} else: error = resolve_common_transfer_errors(stdout + stderr, is_stagein=False) fspec.status = 'failed' @@ -182,21 +199,20 @@ def copy_out(files, **kwargs): return files -def move_all_files_in(files, nretries=1): ### NOT USED -- TO BE DEPRECATED +def move_all_files_in(files: list, nretries: int = 1) -> (int, str, str): ### NOT USED -- TO BE DEPRECATED """ - Move all files. + Move all input files. - :param files: - :param nretries: number of retries; sometimes there can be a timeout copying, but the next attempt may succeed - :return: exit_code, stdout, stderr + :param files: list of FileSpec objects (list) + :param nretries: number of retries; sometimes there can be a timeout copying, but the next attempt may succeed (int) + :return: exit_code (int), stdout (str), stderr (str). """ - exit_code = 0 stdout = "" stderr = "" for entry in files: # entry = {'name':, 'source':, 'destination':} - logger.info("transferring file %s from %s to %s" % (entry['name'], entry['source'], entry['destination'])) + logger.info(f"transferring file {entry['name']} from {entry['source']} to {entry['destination']}") source = entry['source'] + '/' + entry['name'] # why /*4 ? Because sometimes gfal-copy complains about file:// protocol (anyone knows why?) @@ -207,7 +223,7 @@ def move_all_files_in(files, nretries=1): ### NOT USED -- TO BE DEPRECATED if exit_code != 0: if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr)) + logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful break @@ -215,20 +231,19 @@ def move_all_files_in(files, nretries=1): ### NOT USED -- TO BE DEPRECATED return exit_code, stdout, stderr -def move_all_files_out(files, nretries=1): ### NOT USED -- TO BE DEPRECATED +def move_all_files_out(files: list, nretries: int = 1) -> (int, str, str): ### NOT USED -- TO BE DEPRECATED """ - Move all files. + Move all output files. - :param files: - :return: exit_code, stdout, stderr + :param files: list of FileSpec objects (list) + :return: exit_code (int), stdout (str), stderr (str). """ - exit_code = 0 stdout = "" stderr = "" for entry in files: # entry = {'name':, 'source':, 'destination':} - logger.info("transferring file %s from %s to %s" % (entry['name'], entry['source'], entry['destination'])) + logger.info(f"transferring file {entry['name']} from {entry['source']} to {entry['destination']}") destination = entry['destination'] + '/' + entry['name'] # why /*4 ? Because sometimes gfal-copy complains about file:// protocol (anyone knows why?) @@ -239,7 +254,7 @@ def move_all_files_out(files, nretries=1): ### NOT USED -- TO BE DEPRECATED if exit_code != 0: if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s" % (exit_code, stdout, stderr)) + logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful break @@ -248,12 +263,19 @@ def move_all_files_out(files, nretries=1): ### NOT USED -- TO BE DEPRECATED #@timeout(seconds=10800) -def move(source, destination, recursive=False): - cmd = None +def move(source: str, destination: str, recursive: bool = False) -> (int, str, str): + """ + Perform gfal-copy from the given source location to the given destination. + + :param source: file source path (str) + :param destination: destination path (str) + :param recursive: True if -r option is desired, False otherwise (bool) + :return: exit code (int), stdout (str), stderr (str). + """ if recursive: - cmd = "gfal-copy -r %s %s" % (source, destination) + cmd = f"gfal-copy -r {source} {destination}" else: - cmd = "gfal-copy %s %s" % (source, destination) + cmd = f"gfal-copy {source} {destination}" print(cmd) exit_code, stdout, stderr = execute(cmd) @@ -261,5 +283,10 @@ def move(source, destination, recursive=False): def check_for_gfal(): - exit_code, gfal_path, _ = execute('which gfal-copy') + """ + Check if gfal-copy is locally available. + + :return: True if gfal-copy is available, False otherwise (bool). + """ + exit_code, _, _ = execute('which gfal-copy') return exit_code == 0 diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py index 33b99a4f..2d54a1b4 100644 --- a/pilot/copytool/gs.py +++ b/pilot/copytool/gs.py @@ -18,14 +18,19 @@ # # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2021-2023 -# - Shuwei +# - Shuwei Ye, yesw@bnl.gov, 2021 + +"""GS copy tool.""" -import os import logging -from pilot.info import infosys -import subprocess +import os +import pathlib import re +import subprocess from glob import glob +from typing import Any + +from pilot.info import infosys try: from google.cloud import storage @@ -34,11 +39,6 @@ else: storage_client = storage.Client() -try: - import pathlib # Python 3 -except Exception: - pathlib = None - from .common import resolve_common_transfer_errors from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException @@ -54,25 +54,48 @@ allowed_schemas = ['gs', 'srm', 'gsiftp', 'https', 'davs', 'root'] -def is_valid_for_copy_in(files): - return True ## FIX ME LATER +def is_valid_for_copy_in(files: list) -> bool: + """ + Determine if this copytool is valid for input for the given file list. + Placeholder. -def is_valid_for_copy_out(files): + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False return True ## FIX ME LATER -def resolve_surl(fspec, protocol, ddmconf, **kwargs): +def is_valid_for_copy_out(files: list) -> bool: """ - Get final destination SURL for file to be transferred to Objectstore - Can be customized at the level of specific copytool + Determine if this copytool is valid for output for the given file list. + + Placeholder. - :param protocol: suggested protocol - :param ddmconf: full ddm storage data - :param fspec: file spec data - :return: dictionary {'surl': surl} + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False + return True ## FIX ME LATER + + +def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> dict: + """ + Get final destination SURL for file to be transferred to Objectstore. + + Can be customized at the level of specific copytool. + :param fspec: file spec data (Any) + :param protocol: suggested protocol (dict) + :param ddmconf: full ddm storage data (dict) + :param kwargs: kwargs dictionary (dict) + :return: SURL dictionary {'surl': surl} (dict). + """ try: pandaqueue = infosys.pandaqueue except Exception: @@ -82,7 +105,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: - raise PilotException('failed to resolve ddmendpoint by name=%s' % fspec.ddmendpoint) + raise PilotException(f'failed to resolve ddmendpoint by name={fspec.ddmendpoint}') dataset = fspec.dataset if dataset: @@ -92,7 +115,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): remote_path = os.path.join(protocol.get('path', ''), pandaqueue, dataset) surl = protocol.get('endpoint', '') + remote_path - logger.info('For GCS bucket, set surl=%s', surl) + logger.info(f'for GCS bucket, set surl={surl}') # example: # protocol = {u'path': u'/atlas-eventservice', u'endpoint': u's3://s3.cern.ch:443/', u'flavour': u'AWS-S3-SSL', u'id': 175} @@ -100,19 +123,19 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): return {'surl': surl} -def copy_in(files, **kwargs): +def copy_in(files: list, **kwargs: dict) -> list: """ Download given files from a GCS bucket. - :param files: list of `FileSpec` objects + :param files: list of `FileSpec` objects (list) :raise: PilotException in case of controlled error + :return: updated files (list). """ - for fspec in files: dst = fspec.workdir or kwargs.get('workdir') or '.' path = os.path.join(dst, fspec.lfn) - logger.info('downloading surl=%s to local file %s', fspec.surl, path) + logger.info(f'downloading surl={fspec.surl} to local file {path}') status, diagnostics = download_file(path, fspec.surl, object_name=fspec.lfn) if not status: ## an error occurred @@ -127,16 +150,15 @@ def copy_in(files, **kwargs): return files -def download_file(path, surl, object_name=None): +def download_file(path: str, surl: str, object_name: str = None) -> (bool, str): """ Download a file from a GS bucket. - :param path: Path to local file after download (string). - :param surl: remote path (string). - :param object_name: GCS object name. If not specified then file_name from path is used. - :return: True if file was uploaded (else False), diagnostics (string). + :param path: Path to local file after download (str) + :param surl: remote path (str) + :param object_name: GCS object name. If not specified then file_name from path is used (str) + :return: True if file was uploaded - otherwise False (bool), diagnostics (str). """ - # if object_name was not specified, use file name from path if object_name is None: object_name = os.path.basename(path) @@ -146,21 +168,21 @@ def download_file(path, surl, object_name=None): with target.open(mode="wb") as downloaded_file: storage_client.download_blob_to_file(surl, downloaded_file) except Exception as error: - diagnostics = 'exception caught in gs client: %s' % error + diagnostics = f'exception caught in gs client: {error}' logger.critical(diagnostics) return False, diagnostics return True, "" -def copy_out(files, **kwargs): +def copy_out(files: list, **kwargs: dict): """ Upload given files to GS storage. - :param files: list of `FileSpec` objects + :param files: list of `FileSpec` objects (list) :raise: PilotException in case of controlled error + :return: updated files (list). """ - workdir = kwargs.pop('workdir') # if len(files) > 0: @@ -170,7 +192,7 @@ def copy_out(files, **kwargs): # (bucket, remote_path) = reobj.groups() for fspec in files: - logger.info('Going to process fspec.turl=%s', fspec.turl) + logger.info(f'processing fspec.turl={fspec.turl}') fspec.status = None reobj = re.match(r'gs://([^/]*)/(.*)', fspec.turl) @@ -195,7 +217,7 @@ def copy_out(files, **kwargs): if os.path.exists(path): if logfile == config.Pilot.pilotlog or logfile == config.Payload.payloadstdout or logfile == config.Payload.payloadstderr: content_type = "text/plain" - logger.info('Change the file=%s content-type to text/plain', logfile) + logger.debug(f'change the file {logfile} content-type to text/plain') else: content_type = None try: @@ -204,12 +226,12 @@ def copy_out(files, **kwargs): result = result.decode('utf-8') if result.find(';') > 0: content_type = result.split(';')[0] - logger.info('Change the file=%s content-type to %s', logfile, content_type) + logger.debug(f'change the file {logfile} content-type to {content_type}') except Exception: pass object_name = os.path.join(remote_path, logfile) - logger.info('uploading %s to bucket=%s using object name=%s', path, bucket, object_name) + logger.info(f'uploading {path} to bucket {bucket} using object name {object_name}') status, diagnostics = upload_file(path, bucket, object_name=object_name, content_type=content_type) if not status: ## an error occurred @@ -219,7 +241,7 @@ def copy_out(files, **kwargs): fspec.status_code = error.get('rcode') raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) else: - diagnostics = 'local output file does not exist: %s' % path + diagnostics = f'local output file does not exist: {path}' logger.warning(diagnostics) fspec.status = 'failed' fspec.status_code = errors.STAGEOUTFAILED @@ -232,16 +254,16 @@ def copy_out(files, **kwargs): return files -def upload_file(file_name, bucket, object_name=None, content_type=None): +def upload_file(file_name: str, bucket: str, object_name: str = None, content_type: str = None) -> (bool, str): """ Upload a file to a GCS bucket. - :param file_name: File to upload. - :param bucket: Bucket to upload to (string). - :param object_name: GCS object name. If not specified then file_name is used. + :param file_name: file to upload (str) + :param bucket: bucket to upload to (str) + :param object_name: GCS object name. If not specified then file_name is used (str) + :param content_type: content type (str) :return: True if file was uploaded (else False), diagnostics (string). """ - # if GCS object_name was not specified, use file_name if object_name is None: object_name = file_name @@ -251,15 +273,15 @@ def upload_file(file_name, bucket, object_name=None, content_type=None): gs_bucket = storage_client.get_bucket(bucket) # remove any leading slash(es) in object_name object_name = object_name.lstrip('/') - logger.info('uploading a file to bucket=%s in full path=%s in content_type=%s', bucket, object_name, content_type) + logger.info(f'uploading a file to bucket {bucket} in full path {object_name} in content_type {content_type}') blob = gs_bucket.blob(object_name) blob.upload_from_filename(filename=file_name, content_type=content_type) if file_name.endswith(config.Pilot.pilotlog): url_pilotlog = blob.public_url os.environ['GTAG'] = url_pilotlog - logger.debug("Set envvar GTAG with the pilotLot URL=%s", url_pilotlog) + logger.debug(f"set env var GTAG with the pilotLot URL={url_pilotlog}") except Exception as error: - diagnostics = 'exception caught in gs client: %s' % error + diagnostics = f'exception caught in gs client: {error}' logger.critical(diagnostics) return False, diagnostics diff --git a/pilot/copytool/lsm.py b/pilot/copytool/lsm.py index 0b6ca87c..9b182ea4 100644 --- a/pilot/copytool/lsm.py +++ b/pilot/copytool/lsm.py @@ -21,13 +21,22 @@ # - Tobias Wegner, tobias.wegner@cern.ch, 2018 # - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 -import os +"""Local site mover copy tool.""" + import logging import errno +import os from time import time -from .common import get_copysetup, verify_catalog_checksum, resolve_common_transfer_errors #, get_timeout -from pilot.common.exception import StageInFailure, StageOutFailure, PilotException, ErrorCodes +from .common import ( + get_copysetup, + verify_catalog_checksum, + resolve_common_transfer_errors #, get_timeout +) +from pilot.common.exception import ( + PilotException, + ErrorCodes +) from pilot.util.container import execute #from pilot.util.timer import timeout @@ -39,50 +48,45 @@ allowed_schemas = ['srm', 'gsiftp', 'root'] # prioritized list of supported schemas for transfers by given copytool -def is_valid_for_copy_in(files): - return True ## FIX ME LATER - #for f in files: - # if not all(key in f for key in ('name', 'source', 'destination')): - # return False - #return True +def is_valid_for_copy_in(files: list) -> bool: + """ + Determine if this copytool is valid for input for the given file list. + Placeholder. -def is_valid_for_copy_out(files): - #for f in files: + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: # if not all(key in f for key in ('name', 'source', 'destination')): # return False - return True + return True ## FIX ME LATER -def copy_in_old(files): +def is_valid_for_copy_out(files: list) -> bool: """ - Tries to download the given files using lsm-get directly. + Determine if this copytool is valid for output for the given file list. - :param files: Files to download - :raises PilotException: StageInFailure - """ + Placeholder. - if not check_for_lsm(dst_in=True): - raise StageInFailure("No LSM tools found") - exit_code, stdout, stderr = move_all_files_in(files) - if exit_code != 0: - # raise failure - raise StageInFailure(stdout) + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False + return True ## FIX ME LATER -def copy_in(files, **kwargs): +def copy_in(files: list, **kwargs: dict) -> list: """ Download given files using the lsm-get command. - :param files: list of `FileSpec` objects. - :raise: PilotException in case of controlled error. - :return: files `FileSpec` object. + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) + :raises: PilotException in case of controlled error + :return: updated files (list). """ - - exit_code = 0 - stdout = "" - stderr = "" - copytools = kwargs.get('copytools') or [] copysetup = get_copysetup(copytools, 'lsm') trace_report = kwargs.get('trace_report') @@ -113,12 +117,12 @@ def copy_in(files, **kwargs): source = fspec.turl destination = os.path.join(dst, fspec.lfn) - logger.info("transferring file %s from %s to %s", fspec.lfn, source, destination) + logger.info(f"transferring file {fspec.lfn} from {source} to {destination}") exit_code, stdout, stderr = move(source, destination, dst_in=True, copysetup=copysetup) if exit_code != 0: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) + logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") error = resolve_common_transfer_errors(stderr, is_stagein=True) fspec.status = 'failed' @@ -148,14 +152,14 @@ def copy_in(files, **kwargs): return files -def copy_out(files, **kwargs): +def copy_out(files: list, **kwargs: dict) -> list: """ Upload given files using lsm copytool. - :param files: list of `FileSpec` objects. - :raise: PilotException in case of controlled error. + :param files: list of `FileSpec` objects (list) + :raise: PilotException in case of controlled error + :return: updated files (list). """ - copytools = kwargs.get('copytools') or [] copysetup = get_copysetup(copytools, 'lsm') trace_report = kwargs.get('trace_report') @@ -173,7 +177,7 @@ def copy_out(files, **kwargs): ddm = ddmconf.get(fspec.ddmendpoint) token = ddm.token if not token: - diagnostics = "copy_out() failed to resolve token value for ddmendpoint=%s" % (fspec.ddmendpoint) + diagnostics = f"copy_out() failed to resolve token value for ddmendpoint={fspec.ddmendpoint}" trace_report.update(clientState='STAGEOUT_ATTEMPT_FAILED', stateReason=diagnostics, timeEnd=time()) @@ -187,19 +191,16 @@ def copy_out(files, **kwargs): # checksum has been calculated in the previous step - transfer_files() in api/data # note: pilot is handing over checksum to the command - which will/should verify it after the transfer - checksum = "adler32:%s" % fspec.checksum.get('adler32') + checksum = f"adler32:{fspec.checksum.get('adler32')}" # define the command options opts = {'--size': fspec.filesize, '-t': token, '--checksum': checksum, '--guid': fspec.guid} - try: - opts = " ".join(["%s %s" % (k, v) for (k, v) in opts.iteritems()]) # Python 2 - except Exception: - opts = " ".join(["%s %s" % (k, v) for (k, v) in list(opts.items())]) # Python 3 + opts = " ".join([f"{k} {v}" for (k, v) in list(opts.items())]) - logger.info("transferring file %s from %s to %s", fspec.lfn, source, destination) + logger.info(f"transferring file {fspec.lfn} from {source} to {destination}") nretries = 1 # input parameter to function? for retry in range(nretries): @@ -228,38 +229,20 @@ def copy_out(files, **kwargs): return files -def copy_out_old(files): - """ - Tries to upload the given files using lsm-put directly. - - :param files: Files to upload - :raises PilotException: StageOutFailure - """ - - if not check_for_lsm(dst_in=False): - raise StageOutFailure("No LSM tools found") - - exit_code, stdout, stderr = move_all_files_out(files) - if exit_code != 0: - # raise failure - raise StageOutFailure(stdout) - - -def move_all_files_in(files, nretries=1): +def move_all_files_in(files: list, nretries: int = 1) -> (int, str, str): """ - Move all files. + Move all inout files. - :param files: - :param nretries: number of retries; sometimes there can be a timeout copying, but the next attempt may succeed - :return: exit_code, stdout, stderr + :param files: list of FileSpec objects (list) + :param nretries: number of retries; sometimes there can be a timeout copying, but the next attempt may succeed (int) + :return: exit code (int), stdout (str), stderr (str). """ - exit_code = 0 stdout = "" stderr = "" for entry in files: # entry = {'name':, 'source':, 'destination':} - logger.info("transferring file %s from %s to %s", entry['name'], entry['source'], entry['destination']) + logger.info(f"transferring file {entry['name']} from {entry['source']} to {entry['destination']}") source = entry['source'] + '/' + entry['name'] destination = os.path.join(entry['destination'], entry['name']) @@ -268,7 +251,7 @@ def move_all_files_in(files, nretries=1): if exit_code != 0: if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) + logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful break @@ -276,20 +259,20 @@ def move_all_files_in(files, nretries=1): return exit_code, stdout, stderr -def move_all_files_out(files, nretries=1): +def move_all_files_out(files: list, nretries: int = 1) -> (int, str, str): """ - Move all files. + Move all output files. - :param files: - :return: exit_code, stdout, stderr + :param files: list of FileSPec objects (list) + :param nretries: number of retries; sometimes there can be a timeout copying, but the next attempt may succeed (int) + :return: exit code (int), stdout (str), stderr (str). """ - exit_code = 0 stdout = "" stderr = "" for entry in files: # entry = {'name':, 'source':, 'destination':} - logger.info("transferring file %s from %s to %s", entry['name'], entry['source'], entry['destination']) + logger.info(f"transferring file {entry['name']} from {entry['source']} to {entry['destination']}") destination = entry['destination'] + '/' + entry['name'] source = os.path.join(entry['source'], entry['name']) @@ -298,7 +281,7 @@ def move_all_files_out(files, nretries=1): if exit_code != 0: if ((exit_code != errno.ETIMEDOUT) and (exit_code != errno.ETIME)) or (retry + 1) == nretries: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) + logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") return exit_code, stdout, stderr else: # all successful break @@ -307,30 +290,29 @@ def move_all_files_out(files, nretries=1): #@timeout(seconds=10800) -def move(source, destination, dst_in=True, copysetup="", options=None): +def move(source: str, destination: str, dst_in: bool = True, copysetup: str = "", options: str = "") -> (int, str, str): """ Use lsm-get or lsm-put to transfer the file. - :param source: path to source (string). - :param destination: path to destination (string). - :param dst_in: True for stage-in, False for stage-out (boolean). - :return: exit code, stdout, stderr + :param source: path to source (str) + :param destination: path to destination (str) + :param dst_in: True for stage-in, False for stage-out (bool) + :return: exit code (int), stdout (str), stderr (str). """ - # copysetup = '/osg/mwt2/app/atlas_app/atlaswn/setup.sh' if copysetup != "": - cmd = 'source %s;' % copysetup + cmd = f'source {copysetup};' else: cmd = '' - args = "%s %s" % (source, destination) + args = f"{source} {destination}" if options: - args = "%s %s" % (options, args) + args = f"{options} {args}" if dst_in: - cmd += "lsm-get %s" % args + cmd += f"lsm-get {args}" else: - cmd += "lsm-put %s" % args + cmd += f"lsm-put {args}" try: exit_code, stdout, stderr = execute(cmd, usecontainer=False, copytool=True) #, timeout=get_timeout(fspec.filesize)) @@ -343,15 +325,20 @@ def move(source, destination, dst_in=True, copysetup="", options=None): stderr = '' logger.warning(stdout) - logger.info('exit_code=%d, stdout=%s, stderr=%s', exit_code, stdout, stderr) + logger.info(f'exit_code={exit_code}, stdout={stdout}, stderr={stderr}') return exit_code, stdout, stderr -def check_for_lsm(dst_in=True): - cmd = None +def check_for_lsm(dst_in: bool = True) -> bool: + """ + Check if lsm-get / lsm-put are locally available. + + :param dst_in: True for stage-in, False for stage-out (bool) + :return: True if command is available (bool). + """ if dst_in: cmd = 'which lsm-get' else: cmd = 'which lsm-put' - exit_code, gfal_path, _ = execute(cmd) + exit_code, _, _ = execute(cmd) return exit_code == 0 diff --git a/pilot/copytool/mv.py b/pilot/copytool/mv.py index deef5b1f..62f04e73 100644 --- a/pilot/copytool/mv.py +++ b/pilot/copytool/mv.py @@ -21,24 +21,58 @@ # - Tobias Wegner, tobias.wegner@cern.ch, 2018 # - David Cameron, david.cameron@cern.ch, 2018-2022 +"""mv/cp/ln copy tool.""" + +import logging import os import re from pilot.common.exception import StageInFailure, StageOutFailure, MKDirFailure, ErrorCodes from pilot.util.container import execute -import logging logger = logging.getLogger(__name__) require_replicas = False # indicate if given copytool requires input replicas to be resolved check_availablespace = False # indicate whether space check should be applied before stage-in transfers using given copytool -def create_output_list(files, init_dir): +def is_valid_for_copy_in(files: list) -> bool: """ - Add files to the output list which tells ARC CE which files to upload + Determine if this copytool is valid for input for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False + return True ## FIX ME LATER + +def is_valid_for_copy_out(files: list) -> bool: + """ + Determine if this copytool is valid for input for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False + return True ## FIX ME LATER + + +def create_output_list(files: list, init_dir: str): + """ + Add files to the output list which tells ARC CE which files to upload. + + :param files: list of FileSpec objects (list) + :param init_dir: start directory (str). + """ for fspec in files: arcturl = fspec.turl if arcturl.startswith('s3://'): @@ -52,48 +86,37 @@ def create_output_list(files, init_dir): else: # Add ARC options to TURL checksumtype, checksum = list(fspec.checksum.items())[0] - arcturl += ':checksumtype=%s:checksumvalue=%s' % (checksumtype, checksum) + arcturl += f':checksumtype={checksumtype}:checksumvalue={checksum}' - logger.info('Adding to output.list: %s %s', fspec.lfn, arcturl) + logger.info(f'adding to output.list: {fspec.lfn} {arcturl}') # Write output.list with open(os.path.join(init_dir, 'output.list'), 'a') as f: - f.write('%s %s\n' % (fspec.lfn, arcturl)) - - -def is_valid_for_copy_in(files): - return True # FIX ME LATER - #for f in files: - # if not all(key in f for key in ('name', 'source', 'destination')): - # return False - #return True - - -def is_valid_for_copy_out(files): - return True # FIX ME LATER - #for f in files: - # if not all(key in f for key in ('name', 'source', 'destination')): - # return False - #return True + f.write(f'{fspec.lfn} {arcturl}\n') -def get_dir_path(turl, prefix='file://localhost'): +def get_dir_path(turl: str, prefix: str = 'file://localhost') -> str: """ - Extract the directory path from the turl that has a given prefix + Extract the directory path from the turl that has a given prefix. + E.g. turl = 'file://localhost/sphenix/lustre01/sphnxpro/rucio/user/jwebb2/01/9f/user.jwebb2.66999._000001.top1outDS.tar' -> '/sphenix/lustre01/sphnxpro/rucio/user/jwebb2/01/9f' (some of these directories will typically have to be created in the next step). - :param turl: TURL (string). - :param prefix: file prefix (string). - :return: directory path (string). + :param turl: TURL (str) + :param prefix: file prefix (str) + :return: directory path (str). """ - - path = turl.replace(prefix, '') - return os.path.dirname(path) + return os.path.dirname(turl.replace(prefix, '')) -def build_final_path(turl, prefix='file://localhost'): +def build_final_path(turl: str, prefix: str = 'file://localhost') -> (int, str, str): + """ + Build the final path for the storage. + :param turl: TURL (str) + :param prefix: file prefix (str) + :return: error code (int), diagnostics (str), path (str). + """ path = '' # first get the directory path to be created @@ -115,14 +138,16 @@ def build_final_path(turl, prefix='file://localhost'): return 0, '', os.path.join(dirname, os.path.basename(turl)) -def copy_in(files, copy_type="symlink", **kwargs): +def copy_in(files: list, copy_type: str = "symlink", **kwargs: dict) -> list: """ - Tries to download the given files using mv directly. + Download the given files using mv directly. - :param files: list of `FileSpec` objects + :param files: list of `FileSpec` objects (list) + :param copy_type: copy type (str) + :param kwargs: kwargs dictionary (dict) :raises PilotException: StageInFailure + :return: updated files (list). """ - # make sure direct access is not attempted (wrong queue configuration - pilot should fail job) allow_direct_access = kwargs.get('allow_direct_access') for fspec in files: @@ -155,25 +180,27 @@ def copy_in(files, copy_type="symlink", **kwargs): return files -def copy_out(files, copy_type="mv", **kwargs): +def copy_out(files: list, copy_type: str = "mv", **kwargs: dict) -> list: """ - Tries to upload the given files using mv directly. + Upload the given files using mv directly. - :param files: list of `FileSpec` objects - :raises PilotException: StageOutFailure + :param files: list of `FileSpec` objects (list) + :param copy_type: copy type (str) + :param kwargs: kwargs dictionary (dict) + :raises PilotException: StageOutFailure, MKDirFailure + :return: updated files (list). """ - if copy_type not in ["cp", "mv"]: raise StageOutFailure("incorrect method for copy out") if not kwargs.get('workdir'): raise StageOutFailure("Workdir is not specified") - exit_code, stdout, stderr = move_all_files(files, - copy_type, - kwargs.get('workdir'), - '', - mvfinaldest=kwargs.get('mvfinaldest', False)) + exit_code, stdout, _ = move_all_files(files, + copy_type, + kwargs.get('workdir'), + '', + mvfinaldest=kwargs.get('mvfinaldest', False)) if exit_code != 0: # raise failure if exit_code == ErrorCodes.MKDIR: @@ -182,7 +209,7 @@ def copy_out(files, copy_type="mv", **kwargs): raise StageOutFailure(stdout) # Create output list for ARC CE if necessary - logger.debug('init_dir for output.list=%s', os.path.dirname(kwargs.get('workdir'))) + logger.debug(f"init_dir for output.list={os.path.dirname(kwargs.get('workdir'))}") output_dir = kwargs.get('output_dir', '') if not output_dir: create_output_list(files, os.path.dirname(kwargs.get('workdir'))) @@ -190,14 +217,18 @@ def copy_out(files, copy_type="mv", **kwargs): return files -def move_all_files(files, copy_type, workdir, jobworkdir, mvfinaldest=False): +def move_all_files(files: list, copy_type: str, workdir: str, jobworkdir: str, + mvfinaldest: bool = False) -> (int, str, str): """ Move all files. - :param files: list of `FileSpec` objects - :return: exit_code, stdout, stderr + :param files: list of `FileSpec` objects (list) + :param copy_type: copy type (str) + :param workdir: work directory (str) + :param jobworkdir: work directory for job (str) + :param mvfinaldest: True if we can transfer to final SE destination, False otherwise (bool) + :return: exit code (int), stdout (str), stderr (str). """ - exit_code = 0 stdout = "" stderr = "" @@ -240,11 +271,11 @@ def move_all_files(files, copy_type, workdir, jobworkdir, mvfinaldest=False): # resolve canonical path source = os.path.realpath(source) - logger.info("transferring file %s from %s to %s", name, source, destination) + logger.info(f"transferring file {name} from {source} to {destination}") exit_code, stdout, stderr = copy_method(source, destination) if exit_code != 0: - logger.warning("transfer failed: exit code = %d, stdout = %s, stderr = %s", exit_code, stdout, stderr) + logger.warning(f"transfer failed: exit code = {exit_code}, stdout = {stdout}, stderr = {stderr}") fspec.status = 'failed' if fspec.filetype == 'input': fspec.status_code = ErrorCodes.STAGEINFAILED @@ -258,52 +289,37 @@ def move_all_files(files, copy_type, workdir, jobworkdir, mvfinaldest=False): return exit_code, stdout, stderr -def move(source, destination): +def move(source: str, destination: str) -> (int, str, str): """ - Tries to upload the given files using mv directly. + Upload the given files using mv directly. - :param source: - :param destination: - - :return: exit_code, stdout, stderr + :param source: file source path (str) + :param destination: file destination path (str) + :return: exit code (int), stdout (str), stderr (str). """ - executable = ['/usr/bin/env', 'mv', source, destination] - cmd = ' '.join(executable) - exit_code, stdout, stderr = execute(cmd) - - return exit_code, stdout, stderr + return execute(' '.join(executable)) -def copy(source, destination): +def copy(source: str, destination: str) -> (int, str, str): """ - Tries to upload the given files using xrdcp directly. - - :param source: - :param destination: + Upload the given files using cp directly. - :return: exit_code, stdout, stderr + :param source: file source path (str) + :param destination: file destination path (str) + :return: exit code (int), stdout (str), stderr (str). """ - executable = ['/usr/bin/env', 'cp', source, destination] - cmd = ' '.join(executable) - exit_code, stdout, stderr = execute(cmd) - - return exit_code, stdout, stderr + return execute(' '.join(executable)) -def symlink(source, destination): +def symlink(source: str, destination: str) -> (int, str, str): """ - Tries to ln the given files. - - :param source: - :param destination: + Create symbolic link ln the given file. - :return: exit_code, stdout, stderr + :param source: file source path (str) + :param destination: file destination path (str) + :return: exit code (int), stdout (str), stderr (str). """ - executable = ['/usr/bin/env', 'ln', '-s', source, destination] - cmd = ' '.join(executable) - exit_code, stdout, stderr = execute(cmd) - - return exit_code, stdout, stderr + return execute(' '.join(executable)) diff --git a/pilot/copytool/objectstore.py b/pilot/copytool/objectstore.py index cabfb308..13589fd3 100644 --- a/pilot/copytool/objectstore.py +++ b/pilot/copytool/objectstore.py @@ -21,9 +21,12 @@ # - Alexey Anisenkov, anisyonk@cern.ch, 2019 # - Paul Nilsson, paul.nilsson@cern.ch, 2019-2023 -import os +"""Objectstore copy tool.""" + import json import logging +import os +from typing import Any from pilot.util.container import execute from pilot.common.exception import ( @@ -46,22 +49,45 @@ def is_valid_for_copy_in(files: list) -> bool: + """ + Determine if this copytool is valid for input for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False return True ## FIX ME LATER def is_valid_for_copy_out(files: list) -> bool: + """ + Determine if this copytool is valid for output for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False return True ## FIX ME LATER -def resolve_surl(fspec, protocol, ddmconf, **kwargs): +def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> dict: """ - Get final destination SURL for file to be transferred to Objectstore - Can be customized at the level of specific copytool + Get final destination SURL for file to be transferred to Objectstore. + + Can be customized at the level of specific copytool. - :param protocol: suggested protocol - :param ddmconf: full ddm storage data - :param fspec: file spec data - :return: dictionary {'surl': surl} + :param fspec: file spec data (Any) + :param protocol: suggested protocol (dict) + :param ddmconf: full ddm storage data (dict) + :return: SURL dictionary {'surl': surl} (dict). """ ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: @@ -78,14 +104,14 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): return {'surl': surl} -def copy_in(files, **kwargs): +def copy_in(files: list, **kwargs: dict) -> list: """ - Download given files using rucio copytool. + Download given files using rucio copytool. - :param files: list of `FileSpec` objects - :raise: PilotException in case of controlled error + :param files: list of `FileSpec` objects (list) + :raise: PilotException in case of controlled error + :return: updated list of files (list). """ - # don't spoil the output, we depend on stderr parsing os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' @@ -94,7 +120,7 @@ def copy_in(files, **kwargs): for fspec in files: cmd = [] - logger.info("To transfer file: %s", fspec) + logger.info(f"transfer file: {fspec}") if fspec.protocol_id: ddm = ddmconf.get(fspec.ddmendpoint) if ddm: @@ -139,19 +165,19 @@ def is_new_rucio_version() -> bool: :return: True if new rucio version (bool). """ - _, stdout, _ = execute('rucio download -h') return True if '--rses RSES' in stdout else False -def copy_out(files, **kwargs): +def copy_out(files: list, **kwargs: dict) -> list: """ - Upload given files using rucio copytool. + Upload the given files using rucio copytool. - :param files: list of `FileSpec` objects - :raise: PilotException in case of controlled error + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) + :raises: PilotException in case of controlled error + :return: updated list of files (list). """ - # don't spoil the output, we depend on stderr parsing os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' diff --git a/pilot/copytool/rucio.py b/pilot/copytool/rucio.py index 08c88558..8bc62c8c 100644 --- a/pilot/copytool/rucio.py +++ b/pilot/copytool/rucio.py @@ -14,6 +14,7 @@ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations +# specific language governing permissions and limitations # under the License. # # Authors: @@ -24,13 +25,16 @@ # - Tomas Javurek, tomas.javurek@cern.ch, 2019 # - David Cameron, david.cameron@cern.ch, 2019 +"""Rucio copy tool.""" + from __future__ import absolute_import # Python 2 (2to3 complains about this) -import os import json import logging -from time import time +import os from copy import deepcopy +from time import time +from typing import Any from pilot.common.exception import ( PilotException, @@ -57,24 +61,46 @@ tracing_rucio = False ## should Rucio send the trace? -def is_valid_for_copy_in(files): +def is_valid_for_copy_in(files: list) -> bool: + """ + Determine if this copytool is valid for input for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False return True ## FIX ME LATER -def is_valid_for_copy_out(files): +def is_valid_for_copy_out(files: list) -> bool: + """ + Determine if this copytool is valid for output for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False return True ## FIX ME LATER #@timeout(seconds=10800) -def copy_in(files, **kwargs): +def copy_in(files: list, **kwargs: dict) -> list: """ - Download given files using rucio copytool. + Download given files using rucio copytool. - :param files: list of `FileSpec` objects - :param ignore_errors: boolean, if specified then transfer failures will be ignored - :raise: PilotException in case of controlled error + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) + :raises: PilotException in case of controlled error + :return: updated files (list). """ - ignore_errors = kwargs.get('ignore_errors') trace_report = kwargs.get('trace_report') use_pcache = kwargs.get('use_pcache') @@ -163,14 +189,13 @@ def copy_in(files, **kwargs): return files -def get_protocol(trace_report_out): +def get_protocol(trace_report_out: dict) -> str: """ Extract the protocol used for the transfer from the dictionary returned by rucio. - :param trace_report_out: returned rucio transfer dictionary (dictionary). - :return: protocol (string). + :param trace_report_out: returned rucio transfer dictionary (dict) + :return: protocol (str). """ - try: protocol = trace_report_out[0].get('protocol') except Exception as error: @@ -180,16 +205,18 @@ def get_protocol(trace_report_out): return protocol -def handle_rucio_error(error_msg, trace_report, trace_report_out, fspec, stagein=True): +def handle_rucio_error(error_msg: str, trace_report: dict, trace_report_out: list, fspec: Any, + stagein: bool = True) -> dict: """ - - :param error_msg: - :param trace_report: - :param trace_report_out: - :param fspec: - :return: + Handle any error from Rucio. + + :param error_msg: error message (str) + :param trace_report: trace report dictionary (dict) + :param trace_report_out: trace report from Rucio (list) + :param fspec: FileSpec object (Any) + :param stagein: True for stage-in, False for stage-out (bool) + :return: error deetails dictionary, {'rcode': rcode, 'state': state, 'error': error_msg} (dict). """ - # try to get a better error message from the traces error_msg_org = error_msg if trace_report_out: @@ -213,15 +240,15 @@ def handle_rucio_error(error_msg, trace_report, trace_report_out, fspec, stagein return error_details -def copy_in_bulk(files, **kwargs): +def copy_in_bulk(files: list, **kwargs: dict) -> list: """ - Download given files using rucio copytool. + Download given files using rucio copytool. - :param files: list of `FileSpec` objects - :param ignore_errors: boolean, if specified then transfer failures will be ignored - :raise: PilotException in case of controlled error + :param files: list of `FileSpec` objects + :param ignore_errors: boolean, if specified then transfer failures will be ignored + :raise: PilotException in case of controlled error + :return: list of done files (list). """ - #allow_direct_access = kwargs.get('allow_direct_access') ignore_errors = kwargs.get('ignore_errors') trace_common_fields = kwargs.get('trace_report') @@ -312,15 +339,16 @@ def copy_in_bulk(files, **kwargs): return files_done -def _get_trace(fspec, traces): +def _get_trace(fspec: Any, traces: list) -> list: """ + Get the trace candidates corresponding to the given file. + Traces returned by Rucio are not orderred the same as input files from pilot. This method finds the proper trace. - :param: fspec: the file that is seeked - :param: traces: all traces that are received by Rucio - - :return: trace_candiates that correspond to the given file + :param: fspec: the file that is wanted (Any) + :param: traces: list of all traces that are received by Rucio (list) + :return: trace_candidates that correspond to the given file (list). """ try: try: @@ -337,15 +365,15 @@ def _get_trace(fspec, traces): #@timeout(seconds=10800) -def copy_out(files, **kwargs): # noqa: C901 +def copy_out(files: list, **kwargs: dict) -> list: # noqa: C901 """ - Upload given files using rucio copytool. + Upload given files using rucio copytool. - :param files: list of `FileSpec` objects - :param ignore_errors: boolean, if specified then transfer failures will be ignored - :raise: PilotException in case of controlled error + :param files: list of `FileSpec` objects (list) + :param ignore_errors: boolean, if specified then transfer failures will be ignored (bool) + :raise: PilotException in case of controlled error + :return: updated files list (list). """ - # don't spoil the output, we depend on stderr parsing os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' logger.info(f'rucio stage-out: X509_USER_PROXY={os.environ.get("X509_USER_PROXY", "")}') @@ -357,7 +385,7 @@ def copy_out(files, **kwargs): # noqa: C901 localsite = os.environ.get('RUCIO_LOCAL_SITE_ID', None) for fspec in files: - logger.info('rucio copytool, uploading file with scope: %s and lfn: %s' % (str(fspec.scope), str(fspec.lfn))) + logger.info(f'rucio copytool, uploading file with scope: {fspec.scope} and lfn: {fspec.lfn}') localsite = localsite if localsite else fspec.ddmendpoint trace_report.update(localSite=localsite, remoteSite=fspec.ddmendpoint) trace_report.update(scope=fspec.scope, dataset=fspec.dataset, url=fspec.surl, filesize=fspec.filesize) @@ -369,11 +397,11 @@ def copy_out(files, **kwargs): # noqa: C901 if summary: summary_file_path = os.path.join(cwd, 'rucio_upload.json') - logger.info('the file will be uploaded to %s' % str(fspec.ddmendpoint)) + logger.info(f'the file will be uploaded to {fspec.ddmendpoint}') trace_report_out = [] transfer_timeout = get_timeout(fspec.filesize) ctimeout = transfer_timeout + 10 # give the API a chance to do the time-out first - logger.info('overall transfer timeout=%s' % ctimeout) + logger.info(f'overall transfer timeout={ctimeout}') error_msg = "" ec = 0 @@ -389,7 +417,7 @@ def copy_out(files, **kwargs): # noqa: C901 trace_report.update(protocol=protocol) if not ignore_errors: trace_report.send() - msg = ' %s:%s to %s, %s' % (fspec.scope, fspec.lfn, fspec.ddmendpoint, error_details.get('error')) + msg = f" {fspec.scope}:{fspec.lfn} to {fspec.ddmendpoint}, {error_details.get('error')}" raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) except Exception as error: error_msg = str(error) @@ -398,7 +426,7 @@ def copy_out(files, **kwargs): # noqa: C901 trace_report.update(protocol=protocol) if not ignore_errors: trace_report.send() - msg = ' %s:%s to %s, %s' % (fspec.scope, fspec.lfn, fspec.ddmendpoint, error_details.get('error')) + msg = f" {fspec.scope}:{fspec.lfn} to {fspec.ddmendpoint}, {error_details.get('error')}" raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) else: protocol = get_protocol(trace_report_out) @@ -414,12 +442,12 @@ def copy_out(files, **kwargs): # noqa: C901 if not ignore_errors: trace_report.send() - msg = ' %s:%s from %s, %s' % (fspec.scope, fspec.lfn, fspec.ddmendpoint, error_details.get('error')) + msg = f" {fspec.scope}:{fspec.lfn} from {fspec.ddmendpoint}, {error_details.get('error')}" raise PilotException(msg, code=error_details.get('rcode'), state=error_details.get('state')) if summary: # resolve final pfn (turl) from the summary JSON if not os.path.exists(summary_file_path): - logger.error('Failed to resolve Rucio summary JSON, wrong path? file=%s' % summary_file_path) + logger.error(f'Failed to resolve Rucio summary JSON, wrong path? file={summary_file_path}') else: with open(summary_file_path, 'rb') as f: summary_json = json.load(f) @@ -456,9 +484,19 @@ def copy_out(files, **kwargs): # noqa: C901 return files -def _stage_in_api(dst, fspec, trace_report, trace_report_out, transfer_timeout, use_pcache, rucio_host): +def _stage_in_api(dst: str, fspec: Any, trace_report: dict, trace_report_out: list, transfer_timeout: int, + use_pcache: bool, rucio_host: str) -> (int, list): """ Stage-in files using the Rucio API. + + :param dst: destination directory (str) + :param fspec: FileSpec object (Any) + :param trace_report: trace report (dict) + :param trace_report_out: list of trace reports from Rucio (list) + :param transfer_timeout: transfer timeout in seconds (int) + :param use_pcache: True if pcache should be used, False otherwise (bool) + :param rucio_host: Rucio host URL (str) + :return: exit code (int), trace report from Rucio (list). """ ec = 0 @@ -526,16 +564,17 @@ def _stage_in_api(dst, fspec, trace_report, trace_report_out, transfer_timeout, return ec, trace_report_out -def _stage_in_bulk(dst, files, trace_report_out=None, trace_common_fields=None, rucio_host=''): +def _stage_in_bulk(dst: str, files: list, trace_report_out: list = [], trace_common_fields: dict = {}, + rucio_host: str = ''): """ Stage-in files in bulk using the Rucio API. :param dst: destination (string). :param files: list of fspec objects. :param trace_report_out: - :param trace_common_fields: + :param trace_common_fields: trace report (dict) :param rucio_host: optional rucio host (string). - :return: + :raises Exception: download_client.download_pfns exception. """ # init. download client from rucio.client import Client @@ -562,7 +601,7 @@ def _stage_in_bulk(dst, files, trace_report_out=None, trace_common_fields=None, _file = {} _file['did_scope'] = fspec.scope _file['did_name'] = fspec.lfn - _file['did'] = '%s:%s' % (fspec.scope, fspec.lfn) + _file['did'] = f'{fspec.scope}:{fspec.lfn}' _file['rse'] = fspec.ddmendpoint _file['base_dir'] = fspec.workdir or dst _file['no_subdir'] = True @@ -600,11 +639,19 @@ def _stage_in_bulk(dst, files, trace_report_out=None, trace_common_fields=None, logger.debug(f'client returned {result}') -def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, transfer_timeout, rucio_host): +def _stage_out_api(fspec: Any, summary_file_path: str, trace_report: dict, trace_report_out: list, + transfer_timeout: int, rucio_host: str) -> (int, list): """ Stage-out files using the Rucio API. - """ + :param fspec: FileSpec object (Any) + :param summary_file_path: path to summary file (str) + :param trace_report: trace report (dict) + :param trace_report_out: trace report from Rucio (list) + :param transfer_timeout: transfer time-out in seconds (int) + :param rucio_host: Rucio host URL (str) + :return: exit code (int), trace report from Rucio (list). + """ ec = 0 # init. download client diff --git a/pilot/copytool/s3.py b/pilot/copytool/s3.py index 05001170..8a7e908d 100644 --- a/pilot/copytool/s3.py +++ b/pilot/copytool/s3.py @@ -19,8 +19,13 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2021-2023 -import os +"""S3 copy tool.""" + import logging +import os +from glob import glob +from typing import Any +from urllib.parse import urlparse try: import boto3 @@ -28,9 +33,6 @@ except Exception: pass -from glob import glob -from urllib.parse import urlparse - from .common import resolve_common_transfer_errors from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException @@ -48,23 +50,61 @@ allowed_schemas = ['srm', 'gsiftp', 'https', 'davs', 'root', 's3', 's3+rucio'] -def is_valid_for_copy_in(files): - return True # FIX ME LATER +def is_valid_for_copy_in(files: list) -> bool: + """ + Determine if this copytool is valid for input for the given file list. + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False + return True ## FIX ME LATER + + +def is_valid_for_copy_out(files: list) -> bool: + """ + Determine if this copytool is valid for output for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False + return True ## FIX ME LATER -def is_valid_for_copy_out(files): - return True # FIX ME LATER +def get_pilot_s3_profile() -> str: + """ + Get the PANDA_PILOT_AWS_PROFILE environment variable. -def get_pilot_s3_profile(): + :return: PANDA_PILOT_AWS_PROFILE (str). + """ return os.environ.get("PANDA_PILOT_AWS_PROFILE", None) -def get_copy_out_extend(): +def get_copy_out_extend() -> str: + """ + Get the PANDA_PILOT_COPY_OUT_EXTEND environment variable. + + :return: PANDA_PILOT_COPY_OUT_EXTEND (str). + """ return os.environ.get("PANDA_PILOT_COPY_OUT_EXTEND", None) -def get_endpoint_bucket_key(surl): +def get_endpoint_bucket_key(surl: str) -> (str, str, str): + """ + Get the endpoint, bucket and key from the given SURL. + + :param surl: SURL (str) + :return: endpoint (str), bucket (str), key (str). + """ parsed = urlparse(surl) endpoint = parsed.scheme + '://' + parsed.netloc full_path = parsed.path @@ -74,18 +114,21 @@ def get_endpoint_bucket_key(surl): parts = full_path.split('/') bucket = parts[1] key = '/'.join(parts[2:]) + return endpoint, bucket, key -def resolve_surl(fspec, protocol, ddmconf, **kwargs): +def resolve_surl(fspec: Any, protocol: dict, ddmconf: dict, **kwargs: dict) -> dict: """ - Get final destination SURL for file to be transferred to Objectstore - Can be customized at the level of specific copytool + Get final destination SURL for file to be transferred to Objectstore. - :param protocol: suggested protocol - :param ddmconf: full ddm storage data - :param fspec: file spec data - :return: dictionary {'surl': surl} + Can be customized at the level of specific copytool. + + :param fspec: FileSpec object (Any) + :param protocol: suggested protocol (dict) + :param ddmconf: full ddm storage data (dict) + :param kwargs: kwargs dictionary (dict) + :return: SURL dictionary {'surl': surl} (dict). """ try: pandaqueue = infosys.pandaqueue @@ -96,7 +139,7 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): ddm = ddmconf.get(fspec.ddmendpoint) if not ddm: - raise PilotException('failed to resolve ddmendpoint by name=%s' % fspec.ddmendpoint) + raise PilotException(f'failed to resolve ddmendpoint by name={fspec.ddmendpoint}') if ddm.is_deterministic: surl = protocol.get('endpoint', '') + os.path.join(protocol.get('path', ''), get_rucio_path(fspec.scope, fspec.lfn)) @@ -119,30 +162,31 @@ def resolve_surl(fspec, protocol, ddmconf, **kwargs): fspec.protocol_id = protocol.get('id') else: - raise PilotException('resolve_surl(): Failed to construct SURL for non deterministic ddm=%s: NOT IMPLEMENTED', fspec.ddmendpoint) + raise PilotException(f'resolve_surl(): Failed to construct SURL for non deterministic ddm={fspec.ddmendpoint}: NOT IMPLEMENTED') - logger.info('resolve_surl, surl: %s', surl) + logger.info(f'resolve_surl, surl: {surl}') # example: # protocol = {u'path': u'/atlas-eventservice', u'endpoint': u's3://s3.cern.ch:443/', u'flavour': u'AWS-S3-SSL', u'id': 175} # surl = 's3://s3.cern.ch:443//atlas-eventservice/EventService_premerge_24706191-5013009653-24039149400-322-5.tar' return {'surl': surl} -def copy_in(files, **kwargs): +def copy_in(files: list, **kwargs: dict) -> list: """ Download given files from an S3 bucket. - :param files: list of `FileSpec` objects - :raise: PilotException in case of controlled error + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) + :raises: PilotException in case of controlled error + :return: updated list of files (list). """ - for fspec in files: dst = fspec.workdir or kwargs.get('workdir') or '.' # bucket = 'bucket' # UPDATE ME path = os.path.join(dst, fspec.lfn) - logger.info('downloading surl %s to local file %s', fspec.surl, path) + logger.info(f'downloading surl {fspec.surl} to local file {path}') status, diagnostics = download_file(path, fspec.surl) if not status: # an error occurred @@ -157,16 +201,15 @@ def copy_in(files, **kwargs): return files -def download_file(path, surl, object_name=None): +def download_file(path: str, surl: str, object_name: str = None) -> (bool, str): """ Download a file from an S3 bucket. - :param path: Path to local file after download (string). - :param surl: Source url to download from. - :param object_name: S3 object name. If not specified then file_name from path is used. - :return: True if file was uploaded (else False), diagnostics (string). + :param path: path to local file after download (str) + :param surl: source url to download from (str) + :param object_name: S3 object name. If not specified then file_name from path is used (str) + :return: True if file was uploaded - otherwise False (bool), diagnostics (str). """ - try: endpoint, bucket, object_name = get_endpoint_bucket_key(surl) session = boto3.Session(profile_name=get_pilot_s3_profile()) @@ -174,31 +217,32 @@ def download_file(path, surl, object_name=None): s3 = session.client('s3', endpoint_url=endpoint) s3.download_file(bucket, object_name, path) except ClientError as error: - diagnostics = 'S3 ClientError: %s' % error + diagnostics = f'S3 ClientError: {error}' logger.critical(diagnostics) return False, diagnostics except Exception as error: - diagnostics = 'exception caught in s3_client: %s' % error + diagnostics = f'exception caught in s3_client: {error}' logger.critical(diagnostics) return False, diagnostics return True, "" -def copy_out_extend(files, **kwargs): +def copy_out_extend(files: list, **kwargs: dict) -> list: """ Upload given files to S3 storage. - :param files: list of `FileSpec` objects - :raise: PilotException in case of controlled error + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) + :raises: PilotException in case of controlled error + :return: updated list of files (list). """ - workdir = kwargs.pop('workdir') for fspec in files: # path = os.path.join(workdir, fspec.lfn) - logger.info('uploading %s to fspec.turl %s', workdir, fspec.turl) + logger.info(f'uploading {workdir} to {fspec.turl}') logfiles = [] lfn = fspec.lfn.strip() @@ -218,7 +262,7 @@ def copy_out_extend(files, **kwargs): logfile = os.path.basename(path) if os.path.exists(path): full_url = os.path.join(fspec.turl, logfile) - logger.info('uploading %s to%s', path, full_url) + logger.info(f'uploading {path} to {full_url}') status, diagnostics = upload_file(path, full_url) if not status: # an error occurred @@ -228,7 +272,7 @@ def copy_out_extend(files, **kwargs): fspec.status_code = error.get('rcode') raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) else: - diagnostics = 'local output file does not exist: %s' % path + diagnostics = f'local output file does not exist: {path}' logger.warning(diagnostics) fspec.status = 'failed' fspec.status_code = errors.STAGEOUTFAILED @@ -241,14 +285,15 @@ def copy_out_extend(files, **kwargs): return files -def copy_out(files, **kwargs): +def copy_out(files: list, **kwargs: dict) -> list: """ Upload given files to S3 storage. - :param files: list of `FileSpec` objects + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) :raise: PilotException in case of controlled error + :return: updated list of files (list). """ - if get_copy_out_extend(): return copy_out_extend(files, **kwargs) @@ -259,7 +304,7 @@ def copy_out(files, **kwargs): path = os.path.join(workdir, fspec.lfn) if os.path.exists(path): # bucket = 'bucket' # UPDATE ME - logger.info('uploading %s to fspec.turl %s', path, fspec.turl) + logger.info(f'uploading {path} to {fspec.turl}') full_url = os.path.join(fspec.turl, fspec.lfn) status, diagnostics = upload_file(path, full_url) @@ -270,7 +315,7 @@ def copy_out(files, **kwargs): fspec.status_code = error.get('rcode') raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) else: - diagnostics = 'local output file does not exist: %s' % path + diagnostics = f'local output file does not exist: {path}' logger.warning(diagnostics) fspec.status = 'failed' fspec.status_code = errors.STAGEOUTFAILED @@ -282,16 +327,15 @@ def copy_out(files, **kwargs): return files -def upload_file(file_name, full_url, object_name=None): +def upload_file(file_name: str, full_url: str, object_name: str = None) -> (bool, str): """ Upload a file to an S3 bucket. - :param file_name: File to upload. - :param turl: Target url to upload to. - :param object_name: S3 object name. If not specified then file_name is used. - :return: True if file was uploaded (else False), diagnostics (string). + :param file_name: file to upload (str) + :param turl: target url to upload to (str) + :param object_name: S3 object name. If not specified then file_name is used (str) + :return: True if file was uploaded - otherwise False (bool), diagnostics (str). """ - # upload the file try: # s3_client = boto3.client('s3') @@ -302,13 +346,13 @@ def upload_file(file_name, full_url, object_name=None): s3_client.upload_file(file_name, bucket, object_name) if object_name.endswith(config.Pilot.pilotlog): os.environ['GTAG'] = full_url - logger.debug("Set envvar GTAG with the pilotLot URL=%s", full_url) + logger.debug(f"Set envvar GTAG with the pilotLot URL={full_url}") except ClientError as error: - diagnostics = 'S3 ClientError: %s' % error + diagnostics = f'S3 ClientError: {error}' logger.critical(diagnostics) return False, diagnostics except Exception as error: - diagnostics = 'exception caught in s3_client: %s' % error + diagnostics = f'exception caught in s3_client: {error}' logger.critical(diagnostics) return False, diagnostics diff --git a/pilot/copytool/xrdcp.py b/pilot/copytool/xrdcp.py index b0a9185f..7a4a6c3b 100644 --- a/pilot/copytool/xrdcp.py +++ b/pilot/copytool/xrdcp.py @@ -19,11 +19,12 @@ # Authors: # - Tobias Wegner, tobias.wegner@cern.ch, 2017-2018 # - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 +# - Alexey Anisenkov, anisyonk@cern.ch, 2017 -# Reimplemented by Alexey Anisenkov +"""Xrdcp copy tool.""" -import os import logging +import os import re from time import time @@ -40,75 +41,104 @@ copy_command = 'xrdcp' -def is_valid_for_copy_in(files): +def is_valid_for_copy_in(files: list) -> bool: + """ + Determine if this copytool is valid for input for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False return True ## FIX ME LATER -def is_valid_for_copy_out(files): +def is_valid_for_copy_out(files: list) -> bool: + """ + Determine if this copytool is valid for output for the given file list. + + Placeholder. + + :param files: list of FileSpec objects (list). + :return: always True (for now) (bool). + """ + # for f in files: + # if not all(key in f for key in ('name', 'source', 'destination')): + # return False return True ## FIX ME LATER -def _resolve_checksum_option(setup, **kwargs): +def _resolve_checksum_option(setup: str, **kwargs) -> str: + """ + Resolve which checksum option to use. - cmd = "%s --version" % copy_command + :param setup: setup (str) + :param kwargs: kwargs dictionary (dict) + :return: option (str). + """ + cmd = f"{copy_command} --version" if setup: - cmd = "source %s; %s" % (setup, cmd) + cmd = f"source {setup}; {cmd}" - logger.info("Execute command (%s) to check xrdcp client version", cmd) + logger.info(f"execute command ({cmd}) to check xrdcp client version") rcode, stdout, stderr = execute(cmd, **kwargs) - logger.info("return code: %s", rcode) - logger.info("return output: %s", stdout + stderr) - - cmd = "%s -h" % copy_command + cmd = f"{copy_command} -h" if setup: - cmd = "source %s; %s" % (setup, cmd) + cmd = f"source {setup}; {cmd}" - logger.info("Execute command (%s) to decide which option should be used to calc/verify file checksum..", cmd) + logger.info(f"execute command ({cmd}) to decide which option should be used to calc/verify file checksum..") rcode, stdout, stderr = execute(cmd, **kwargs) output = stdout + stderr - logger.info("return code: %s", rcode) - logger.debug("return output: %s", output) coption = "" checksum_type = 'adler32' ## consider only adler32 for now if rcode: - logger.error('FAILED to execute command=%s: %s', cmd, output) + logger.error(f'FAILED to execute command={cmd}: {output}') else: if "--cksum" in output: - coption = "--cksum %s:print" % checksum_type + coption = f"--cksum {checksum_type}:print" elif "-adler" in output and checksum_type == 'adler32': coption = "-adler" elif "-md5" in output and checksum_type == 'md5': coption = "-md5" if coption: - logger.info("Use %s option to get the checksum for %s command", coption, copy_command) + logger.info(f"use {coption} option to get the checksum for {copy_command} command") return coption #@timeout(seconds=10800) -def _stagefile(coption, source, destination, filesize, is_stagein, setup=None, **kwargs): +def _stagefile(coption: str, source: str, destination: str, filesize: int, is_stagein: bool, setup: str = None, + **kwargs: dict) -> (int, str, str): """ - Stage the file (stagein or stageout) - :return: destination file details (checksum, checksum_type) in case of success, throw exception in case of failure - :raise: PilotException in case of controlled error + Stage the given file (stagein or stageout). + + :param coption: checksum option (str) + :param source: file source path (str) + :param destination: file destination path (str) + :param filesize: file size (int) + :param is_stagein: True for stage-in, False for stage-out (bool) + :param setup: setup (str) + :param kwargs: kwargs dictionary (dict) + :raises: PilotException in case of controlled error + :return: destination file details - file size (int) checksum (str), checksum_type (str). """ - filesize_cmd, checksum_cmd, checksum_type = None, None, None - cmd = '%s -np -f %s %s %s' % (copy_command, coption, source, destination) + cmd = f'{copy_command} -np -f {coption} {source} {destination}' if setup: - cmd = "source %s; %s" % (setup, cmd) + cmd = f"source {setup}; {cmd}" #timeout = get_timeout(filesize) - #logger.info("Executing command: %s, timeout=%s" % (cmd, timeout)) - rcode, stdout, stderr = execute(cmd, **kwargs) - logger.info('rcode=%d, stdout=%s, stderr=%s', rcode, stdout, stderr) + logger.info(f'rcode={rcode}, stdout={stdout}, stderr={stderr}') if rcode: ## error occurred error = resolve_common_transfer_errors(stdout + stderr, is_stagein=is_stagein) @@ -116,7 +146,7 @@ def _stagefile(coption, source, destination, filesize, is_stagein, setup=None, * #rcode = error.get('rcode') ## TO BE IMPLEMENTED #if not is_stagein and rcode == PilotErrors.ERR_CHKSUMNOTSUP: ## stage-out, on fly checksum verification is not supported .. ignore # logger.info('stage-out: ignore ERR_CHKSUMNOTSUP error .. will explicitly verify uploaded file') - # return None, None + # return None, None, None raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state')) @@ -137,14 +167,15 @@ def _stagefile(coption, source, destination, filesize, is_stagein, setup=None, * # @timeout(seconds=10800) -def copy_in(files, **kwargs): +def copy_in(files: list, **kwargs: dict) -> list: """ - Download given files using xrdcp command. + Download given files using xrdcp command. - :param files: list of `FileSpec` objects - :raise: PilotException in case of controlled error + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) + :raises: PilotException in case of controlled error + :return: updated list of files (list). """ - #allow_direct_access = kwargs.get('allow_direct_access') or False setup = kwargs.pop('copytools', {}).get('xrdcp', {}).get('setup') coption = _resolve_checksum_option(setup, **kwargs) @@ -172,8 +203,8 @@ def copy_in(files, **kwargs): dst = fspec.workdir or kwargs.get('workdir') or '.' destination = os.path.join(dst, fspec.lfn) try: - filesize_cmd, checksum_cmd, checksum_type = _stagefile(coption, fspec.turl, destination, fspec.filesize, - is_stagein=True, setup=setup, **kwargs) + _, checksum_cmd, checksum_type = _stagefile(coption, fspec.turl, destination, fspec.filesize, + is_stagein=True, setup=setup, **kwargs) fspec.status_code = 0 fspec.status = 'transferred' except PilotException as error: @@ -201,14 +232,15 @@ def copy_in(files, **kwargs): # @timeout(seconds=10800) -def copy_out(files, **kwargs): +def copy_out(files: list, **kwargs: dict) -> list: """ - Upload given files using xrdcp command. + Upload given files using xrdcp command. - :param files: list of `FileSpec` objects - :raise: PilotException in case of controlled error + :param files: list of `FileSpec` objects (list) + :param kwargs: kwargs dictionary (dict) + :raise: PilotException in case of controlled error + :return: updated list of files (list). """ - setup = kwargs.pop('copytools', {}).get('xrdcp', {}).get('setup') coption = _resolve_checksum_option(setup, **kwargs) trace_report = kwargs.get('trace_report') @@ -218,8 +250,8 @@ def copy_out(files, **kwargs): trace_report.update(catStart=time(), filename=fspec.lfn, guid=fspec.guid.replace('-', '')) try: - filesize_cmd, checksum_cmd, checksum_type = _stagefile(coption, fspec.surl, fspec.turl, fspec.filesize, - is_stagein=False, setup=setup, **kwargs) + _, checksum_cmd, checksum_type = _stagefile(coption, fspec.surl, fspec.turl, fspec.filesize, + is_stagein=False, setup=setup, **kwargs) fspec.status_code = 0 fspec.status = 'transferred' trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time()) @@ -245,18 +277,17 @@ def copy_out(files, **kwargs): return files -def get_file_info_from_output(output): +def get_file_info_from_output(output: str) -> (int, str, str): """ - Extract file size, checksum value from xrdcp --chksum command output + Extract file size, checksum value from the xrdcp --chksum command output. - :return: (filesize [int/None], checksum, checksum_type) or (None, None, None) in case of failure + :return: file size (int), checksum (str), checksum_type (str). """ - if not output: return None, None, None if not ("xrootd" in output or "XRootD" in output or "adler32" in output): - logger.warning("WARNING: Failed to extract checksum: Unexpected output: %s", output) + logger.warning(f"WARNING: Failed to extract checksum: Unexpected output: {output}") return None, None, None pattern = r"(?Pmd5|adler32):\ (?P[a-zA-Z0-9]+)\ \S+\ (?P[0-9]+)" # Python 3 (added r) @@ -272,9 +303,10 @@ def get_file_info_from_output(output): try: filesize = int(filesize) except ValueError as error: - logger.warning('failed to convert filesize to int: %s', error) + logger.warning(f'failed to convert filesize to int: {error}') filesize = None else: - logger.warning("WARNING: Checksum/file size info not found in output: failed to match pattern=%s in output=%s", pattern, output) + logger.warning(f"WARNING: Checksum/file size info not found in output: " + f"failed to match pattern={pattern} in output={output}") return filesize, checksum, checksum_type diff --git a/pilot/eventservice/communicationmanager/communicationmanager.py b/pilot/eventservice/communicationmanager/communicationmanager.py index 6cc161e0..c2c3332c 100644 --- a/pilot/eventservice/communicationmanager/communicationmanager.py +++ b/pilot/eventservice/communicationmanager/communicationmanager.py @@ -20,34 +20,31 @@ # - Wen Guan, wen.guan@cern.ch, 2018 # - Paul Nilsson, paul.nilsson@cern.ch, 2023 -""" -Main classes to manage the messages between ES and harvester/ACT/Panda. -""" +"""Main classes to manage the messages between ES and harvester/ACT/Panda.""" import json import logging import os import threading import time -try: - import Queue as queue # noqa: N813 -except Exception: - import queue # Python 3 +import queue +from typing import Any from pilot.common import exception from pilot.common.pluginfactory import PluginFactory - logger = logging.getLogger(__name__) -""" -Communication response -""" +class CommunicationResponse: + """Communication response class.""" + def __init__(self, attrs: dict = None): + """ + Initialize variables. -class CommunicationResponse(object): - def __init__(self, attrs=None): + :param attrs: attributes dictionary (dict). + """ if not attrs: attrs = {} if not isinstance(attrs, dict): @@ -62,7 +59,12 @@ def __init__(self, attrs=None): for key in attrs: setattr(self, key, attrs[key]) - def __str__(self): + def __str__(self) -> str: + """ + Return string representation. + + :return: string representation (str). + """ json_str = {} for key, value in list(self.__dict__.items()): # Python 2/3 if value and type(value) is list: @@ -76,19 +78,23 @@ def __str__(self): return json.dumps(json_str) -""" -Communication request -""" +class CommunicationRequest(): + """Communication request class.""" + class RequestType(): + """Request type class.""" -class CommunicationRequest(object): - class RequestType(object): RequestJobs = 'request_jobs' UpdateJobs = 'update_jobs' RequestEvents = 'request_events' UpdateEvents = 'update_events' - def __init__(self, attrs=None): + def __init__(self, attrs: dict = None): + """ + Initialize variables. + + :param attrs: attributes dictionary (dict). + """ if not attrs: attrs = {} if not isinstance(attrs, dict): @@ -112,6 +118,11 @@ def __init__(self, attrs=None): self.abort = False def __str__(self): + """ + Return string representation. + + :return: string representation (str). + """ json_str = {} for key, value in list(self.__dict__.items()): # Python 2/3 if value and type(value) is list: @@ -122,17 +133,20 @@ def __str__(self): json_str[key] = str(value) else: json_str[key] = value - return json.dumps(json_str) - -""" -Communication manager thread -""" + return json.dumps(json_str) class CommunicationManager(threading.Thread, PluginFactory): + """Communication manager class.""" def __init__(self, *args, **kwargs): + """ + Initialize variables. + + :param args: args object (Any) + :param kwargs: kwargs dictionary (dict). + """ super(CommunicationManager, self).__init__() PluginFactory.__init__(self, *args, **kwargs) self.setName("CommunicationManager") @@ -159,29 +173,28 @@ def __init__(self, *args, **kwargs): self.kwargs = kwargs def stop(self): - """ - Set stop signal(main run process will clean queued requests to release waiting clients and then quit) - """ + """Set stop signal (main run process will clean queued requests to release waiting clients and then quit).""" if not self.is_stop(): - logger.info("Stopping Communication Manager.") + logger.info("stopping Communication Manager.") self.stop_event.set() - def is_stop(self): + def is_stop(self) -> bool: """ - check whether the stop signal is set + Check whether the stop signal is set. - :returns: True if the stop signal is set, otherwise False + :returns: True if the stop signal is set, otherwise False (bool) """ return self.stop_event.is_set() - def get_jobs(self, njobs=1, post_hook=None, args=None): + def get_jobs(self, njobs: int = 1, post_hook: Any = None, args: Any = None) -> Any: """ + Get jobs. + Function can be called by client to send a get_job request and get a response with jobs. - :returns: jobs(got from jobs servers) - :raise: Exception catched when getting jobs + :raises: Exception caught when getting jobs from server + :return: jobs (from server) (Any). """ - if self.is_stop(): return None @@ -213,14 +226,17 @@ def get_jobs(self, njobs=1, post_hook=None, args=None): else: return req.response.content - def update_jobs(self, jobs, post_hook=None): + def update_jobs(self, jobs: Any, post_hook: Any = None) -> Any: """ + Update jobs. + Function can be called by client to update jobs' status to server. - :returns: status of updating jobs - :raise: Exception catched when updating jobs + :param jobs: jobs to be updated (Any) + :param post_hook: post hook function (Any) + :raises: Exception caught when updating jobs + :return: status of updating jobs (Any). """ - if self.is_stop(): return None @@ -243,21 +259,25 @@ def update_jobs(self, jobs, post_hook=None): else: return req.response.content - def get_event_ranges(self, num_event_ranges=1, post_hook=None, job=None): + def get_event_ranges(self, num_event_ranges: int = 1, post_hook: Any = None, job: Any = None) -> Any: """ + Get event ranges. + Function can be called by client to send a get_event_ranges request and get a response with event ranges. - :returns: event ranges (got from jobs servers) + :param num_event_ranges: number of event ranges to get (int) + :param post_hook: post hook function (Any) + :param job: job info (Any) :raise: Exception caught when getting event ranges + :return: event ranges (from server) (Any). """ - if self.is_stop(): return None if not job: resp_attrs = {'status': -1, 'content': None, - 'exception': exception.CommunicationFailure("Get events failed because job info missing(job: %s)" % job)} + 'exception': exception.CommunicationFailure(f"Get events failed because job info missing(job: {job})")} resp = CommunicationResponse(resp_attrs) raise resp.exception @@ -284,14 +304,17 @@ def get_event_ranges(self, num_event_ranges=1, post_hook=None, job=None): else: return req.response.content - def update_events(self, update_events, post_hook=None): + def update_events(self, update_events: Any, post_hook: Any = None) -> Any: """ + Update events. + Function can be called by client to send a update_events request. - :returns: status of updating event ranges - :raise: Exception catched when updating event ranges + :param update_events: update events (Any) + :param post_hook: post hook function (Any) + :raises: Exception caught when updating event ranges + :return: status of updating event ranges """ - if self.is_stop(): return None @@ -313,13 +336,12 @@ def update_events(self, update_events, post_hook=None): else: return req.response.content - def get_plugin_confs(self): + def get_plugin_confs(self) -> dict: """ - Get different plugin for different communicator + Get different plug-in for different communicator. - :returns: dict with {'class': } and other items + :returns: dict with {'class': } and other items (dict). """ - plugin = os.environ.get('COMMUNICATOR_PLUGIN', None) if not plugin: plugin_confs = {'class': 'pilot.eventservice.communicationmanager.plugins.pandacommunicator.PandaCommunicator'} @@ -333,16 +355,20 @@ def get_plugin_confs(self): if self.args: for key, value in list(vars(self.args).items()): # Python 2/3 plugin_confs[key] = value + return plugin_confs - def can_process_request(self, processor, process_type): + def can_process_request(self, processor: dict, process_type: str) -> bool: """ - To check whether it is ready to process request in a type. - For request such as HarvesterShareFileCommunicator, it should check whether there are processing requests to avoid overwriting files. + Check whether it is ready to process request in a type. - :returns: True or False - """ + For request such as HarvesterShareFileCommunicator, it should check whether there are processing requests to + avoid overwriting files. + :param processor: processor dictionary (dict) + :param process_type: process type (str) + :return: True or False (bool). + """ if self.queues[process_type].empty(): return False @@ -356,14 +382,11 @@ def can_process_request(self, processor, process_type): return False def run(self): - """ - Main loop to handle communication requests - """ - + """Handle communication requests.""" confs = self.get_plugin_confs() - logger.info("Communication plugin confs: %s" % confs) + logger.info(f"communication plugin confs: {confs}") communicator = self.get_plugin(confs) - logger.info("Communication: %s" % communicator) + logger.info(f"communicator: {communicator}") processor = {'request_get_jobs': {'pre_check': communicator.pre_check_get_jobs, 'handler': communicator.request_get_jobs, @@ -397,7 +420,7 @@ def run(self): if self.is_stop(): while not self.queues[process_type].empty(): req = self.queues[process_type].get() - logger.info("Is going to stop, aborting request: %s" % req) + logger.info(f"Is going to stop, aborting request: {req}") req.abort = True resp_attrs = {'status': None, 'content': None, @@ -408,14 +431,14 @@ def run(self): if not pre_check_resp.status == 0: continue - logger.info("Processing %s" % process_type) + logger.info(f"processing {process_type}") has_req = True req = self.queues[process_type].get() - logger.info("Processing %s request: %s" % (process_type, req)) + logger.info(f"processing {process_type} request: {req}") res = processor[process_type]['handler'](req) - logger.info("Processing %s respone: %s" % (process_type, res)) + logger.info(f"processing {process_type} respone: {res}") if res.status is False: req.response = res @@ -432,4 +455,5 @@ def run(self): if self.is_stop(): break time.sleep(1) - logger.info("Communication manager stopped.") + + logger.info("communication manager finished") diff --git a/pilot/eventservice/communicationmanager/plugins/__init__.py b/pilot/eventservice/communicationmanager/plugins/__init__.py index 02cf1dd8..afe6e4f7 100644 --- a/pilot/eventservice/communicationmanager/plugins/__init__.py +++ b/pilot/eventservice/communicationmanager/plugins/__init__.py @@ -19,3 +19,5 @@ # Authors: # - Wen Guan, wen.guan@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2023 + +"""Default init.""" diff --git a/pilot/eventservice/communicationmanager/plugins/basecommunicator.py b/pilot/eventservice/communicationmanager/plugins/basecommunicator.py index e2fcdc82..2e458501 100644 --- a/pilot/eventservice/communicationmanager/plugins/basecommunicator.py +++ b/pilot/eventservice/communicationmanager/plugins/basecommunicator.py @@ -20,96 +20,147 @@ # - Wen Guan, wen.guan@cern.ch, 2018 # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +"""Base communicator.""" + import logging +from typing import Any + logger = logging.getLogger(__name__) -""" -Base communicator -""" +class BaseCommunicator: + """Base communicator class.""" -class BaseCommunicator(object): _instance = None - def __new__(class_, *args, **kwargs): + def __new__(class_, *args: Any, **kwargs: dict) -> Any: + """ + Create new instance of class. + + :param args: args object (Any) + :param kwargs: kwargs dictionary (dict) + :return: new class instance (Any). + """ if not isinstance(class_._instance, class_): class_._instance = object.__new__(class_, *args, **kwargs) + return class_._instance - def __init__(self, *args, **kwargs): + def __init__(self, *args: Any, **kwargs: dict): + """ + Initialize variables. + + :param args: args object (Any) + :param kwargs: kwargs dictionary (dict) + """ super(BaseCommunicator, self).__init__() for key in kwargs: setattr(self, key, kwargs[key]) - def pre_check_get_jobs(self, req): + def pre_check_get_jobs(self, req: Any): """ - Precheck whether it's ok to send a requst to get jobs. + Check whether it's ok to send a request to get jobs. + + :param req: request (Any) + :raises: NotImplementedError. """ - #raise exception.NotImplementedError() raise NotImplementedError() - def request_get_jobs(self, req): + def request_get_jobs(self, req: Any): """ - Send a requst to get jobs. + Send a request to get jobs. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() - def check_get_jobs_status(self, req): + def check_get_jobs_status(self, req: Any): """ - Check whether jobs are prepared + Check whether jobs are prepared. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() - def get_jobs(self, req): + def get_jobs(self, req: Any): """ - Get the job + Get the jobs. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() - def update_jobs(self, req): + def update_jobs(self, req: Any): """ - Update jobs status. + Update job statuses. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() - def pre_check_get_events(self, req): + def pre_check_get_events(self, req: Any): """ - Precheck whether it's ok to send a request to get events. + Check whether it's ok to send a request to get events. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() - def request_get_events(self, req): + def request_get_events(self, req: Any): """ - Send a requst to get events. + Send a request to get events. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() - def check_get_events_status(self, req): + def check_get_events_status(self, req: Any): """ - Check whether events prepared + Check whether events prepared. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() - def get_events(self, req): + def get_events(self, req: Any): """ - Get events + Get events. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() - def pre_check_update_events(self, req): + def pre_check_update_events(self, req: Any): """ - Precheck whether it's ok to update events. + Check whether it's ok to update events. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() - def update_events(self, req): + def update_events(self, req: Any): """ Update events. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() - def pre_check_update_jobs(self, req): + def pre_check_update_jobs(self, req: Any): """ - Precheck whether it's ok to update event ranges. + Check whether it's ok to update event ranges. + + :param req: request (Any) + :raises: NotImplementedError. """ raise NotImplementedError() diff --git a/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py b/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py index 3f4c681c..c1c34697 100644 --- a/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py +++ b/pilot/eventservice/communicationmanager/plugins/pandacommunicator.py @@ -20,10 +20,13 @@ # - Wen Guan, wen.guan@cern.ch, 2018 # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 -import json +"""PanDA communicator.""" + +import logging import threading import traceback from os import environ +from typing import Any from pilot.common import exception from pilot.util import https @@ -31,47 +34,59 @@ from ..communicationmanager import CommunicationResponse from .basecommunicator import BaseCommunicator -import logging logger = logging.getLogger(__name__) -""" -Panda Communicator -""" - class PandaCommunicator(BaseCommunicator): - def __init__(self, *args, **kwargs): + """PanDA communicator class.""" + + def __init__(self, *args: Any, **kwargs: dict): + """ + Initialize variables. + + :param args: args object (Any) + :param kwargs: kwargs dictionary (dict) + """ super(PandaCommunicator, self).__init__(args, kwargs) self.get_jobs_lock = threading.Lock() self.get_events_lock = threading.Lock() self.update_events_lock = threading.Lock() self.update_jobs_lock = threading.Lock() - def pre_check_get_jobs(self, req=None): + def pre_check_get_jobs(self, req=None) -> Any: """ - Precheck whether it's ok to send a requst to get jobs. + Check whether it's ok to send a request to get jobs. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ return CommunicationResponse({'status': 0}) - def request_get_jobs(self, req): + def request_get_jobs(self, req: Any) -> Any: """ - Send a requst to get jobs. + Send a request to get jobs. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ return CommunicationResponse({'status': 0}) def check_get_jobs_status(self, req=None): """ - Check whether jobs are prepared + Check whether jobs are prepared. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ return CommunicationResponse({'status': 0}) - def get_jobs(self, req): + def get_jobs(self, req: Any) -> dict: """ Get the job definition from panda server. - :return: job definiton dictionary. + :param req: request (Any) + :return: job definition dictionary (dict). """ - self.get_jobs_lock.acquire() try: @@ -86,10 +101,10 @@ def get_jobs(self, req): data[key] = getattr(req, value) for i in range(req.num_jobs): - logger.info("Getting jobs: %s" % data) + logger.info(f"Getting jobs: {data}") url = environ.get('PANDA_SERVER_URL', config.Pilot.pandaserver) - res = https.request('{pandaserver}/server/panda/getJob'.format(pandaserver=url), data=data) - logger.info("Got jobs returns: %s" % res) + res = https.request(f'{url}/server/panda/getJob', data=data) + logger.info(f"Got jobs returns: {res}") if res is None: resp_attrs = {'status': None, 'content': None, 'exception': exception.CommunicationFailure("Get job failed to get response from Panda.")} @@ -101,7 +116,7 @@ def get_jobs(self, req): elif res['StatusCode'] != 0: resp_attrs = {'status': res['StatusCode'], 'content': None, - 'exception': exception.CommunicationFailure("Get job from Panda returns a non-zero value: %s" % res['StatusCode'])} + 'exception': exception.CommunicationFailure(f"Get job from Panda returns a non-zero value: {res['StatusCode']}")} break else: jobs.append(res) @@ -113,39 +128,50 @@ def get_jobs(self, req): resp = CommunicationResponse(resp_attrs) except Exception as e: # Python 2/3 - logger.error("Failed to get jobs: %s, %s" % (e, traceback.format_exc())) - resp_attrs = {'status': -1, 'content': None, 'exception': exception.UnknownException("Failed to get jobs: %s" % (traceback.format_exc()))} + logger.error(f"Failed to get jobs: {e}, {traceback.format_exc()}") + resp_attrs = {'status': -1, 'content': None, 'exception': exception.UnknownException(f"Failed to get jobs: {traceback.format_exc()}")} resp = CommunicationResponse(resp_attrs) self.get_jobs_lock.release() return resp - def pre_check_get_events(self, req=None): + def pre_check_get_events(self, req: Any = None) -> Any: """ Precheck whether it's ok to send a request to get events. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ return CommunicationResponse({'status': 0}) - def request_get_events(self, req): + def request_get_events(self, req: Any) -> Any: """ - Send a requst to get events. + Send a request to get events. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ return CommunicationResponse({'status': 0}) - def check_get_events_status(self, req=None): + def check_get_events_status(self, req: Any = None) -> Any: """ - Check whether events prepared + Check whether events prepared. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ return CommunicationResponse({'status': 0}) - def get_events(self, req): + def get_events(self, req: Any) -> Any: """ - Get events + Get events. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ self.get_events_lock.acquire() - resp = None try: if not req.num_ranges: # ToBeFix num_ranges with corecount @@ -156,10 +182,10 @@ def get_events(self, req): 'taskID': req.taskid, 'nRanges': req.num_ranges} - logger.info("Downloading new event ranges: %s" % data) + logger.info(f"Downloading new event ranges: {data}") url = environ.get('PANDA_SERVER_URL', config.Pilot.pandaserver) - res = https.request('{pandaserver}/server/panda/getEventRanges'.format(pandaserver=url), data=data) - logger.info("Downloaded event ranges: %s" % res) + res = https.request(f'{url}/server/panda/getEventRanges', data=data) + logger.info(f"Downloaded event ranges: {res}") if res is None: resp_attrs = {'status': -1, @@ -170,90 +196,104 @@ def get_events(self, req): else: resp_attrs = {'status': res['StatusCode'], 'content': None, - 'exception': exception.CommunicationFailure("Get events from panda returns non-zero value: %s" % res['StatusCode'])} + 'exception': exception.CommunicationFailure(f"Get events from panda returns non-zero value: {res['StatusCode']}")} resp = CommunicationResponse(resp_attrs) except Exception as e: # Python 2/3 - logger.error("Failed to download event ranges: %s, %s" % (e, traceback.format_exc())) - resp_attrs = {'status': -1, 'content': None, 'exception': exception.UnknownException("Failed to get events: %s" % (traceback.format_exc()))} + logger.error(f"Failed to download event ranges: {e}, {traceback.format_exc()}") + resp_attrs = {'status': -1, 'content': None, 'exception': exception.UnknownException(f"Failed to get events: {traceback.format_exc()}")} resp = CommunicationResponse(resp_attrs) self.get_events_lock.release() return resp - def pre_check_update_events(self, req=None): + def pre_check_update_events(self, req: Any = None) -> Any: """ Precheck whether it's ok to update events. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ self.update_events_lock.acquire() try: pass except Exception as e: # Python 2/3 - logger.error("Failed to pre_check_update_events: %s, %s" % (e, traceback.format_exc())) + logger.error(f"Failed to pre_check_update_events: {e}, {traceback.format_exc()}") self.update_events_lock.release() + return CommunicationResponse({'status': 0}) - def update_events(self, req): + def update_events(self, req: Any) -> Any: """ Update events. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ self.update_events_lock.acquire() - resp = None try: - logger.info("Updating events: %s" % req) + logger.info(f"Updating events: {req}") url = environ.get('PANDA_SERVER_URL', config.Pilot.pandaserver) - res = https.request('{pandaserver}/server/panda/updateEventRanges'.format(pandaserver=url), data=req.update_events) + res = https.request(f'{url}/server/panda/updateEventRanges', data=req.update_events) - logger.info("Updated event ranges status: %s" % res) + logger.info(f"Updated event ranges status: {res}") resp_attrs = {'status': 0, 'content': res, 'exception': None} resp = CommunicationResponse(resp_attrs) except Exception as e: # Python 2/3 - logger.error("Failed to update event ranges: %s, %s" % (e, traceback.format_exc())) - resp_attrs = {'status': -1, 'content': None, 'exception': exception.UnknownException("Failed to update events: %s" % (traceback.format_exc()))} + logger.error(f"Failed to update event ranges: {e}, {traceback.format_exc()}") + resp_attrs = {'status': -1, 'content': None, 'exception': exception.UnknownException(f"Failed to update events: {traceback.format_exc()}")} resp = CommunicationResponse(resp_attrs) self.update_events_lock.release() + return resp - def pre_check_update_jobs(self, req=None): + def pre_check_update_jobs(self, req: Any = None) -> Any: """ - Precheck whether it's ok to update jobs. + Check whether it's ok to update jobs. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ - self.update_jobs_lock.acquire() try: - pass - except Exception as e: # Python 2/3 - logger.error("Failed to pre_check_update_jobs: %s, %s" % (e, traceback.format_exc())) - self.update_jobs_lock.release() + self.update_jobs_lock.acquire() + + self.update_jobs_lock.release() + except Exception as exc: + logger.error(f"failed in pre_check_update_jobs: {exc}, {traceback.format_exc()}") return CommunicationResponse({'status': 0}) - def update_job(self, job): + def update_job(self, job: Any) -> int: """ Update job. - """ + :param job: job definition (Any) + :return: status code (int). + """ try: - logger.info("Updating job: %s" % job) + logger.info(f"Updating job: {job}") url = environ.get('PANDA_SERVER_URL', config.Pilot.pandaserver) - res = https.request('{pandaserver}/server/panda/updateJob'.format(pandaserver=url), data=job) + res = https.request(f'{url}/server/panda/updateJob', data=job) - logger.info("Updated jobs status: %s" % res) + logger.info(f"Updated jobs status: {res}") return res - except Exception as e: # Python 2/3 - logger.error("Failed to update jobs: %s, %s" % (e, traceback.format_exc())) + except Exception as exc: + logger.error(f"failed to update jobs: {exc}, {traceback.format_exc()}") return -1 - def update_jobs(self, req): + def update_jobs(self, req: Any) -> Any: """ Update jobs. + + :param req: request (Any) + :return: CommunicationResponse({'status': 0}) (Any). """ self.update_jobs_lock.acquire() - resp = None try: - logger.info("Updating jobs: %s" % req) + logger.info(f"Updating jobs: {req}") res_list = [] for job in req.jobs: res = self.update_job(job) @@ -261,32 +301,10 @@ def update_jobs(self, req): resp_attrs = {'status': 0, 'content': res_list, 'exception': None} resp = CommunicationResponse(resp_attrs) except Exception as e: # Python 2/3 - logger.error("Failed to update jobs: %s, %s" % (e, traceback.format_exc())) - resp_attrs = {'status': -1, 'content': None, 'exception': exception.UnknownException("Failed to update jobs: %s" % (traceback.format_exc()))} + logger.error(f"Failed to update jobs: {e}, {traceback.format_exc()}") + resp_attrs = {'status': -1, 'content': None, 'exception': exception.UnknownException(f"Failed to update jobs: {traceback.format_exc()}")} resp = CommunicationResponse(resp_attrs) self.update_jobs_lock.release() - return resp - - def update_jobs_old(self, req): - """ - Update jobs. - """ - self.update_jobs_lock.acquire() - - try: - logger.info("Updating jobs: %s" % req) - data = {'jobList': json.dumps(req.jobs)} - url = environ.get('PANDA_SERVER_URL', config.Pilot.pandaserver) - res = https.request('{pandaserver}/server/panda/updateJobsInBulk'.format(pandaserver=url), data=data) - logger.info("Updated jobs status: %s" % res) - resp_attrs = {'status': 0, 'content': res, 'exception': None} - resp = CommunicationResponse(resp_attrs) - except Exception as e: # Python 2/3 - logger.error("Failed to update jobs: %s, %s" % (e, traceback.format_exc())) - resp_attrs = {'status': -1, 'content': None, 'exception': exception.UnknownException("Failed to update jobs: %s" % (traceback.format_exc()))} - resp = CommunicationResponse(resp_attrs) - - self.update_jobs_lock.release() return resp diff --git a/pilot/eventservice/esprocess/eshook.py b/pilot/eventservice/esprocess/eshook.py index fe5ec6c0..6da9a28e 100644 --- a/pilot/eventservice/esprocess/eshook.py +++ b/pilot/eventservice/esprocess/eshook.py @@ -20,37 +20,38 @@ # - Wen Guan, wen.guan@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2023 -""" -Hooks for EventService. -""" +"""Hooks for EventService.""" class ESHook: - def get_payload(self): + """Event Service Hook class.""" + + def get_payload(self) -> dict: """ Get payload to execute. - :returns: dict {'payload': , 'output_file': , 'error_file': } + :return: {'payload': , 'output_file': , 'error_file': } (dict). """ raise Exception("Not Implemented") - def get_event_ranges(self, num_ranges=1): + def get_event_ranges(self, num_ranges: int = 1) -> dict: """ Get event ranges. - :param num_ranges: Number of event ranges to download, default is 1. - :returns: dict of event ranges. - None if no available events. + :param num_ranges: Number of event ranges to download, default is 1 (int) + :returns: dictionary of event ranges (dict). """ raise Exception("Not Implemented") - def handle_out_message(self, message): + def handle_out_message(self, message: dict): """ Handle ES output or error message. - :param message: a dict of parsed message. - For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , - 'wall': , 'message': }. - Fro 'failed' event ranges, it's {'id': , 'status': 'finished', 'message': }. + Example + For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , + 'wall': , 'message': }. + For 'failed' event ranges, it's {'id': , 'status': 'finished', 'message': }. + + :param message: dictionary of a parsed message (dict). """ raise Exception("Not Implemented") diff --git a/pilot/eventservice/esprocess/esmanager.py b/pilot/eventservice/esprocess/esmanager.py index 81e541b5..2bcabb35 100644 --- a/pilot/eventservice/esprocess/esmanager.py +++ b/pilot/eventservice/esprocess/esmanager.py @@ -20,40 +20,39 @@ # - Wen Guan, wen.guan@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2023 +"""Event Service manager to set up and run ESProcess.""" + import logging +from typing import Any from pilot.eventservice.esprocess.esprocess import ESProcess from pilot.eventservice.esprocess.eshook import ESHook logger = logging.getLogger(__name__) -""" -ES manager to setup and run ESProcess. -""" - class ESManager: - def __init__(self, hook): + """Event Service manager class.""" + + def __init__(self, hook: Any): """ - Initialization: setup ES hooks. + Set up ES hooks. - :param hook: an instance of ESHook. + :param hook: an instance of ESHook (Any) + :raises Exception: if hook is not an instance of ESHook. """ logger.info('initializing hooks') if not isinstance(hook, ESHook): - raise Exception("hook(%s) is not instance of %s" % (hook, ESHook)) + raise Exception(f"hook({hook}) is not instance of {ESHook}") self.__hook = hook logger.info('initialized hooks') def run(self): - """ - Initialize and run ESProcess. - """ - + """Initialize and run ESProcess.""" logger.debug('gettting payload') payload = self.__hook.get_payload() - logger.debug('got payload: %s' % payload) + logger.debug(f'got payload: {payload}') logger.info('init ESProcess') process = ESProcess(payload) diff --git a/pilot/eventservice/esprocess/esmessage.py b/pilot/eventservice/esprocess/esmessage.py index a7439274..05c667e4 100644 --- a/pilot/eventservice/esprocess/esmessage.py +++ b/pilot/eventservice/esprocess/esmessage.py @@ -19,106 +19,99 @@ # - Wen Guan, wen.guan@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +"""Event Service message class.""" + import logging import os import threading import time import traceback +from typing import Any from pilot.common.exception import PilotException, MessageFailure - logger = logging.getLogger(__name__) class MessageThread(threading.Thread): - """ - A thread to receive messages from payload and put recevied messages to the out queues. - """ + """A thread to receive messages from payload and put recevied messages to the out queues.""" - def __init__(self, message_queue, socket_name=None, context='local', **kwds): + def __init__(self, message_queue: Any, socket_name: str = None, context: str = 'local', **kwds: dict): """ Initialize yampl server socket. - :param message_queue: a queue to transfer messages between current instance and ESProcess. - :param socket_name: name of the socket between current process and payload. - :param context: name of the context between current process and payload, default is 'local'. - :param **kwds: other parameters. - - :raises MessageFailure: when failed to setup message socket. + :param message_queue: a queue to transfer messages between current instance and ESProcess (Any) + :param socket_name: name of the socket between current process and payload (str) + :param context: name of the context between current process and payload, default is 'local' (str) + :param **kwds: other parameters (dict) + :raises MessageFailure: when failed to set up message socket. """ - threading.Thread.__init__(self, **kwds) self.setName("MessageThread") self.__message_queue = message_queue self._socket_name = socket_name self.__stop = threading.Event() - logger.info('try to import yampl') try: import yampl - except Exception as e: - raise MessageFailure("Failed to import yampl: %s" % e) - logger.info('finished to import yampl') + except Exception as exc: + raise MessageFailure(f"Failed to import yampl: {exc}") - logger.info('start to setup yampl server socket.') + logger.info('setup yampl server socket') try: if self._socket_name is None or len(self._socket_name) == 0: - self._socket_name = 'EventService_EventRanges_' + str(os.getpid()) + self._socket_name = f'EventService_EventRanges_{os.getpid()}' self.__message_server = yampl.ServerSocket(self._socket_name, context) - except Exception as e: - raise MessageFailure("Failed to setup yampl server socket: %s %s" % (e, traceback.print_exc())) - logger.info('finished to setup yampl server socket(socket_name: %s, context:%s).' % (self._socket_name, context)) + except Exception as exc: + raise MessageFailure(f"failed to set up yampl server socket: {exc} {traceback.print_exc()}") + logger.info(f'finished setting up yampl server socket (socket_name: {self._socket_name}, context:{context}).') - def get_yampl_socket_name(self): + def get_yampl_socket_name(self) -> str: + """ + Get yampl socket name. + + :return: yampl socket name (str). + """ return self._socket_name - def send(self, message): + def send(self, message: str): """ Send messages to payload through yampl server socket. - :param message: String of the message. - - :raises MessageFailure: When failed to send a message to the payload. + :param message: message (str). + :raises MessageFailure: when failed to send a message to the payload. """ - logger.debug('Send a message to yampl: %s' % message) + logger.debug(f'will send message to yampl: {message}') try: if not self.__message_server: raise MessageFailure("No message server.") - self.__message_server.send_raw(message.encode('utf8')) # Python 2 and 3 - except Exception as e: - raise MessageFailure(e) + self.__message_server.send_raw(message.encode('utf8')) + except Exception as exc: + raise MessageFailure(exc) def stop(self): - """ - Set stop event. - """ + """Set stop event.""" logger.debug('set stop event') self.__stop.set() - def is_stopped(self): + def is_stopped(self) -> bool: """ Get status whether stop event is set. - :returns: True if stop event is set, otherwise False. + :return: True if stop event is set, otherwise False (bool). """ return self.__stop.is_set() def terminate(self): - """ - Terminate message server. - """ + """Terminate message server.""" if self.__message_server: - logger.info("Terminating message server.") + logger.info("terminating message server.") del self.__message_server self.__message_server = None def run(self): - """ - Main thread loop to poll messages from payload and - put received into message queue for other processes to fetch. - """ - logger.info('Message thread starts to run.') + """Poll messages from payload and put received into message queue for other processes to fetch.""" + logger.info('message thread starts to run') try: while True: if self.is_stopped(): @@ -132,13 +125,13 @@ def run(self): time.sleep(0.01) else: self.__message_queue.put(buf.decode('utf8')) # Python 2 and 3 - except PilotException as e: + except PilotException as exc: self.terminate() - logger.error("Pilot Exception: Message thread got an exception, will finish: %s, %s" % (e.get_detail(), traceback.format_exc())) - # raise e - except Exception as e: + logger.error(f"Pilot Exception: message thread got an exception, will finish: {exc.get_detail()}, {traceback.format_exc()}") + # raise exc + except Exception as exc: self.terminate() - logger.error("Message thread got an exception, will finish: %s" % str(e)) - # raise MessageFailure(e) + logger.error(f"message thread got an exception, will finish: {exc}") + # raise MessageFailure(exc) self.terminate() - logger.info('Message thread finished.') + logger.info('message thread finished.') diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index b911d4a8..b62743b2 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -19,44 +19,52 @@ # - Wen Guan, wen.guan@cern.ch, 2017-2018 # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +""" +Event Service process. + +Main process to handle event service. +It makes use of two hooks get_event_ranges_hook and handle_out_message_hook to communicate with other processes when +it's running. The process will handle the logic of Event service independently. +""" + import io import json import logging import os +import queue import re import subprocess import time import threading import traceback - -try: - import Queue as queue # noqa: N813 -except Exception: - import queue # Python 3 - -from pilot.common.exception import PilotException, MessageFailure, SetupFailure, RunPayloadFailure, UnknownException +from typing import ( + Any, + TextIO +) + +from pilot.common.exception import ( + PilotException, + MessageFailure, + SetupFailure, + RunPayloadFailure, + UnknownException +) from pilot.eventservice.esprocess.esmessage import MessageThread from pilot.util.container import containerise_executable from pilot.util.processes import kill_child_processes logger = logging.getLogger(__name__) -""" -Main process to handle event service. -It makes use of two hooks get_event_ranges_hook and handle_out_message_hook to communicate with other processes when -it's running. The process will handle the logic of Event service independently. -""" - class ESProcess(threading.Thread): - """ - Main EventService Process. - """ + """Main EventService Process.""" + def __init__(self, payload, waiting_time=30 * 60): """ - Init ESProcess. + Initialize ESProcess. - :param payload: a dict of {'executable': , 'output_file': , 'error_file': } + :param payload: {'executable': , 'output_file': , 'error_file': } (dict) + :param waiting_time: waiting time for no more events (int). """ threading.Thread.__init__(self, name='esprocess') @@ -85,13 +93,24 @@ def __init__(self, payload, waiting_time=30 * 60): self.event_ranges_cache = [] def __del__(self): + """Handle destruction.""" if self.__message_thread: self.__message_thread.stop() def is_payload_started(self): + """ + Check whether the payload has started. + + :return: True if the payload has started, otherwise False (bool). + """ return self.__is_payload_started def stop(self, delay=1800): + """ + Stop the process. + + :param delay: waiting time to stop the process (int). + """ if not self.__stop.is_set(): self.__stop.set() self.__stop_set_time = time.time() @@ -99,33 +118,29 @@ def stop(self, delay=1800): event_ranges = "No more events" self.send_event_ranges_to_payload(event_ranges) - def init_message_thread(self, socketname=None, context='local'): + def init_message_thread(self, socketname: str = None, context: str = 'local'): """ - init message thread. - - :param socket_name: name of the socket between current process and payload. - :param context: name of the context between current process and payload, default is 'local'. + Initialize message thread. + :param socket_name: name of the socket between current process and payload (str) + :param context: name of the context between current process and payload, default is 'local' (str) :raises MessageFailure: when failed to init message thread. """ - - logger.info("start to init message thread") + logger.info("start to initialize message thread") try: self.__message_thread = MessageThread(self.__message_queue, socketname, context) self.__message_thread.start() - except PilotException as e: - logger.error("Failed to start message thread: %s" % e.get_detail()) + except PilotException as exc: + logger.error(f"failed to start message thread: {exc.get_detail()}") self.__ret_code = -1 - except Exception as e: - logger.error("Failed to start message thread: %s" % str(e)) + except Exception as exc: + logger.error(f"failed to start message thread: {exc}") self.__ret_code = -1 - raise MessageFailure(e) - logger.info("finished to init message thread") + raise MessageFailure(exc) + logger.info("finished initializing message thread") def stop_message_thread(self): - """ - Stop message thread - """ + """Stop message thread.""" logger.info("Stopping message thread") if self.__message_thread: while self.__message_thread.is_alive(): @@ -133,18 +148,24 @@ def stop_message_thread(self): self.__message_thread.stop() logger.info("Message thread stopped") - def init_yampl_socket(self, executable): + def init_yampl_socket(self, executable: str) -> str: + """ + Initialize yampl socket. + + :param executable: executable string. + :return: executable string with yampl socket name (str). + """ socket_name = self.__message_thread.get_yampl_socket_name() is_ca = "--CA" in executable if is_ca: - preexec_socket_config = " --preExec \'ConfigFlags.MP.EventRangeChannel=\"%s\"\' " % (socket_name) + preexec_socket_config = f" --preExec 'ConfigFlags.MP.EventRangeChannel=\"{socket_name}\"' " else: preexec_socket_config = \ - " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\' " % (socket_name) + f" --preExec 'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"{socket_name}\"' " if "PILOT_EVENTRANGECHANNEL" in executable: - executable = "export PILOT_EVENTRANGECHANNEL=\"%s\"; " % (socket_name) + executable + executable = f"export PILOT_EVENTRANGECHANNEL=\"{socket_name}\"; " + executable elif "--preExec" not in executable: executable = executable.strip() if executable.endswith(";"): @@ -153,30 +174,29 @@ def init_yampl_socket(self, executable): else: if "import jobproperties as jps" in executable: executable = executable.replace("import jobproperties as jps;", - "import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\";" % (socket_name)) + f"import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"{socket_name}\";") if is_ca: logger.warning("Found jobproperties config in CA job") else: if "--preExec " in executable: executable = executable.replace("--preExec ", preexec_socket_config) else: - logger.warn("--preExec has an unknown format - expected \'--preExec \"\' or \"--preExec \'\", got: %s" % (executable)) + logger.warn(f"--preExec has an unknown format - expected '--preExec \"' or \"--preExec '\", got: {executable}") return executable def init_payload_process(self): """ - init payload process. + Initialize payload process. :raise SetupFailure: when failed to init payload process. """ - - logger.info("start to init payload process") + logger.info("initializing payload process") try: try: workdir = self.get_workdir() - except Exception as e: - raise e + except Exception as exc: + raise exc executable = self.get_executable(workdir) output_file_fd = self.get_file(workdir, file_label='output_file', file_name='ES_payload_output.txt') @@ -187,11 +207,11 @@ def init_payload_process(self): try: executable, diagnostics = containerise_executable(executable, job=self.__payload['job'], workdir=workdir) if diagnostics: - msg = 'containerisation of executable failed: %s' % diagnostics + msg = f'containerisation of executable failed: {diagnostics}' logger.warning(msg) raise SetupFailure(msg) except Exception as e: - msg = 'exception caught while preparing container command: %s' % e + msg = f'exception caught while preparing container command: {e}' logger.warning(msg) raise SetupFailure(msg) else: @@ -202,34 +222,30 @@ def init_payload_process(self): self.pid = self.__process.pid self.__payload['job'].pid = self.pid self.__is_payload_started = True - logger.debug("Started new processs (executable: %s, stdout: %s, stderr: %s, pid: %s)" % (executable, - output_file_fd, - error_file_fd, - self.__process.pid)) + logger.debug(f"started new processs (executable: {executable}, stdout: {output_file_fd}, " + f"stderr: {error_file_fd}, pid: {self.__process.pid})") if 'job' in self.__payload and self.__payload['job'] and self.__payload['job'].corecount: self.corecount = int(self.__payload['job'].corecount) - except PilotException as e: - logger.error("Failed to start payload process: %s, %s" % (e.get_detail(), traceback.format_exc())) + except PilotException as exc: + logger.error(f"failed to start payload process: {exc.get_detail()}, {traceback.format_exc()}") self.__ret_code = -1 - except Exception as e: - logger.error("Failed to start payload process: %s, %s" % (str(e), traceback.format_exc())) + except Exception as exc: + logger.error(f"failed to start payload process: {exc}, {traceback.format_exc()}") self.__ret_code = -1 - raise SetupFailure(e) + raise SetupFailure(exc) logger.info("finished initializing payload process") - def get_file(self, workdir, file_label='output_file', file_name='ES_payload_output.txt'): + def get_file(self, workdir: str, file_label: str = 'output_file', + file_name: str = 'ES_payload_output.txt') -> TextIO: """ Return the requested file. - :param file_label: - :param workdir: - :return: + :param file_label: label of the file (str) + :param workdir: work directory (str) + :param file_name: name of the file (str) + :return: file descriptor (TextIO). """ - - try: - file_type = file # Python 2 - except NameError: - file_type = io.IOBase # Python 3 + file_type = io.IOBase if file_label in self.__payload: if isinstance(self.__payload[file_label], file_type): @@ -243,84 +259,83 @@ def get_file(self, workdir, file_label='output_file', file_name='ES_payload_outp return _file_fd - def get_workdir(self): + def get_workdir(self) -> str: """ Return the workdir. + If the workdir is set but is not a directory, return None. - :return: workdir (string or None). + :return: work directory (str) :raises SetupFailure: in case workdir is not a directory. """ - workdir = '' if 'workdir' in self.__payload: workdir = self.__payload['workdir'] if not os.path.exists(workdir): - os.makedirs(workdir) + try: + os.makedirs(workdir) + except OSError as exc: + raise SetupFailure(f"failed to create workdir: {exc}") elif not os.path.isdir(workdir): raise SetupFailure('workdir exists but is not a directory') + return workdir - def get_executable(self, workdir): + def get_executable(self, workdir: str) -> str: """ Return the executable string. - :param workdir: work directory (string). - :return: executable (string). + :param workdir: work directory (str) + :return: executable (str). """ - executable = self.__payload['executable'] - executable = self.init_yampl_socket(executable) - return 'cd %s; %s' % (workdir, executable) + executable = self.init_yampl_socket(self.__payload['executable']) + return f'cd {workdir}; {executable}' - def set_get_event_ranges_hook(self, hook): + def set_get_event_ranges_hook(self, hook: Any): """ - set get_event_ranges hook. + Set get_event_ranges hook. - :param hook: a hook method to get event ranges. + :param hook: a hook method to get event ranges (Any). """ - self.get_event_ranges_hook = hook - def get_get_event_ranges_hook(self): + def get_get_event_ranges_hook(self) -> Any: """ - get get_event_ranges hook. + Get get_event_ranges hook. - :returns: The hook method to get event ranges. + :return: the hook method to get event ranges (Any). """ - return self.get_event_ranges_hook - def set_handle_out_message_hook(self, hook): + def set_handle_out_message_hook(self, hook: Any): """ - set handle_out_message hook. + Set handle_out_message hook. - :param hook: a hook method to handle payload output and error messages. + :param hook: a hook method to handle payload output and error messages (Any). """ - self.handle_out_message_hook = hook - def get_handle_out_message_hook(self): + def get_handle_out_message_hook(self) -> Any: """ - get handle_out_message hook. + Get handle_out_message hook. - :returns: The hook method to handle payload output and error messages. + :return: The hook method to handle payload output and error messages (Any). """ - return self.handle_out_message_hook def init(self): """ - initialize message thread and payload process. - """ + Initialize message thread and payload process. + :raises: SetupFailure, MessageFailure. + """ try: self.init_message_thread() self.init_payload_process() - except Exception as e: - # TODO: raise exceptions + except Exception as exc: self.__ret_code = -1 self.stop() - raise e + raise exc def monitor(self): """ @@ -328,12 +343,12 @@ def monitor(self): raises: MessageFailure: when the message thread is dead or exited. RunPayloadFailure: when the payload process is dead or exited. + Exception: when too long time since "No more events" is injected. """ - if self.__no_more_event_time and time.time() - self.__no_more_event_time > self.__waiting_time: self.__ret_code = -1 - raise Exception('Too long time (%s seconds) since "No more events" is injected' % - (time.time() - self.__no_more_event_time)) + raise Exception(f'Too long time ({time.time() - self.__no_more_event_time} seconds) ' + f'since \"No more events\" is injected') if self.__monitor_log_time is None or self.__monitor_log_time < time.time() - 10 * 60: self.__monitor_log_time = time.time() @@ -351,39 +366,43 @@ def monitor(self): logger.info("Payload finished with no more events") else: self.__ret_code = self.__process.poll() - raise RunPayloadFailure("Payload process is not alive: %s" % self.__process.poll()) + raise RunPayloadFailure(f"Payload process is not alive: {self.__process.poll()}") if self.__stop.is_set() and time.time() > self.__stop_set_time + self.__stop_delay: - logger.info("Stop has been set for %s seconds, which is more than the stop wait time. Will terminate" % self.__stop_delay) + logger.info(f"Stop has been set for {self.__stop_delay} seconds, which is more than the stop wait time. Will terminate") self.terminate() - def has_running_children(self): + def has_running_children(self) -> bool: """ - Check whether it has running children + Check whether there are running children. - :return: True if there are alive children, otherwise False + :return: True if there are alive children, otherwise False (bool). """ if self.__message_thread and self.__message_thread.is_alive(): return True if self.__process and self.__process.poll() is None: return True + return False - def is_payload_running(self): + def is_payload_running(self) -> bool: """ - Check whether the payload is still running + Check whether the payload is still running. - :return: True if the payload is running, otherwise False + :return: True if the payload is running, otherwise False (bool). """ if self.__process and self.__process.poll() is None: return True + return False - def get_event_range_to_payload(self): + def get_event_range_to_payload(self) -> list: """ - Get one event range to be sent to payload + Get one event range to be sent to payload. + + :return: list of event ranges (list). """ - logger.debug("Number of cached event ranges: %s" % len(self.event_ranges_cache)) + logger.debug(f"number of cached event ranges: {len(self.event_ranges_cache)}") if not self.event_ranges_cache: event_ranges = self.get_event_ranges() if event_ranges: @@ -395,38 +414,36 @@ def get_event_range_to_payload(self): else: return [] - def get_event_ranges(self, num_ranges=None): + def get_event_ranges(self, num_ranges: int = None) -> list: """ - Calling get_event_ranges hook to get event ranges. - - :param num_ranges: number of event ranges to get. + Call get_event_ranges hook to get event ranges. + :param num_ranges: number of event ranges to get (int) + :return: list of event ranges (list) :raises: SetupFailure: If get_event_ranges_hook is not set. MessageFailure: when failed to get event ranges. """ if not num_ranges: num_ranges = self.corecount - logger.debug('getting event ranges(num_ranges=%s)' % num_ranges) + logger.debug(f'getting event ranges(num_ranges={num_ranges})') if not self.get_event_ranges_hook: raise SetupFailure("get_event_ranges_hook is not set") try: - logger.debug('calling get_event_ranges hook(%s) to get event ranges.' % self.get_event_ranges_hook) + logger.debug(f'calling get_event_ranges hook({self.get_event_ranges_hook}) to get event ranges.') event_ranges = self.get_event_ranges_hook(num_ranges) - logger.debug('got event ranges: %s' % event_ranges) + logger.debug(f'got event ranges: {event_ranges}') return event_ranges except Exception as e: - raise MessageFailure("Failed to get event ranges: %s" % e) + raise MessageFailure(f"Failed to get event ranges: {e}") - def send_event_ranges_to_payload(self, event_ranges): + def send_event_ranges_to_payload(self, event_ranges: list): """ Send event ranges to payload through message thread. - :param event_ranges: list of event ranges. + :param event_ranges: list of event ranges (list). """ - - msg = None if "No more events" in event_ranges: msg = event_ranges self.is_no_more_events = True @@ -435,21 +452,19 @@ def send_event_ranges_to_payload(self, event_ranges): if type(event_ranges) is not list: event_ranges = [event_ranges] msg = json.dumps(event_ranges) - logger.debug('send event ranges to payload: %s' % msg) + logger.debug(f'send event ranges to payload: {msg}') self.__message_thread.send(msg) - def parse_out_message(self, message): + def parse_out_message(self, message: str) -> dict: """ Parse output or error messages from payload. - :param message: The message string received from payload. - - :returns: a dict {'id': , 'status': , 'output': , 'cpu': , 'wall': , 'message': } + :param message: The message string received from payload (str) + :return: {'id': , 'status': , 'output': , 'cpu': , 'wall': , 'message': } :raises: PilotExecption: when a PilotException is caught. UnknownException: when other unknown exception is caught. """ - - logger.debug('parsing message: %s' % message) + logger.debug(f'parsing message: {message}') try: if message.startswith("/"): parts = message.split(",") @@ -473,7 +488,7 @@ def parse_out_message(self, message): ret = {'id': event_range_id, 'status': 'failed', 'message': message} return ret else: - raise Exception("Failed to parse %s" % message) + raise Exception(f"Failed to parse {message}") else: pattern = re.compile(r"(ERR\_[A-Z\_]+)\ ([0-9A-Za-z._\-]+)\:\ ?(.+)") found = re.findall(pattern, message) @@ -481,46 +496,42 @@ def parse_out_message(self, message): ret = {'id': event_range_id, 'status': 'failed', 'message': message} return ret else: - raise UnknownException("Unknown message %s" % message) + raise UnknownException(f"Unknown message {message}") except PilotException as e: raise e except Exception as e: raise UnknownException(e) - def handle_out_message(self, message): + def handle_out_message(self, message: str): """ Handle output or error messages from payload. - Messages from payload will be parsed and the handle_out_message hook is called. - :param message: The message string received from payload. + Messages from payload will be parsed and the handle_out_message hook is called. + :param message: message string received from payload (str) :raises: SetupFailure: when handle_out_message_hook is not set. RunPayloadFailure: when failed to handle an output or error message. """ - - logger.debug('handling out message: %s' % message) + logger.debug(f'handling out message: {message}') if not self.handle_out_message_hook: raise SetupFailure("handle_out_message_hook is not set") try: message_status = self.parse_out_message(message) - logger.debug('parsed out message: %s' % message_status) - logger.debug('calling handle_out_message hook(%s) to handle parsed message.' % self.handle_out_message_hook) + logger.debug(f'parsed out message: {message_status}') + logger.debug(f'calling handle_out_message hook({self.handle_out_message_hook}) to handle parsed message.') self.handle_out_message_hook(message_status) except Exception as e: - raise RunPayloadFailure("Failed to handle out message: %s" % e) + raise RunPayloadFailure(f"Failed to handle out message: {e}") def handle_messages(self): - """ - Monitor the message queue to get output or error messages from payload and response to different messages. - """ - + """Monitor the message queue to get output or error messages from payload and response to different messages.""" try: message = self.__message_queue.get(False) except queue.Empty: pass else: - logger.debug('received message from payload: %s' % message) + logger.debug(f'received message from payload: {message}') if "Ready for events" in message: event_ranges = self.get_event_range_to_payload() if not event_ranges: @@ -529,9 +540,9 @@ def handle_messages(self): else: self.handle_out_message(message) - def poll(self): + def poll(self) -> int: """ - poll whether the process is still running. + Poll whether the process is still running. :returns: None: still running. 0: finished successfully. @@ -539,12 +550,11 @@ def poll(self): """ return self.__ret_code - def terminate(self, time_to_wait=1): + def terminate(self, time_to_wait: int = 1): """ Terminate running threads and processes. - :param time_to_wait: integer, seconds to wait to force kill the payload process. - + :param time_to_wait: integer, seconds to wait to force kill the payload process (int) :raises: PilotExecption: when a PilotException is caught. UnknownException: when other unknown exception is caught. """ @@ -556,7 +566,7 @@ def terminate(self, time_to_wait=1): if self.__process.poll() == 0: logger.info("payload finished successfully.") else: - logger.error("payload finished with error code: %s" % self.__process.poll()) + logger.error(f"payload finished with error code: {self.__process.poll()}") else: for i in range(time_to_wait * 10): if not self.__process.poll() is None: @@ -567,20 +577,18 @@ def terminate(self, time_to_wait=1): if self.__process.poll() == 0: logger.info("payload finished successfully.") else: - logger.error("payload finished with error code: %s" % self.__process.poll()) + logger.error(f"payload finished with error code: {self.__process.poll()}") else: logger.info('terminating payload process.') pgid = os.getpgid(self.__process.pid) - logger.info('got process group id for pid %s: %s' % (self.__process.pid, pgid)) - # logger.info('send SIGTERM to process group: %s' % pgid) - # os.killpg(pgid, signal.SIGTERM) - logger.info('send SIGTERM to process: %s' % self.__process.pid) + logger.info(f'got process group id for pid {self.__process.pid}: {pgid}') + logger.info(f'send SIGTERM to process: {self.__process.pid}') kill_child_processes(self.__process.pid) self.__ret_code = self.__process.poll() else: self.__ret_code = -1 except Exception as e: - logger.error('Exception caught when terminating ESProcess: %s' % e) + logger.error(f'Exception caught when terminating ESProcess: {e}') self.__ret_code = -1 self.stop() raise UnknownException(e) @@ -589,8 +597,6 @@ def kill(self): """ Terminate running threads and processes. - :param time_to_wait: integer, seconds to wait to force kill the payload process. - :raises: PilotException: when a PilotException is caught. UnknownException: when other unknown exception is caught. """ @@ -602,56 +608,53 @@ def kill(self): if self.__process.poll() == 0: logger.info("payload finished successfully.") else: - logger.error("payload finished with error code: %s" % self.__process.poll()) + logger.error(f"payload finished with error code: {self.__process.poll()}") else: logger.info('killing payload process.') pgid = os.getpgid(self.__process.pid) - logger.info('got process group id for pid %s: %s' % (self.__process.pid, pgid)) - # logger.info('send SIGKILL to process group: %s' % pgid) - # os.killpg(pgid, signal.SIGKILL) - logger.info('send SIGKILL to process: %s' % self.__process.pid) + logger.info(f'got process group id for pid {self.__process.pid}: {pgid}') + logger.info(f'send SIGKILL to process: {self.__process.pid}') kill_child_processes(self.__process.pid) - except Exception as e: - logger.error('Exception caught when terminating ESProcess: %s' % e) + except Exception as exc: + logger.error(f'exception caught when terminating ESProcess: {exc}') self.stop() - raise UnknownException(e) + raise UnknownException(exc) def clean(self): - """ - Clean left resources - """ + """Clean left resources.""" self.terminate() def run(self): """ - Main run loops: monitor message thread and payload process. - handle messages from payload and response messages with injecting new event ranges or process outputs. + Run main loops. + + Monitor message thread and payload process. + Handle messages from payload and response messages with injecting new event ranges or process outputs. :raises: PilotExecption: when a PilotException is caught. UnknownException: when other unknown exception is caught. """ - - logger.info('start esprocess with thread ident: %s' % (self.ident)) - logger.debug('initializing') + logger.info(f'start esprocess with thread ident: {self.ident}') self.init() logger.debug('initialization finished.') - logger.info('starts to main loop') + logger.info('start main loop') while self.is_payload_running(): try: self.monitor() self.handle_messages() time.sleep(0.01) except PilotException as e: - logger.error('PilotException caught in the main loop: %s, %s' % (e.get_detail(), traceback.format_exc())) + logger.error(f'PilotException caught in the main loop: {e.get_detail()}, {traceback.format_exc()}') # TODO: define output message exception. If caught 3 output message exception, terminate self.stop() - except Exception as e: - logger.error('Exception caught in the main loop: %s, %s' % (e, traceback.format_exc())) + except Exception as exc: + logger.error(f'exception caught in the main loop: {exc}, {traceback.format_exc()}') # TODO: catch and raise exceptions # if catching dead process exception, terminate. self.stop() break + self.clean() self.stop_message_thread() logger.debug('main loop finished') diff --git a/pilot/eventservice/esprocess/esprocessfinegrainedproc.py b/pilot/eventservice/esprocess/esprocessfinegrainedproc.py new file mode 100644 index 00000000..641902fc --- /dev/null +++ b/pilot/eventservice/esprocess/esprocessfinegrainedproc.py @@ -0,0 +1,363 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Authors: +# - Wen Guan, wen.guan@cern.ch, 2023 + +import io +import logging +import os +import time +import threading +import traceback + +from pilot.common.exception import PilotException, MessageFailure, SetupFailure, RunPayloadFailure, UnknownException + + +logger = logging.getLogger(__name__) + +""" +Main process to handle event service. +It makes use of two hooks get_event_ranges_hook and handle_out_message_hook to communicate with other processes when +it's running. The process will handle the logic of Event service independently. +""" + + +class ESProcessFineGrainedProc(threading.Thread): + """ + Main EventService Process. + """ + def __init__(self, payload, waiting_time=30 * 60): + """ + Init ESProcessFineGrainedProc. + + :param payload: a dict of {'executable': , 'output_file': , 'error_file': } + """ + threading.Thread.__init__(self, name='esprocessFineGrainedProc') + + self.__payload = payload + + self.__process = None + + self.get_event_ranges_hook = None + self.handle_out_message_hook = None + + self.__monitor_log_time = None + self.is_no_more_events = False + self.__no_more_event_time = None + self.__waiting_time = waiting_time + self.__stop = threading.Event() + self.__stop_time = 180 + self.pid = None + self.__is_payload_started = False + + self.__ret_code = None + self.setName("ESProcessFineGrainedProc") + self.corecount = 1 + + self.event_ranges_cache = [] + + def is_payload_started(self): + return self.__is_payload_started + + def stop(self, delay=1800): + if not self.__stop.is_set(): + self.__stop.set() + self.__stop_set_time = time.time() + self.__stop_delay = delay + + def get_job_id(self): + if 'job' in self.__payload and self.__payload['job'] and self.__payload['job'].jobid: + return self.__payload['job'].jobid + return '' + + def get_corecount(self): + if 'job' in self.__payload and self.__payload['job'] and self.__payload['job'].corecount: + core_count = int(self.__payload['job'].corecount) + return core_count + return 1 + + def get_file(self, workdir, file_label='output_file', file_name='ES_payload_output.txt'): + """ + Return the requested file. + + :param file_label: + :param workdir: + :return: + """ + + try: + file_type = file # Python 2 + except NameError: + file_type = io.IOBase # Python 3 + + if file_label in self.__payload: + if isinstance(self.__payload[file_label], file_type): + _file_fd = self.__payload[file_label] + else: + _file = self.__payload[file_label] if '/' in self.__payload[file_label] else os.path.join(workdir, self.__payload[file_label]) + _file_fd = open(_file, 'w') + else: + _file = os.path.join(workdir, file_name) + _file_fd = open(_file, 'w') + + return _file_fd + + def get_workdir(self): + """ + Return the workdir. + If the workdir is set but is not a directory, return None. + + :return: workdir (string or None). + :raises SetupFailure: in case workdir is not a directory. + """ + + workdir = '' + if 'workdir' in self.__payload: + workdir = self.__payload['workdir'] + if not os.path.exists(workdir): + os.makedirs(workdir) + elif not os.path.isdir(workdir): + raise SetupFailure('workdir exists but is not a directory') + return workdir + + def get_executable(self, workdir): + """ + Return the executable string. + + :param workdir: work directory (string). + :return: executable (string). + """ + executable = self.__payload['executable'] + executable = self.get_payload_executable(executable) + return 'cd %s; %s' % (workdir, executable) + + def set_get_event_ranges_hook(self, hook): + """ + set get_event_ranges hook. + + :param hook: a hook method to get event ranges. + """ + + self.get_event_ranges_hook = hook + + def get_get_event_ranges_hook(self): + """ + get get_event_ranges hook. + + :returns: The hook method to get event ranges. + """ + + return self.get_event_ranges_hook + + def set_handle_out_message_hook(self, hook): + """ + set handle_out_message hook. + + :param hook: a hook method to handle payload output and error messages. + """ + + self.handle_out_message_hook = hook + + def get_handle_out_message_hook(self): + """ + get handle_out_message hook. + + :returns: The hook method to handle payload output and error messages. + """ + + return self.handle_out_message_hook + + def init(self): + """ + initialize message thread and payload process. + """ + + try: + pass + except Exception as e: + # TODO: raise exceptions + self.__ret_code = -1 + self.stop() + raise e + + def monitor(self): + """ + Monitor whether a process is dead. + + raises: MessageFailure: when the message thread is dead or exited. + RunPayloadFailure: when the payload process is dead or exited. + """ + pass + + def has_running_children(self): + """ + Check whether it has running children + + :return: True if there are alive children, otherwise False + """ + return False + + def is_payload_running(self): + """ + Check whether the payload is still running + + :return: True if the payload is running, otherwise False + """ + return False + + def get_event_ranges(self, num_ranges=None, queue_factor=1): + """ + Calling get_event_ranges hook to get event ranges. + + :param num_ranges: number of event ranges to get. + + :raises: SetupFailure: If get_event_ranges_hook is not set. + MessageFailure: when failed to get event ranges. + """ + if not num_ranges: + num_ranges = self.corecount + + logger.debug('getting event ranges(num_ranges=%s)' % num_ranges) + if not self.get_event_ranges_hook: + raise SetupFailure("get_event_ranges_hook is not set") + + try: + logger.debug('calling get_event_ranges hook(%s) to get event ranges.' % self.get_event_ranges_hook) + event_ranges = self.get_event_ranges_hook(num_ranges, queue_factor=queue_factor) + logger.debug('got event ranges: %s' % event_ranges) + return event_ranges + except Exception as e: + raise MessageFailure("Failed to get event ranges: %s" % e) + + def parse_out_message(self, message): + """ + Parse output or error messages from payload. + + :param message: The message string received from payload. + + :returns: a dict {'id': , 'status': , 'output': , 'cpu': , 'wall': , 'message': } + :raises: PilotExecption: when a PilotException is caught. + UnknownException: when other unknown exception is caught. + """ + + logger.debug('parsing message: %s' % message) + return message + + def handle_out_message(self, message): + """ + Handle output or error messages from payload. + Messages from payload will be parsed and the handle_out_message hook is called. + + :param message: The message string received from payload. + + :raises: SetupFailure: when handle_out_message_hook is not set. + RunPayloadFailure: when failed to handle an output or error message. + """ + + logger.debug('handling out message: %s' % message) + if not self.handle_out_message_hook: + raise SetupFailure("handle_out_message_hook is not set") + + try: + message_status = self.parse_out_message(message) + logger.debug('parsed out message: %s' % message_status) + logger.debug('calling handle_out_message hook(%s) to handle parsed message.' % self.handle_out_message_hook) + self.handle_out_message_hook(message_status) + except Exception as e: + raise RunPayloadFailure("Failed to handle out message: %s" % e) + + def poll(self): + """ + poll whether the process is still running. + + :returns: None: still running. + 0: finished successfully. + others: failed. + """ + return self.__ret_code + + def terminate(self, time_to_wait=1): + """ + Terminate running threads and processes. + + :param time_to_wait: integer, seconds to wait to force kill the payload process. + + :raises: PilotExecption: when a PilotException is caught. + UnknownException: when other unknown exception is caught. + """ + logger.info('terminate running threads and processes.') + try: + self.stop() + except Exception as e: + logger.error('Exception caught when terminating ESProcessFineGrainedProc: %s' % e) + self.__ret_code = -1 + raise UnknownException(e) + + def kill(self): + """ + Terminate running threads and processes. + + :param time_to_wait: integer, seconds to wait to force kill the payload process. + + :raises: PilotException: when a PilotException is caught. + UnknownException: when other unknown exception is caught. + """ + logger.info('terminate running threads and processes.') + try: + self.stop() + except Exception as e: + logger.error('Exception caught when terminating ESProcessFineGrainedProc: %s' % e) + raise UnknownException(e) + + def clean(self): + """ + Clean left resources + """ + self.stop() + + def run(self): + """ + Main run loops: monitor message thread and payload process. + handle messages from payload and response messages with injecting new event ranges or process outputs. + + :raises: PilotExecption: when a PilotException is caught. + UnknownException: when other unknown exception is caught. + """ + + logger.info('start esprocess with thread ident: %s' % (self.ident)) + logger.debug('initializing') + self.init() + logger.debug('initialization finished.') + + logger.info('starts to main loop') + while self.is_payload_running(): + try: + self.monitor() + time.sleep(0.01) + except PilotException as e: + logger.error('PilotException caught in the main loop: %s, %s' % (e.get_detail(), traceback.format_exc())) + # TODO: define output message exception. If caught 3 output message exception, terminate + self.stop() + except Exception as e: + logger.error('Exception caught in the main loop: %s, %s' % (e, traceback.format_exc())) + # TODO: catch and raise exceptions + # if catching dead process exception, terminate. + self.stop() + break + self.clean() + logger.debug('main loop finished') diff --git a/pilot/eventservice/esprocess/hooks/acthook.py b/pilot/eventservice/esprocess/hooks/acthook.py index 91988b08..bdb0b700 100644 --- a/pilot/eventservice/esprocess/hooks/acthook.py +++ b/pilot/eventservice/esprocess/hooks/acthook.py @@ -20,28 +20,28 @@ # - Wen Guan, wen.guan@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2023 -""" -Hooks for ARC-ControlTower EventService. -""" +"""Hooks for ARC-ControlTower EventService.""" from pilot.eventservice.eshook import ESHook class ACTESHook(ESHook): - def get_payload(self): + """ACT EventService hook class.""" + + def get_payload(self) -> dict: """ Get payload to execute. - :returns: dict {'payload': , 'output_file': , 'error_file': } + :return: {'payload': , 'output_file': , 'error_file': } (dict) """ raise Exception("Not Implemented") - def get_event_ranges(self, num_ranges=1): + def get_event_ranges(self, num_ranges: int = 1) -> dict: """ Get event ranges. - :returns: dict of event ranges. - None if no available events. + :param num_ranges: number of event ranges to get (int) + :return: dictionary of event ranges (dict). """ raise Exception("Not Implemented") @@ -49,9 +49,12 @@ def handle_out_message(self, message): """ Handle ES output or error messages. - :param message: a dict of parsed message. - For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , - 'wall': , 'message': }. - Fro 'failed' event ranges, it's {'id': , 'status': 'finished', 'message': }. + Example + For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , + 'wall': , 'message': }. + For 'failed' event ranges, it's {'id': , 'status': 'finished', 'message': }. + + :param message: dictionary of a parsed message (dict). + :raises Exception: if anything goes wrong. """ raise Exception("Not Implemented") diff --git a/pilot/eventservice/esprocess/hooks/harvesterhook.py b/pilot/eventservice/esprocess/hooks/harvesterhook.py index 2b76343e..61c4ad10 100644 --- a/pilot/eventservice/esprocess/hooks/harvesterhook.py +++ b/pilot/eventservice/esprocess/hooks/harvesterhook.py @@ -20,40 +20,40 @@ # - Wen Guan, wen.guan@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2023 -""" -Hooks for Harvester EventService. -""" +"""Hooks for Harvester EventService.""" from pilot.eventservice.eshook import ESHook class HarvesterESHook(ESHook): - def get_payload(self): + """Harvester EventService hook.""" + + def get_payload(self) -> dict: """ Get payload to execute. - :returns: dict {'payload': , 'output_file': , 'error_file': } + :return: {'payload': , 'output_file': , 'error_file': } (dict). """ raise Exception("Not Implemented") - def get_event_ranges(self, num_ranges=1): + def get_event_ranges(self, num_ranges: int = 1) -> dict: """ Get event ranges. - :param num_ranges: Number of event ranges to download, default is 1. - - :returns: dict of event ranges. - None if no available events. + :param num_ranges: Number of event ranges to download, default is 1 (int) + :return: dictionary of event ranges (dict). """ raise Exception("Not Implemented") - def handle_out_message(self, message): + def handle_out_message(self, message: dict): """ Handle ES output or error messages. - :param message: a dict of parsed message. - For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , + Example + For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , 'wall': , 'message': }. - Fro 'failed' event ranges, it's {'id': , 'status': 'finished', 'message': }. + For 'failed' event ranges, it's {'id': , 'status': 'finished', 'message': }. + + :param message: dictionary of parsed message (dict). """ raise Exception("Not Implemented") diff --git a/pilot/eventservice/workexecutor/plugins/baseexecutor.py b/pilot/eventservice/workexecutor/plugins/baseexecutor.py index 9279fd94..892d9635 100644 --- a/pilot/eventservice/workexecutor/plugins/baseexecutor.py +++ b/pilot/eventservice/workexecutor/plugins/baseexecutor.py @@ -56,6 +56,8 @@ def __init__(self, **kwargs): self.proc = None + self.current_dir = os.getcwd() + def get_pid(self): return self.proc.pid if self.proc else None @@ -75,6 +77,10 @@ def start(self): def stop(self): if not self.is_stop(): self.__stop.set() + if self.communication_manager: + self.communication_manager.stop() + os.chdir(self.current_dir) + logger.info("change current dir from %s to %s" % (os.getcwd(), self.current_dir)) def is_stop(self): return self.__stop.is_set() @@ -92,7 +98,9 @@ def set_payload(self, payload): self.__is_set_payload = True job = self.get_job() if job and job.workdir: + current_dir = os.getcwd() os.chdir(job.workdir) + logger.info("change current dir from %s to %s" % (current_dir, job.workdir)) def is_set_payload(self): return self.__is_set_payload @@ -108,7 +116,7 @@ def retrieve_payload(self): jobs = self.communication_manager.get_jobs(njobs=1, args=self.args) logger.info("Received jobs: %s" % jobs) if jobs: - job = create_job(jobs[0], queue=self.queue) + job = create_job(jobs[0], queuename=self.queue) # get the payload command from the user specific code pilot_user = os.environ.get('PILOT_USER', 'atlas').lower() diff --git a/pilot/eventservice/workexecutor/plugins/finegrainedprocexecutor.py b/pilot/eventservice/workexecutor/plugins/finegrainedprocexecutor.py new file mode 100644 index 00000000..b78b904a --- /dev/null +++ b/pilot/eventservice/workexecutor/plugins/finegrainedprocexecutor.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Authors: +# - Wen Guan, wen.guan@cern.ch, 2023 - 2024 + +import json +import os +import time +import traceback + +from pilot.common.errorcodes import ErrorCodes + +from .baseexecutor import BaseExecutor + +import logging +logger = logging.getLogger(__name__) + +errors = ErrorCodes() + +""" +FineGrainedProc Executor with one process to manage EventService +""" + + +class FineGrainedProcExecutor(BaseExecutor): + def __init__(self, **kwargs): + super(FineGrainedProcExecutor, self).__init__(**kwargs) + self.setName("FineGrainedProcExecutor") + + self.__queued_out_messages = [] + self.__stageout_failures = 0 + self.__max_allowed_stageout_failures = 20 + self.__last_stageout_time = None + self.__all_out_messages = [] + + self.proc = None + self.exit_code = None + + def is_payload_started(self): + return self.proc.is_payload_started() if self.proc else False + + def get_pid(self): + return self.proc.pid if self.proc else None + + def get_exit_code(self): + return self.exit_code + + def update_finished_event_ranges(self, out_messagess, output_file, fsize, checksum, storage_id): + """ + Update finished event ranges + + :param out_messages: messages from AthenaMP. + :param output_file: output file name. + :param fsize: file size. + :param adler32: checksum (adler32) of the file. + :param storage_id: the id of the storage. + """ + + if len(out_messagess) == 0: + return + + event_ranges = [] + for out_msg in out_messagess: + event_ranges.append({"eventRangeID": out_msg['id'], "eventStatus": 'finished'}) + event_range_status = {"zipFile": {"numEvents": len(event_ranges), + "objstoreID": storage_id, + "lfn": os.path.basename(output_file), + "fsize": fsize, + "pathConvention": 1000}, + "eventRanges": event_ranges} + for checksum_key in checksum: + event_range_status["zipFile"][checksum_key] = checksum[checksum_key] + event_range_message = {'version': 1, 'eventRanges': json.dumps([event_range_status])} + self.update_events(event_range_message) + + job = self.get_job() + job.nevents += len(event_ranges) + + def update_failed_event_ranges(self, out_messagess): + """ + Update failed event ranges + + :param out_messages: messages from AthenaMP. + """ + + if len(out_messagess) == 0: + return + + event_ranges = [] + for message in out_messagess: + status = message['status'] if message['status'] in ['failed', 'fatal'] else 'failed' + # ToBeFixed errorCode + event_ranges.append({"errorCode": errors.UNKNOWNPAYLOADFAILURE, "eventRangeID": message['id'], "eventStatus": status}) + event_range_message = {'version': 0, 'eventRanges': json.dumps(event_ranges)} + self.update_events(event_range_message) + + def update_terminated_event_ranges(self, out_messagess): + """ + Update terminated event ranges + + :param out_messages: messages from AthenaMP. + """ + + if len(out_messagess) == 0: + return + + event_ranges = [] + finished_events = 0 + for message in out_messagess: + if message['status'] in ['failed', 'fatal', 'finished', 'running', 'transferring']: + status = message['status'] + if message['status'] in ['finished']: + finished_events += 1 + else: + logger.warn("status is unknown for messages, set it running: %s" % str(message)) + status = 'running' + error_code = message.get("error_code", None) + if status in ["failed", "fatal"] and error_code is None: + error_code = errors.UNKNOWNPAYLOADFAILURE + error_diag = message.get("error_diag") + + event_range = {"eventRangeID": message['id'], "eventStatus": status, "errorCode": error_code, "errorDiag": error_diag} + event_ranges.append(event_range) + event_range_message = {'version': 0, 'eventRanges': json.dumps(event_ranges)} + self.update_events(event_range_message) + + job = self.get_job() + job.nevents += finished_events + + def handle_out_message(self, message): + """ + Handle ES output or error messages hook function for tests. + + :param message: a dict of parsed message. + For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , + 'wall': , 'message': }. + Fro 'failed' event ranges, it's {'id': , 'status': 'failed', 'message': }. + """ + + logger.info(f"handling out message: {message}") + + self.__all_out_messages.append(message) + + self.__queued_out_messages.append(message) + + def stageout_es(self, force=False): + """ + Stage out event service outputs. + When pilot fails to stage out a file, the file will be added back to the queue for staging out next period. + """ + + job = self.get_job() + if len(self.__queued_out_messages): + if force or self.__last_stageout_time is None or (time.time() > self.__last_stageout_time + job.infosys.queuedata.es_stageout_gap): + + out_messages = [] + while len(self.__queued_out_messages) > 0: + out_messages.append(self.__queued_out_messages.pop()) + + if out_messages: + self.__last_stageout_time = time.time() + self.update_terminated_event_ranges(out_messages) + + def clean(self): + """ + Clean temp produced files + """ + + for msg in self.__all_out_messages: + if msg['status'] in ['failed', 'fatal']: + pass + elif 'output' in msg: + try: + logger.info(f"removing ES pre-merge file: {msg['output']}") + os.remove(msg['output']) + except Exception as exc: + logger.error(f"failed to remove file({msg['output']}): {exc}") + self.__queued_out_messages = [] + self.__stageout_failures = 0 + self.__last_stageout_time = None + self.__all_out_messages = [] + + if self.proc: + self.proc.stop() + while self.proc.is_alive(): + time.sleep(0.1) + + self.stop_communicator() + self.stop() + + def get_esprocess_finegrainedproc(self, payload): + # get the payload command from the user specific code + try: + pilot_user = os.environ.get('PILOT_USER', 'generic').lower() + esprocessfinegrainedproc = __import__(f'pilot.user.{pilot_user}.esprocessfinegrainedproc', + globals(), locals(), [pilot_user], 0) + proc = esprocessfinegrainedproc.ESProcessFineGrainedProc(payload) + return proc + except Exception as ex: + logger.warn("use specific ESProcessFineGrainedProc does not exist. Using the pilot.eventservice.esprocess.esprocessfinegrainedproc: " + str(ex)) + from pilot.eventservice.esprocess.esprocessfinegrainedproc import ESProcessFineGrainedProc + proc = ESProcessFineGrainedProc(payload) + return proc + + def run(self): + """ + Initialize and run ESProcess. + """ + + try: + logger.info("starting ES FineGrainedProcExecutor with thread identifier: %s" % (self.ident)) + if self.is_set_payload(): + payload = self.get_payload() + elif self.is_retrieve_payload(): + payload = self.retrieve_payload() + else: + logger.error("payload is not set, is_retrieve_payload is also not set - no payloads") + self.exit_code = -1 + return + + logger.info(f"payload: {payload}") + logger.info("starting ESProcessFineGrainedProc") + proc = self.get_esprocess_finegrainedproc(payload) + self.proc = proc + logger.info("ESProcessFineGrainedProc initialized") + + proc.set_get_event_ranges_hook(self.get_event_ranges) + proc.set_handle_out_message_hook(self.handle_out_message) + + logger.info('ESProcessFineGrainedProc starts to run') + proc.start() + logger.info('ESProcessFineGrainedProc started to run') + + iteration = 0 + while proc.is_alive(): + iteration += 1 + if self.is_stop(): + logger.info(f'stop is set -- stopping process pid={proc.pid}') + proc.stop() + break + self.stageout_es() + + # have we passed the threshold for failed stage-outs? + if self.__stageout_failures >= self.__max_allowed_stageout_failures: + logger.warning(f'too many stage-out failures ({self.__max_allowed_stageout_failures})') + logger.info(f'stopping process pid={proc.pid}') + proc.stop() + break + + exit_code = proc.poll() + if iteration % 60 == 0: + logger.info(f'running: iteration={iteration} pid={proc.pid} exit_code={exit_code}') + time.sleep(5) + + while proc.is_alive(): + time.sleep(1) + logger.info("ESProcess finished") + + self.stageout_es(force=True) + self.clean() + self.exit_code = proc.poll() + logger.info("ESProcess exit_code: %s" % self.exit_code) + + except Exception as exc: + logger.error(f'execute payload failed: {exc}, {traceback.format_exc()}') + self.clean() + self.exit_code = -1 + + logger.info('ES fine grained proc executor finished') diff --git a/pilot/eventservice/workexecutor/plugins/genericexecutor.py b/pilot/eventservice/workexecutor/plugins/genericexecutor.py index 8a0f5cba..17aa1528 100644 --- a/pilot/eventservice/workexecutor/plugins/genericexecutor.py +++ b/pilot/eventservice/workexecutor/plugins/genericexecutor.py @@ -21,14 +21,20 @@ # - Alexey Anisenkov, anisyonk@cern.ch, 2019 # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +"""Generic executor.""" + import json +import logging import os import time import traceback +from typing import Any from pilot.api.es_data import StageOutESClient -from pilot.common.exception import PilotException, StageOutFailure - +from pilot.common.exception import ( + PilotException, + StageOutFailure +) from pilot.common.errorcodes import ErrorCodes from pilot.eventservice.esprocess.esprocess import ESProcess from pilot.info.filespec import FileSpec @@ -37,50 +43,64 @@ from .baseexecutor import BaseExecutor -import logging logger = logging.getLogger(__name__) - errors = ErrorCodes() -""" -Generic Executor with one process to manage EventService -""" - class GenericExecutor(BaseExecutor): + """Generic executor class.""" + def __init__(self, **kwargs): + """ + Initialize generic executor. + + :param kwargs: kwargs dictionary (dict). + """ super(GenericExecutor, self).__init__(**kwargs) self.setName("GenericExecutor") - self.__queued_out_messages = [] self.__stageout_failures = 0 self.__max_allowed_stageout_failures = 20 self.__last_stageout_time = None self.__all_out_messages = [] - self.proc = None self.exit_code = None - def is_payload_started(self): + def is_payload_started(self) -> bool: + """ + Check if payload is started. + + :return: True if payload is started, False if not (bool). + """ return self.proc.is_payload_started() if self.proc else False - def get_pid(self): + def get_pid(self) -> int: + """ + Get the process id of the payload process. + + :return: process id (int). + """ return self.proc.pid if self.proc else None def get_exit_code(self): - return self.exit_code + """ + Get exit code of the payload process. - def update_finished_event_ranges(self, out_messagess, output_file, fsize, checksum, storage_id): + :return: exit code (int). """ - Update finished event ranges + return self.exit_code - :param out_messages: messages from AthenaMP. - :param output_file: output file name. - :param fsize: file size. - :param adler32: checksum (adler32) of the file. - :param storage_id: the id of the storage. + def update_finished_event_ranges(self, out_messagess: Any, output_file: str, fsize: int, checksum: str, + storage_id: Any) -> None: """ + Update finished event ranges. + :param out_messages: messages from AthenaMP (Any) + :param output_file: output file name (str) + :param fsize: file size (int) + :param adler32: checksum (adler32) of the file (str) + :param storage_id: the id of the storage (Any). + """ if len(out_messagess) == 0: return @@ -101,34 +121,34 @@ def update_finished_event_ranges(self, out_messagess, output_file, fsize, checks job = self.get_job() job.nevents += len(event_ranges) - def update_failed_event_ranges(self, out_messagess): + def update_failed_event_ranges(self, out_messages: Any) -> None: """ - Update failed event ranges + Update failed event ranges. - :param out_messages: messages from AthenaMP. + :param out_messages: messages from AthenaMP (Any). """ - - if len(out_messagess) == 0: + if len(out_messages) == 0: return event_ranges = [] - for message in out_messagess: + for message in out_messages: status = message['status'] if message['status'] in ['failed', 'fatal'] else 'failed' # ToBeFixed errorCode event_ranges.append({"errorCode": errors.UNKNOWNPAYLOADFAILURE, "eventRangeID": message['id'], "eventStatus": status}) event_range_message = {'version': 0, 'eventRanges': json.dumps(event_ranges)} self.update_events(event_range_message) - def handle_out_message(self, message): + def handle_out_message(self, message: Any): """ Handle ES output or error messages hook function for tests. - :param message: a dict of parsed message. - For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , + Example + For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , 'wall': , 'message': }. - Fro 'failed' event ranges, it's {'id': , 'status': 'failed', 'message': }. - """ + For 'failed' event ranges, it's {'id': , 'status': 'failed', 'message': }. + :param message: a dict of parsed message (Any). + """ logger.info(f"handling out message: {message}") self.__all_out_messages.append(message) @@ -138,23 +158,22 @@ def handle_out_message(self, message): else: self.__queued_out_messages.append(message) - def tarzip_output_es(self): + def tarzip_output_es(self) -> (Any, str): """ - Tar/zip eventservice outputs. + Tar/zip event service outputs. - :return: out_messages, output_file + :return: out_messages (Any), output_file (str). """ - out_messages = [] while len(self.__queued_out_messages) > 0: out_messages.append(self.__queued_out_messages.pop()) - output_file = "EventService_premerge_%s.tar" % out_messages[0]['id'] + output_file = f"EventService_premerge_{out_messages[0]['id']}.tar" ret_messages = [] try: for out_msg in out_messages: - command = "tar -rf " + output_file + " --directory=%s %s" % (os.path.dirname(out_msg['output']), os.path.basename(out_msg['output'])) + command = "tar -rf " + output_file + f" --directory={os.path.dirname(out_msg['output'])} {os.path.basename(out_msg['output'])}" exit_code, stdout, stderr = execute(command) if exit_code == 0: ret_messages.append(out_msg) @@ -177,13 +196,14 @@ def tarzip_output_es(self): return ret_messages, output_file - def stageout_es_real(self, output_file): # noqa: C901 + def stageout_es_real(self, output_file: str) -> (str, Any, int, str): # noqa: C901 """ Stage out event service output file. - :param output_file: output file name. + :param output_file: output file name (str) + :return: storage (str), storage_id (Any), fsize (int), checksum (str) + :raises StageOutFailure: when stage-out failed. """ - job = self.get_job() logger.info('prepare to stage-out event service files') @@ -264,12 +284,14 @@ def stageout_es_real(self, output_file): # noqa: C901 return file_spec.ddmendpoint, storage_id, file_spec.filesize, file_spec.checksum - def stageout_es(self, force=False): + def stageout_es(self, force: bool = False): """ Stage out event service outputs. + When pilot fails to stage out a file, the file will be added back to the queue for staging out next period. - """ + :param force: force to stage out (bool). + """ job = self.get_job() if len(self.__queued_out_messages): if force or self.__last_stageout_time is None or (time.time() > self.__last_stageout_time + job.infosys.queuedata.es_stageout_gap): @@ -296,10 +318,7 @@ def stageout_es(self, force=False): self.__stageout_failures += 1 def clean(self): - """ - Clean temp produced files - """ - + """Clean temp produced files.""" for msg in self.__all_out_messages: if msg['status'] in ['failed', 'fatal']: pass @@ -321,13 +340,10 @@ def clean(self): self.stop_communicator() - def run(self): - """ - Initialize and run ESProcess. - """ - + def run(self) -> None: + """Initialize and run ESProcess.""" try: - logger.info("starting ES GenericExecutor with thread identifier: %s" % (self.ident)) + logger.info(f"starting ES GenericExecutor with thread identifier: {self.ident}") if self.is_set_payload(): payload = self.get_payload() elif self.is_retrieve_payload(): diff --git a/pilot/eventservice/workexecutor/plugins/hpoexecutor.py b/pilot/eventservice/workexecutor/plugins/hpoexecutor.py index 57f88f44..534de1ae 100644 --- a/pilot/eventservice/workexecutor/plugins/hpoexecutor.py +++ b/pilot/eventservice/workexecutor/plugins/hpoexecutor.py @@ -19,10 +19,14 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +"""HPO executor.""" + import json +import logging import os import time import traceback +from typing import Any from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import FileHandlingFailure @@ -30,45 +34,64 @@ from pilot.info.filespec import FileSpec from pilot.util.config import config from pilot.util.filehandling import calculate_checksum - from .baseexecutor import BaseExecutor -import logging logger = logging.getLogger(__name__) - errors = ErrorCodes() -""" -HPO Executor -""" - class HPOExecutor(BaseExecutor): + """HPO executor class.""" + def __init__(self, **kwargs): + """ + Initialize HPO executor. + + :param kwargs: kwargs dictionary (dict). + """ super(HPOExecutor, self).__init__(**kwargs) self.setName("HPOExecutor") - self.__queued_out_messages = [] self.__last_stageout_time = None self.__all_out_messages = [] - self.proc = None self.exit_code = None - def is_payload_started(self): + def is_payload_started(self) -> bool: + """ + Check if payload is started. + + :return: True if payload is started, False otherwise (bool). + """ return self.proc.is_payload_started() if self.proc else False - def get_pid(self): + def get_pid(self) -> int: + """ + Get the process id of the payload process. + + :return: the process id of the payload process (int). + """ return self.proc.pid if self.proc else None - def get_exit_code(self): + def get_exit_code(self) -> int: + """ + Get the exit code of the payload process. + + :return: the exit code of the payload process (int). + """ return self.exit_code - def create_file_spec(self, pfn): + def create_file_spec(self, pfn: str) -> FileSpec: + """ + Create a file spec from a pfn. + + :param pfn: physical file name (str) + :return: a file spec (FileSpec). + """ try: checksum = calculate_checksum(pfn, algorithm=config.File.checksum_type) except (FileHandlingFailure, NotImplementedError, Exception) as exc: - logger.warning('caught exception: %s', exc) + logger.warning(f'caught exception: {exc}') checksum = '' # fail later filesize = os.path.getsize(pfn) file_data = {'scope': 'transient', @@ -79,20 +102,19 @@ def create_file_spec(self, pfn): file_spec = FileSpec(filetype='output', **file_data) return file_spec - def update_finished_event_ranges(self, out_messagess): + def update_finished_event_ranges(self, out_messages: Any) -> None: """ - Update finished event ranges + Update finished event ranges. - :param out_messages: messages from AthenaMP. + :param out_messages: messages from AthenaMP (Any). """ - logger.info("update_finished_event_ranges:") - if len(out_messagess) == 0: + if len(out_messages) == 0: return event_ranges = [] - for out_msg in out_messagess: + for out_msg in out_messages: fspec = self.create_file_spec(out_msg['output']) event_range_status = {"eventRangeID": out_msg['id'], "eventStatus": 'finished', "pfn": out_msg['output'], "fsize": fspec.filesize} for checksum_key in fspec.checksum: @@ -105,34 +127,34 @@ def update_finished_event_ranges(self, out_messagess): job = self.get_job() job.nevents += len(event_ranges) - def update_failed_event_ranges(self, out_messagess): + def update_failed_event_ranges(self, out_messages: Any) -> None: """ - Update failed event ranges + Update failed event ranges. - :param out_messages: messages from AthenaMP. + :param out_messages: messages from AthenaMP (Any). """ - if len(out_messagess) == 0: + if len(out_messages) == 0: return event_ranges = [] - for message in out_messagess: + for message in out_messages: status = message['status'] if message['status'] in ['failed', 'fatal'] else 'failed' # ToBeFixed errorCode event_ranges.append({"errorCode": errors.UNKNOWNPAYLOADFAILURE, "eventRangeID": message['id'], "eventStatus": status}) event_range_message = {'version': 0, 'eventRanges': json.dumps(event_ranges)} self.update_events(event_range_message) - def handle_out_message(self, message): + def handle_out_message(self, message: dict): """ Handle ES output or error messages hook function for tests. - :param message: a dict of parsed message. - For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , + For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , 'wall': , 'message': }. - Fro 'failed' event ranges, it's {'id': , 'status': 'failed', 'message': }. - """ + For 'failed' event ranges, it's {'id': , 'status': 'failed', 'message': }. - logger.info("Handling out message: %s" % message) + :param message: a dict of parsed message (dict). + """ + logger.info(f"Handling out message: {message}") self.__all_out_messages.append(message) @@ -141,14 +163,13 @@ def handle_out_message(self, message): else: self.__queued_out_messages.append(message) - def stageout_es(self, force=False): + def stageout_es(self, force: bool = False): """ Stage out event service outputs. + :param force: force stage out (bool). """ - job = self.get_job() - # logger.info("job.infosys.queuedata.es_stageout_gap: %s" % job.infosys.queuedata.es_stageout_gap) if len(self.__queued_out_messages): if force or self.__last_stageout_time is None or (time.time() > self.__last_stageout_time + job.infosys.queuedata.es_stageout_gap): out_messages = [] @@ -157,10 +178,7 @@ def stageout_es(self, force=False): self.update_finished_event_ranges(out_messages) def clean(self): - """ - Clean temp produced files - """ - + """Clean temp produced files.""" logger.info("shutting down...") self.__queued_out_messages = [] @@ -175,21 +193,19 @@ def clean(self): self.stop_communicator() def run(self): - """ - Initialize and run ESProcess. - """ + """Initialize and run ESProcess.""" try: - logger.info("starting ES HPOExecutor with thread ident: %s" % self.ident) + logger.info(f"starting ES HPOExecutor with thread ident: {self.ident}") if self.is_set_payload(): payload = self.get_payload() elif self.is_retrieve_payload(): payload = self.retrieve_payload() else: - logger.error("Payload is not set but is_retrieve_payload is also not set. No payloads.") + logger.error("payload is not set but is_retrieve_payload is also not set. No payloads.") - logger.info("payload: %s" % payload) + logger.info(f"payload: {payload}") - logger.info("Starting ESProcess") + logger.info("starting ESProcess") proc = ESProcess(payload, waiting_time=999999) self.proc = proc logger.info("ESProcess initialized") @@ -202,21 +218,18 @@ def run(self): logger.info('ESProcess started to run') exit_code = None - try: - iteration = long(0) # Python 2 # noqa: F821 - except Exception: - iteration = 0 # Python 3 + iteration = 0 while proc.is_alive(): iteration += 1 if self.is_stop(): - logger.info('Stop is set. breaking -- stop process pid=%s' % proc.pid) + logger.info(f'stop is set. breaking -- stop process pid={proc.pid}') proc.stop() break self.stageout_es() exit_code = proc.poll() if iteration % 60 == 0: - logger.info('running: iteration=%d pid=%s exit_code=%s' % (iteration, proc.pid, exit_code)) + logger.info(f'running: iteration={iteration} pid={proc.pid} exit_code={exit_code}') time.sleep(5) while proc.is_alive(): @@ -228,8 +241,8 @@ def run(self): self.exit_code = proc.poll() - except Exception as e: - logger.error('Execute payload failed: %s, %s' % (e, traceback.format_exc())) + except Exception as exc: + logger.error(f'execute payload failed: {exc}, {traceback.format_exc()}') self.clean() self.exit_code = -1 logger.info('ES HPO executor finished') diff --git a/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py b/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py index 5ae2ae4b..47e639c3 100644 --- a/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py +++ b/pilot/eventservice/workexecutor/plugins/raythenaexecutor.py @@ -20,10 +20,14 @@ # - Miha Muskinja, miha.muskinja@cern.ch, 2020 # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +"""Raythena executor.""" + import json +import logging import os import time import traceback +from typing import Any from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import FileHandlingFailure @@ -31,45 +35,64 @@ from pilot.info.filespec import FileSpec from pilot.util.config import config from pilot.util.filehandling import calculate_checksum, move - from .baseexecutor import BaseExecutor -import logging logger = logging.getLogger(__name__) - errors = ErrorCodes() -""" -Raythena Executor with one process to manage EventService -""" - class RaythenaExecutor(BaseExecutor): + """Raythena executor class.""" + def __init__(self, **kwargs): + """ + Initialize Raythena executor. + + :param kwargs: kwargs dictionary (dict). + """ super(RaythenaExecutor, self).__init__(**kwargs) self.setName("RaythenaExecutor") - self.__queued_out_messages = [] self.__last_stageout_time = None self.__all_out_messages = [] - self.proc = None self.exit_code = None - def is_payload_started(self): + def is_payload_started(self) -> bool: + """ + Check if payload is started. + + :return: True if payload is started, False otherwise (bool). + """ return self.proc.is_payload_started() if self.proc else False - def get_pid(self): + def get_pid(self) -> int: + """ + Get the process id of the payload process. + + :return: the process id of the payload process (int). + """ return self.proc.pid if self.proc else None - def get_exit_code(self): + def get_exit_code(self) -> int: + """ + Get the exit code of the payload process. + + :return: the exit code of the payload process (int). + """ return self.exit_code - def create_file_spec(self, pfn): + def create_file_spec(self, pfn: str) -> FileSpec: + """ + Create a FileSpec object from a given PFN. + + :param pfn: physical file name (string). + :return: a FileSpec object (FileSpec). + """ try: checksum = calculate_checksum(pfn, algorithm=config.File.checksum_type) except (FileHandlingFailure, NotImplementedError, Exception) as exc: - logger.warning('caught exception: %s', exc) + logger.warning(f'caught exception: {exc}') checksum = '' # fail later filesize = os.path.getsize(pfn) @@ -79,39 +102,38 @@ def create_file_spec(self, pfn): 'filesize': filesize, } file_spec = FileSpec(filetype='output', **file_data) + return file_spec - def move_output(self, pfn): + def move_output(self, pfn: str): """ Move output file from given PFN path to PILOT_OUTPUT_DIR if set. - :param pfn: physical file name (string). - :return: + :param pfn: physical file name (str). """ - outputdir = os.environ.get('PILOT_OUTPUT_DIR', None) if outputdir: try: move(pfn, outputdir) - except Exception as e: - logger.warning('failed to move output: %s' % e) + except Exception as exc: + logger.warning(f'failed to move output: {exc}') - def update_finished_event_ranges(self, out_messagess): + def update_finished_event_ranges(self, out_messages: Any) -> None: """ - Update finished event ranges + Update finished event ranges. - :param out_messages: messages from AthenaMP. + :param out_messages: messages from AthenaMP (Any). """ - logger.info("update_finished_event_ranges:") - if len(out_messagess) == 0: + if len(out_messages) == 0: return event_ranges = [] - for out_msg in out_messagess: + for out_msg in out_messages: fspec = self.create_file_spec(out_msg['output']) - event_range_status = {"eventRangeID": out_msg['id'], "eventStatus": 'finished', "pfn": out_msg['output'], "fsize": fspec.filesize} + event_range_status = {"eventRangeID": out_msg['id'], "eventStatus": 'finished', "pfn": out_msg['output'], + "fsize": fspec.filesize} for checksum_key in fspec.checksum: event_range_status[checksum_key] = fspec.checksum[checksum_key] event_ranges.append(event_range_status) @@ -126,34 +148,35 @@ def update_finished_event_ranges(self, out_messagess): job = self.get_job() job.nevents += len(event_ranges) - def update_failed_event_ranges(self, out_messagess): + def update_failed_event_ranges(self, out_messages: Any) -> None: """ - Update failed event ranges + Update failed event ranges. - :param out_messages: messages from AthenaMP. + :param out_messages: messages from AthenaMP (Any). """ - if len(out_messagess) == 0: + if len(out_messages) == 0: return event_ranges = [] - for message in out_messagess: + for message in out_messages: status = message['status'] if message['status'] in ['failed', 'fatal'] else 'failed' # ToBeFixed errorCode event_ranges.append({"errorCode": errors.UNKNOWNPAYLOADFAILURE, "eventRangeID": message['id'], "eventStatus": status}) event_range_message = {'version': 0, 'eventRanges': json.dumps(event_ranges)} self.update_events(event_range_message) - def handle_out_message(self, message): + def handle_out_message(self, message: dict): """ Handle ES output or error messages hook function for tests. - :param message: a dict of parsed message. - For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , + Example: + For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , 'wall': , 'message': }. - Fro 'failed' event ranges, it's {'id': , 'status': 'failed', 'message': }. - """ + For 'failed' event ranges, it's {'id': , 'status': 'failed', 'message': }. - logger.info("Handling out message: %s" % message) + :param message: dictionary of parsed message (dict). + """ + logger.info(f"Handling out message: {message}") self.__all_out_messages.append(message) @@ -162,14 +185,13 @@ def handle_out_message(self, message): else: self.__queued_out_messages.append(message) - def stageout_es(self, force=False): + def stageout_es(self, force: bool = False): """ Stage out event service outputs. + :param force: force stage out (bool). """ - job = self.get_job() - # logger.info("job.infosys.queuedata.es_stageout_gap: %s" % job.infosys.queuedata.es_stageout_gap) if len(self.__queued_out_messages): if force or self.__last_stageout_time is None or (time.time() > self.__last_stageout_time + job.infosys.queuedata.es_stageout_gap): out_messages = [] @@ -178,10 +200,7 @@ def stageout_es(self, force=False): self.update_finished_event_ranges(out_messages) def clean(self): - """ - Clean temp produced files - """ - + """Clean temp produced files.""" logger.info("shutting down...") self.__queued_out_messages = [] @@ -196,21 +215,19 @@ def clean(self): self.stop_communicator() def run(self): - """ - Initialize and run ESProcess. - """ + """Initialize and run ESProcess.""" try: - logger.info("starting ES RaythenaExecutor with thread ident: %s" % self.ident) + logger.info(f"starting ES RaythenaExecutor with thread ident: {self.ident}") if self.is_set_payload(): payload = self.get_payload() elif self.is_retrieve_payload(): payload = self.retrieve_payload() else: - logger.error("Payload is not set but is_retrieve_payload is also not set. No payloads.") + logger.error("payload is not set but is_retrieve_payload is also not set. No payloads.") - logger.info("payload: %s" % payload) + logger.info(f"payload: {payload}") - logger.info("Starting ESProcess") + logger.info("starting ESProcess") proc = ESProcess(payload, waiting_time=999999) self.proc = proc logger.info("ESProcess initialized") @@ -222,22 +239,18 @@ def run(self): proc.start() logger.info('ESProcess started to run') - exit_code = None - try: - iteration = long(0) # Python 2 # noqa: F821 - except Exception: - iteration = 0 # Python 3 + iteration = 0 while proc.is_alive(): iteration += 1 if self.is_stop(): - logger.info('Stop is set. breaking -- stop process pid=%s' % proc.pid) + logger.info(f'Stop is set. breaking -- stop process pid={proc.pid}') proc.stop() break self.stageout_es() exit_code = proc.poll() if iteration % 60 == 0: - logger.info('running: iteration=%d pid=%s exit_code=%s' % (iteration, proc.pid, exit_code)) + logger.info(f'running: iteration={iteration} pid={proc.pid} exit_code={exit_code}') time.sleep(5) while proc.is_alive(): @@ -249,8 +262,8 @@ def run(self): self.exit_code = proc.poll() - except Exception as e: - logger.error('Execute payload failed: %s, %s' % (e, traceback.format_exc())) + except Exception as exc: + logger.error(f'execute payload failed: {exc}, {traceback.format_exc()}') self.clean() self.exit_code = -1 logger.info('ES raythena executor finished') diff --git a/pilot/eventservice/workexecutor/workexecutor.py b/pilot/eventservice/workexecutor/workexecutor.py index f9c27182..945bf21f 100644 --- a/pilot/eventservice/workexecutor/workexecutor.py +++ b/pilot/eventservice/workexecutor/workexecutor.py @@ -20,23 +20,27 @@ # - Wen Guan, wen.guan@cern.ch, 2018 # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +"""Base executor - Main class to manage the event service work.""" +import logging import time +from typing import Any from pilot.common import exception from pilot.common.pluginfactory import PluginFactory -import logging logger = logging.getLogger(__name__) -""" -Main class to manage the event service work. -""" - class WorkExecutor(PluginFactory): + """Work executor class.""" - def __init__(self, args=None): + def __init__(self, args: Any = None): + """ + Initialize work executor. + + :param args: args dictionary (Any). + """ super(WorkExecutor, self).__init__() self.payload = None self.plugin = None @@ -44,19 +48,40 @@ def __init__(self, args=None): self.args = args self.pid = None - def get_pid(self): + def get_pid(self) -> int: + """ + Return the pid of the payload process. + + :return: pid (int). + """ return self.plugin.get_pid() if self.plugin else None - def set_payload(self, payload): + def set_payload(self, payload: Any): + """ + Set the payload. + + :param payload: payload (Any). + """ self.payload = payload - def set_retrieve_paylaod(self): + def set_retrieve_payload(self): + """Set the payload to be retrieved.""" self.is_retrieve_payload = True - def get_payload(self): + def get_payload(self) -> Any: + """ + Return the payload. + + :return: payload (Any). + """ return self.payload - def get_plugin_confs(self): + def get_plugin_confs(self) -> dict: + """ + Return the plugin configurations. + + :return: plugin configurations (dict). + """ plugin_confs = {} if self.args and 'executor_type' in list(self.args.keys()): # Python 2/3 if self.args['executor_type'] == 'hpo': @@ -75,17 +100,25 @@ def get_plugin_confs(self): plugin_confs = {'class': 'pilot.eventservice.workexecutor.plugins.hammercloudexecutor.HammerCloudExecutor'} elif self.args['executor_type'] == 'mpi': # network-less plugin_confs = {'class': 'pilot.eventservice.workexecutor.plugins.mpiexecutor.MPIExecutor'} + elif self.args['executor_type'] == 'fineGrainedProc': + plugin_confs = {'class': 'pilot.eventservice.workexecutor.plugins.finegrainedprocexecutor.FineGrainedProcExecutor'} else: plugin_confs = {'class': 'pilot.eventservice.workexecutor.plugins.genericexecutor.GenericExecutor'} plugin_confs['args'] = self.args + return plugin_confs def start(self): + """ + Start the work executor. + + :raises SetupFailure: if no available executor plugin. + """ plugin_confs = self.get_plugin_confs() - logger.info("Plugin confs: %s" % plugin_confs) + logger.info(f"Plugin confs: {plugin_confs}") self.plugin = self.get_plugin(plugin_confs) - logger.info("WorkExecutor started with plugin: %s" % self.plugin) + logger.info(f"WorkExecutor started with plugin: {self.plugin}") if not self.plugin: raise exception.SetupFailure("No available executor plugin.") @@ -97,36 +130,72 @@ def start(self): else: self.plugin.set_payload(self.get_payload()) - logger.info("Starting plugin: %s" % self.plugin) + logger.info(f"Starting plugin: {self.plugin}") self.plugin.start() logger.info("Waiting for payload to start") while self.plugin.is_alive(): if self.plugin.is_payload_started(): - logger.info("Payload started with pid: %s" % self.get_pid()) + logger.info(f"Payload started with pid: {self.get_pid()}") break time.sleep(1) - def stop(self): + def stop(self) -> int: + """ + Stop the work executor. + + :return: exit code (int) + :raises SetupFailure: if no available executor plugin. + """ if not self.plugin: raise exception.SetupFailure("No available executor plugin.") + return self.plugin.stop() - def is_alive(self): + def is_alive(self) -> bool: + """ + Check if the work executor is alive. + + :return: True if alive, otherwise False (bool) + :raises SetupFailure: if no available executor plugin. + """ if not self.plugin: raise exception.SetupFailure("No available executor plugin.") + return self.plugin.is_alive() - def get_exit_code(self): + def get_exit_code(self) -> int: + """ + Return the exit code. + + :return: exit code (int) + :raises SetupFailure: if no available executor plugin. + """ if not self.plugin: raise exception.SetupFailure("No available executor plugin.") + return self.plugin.get_exit_code() - def get_event_ranges(self): + def get_event_ranges(self) -> list: + """ + Get event ranges. + + :return: event ranges (list) + :raises SetupFailure: if no available executor plugin. + """ if not self.plugin: raise exception.SetupFailure("No available executor plugin.") + return self.plugin.get_event_ranges() - def update_events(self, messages): + def update_events(self, messages: Any) -> bool: + """ + Update events. + + :param messages: messages (Any) + :return: True if events are updated, otherwise False (bool) + :raises SetupFailure: if no available executor plugin. + """ if not self.plugin: raise exception.SetupFailure("No available executor plugin.") + return self.plugin.update_events(messages) diff --git a/pilot/info/__init__.py b/pilot/info/__init__.py index 5ea8a159..661993e1 100644 --- a/pilot/info/__init__.py +++ b/pilot/info/__init__.py @@ -32,31 +32,30 @@ """ +import logging +from collections import namedtuple +from typing import Any + from .infoservice import InfoService from .jobinfo import JobInfoProvider # noqa from .jobdata import JobData # noqa from .filespec import FileSpec # noqa - from pilot.common.exception import PilotException -from collections import namedtuple -import logging logger = logging.getLogger(__name__) -def set_info(args): ## should be DEPRECATED: use `infosys.init(queuename)` +def set_info(args: Any): ## should be DEPRECATED: use `infosys.init(queuename)` """ Set up all necessary site information for given PandaQueue name. Resolve everything from the specified queue name (passed via `args.queue`) and fill extra lookup structure (Populate `args.info`). - raise PilotException in case of errors. - - :param args: input (shared) arguments - :return: None + :param args: input (shared) arguments (Any) + :raises PilotException: in case of errors. """ - # ## initialize info service + # initialize info service infosys.init(args.queue) args.info = namedtuple('info', ['queue', 'infoservice', @@ -65,7 +64,7 @@ def set_info(args): ## should be DEPRECATED: use `infosys.init(queuename)` # 'site_info', 'storages_info']) args.info.queue = args.queue - args.info.infoservice = infosys # ## THIS is actually for tests and redundant - the pilot.info.infosys should be used + args.info.infoservice = infosys # THIS is actually for tests and redundant - the pilot.info.infosys should be used # args.infoservice = infosys # ?? # check if queue is ACTIVE @@ -90,11 +89,6 @@ def set_info(args): ## should be DEPRECATED: use `infosys.init(queuename)` #args.info.sites_info = infosys.sites_info - logger.info('queue: %s' % args.info.queue) - #logger.info('site: %s' % args.info.site) - #logger.info('storages: %s' % args.info.storages) - #logger.info('queuedata: %s' % args.info.infoservice.queuedata) - # global InfoService Instance without Job specific settings applied (singleton shared object) # normally we should create such instance for each job to properly consider overwrites coming from JonInfoProvider diff --git a/pilot/info/basedata.py b/pilot/info/basedata.py index 292428be..81676f4c 100644 --- a/pilot/info/basedata.py +++ b/pilot/info/basedata.py @@ -20,6 +20,8 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 """ +Base data class. + The implementation of base data structure to host various settings collected from external source with built-in validation and schema translation support. @@ -28,7 +30,6 @@ - introduce internal information schema (names of attribues) to remove dependency with data structrure, formats, names from external sources (e.g. AGIS/CRIC) - :author: Alexey Anisenkov :contact: anisyonk@cern.ch :date: January 2018 @@ -37,27 +38,30 @@ import ast import copy import logging +from typing import Any + logger = logging.getLogger(__name__) -class BaseData(object): +class BaseData: """ - High-level object to host structured data collected from external source - It's considered to be like a bridge (connector) in order to remove direct dependency to - external schema (format) implementation + Base data class. + + High-level object to host structured data collected from external source + It's considered to be like a bridge (connector) in order to remove direct dependency to + external schema (format) implementation """ _keys = {} - def _load_data(self, data, kmap={}, validators=None): + def _load_data(self, data: dict, kmap: dict = {}, validators: dict = None): """ - Construct and initialize data from ext source. + Construct and initialize data from ext source. - :param data: input dictionary of raw data settings - :param kmap: the translation map of data attributes from external format to internal schema - :param validators: map of validation handlers to be applied + :param data: input dictionary of raw data settings (dict) + :param kmap: the translation map of data attributes from external format to internal schema (dict) + :param validators: map of validation handlers to be applied (dict). """ - # the translation map of the queue data attributes from external data to internal schema # 'internal_name':('ext_name1', 'extname2_if_any') # 'internal_name2':'ext_name3' @@ -100,7 +104,7 @@ def _load_data(self, data, kmap={}, validators=None): if callable(hvalidator): value = hvalidator(raw, ktype, kname, defval=copy.deepcopy(getattr(self, kname, None))) ## apply custom validation if defined - hvalidator = getattr(self, 'clean__%s' % kname, None) + hvalidator = getattr(self, f'clean__{kname}', None) if callable(hvalidator): value = hvalidator(raw, value) @@ -108,26 +112,25 @@ def _load_data(self, data, kmap={}, validators=None): self.clean() - def clean(self): - """ - Validate and finally clean up required data values (required object properties) if need - Executed once all fields have already passed field-specific validation checks - Could be customized by child object - :return: None + def clean(self) -> None: """ - pass + Validate and finally clean up required data values (required object properties if needed. - ## - ## default validators - ## - def clean_numeric(self, raw, ktype, kname=None, defval=0): + Executed once all fields have already passed field-specific validation checks. + Could be customized by child object. """ - Clean and convert input value to requested numeric type - :param raw: raw input data - :param ktype: variable type to which result should be casted - :param defval: default value to be used in case of cast error + return + + def clean_numeric(self, raw: Any, ktype: Any, kname: Any = None, defval: int = 0) -> Any: """ + Clean and convert input value to requested numeric type. + :param raw: raw input data (Any) + :param ktype: variable type to which result should be cast (Any) + :param kname: name of the variable (Any) + :param defval: default value to be used in case of cast error (int). + :return: cleaned value (Any). + """ if isinstance(raw, ktype): return raw @@ -141,36 +144,40 @@ def clean_numeric(self, raw, ktype, kname=None, defval=0): logger.warning(f'failed to convert data for key={kname}, raw={raw} to type={ktype}, defval={defval}') return defval - def clean_string(self, raw, ktype, kname=None, defval=""): - """ - Clean and convert input value to requested string type - :param raw: raw input data - :param ktype: variable type to which result should be casted - :param defval: default value to be used in case of cast error + def clean_string(self, raw: Any, ktype: Any, kname: Any = None, defval: str = "") -> Any: """ + Clean and convert input value to requested string type. + :param raw: raw input data (Any) + :param ktype: variable type to which result should be cast (Any) + :param kname: name of the variable (Any) + :param defval: default value to be used in case of cast error (str). + :return: cleaned value (Any). + """ if isinstance(raw, ktype): return raw if raw is None: return defval - else: - if isinstance(raw, str): - raw = raw.strip() + + if isinstance(raw, str): + raw = raw.strip() try: return ktype(raw) except Exception: logger.warning(f'failed to convert data for key={kname}, raw={raw} to type={ktype}') return defval - def clean_boolean(self, raw, ktype, kname=None, defval=None): - """ - Clean and convert input value to requested boolean type - :param raw: raw input data - :param ktype: variable type to which result should be casted - :param defval: default value to be used in case of cast error + def clean_boolean(self, raw: Any, ktype: Any, kname: Any = None, defval: Any = None) -> Any: """ + Clean and convert input value to requested boolean type. + :param raw: raw input data (Any) + :param ktype: variable type to which result should be cast (Any) + :param kname: name of the variable (Any) + :param defval: default value to be used in case of cast error (Any) + :return: cleaned value (Any). + """ if isinstance(raw, ktype): return raw @@ -184,23 +191,25 @@ def clean_boolean(self, raw, ktype, kname=None, defval=None): logger.warning(f'failed to convert data for key={kname}, raw={raw} to type={ktype}') return defval - return val.lower() in ['1', 'true', 'yes'] + return val.lower() in {'1', 'true', 'yes'} - def clean_dictdata(self, raw, ktype, kname=None, defval=None): - """ - Clean and convert input value to requested dict type - :param raw: raw input data - :param ktype: variable type to which result should be casted - :param defval: default value to be used in case of cast error + def clean_dictdata(self, raw: Any, ktype: Any, kname: Any = None, defval: Any = None) -> Any: """ + Clean and convert input value to requested dict type. + :param raw: raw input data (Any) + :param ktype: variable type to which result should be cast (Any) + :param kname: name of the variable (Any) + :param defval: default value to be used in case of cast error (Any) + :return: cleaned value (Any). + """ if isinstance(raw, str): raw = ast.literal_eval(raw) if isinstance(raw, ktype): return raw - elif raw is None: + if raw is None: return defval try: return ktype(raw) @@ -208,22 +217,24 @@ def clean_dictdata(self, raw, ktype, kname=None, defval=None): logger.warning(f'failed to convert data for key={kname}, raw={raw} to type={ktype}') return defval - def clean_listdata(self, raw, ktype, kname=None, defval=None): - """ - Clean and convert input value to requested list type - :param raw: raw input data - :param ktype: variable type to which result should be casted - :param defval: default value to be used in case of cast error + def clean_listdata(self, raw: Any, ktype: Any, kname: Any = None, defval: Any = None) -> Any: """ + Clean and convert input value to requested list type. + :param raw: raw input data (Any) + :param ktype: variable type to which result should be cast (Any) + :param kname: name of the variable (Any) + :param defval: default value to be used in case of cast error (Any) + :return: cleaned value (Any). + """ if isinstance(raw, ktype): return raw - elif raw is None: + if raw is None: return defval - else: - if isinstance(raw, str): - raw = raw.split(',') + + if isinstance(raw, str): + raw = raw.split(',') try: return ktype(raw) except Exception: @@ -237,14 +248,16 @@ def clean_listdata(self, raw, ktype, kname=None, defval=None): # # return value - def __repr__(self): - """ - Default representation of an object + def __repr__(self) -> str: """ + Represent data as string. + :return: representation (str). + """ ret = [] attrs = [key for key in dir(self) if not callable(getattr(self, key)) and not key.startswith('_')] for key in sorted(attrs): - ret.append(" %s=%s" % (key, getattr(self, key))) + ret.append(f" {key}={getattr(self, key)}") ret.append('') + return '\n'.join(ret) diff --git a/pilot/info/configinfo.py b/pilot/info/configinfo.py index 7842916b..13255411 100644 --- a/pilot/info/configinfo.py +++ b/pilot/info/configinfo.py @@ -20,52 +20,63 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2023 """ -Pilot Config specific info provider mainly used to customize Queue, Site, etc data of Information Service -with details fetched directly from local Pilot instance configuration +Pilot Config specific info provider. + +Mainly used to customize Queue, Site, etc data of Information Service with details fetched directly from local +Pilot instance configuration. :author: Alexey Anisenkov :contact: anisyonk@cern.ch :date: January 2018 """ +import ast +import logging +from typing import Any + from ..util.config import config -import logging logger = logging.getLogger(__name__) -class PilotConfigProvider(object): +class PilotConfigProvider: """ - Info provider which is used to extract settings specific for local Pilot instance - and overwrite general configuration used by Information Service + Pilot Config provider class. + + Info provider which is used to extract settings specific for local Pilot instance + and overwrite general configuration used by Information Service. """ config = None # Pilot Config instance - def __init__(self, conf=None): - self.config = conf or config + def __init__(self, conf: Any = None): + """ + Init class instance. - def resolve_schedconf_sources(self): + :param conf: Pilot Config instance (Any). """ - Resolve prioritized list of source names to be used for SchedConfig data load - :return: prioritized list of source names + self.config = conf or config + + def resolve_schedconf_sources(self) -> None: """ + Resolve prioritized list of source names to be used for SchedConfig data load. + Could return a prioritized list of source names (list). + """ # ## FIX ME LATER # an example of return data: # return ['AGIS', 'LOCAL', 'CVMFS'] return None # ## Not implemented yet - def resolve_queuedata(self, pandaqueue, **kwargs): + def resolve_queuedata(self, pandaqueue: str, **kwargs: dict) -> dict: """ - Resolve queue data details + Resolve queue data details. - :param pandaqueue: name of PandaQueue - :return: dict of settings for given PandaQueue as a key + :param pandaqueue: name of PandaQueue (str) + :param kwargs: other parameters (dict) + :return: dictionary of settings for given PandaQueue as a key (dict). """ - - import ast data = { 'maxwdir_broken': self.config.Pilot.maximum_input_file_sizes, # ## Config API is broken -- FIXME LATER #'container_type': 'singularity:pilot;docker:wrapper', # ## for testing @@ -77,6 +88,6 @@ def resolve_queuedata(self, pandaqueue, **kwargs): if hasattr(self.config.Information, 'acopytools'): ## FIX ME LATER: Config API should reimplemented/fixed later data['acopytools'] = ast.literal_eval(self.config.Information.acopytools) - logger.info('queuedata: following keys will be overwritten by config values: %s' % data) + logger.info(f'queuedata: following keys will be overwritten by config values: {data}') return {pandaqueue: data} diff --git a/pilot/info/dataloader.py b/pilot/info/dataloader.py index 0636c384..9bea57e0 100644 --- a/pilot/info/dataloader.py +++ b/pilot/info/dataloader.py @@ -20,120 +20,157 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 """ -Base loader class to retrive data from Ext sources (file, url) +Base loader class to retrieve data from Ext sources (file, url). :author: Alexey Anisenkov :contact: anisyonk@cern.ch :date: January 2018 """ +import json +import logging import os import time -import json import urllib.request import urllib.error import urllib.parse +from datetime import ( + datetime, + timedelta +) +from typing import Any -from datetime import datetime, timedelta from pilot.util.timer import timeout from pilot.util.https import ctx -import logging logger = logging.getLogger(__name__) -class DataLoader(object): - """ - Base data loader - """ +class DataLoader: + """Base data loader.""" @classmethod - def is_file_expired(self, fname, cache_time=0): + def is_file_expired(cls, fname: str, cache_time: int = 0) -> bool: """ Check if file fname is older than cache_time seconds from its last_update_time. - :param fname: File name. - :param cache_time: Cache time in seconds. - :return: Boolean. + :param fname: file name (str) + :param cache_time: cache time in seconds (int) + :return: True if file is expired, False otherwise (bool). """ - if cache_time: - lastupdate = self.get_file_last_update_time(fname) + lastupdate = cls.get_file_last_update_time(fname) return not (lastupdate and datetime.now() - lastupdate < timedelta(seconds=cache_time)) return True @classmethod - def get_file_last_update_time(self, fname): + def get_file_last_update_time(cls, fname: str) -> datetime or None: """ Return the last update time of the given file. - :param fname: File name. - :return: Last update time in seconds or None if file does not exist. + :param fname: file name (str) + :return: last update time in seconds or None if file does not exist (datetime or None). """ - try: lastupdate = datetime.fromtimestamp(os.stat(fname).st_mtime) - except Exception: + except OSError: lastupdate = None return lastupdate @classmethod # noqa: C901 - def load_url_data(self, url, fname=None, cache_time=0, nretry=3, sleep_time=60): # noqa: C901 + def load_url_data(cls, url: str, fname: str = None, cache_time: int = 0, nretry: int = 3, sleep_time: int = 60) -> Any: # noqa: C901 """ Download data from url or file resource and optionally save it into cache file fname. + The file will not be (re-)loaded again if cache age from last file modification does not exceed cache_time seconds. - If url is None then data will be read from cache file fname (if any) + If url is None then data will be read from cache file fname (if any). - :param url: Source of data - :param fname: Cache file name. If given then loaded data will be saved into it. - :param cache_time: Cache time in seconds. - :param nretry: Number of retries (default is 3). - :param sleep_time: Sleep time (default is 60 s) between retry attempts. - :return: data loaded from the url or file content if url passed is a filename. + :param url: URL to source of data (str) + :param fname: cache file name. If given then loaded data will be saved into it (str) + :param cache_time: cache time in seconds (int) + :param nretry: number of retries (default is 3) (int) + :param sleep_time: sleep time (default is 60 s) between retry attempts (int) + :return: data loaded from the url or file content if url passed is a filename (Any). """ - @timeout(seconds=20) - def _readfile(url): + def _readfile(url: str) -> str: + """ + Read file content. + + :param url: file name (str) + :return: file content (str). + """ if os.path.isfile(url): - with open(url, "r") as f: - content = f.read() + try: + with open(url, "r", encoding='utf-8') as f: + content = f.read() + except (OSError, UnicodeDecodeError) as exc: + logger.warning(f"failed to read file {url}: {exc}") + content = "" + return content + return "" + + def _readurl(url: str, _timeout: int = 20) -> str: + """ + Read url content. + + :param url: url (str) + :return: url content (str). + """ + req = urllib.request.Request(url) + req.add_header('User-Agent', ctx.user_agent) + try: + with urllib.request.urlopen(req, context=ctx.ssl_context, timeout=_timeout) as response: + content = response.read() + except urllib.error.URLError as exc: + logger.warning(f"error occurred with urlopen: {exc.reason}") + # Handle the error, set content to None or handle as needed + content = "" + + return content + content = None - if url and self.is_file_expired(fname, cache_time): # load data into temporary cache file + if url and cls.is_file_expired(fname, cache_time): # load data into temporary cache file for trial in range(1, nretry + 1): if content: break try: native_access = '://' not in url ## trival check for file access, non accurate.. FIXME later if need if native_access: - logger.info('[attempt=%s/%s] loading data from file=%s' % (trial, nretry, url)) + logger.info(f'[attempt={trial}/{nretry}] loading data from file {url}') content = _readfile(url) else: - logger.info('[attempt=%s/%s] loading data from url=%s' % (trial, nretry, url)) + logger.info(f'[attempt={trial}/{nretry}] loading data from url {url}') req = urllib.request.Request(url) req.add_header('User-Agent', ctx.user_agent) - content = urllib.request.urlopen(req, context=ctx.ssl_context, timeout=20).read() + content = _readurl(url) if fname: # save to cache - with open(fname, "w+") as f: + with open(fname, "w+", encoding='utf-8') as _file: if isinstance(content, bytes): # if-statement will always be needed for python 3 content = content.decode("utf-8") - f.write(content) - logger.info('saved data from "%s" resource into file=%s, length=%.1fKb' % - (url, fname, len(content) / 1024.)) + + if content: + _file.write(content) + logger.info(f'saved data from \"{url}\" resource into file {fname}, ' + f'length={len(content) / 1024.:.1f} kB') + else: + logger.warning('no data to save into cache file') + continue + return content - except Exception as e: # ignore errors, try to use old cache if any - logger.warning('failed to load data from url=%s, error: %s .. trying to use data from cache=%s' % - (url, e, fname)) + except Exception as exc: # ignore errors, try to use old cache if any + logger.warning(f"failed to load data from url {url}, error: {exc} .. trying to use data from cache={fname}") # will try to use old cache below if trial < nretry: xsleep_time = sleep_time() if callable(sleep_time) else sleep_time - logger.info("will try again after %ss.." % xsleep_time) + logger.info(f"will try again after {xsleep_time} s..") time.sleep(xsleep_time) if content is not None: # just loaded data @@ -141,30 +178,31 @@ def _readfile(url): # read data from old cache fname try: - with open(fname, 'r') as f: + with open(fname, 'r', encoding='utf-8') as f: content = f.read() - except Exception as e: - logger.warning("cache file=%s is not available: %s .. skipped" % (fname, e)) + except (OSError, UnicodeDecodeError) as exc: + logger.warning(f"cache file={fname} is not available: {exc} .. skipped") return None return content @classmethod - def load_data(self, sources, priority, cache_time=60, parser=None): + def load_data(cls, sources: dict, priority: list, cache_time: int = 60, parser: Any = None) -> Any: """ Download data from various sources (prioritized). + Try to get data from sources according to priority values passed Expected format of source entry: - sources = {'NAME':{'url':"source url", 'nretry':int, 'fname':'cache file (optional)', 'cache_time':int (optional), 'sleep_time':opt}} - - :param sources: Dict of source configuration - :param priority: Ordered list of source names - :param cache_time: Default cache time in seconds. Can be overwritten by cache_time value passed in sources dict - :param parser: Callback function to interpret/validate data which takes read data from source as input. Default is json.loads - :return: Data loaded and processed by parser callback + sources = {'NAME':{'url':"source url", 'nretry':int, 'fname':'cache file (optional)', + 'cache_time':int (optional), 'sleep_time':opt}} + + :param sources: dict of source configuration (dict) + :param priority: ordered list of source names (list) + :param cache_time: default cache time in seconds. Can be overwritten by cache_time value passed in sources (dict) + :param parser: callback function to interpret/validate data which takes read data from source as input. Default is json.loads (Any) + :return: data loaded and processed by parser callback (Any) """ - if not priority: # no priority set ## randomly order if need (FIX ME LATER) priority = list(sources.keys()) @@ -177,7 +215,7 @@ def load_data(self, sources, priority, cache_time=60, parser=None): idat = dict([k, dat.get(k)] for k in accepted_keys if k in dat) idat.setdefault('cache_time', cache_time) - content = self.load_url_data(**idat) + content = cls.load_url_data(**idat) if isinstance(content, bytes): content = content.decode("utf-8") logger.debug('converted content to utf-8') @@ -189,13 +227,14 @@ def load_data(self, sources, priority, cache_time=60, parser=None): def jsonparser(c): dat = json.loads(c) if dat and isinstance(dat, dict) and 'error' in dat: - raise Exception('response contains error, data=%s' % dat) + raise Exception(f'response contains error, data={dat}') return dat parser = jsonparser try: data = parser(content) - except Exception as e: - logger.fatal("failed to parse data from source=%s (resource=%s, cache=%s).. skipped, error=%s" % (dat.get('url'), key, dat.get('fname'), e)) + except Exception as exc: + logger.fatal(f"failed to parse data from source={dat.get('url')} " + f"(resource={key}, cache={dat.get('fname')}).. skipped, error={exc}") data = None if data: return data @@ -203,37 +242,44 @@ def jsonparser(c): return None -def merge_dict_data(d1, d2, keys=[], common=True, left=True, right=True, rec=False): +def merge_dict_data(dic1: dict, dic2: dict, keys: list = [], common: bool = True, left: bool = True, + right: bool = True, rec: bool = False) -> dict: """ - Recursively merge two dict objects - Merge content of d2 dict into copy of d1 - :param common: if True then do merge keys exist in both dicts - :param left: if True then preseve keys exist only in d1 - :param right: if True then preserve keys exist only in d2 + Recursively merge two dictionary objects. + + Merge content of dic2 dict into copy of dic1. + + :param dic1: dictionary to merge into (dict) + :param dic2: dictionary to merge from (dict) + :param keys: list of keys to merge (list) + :param common: if True then merge keys exist in both dictionaries (bool) + :param left: if True then preserve keys exist only in dic1 (bool) + :param right: if True then preserve keys exist only in dic2 (bool) + :param rec: if True then merge recursively (bool) + :return: merged dictionary (dict). """ - ### TODO: verify and configure logic later - if not (isinstance(d1, dict) and isinstance(d2, dict)): - return d2 + if not (isinstance(dic1, dict) and isinstance(dic2, dict)): + return dic2 - ret = d1.copy() + ret = dic1.copy() if keys and rec: - for k in set(keys) & set(d2): - ret[k] = d2[k] + for k in set(keys) & set(dic2): + ret[k] = dic2[k] return ret if common: # common - for k in set(d1) & set(d2): - ret[k] = merge_dict_data(d1[k], d2[k], keys, rec=True) + for k in set(dic1) & set(dic2): + ret[k] = merge_dict_data(dic1[k], dic2[k], keys, rec=True) if not left: # left - for k in set(d1) - set(d2): + for k in set(dic1) - set(dic2): ret.pop(k) if right: # right - for k in set(d2) - set(d1): - ret[k] = d2[k] + for k in set(dic2) - set(dic1): + ret[k] = dic2[k] return ret diff --git a/pilot/info/extinfo.py b/pilot/info/extinfo.py index b11214ed..fda96cb6 100644 --- a/pilot/info/extinfo.py +++ b/pilot/info/extinfo.py @@ -20,8 +20,9 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 """ -Information provider from external source(s) -which is mainly used to retrive Queue, Site, etc data required for Information Service +Information provider from external source(s). + +Mainly used to retrieve Queue, Site, etc data required for the Information Service. :author: Alexey Anisenkov :contact: anisyonk@cern.ch @@ -31,40 +32,45 @@ import os import json import random +import logging +from typing import Any +from pilot.common.errorcodes import ErrorCodes +from pilot.common.exception import PilotException from pilot.util.config import config from .dataloader import DataLoader, merge_dict_data -import logging logger = logging.getLogger(__name__) class ExtInfoProvider(DataLoader): """ - Information provider to retrive data from external source(s) - (e.g. AGIS, PanDA, CVMFS) + Information provider to retrive data from external source(s). + + E.g. CRIC, PanDA, CVMFS. """ - def __init__(self, cache_time=60): - """ - :param cache_time: Default cache time in seconds + def __init__(self, cache_time: int = 60): """ + Initialize class instance. + :param cache_time: default cache time in seconds (int). + """ self.cache_time = cache_time @classmethod - def load_schedconfig_data(self, pandaqueues=[], priority=[], cache_time=60): + def load_schedconfig_data(cls, pandaqueues: list = [], priority: list = [], cache_time: int = 60) -> dict: """ - Download the (AGIS-extended) data associated to PandaQueue from various sources (prioritized). - Try to get data from CVMFS first, then AGIS or from Panda JSON sources (not implemented). + Download the (CRIC-extended) data associated to PandaQueue from various sources (prioritized). - For the moment PanDA source does not provide the full schedconfig description + Try to get data from CVMFS first, then CRIC or from Panda JSON sources (not implemented). + At the moment PanDA source does not provide the full schedconfig description. - :param pandaqueues: list of PandaQueues to be loaded - :param cache_time: Default cache time in seconds. - :return: + :param pandaqueues: list of PandaQueues to be loaded (list) + :param priority: list of sources to be used for data load (list) + :param cache_time: default cache time in seconds (int). + :return: dict of schedconfig settings by PandaQueue name as a key (dict). """ - pandaqueues = sorted(set(pandaqueues)) cache_dir = config.Information.cache_dir @@ -73,7 +79,7 @@ def load_schedconfig_data(self, pandaqueues=[], priority=[], cache_time=60): cric_url = getattr(config.Information, 'queues_url', None) or 'https://atlas-cric.cern.ch/cache/schedconfig/{pandaqueue}.json' cric_url = cric_url.format(pandaqueue=pandaqueues[0] if len(pandaqueues) == 1 else 'pandaqueues') - cvmfs_path = self.get_cvmfs_path(config.Information.queues_cvmfs, 'cric_pandaqueues.json') + cvmfs_path = cls.get_cvmfs_path(config.Information.queues_cvmfs, 'cric_pandaqueues.json') sources = {'CVMFS': {'url': cvmfs_path, 'nretry': 1, @@ -82,7 +88,7 @@ def load_schedconfig_data(self, pandaqueues=[], priority=[], cache_time=60): 'nretry': 3, 'sleep_time': lambda: 15 + random.randint(0, 30), ## max sleep time 45 seconds between retries 'cache_time': 3 * 60 * 60, # 3 hours - 'fname': os.path.join(cache_dir, 'agis_schedconf.agis.%s.json' % (pandaqueues[0] if len(pandaqueues) == 1 else 'pandaqueues'))}, + 'fname': os.path.join(cache_dir, f"agis_schedconf.agis.{pandaqueues[0] if len(pandaqueues) == 1 else 'pandaqueues'}.json")}, 'LOCAL': {'url': os.environ.get('LOCAL_AGIS_SCHEDCONF'), 'nretry': 1, 'cache_time': 3 * 60 * 60, # 3 hours @@ -91,15 +97,15 @@ def load_schedconfig_data(self, pandaqueues=[], priority=[], cache_time=60): } pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.setup' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.setup', globals(), locals(), [pilot_user], 0) queuedata_source_priority = user.get_schedconfig_priority() priority = priority or queuedata_source_priority logger.debug(f'schedconfig priority={priority}') - return self.load_data(sources, priority, cache_time) + return cls.load_data(sources, priority, cache_time) @staticmethod - def get_cvmfs_path(url, fname): + def get_cvmfs_path(url: str, fname: str) -> str: """ Return a proper path for cvmfs. @@ -107,29 +113,30 @@ def get_cvmfs_path(url, fname): :param fname: file name for CRIC JSON (string). :return: cvmfs path (string). """ - if url: cvmfs_path = url.replace('CVMFS_PATH', os.environ.get('ATLAS_SW_BASE', '/cvmfs')) else: - cvmfs_path = '%s/atlas.cern.ch/repo/sw/local/etc/%s' % (os.environ.get('ATLAS_SW_BASE', '/cvmfs'), fname) + cvmfs_path = f"{os.environ.get('ATLAS_SW_BASE', '/cvmfs')}/atlas.cern.ch/repo/sw/local/etc/{fname}" return cvmfs_path @classmethod - def load_queuedata(self, pandaqueue, priority=[], cache_time=60): + def load_queuedata(cls, pandaqueue: str, priority: list = [], cache_time: int = 60) -> dict: """ Download the queuedata from various sources (prioritized). - Try to get data from PanDA, CVMFS first, then AGIS + + Try to get data from PanDA, CVMFS first, then CRIC. This function retrieves only min information of queuedata provided by PanDA cache for the moment. - :param pandaqueue: PandaQueue name - :param cache_time: Default cache time in seconds. - :return: + :param pandaqueue: PandaQueue name (str) + :param priority: list of sources to be used for data load (list) + :param cache_time: default cache time in seconds (str) + :return: dict of queuedata settings by PandaQueue name as a key (dict) + :raises PilotException: in case of error. """ - if not pandaqueue: - raise Exception('load_queuedata(): pandaqueue name is not specififed') + raise PilotException('load_queuedata(): pandaqueue name is not specififed', code=ErrorCodes.QUEUEDATA) pandaqueues = [pandaqueue] @@ -137,16 +144,24 @@ def load_queuedata(self, pandaqueue, priority=[], cache_time=60): if not cache_dir: cache_dir = os.environ.get('PILOT_HOME', '.') - def jsonparser_panda(c): - dat = json.loads(c) - if dat and isinstance(dat, dict) and 'error' in dat: - raise Exception('response contains error, data=%s' % dat) - return {pandaqueue: dat} + def jsonparser_panda(dat: Any) -> dict: + """ + Parse json data from PanDA source. + + :param dat: data (Any) + :return: parsed data (dict) + :raises Exception: in case of error. + """ + _dat = json.loads(dat) + if _dat and isinstance(_dat, dict) and 'error' in _dat: + raise PilotException(f'response contains error, data={_dat}', code=ErrorCodes.QUEUEDATA) + + return {pandaqueue: _dat} queuedata_url = (os.environ.get('QUEUEDATA_SERVER_URL') or getattr(config.Information, 'queuedata_url', '')).format(**{'pandaqueue': pandaqueues[0]}) cric_url = getattr(config.Information, 'queues_url', None) cric_url = cric_url.format(pandaqueue=pandaqueues[0] if len(pandaqueues) == 1 else 'pandaqueues') - cvmfs_path = self.get_cvmfs_path(getattr(config.Information, 'queuedata_cvmfs', None), 'cric_pandaqueues.json') + cvmfs_path = cls.get_cvmfs_path(getattr(config.Information, 'queuedata_cvmfs', None), 'cric_pandaqueues.json') sources = {'CVMFS': {'url': cvmfs_path, 'nretry': 1, @@ -155,7 +170,7 @@ def jsonparser_panda(c): 'nretry': 3, 'sleep_time': lambda: 15 + random.randint(0, 30), # max sleep time 45 seconds between retries 'cache_time': 3 * 60 * 60, # 3 hours - 'fname': os.path.join(cache_dir, 'agis_schedconf.agis.%s.json' % (pandaqueues[0] if len(pandaqueues) == 1 else 'pandaqueues'))}, + 'fname': os.path.join(cache_dir, f"agis_schedconf.agis.{pandaqueues[0] if len(pandaqueues) == 1 else 'pandaqueues'}.json")}, 'LOCAL': {'url': None, 'nretry': 1, 'cache_time': 3 * 60 * 60, # 3 hours @@ -172,24 +187,25 @@ def jsonparser_panda(c): } pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.setup' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.setup', globals(), locals(), [pilot_user], 0) queuedata_source_priority = user.get_queuedata_priority() priority = priority or queuedata_source_priority logger.debug(f'queuedata priority={priority}') - return self.load_data(sources, priority, cache_time) + return cls.load_data(sources, priority, cache_time) @classmethod - def load_storage_data(self, ddmendpoints=[], priority=[], cache_time=60): + def load_storage_data(cls, ddmendpoints: list = [], priority: list = [], cache_time: int = 60) -> dict: """ Download DDM Storages details by given name (DDMEndpoint) from various sources (prioritized). + Unless specified as an argument in the function call, the prioritized list will be read from the user plug-in. - :param pandaqueues: list of PandaQueues to be loaded - :param cache_time: Default cache time in seconds. - :return: dict of DDMEndpoint settings by DDMendpoint name as a key + :param ddmendpoints: list of ddmendpoint names (list) + :param priority: list of sources to be used for data load (list) + :param cache_time: default cache time in seconds (int) + :return: dictionary of DDMEndpoint settings by DDMendpoint name as a key (dict). """ - ddmendpoints = sorted(set(ddmendpoints)) cache_dir = config.Information.cache_dir @@ -199,13 +215,12 @@ def load_storage_data(self, ddmendpoints=[], priority=[], cache_time=60): # list of sources to fetch ddmconf data from _storagedata_url = os.environ.get('STORAGEDATA_SERVER_URL', '') storagedata_url = _storagedata_url if _storagedata_url else getattr(config.Information, 'storages_url', None) - cvmfs_path = self.get_cvmfs_path(config.Information.storages_cvmfs, 'cric_ddmendpoints.json') + cvmfs_path = cls.get_cvmfs_path(config.Information.storages_cvmfs, 'cric_ddmendpoints.json') sources = {'USER': {'url': storagedata_url, 'nretry': 3, 'sleep_time': lambda: 15 + random.randint(0, 30), ## max sleep time 45 seconds between retries 'cache_time': 3 * 60 * 60, # 3 hours - 'fname': os.path.join(cache_dir, 'agis_ddmendpoints.agis.%s.json' % - ('_'.join(ddmendpoints) or 'ALL'))}, + 'fname': os.path.join(cache_dir, f"agis_ddmendpoints.agis.{'_'.join(ddmendpoints) or 'ALL'}.json")}, 'CVMFS': {'url': cvmfs_path, 'nretry': 1, 'fname': os.path.join(cache_dir, getattr(config.Information, 'storages_cache', None) or 'agis_ddmendpoints.json')}, @@ -215,8 +230,7 @@ def load_storage_data(self, ddmendpoints=[], priority=[], cache_time=60): ## max sleep time 45 seconds between retries 'cache_time': 3 * 60 * 60, # 3 hours - 'fname': os.path.join(cache_dir, 'agis_ddmendpoints.agis.%s.json' % - ('_'.join(ddmendpoints) or 'ALL'))}, + 'fname': os.path.join(cache_dir, f"agis_ddmendpoints.agis.{'_'.join(ddmendpoints) or 'ALL'}.json")}, 'LOCAL': {'url': None, 'nretry': 1, 'cache_time': 3 * 60 * 60, # 3 hours @@ -225,7 +239,7 @@ def load_storage_data(self, ddmendpoints=[], priority=[], cache_time=60): } pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.setup' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.setup', globals(), locals(), [pilot_user], 0) ddm_source_priority = user.get_ddm_source_priority() if os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': priority = ['LOCAL'] @@ -233,17 +247,18 @@ def load_storage_data(self, ddmendpoints=[], priority=[], cache_time=60): priority = priority or ddm_source_priority logger.debug(f'storage data priority={priority}') - return self.load_data(sources, priority, cache_time) + return cls.load_data(sources, priority, cache_time) - def resolve_queuedata(self, pandaqueue, schedconf_priority=None): + def resolve_queuedata(self, pandaqueue: str, schedconf_priority: list = None) -> dict: """ - Resolve final full queue data details - (primary data provided by PanDA merged with overall queue details from AGIS) + Resolve final full queue data details. - :param pandaqueue: name of PandaQueue - :return: dict of settings for given PandaQueue as a key - """ + (primary data provided by PanDA merged with overall queue details from AGIS) + :param pandaqueue: name of PandaQueue + :param schedconf_priority: list of sources to be used for schedconfig data load + :return: dictionary of settings for given PandaQueue as a key (dict). + """ # load queuedata (min schedconfig settings) master_data = self.load_queuedata(pandaqueue, cache_time=self.cache_time) ## use default priority @@ -253,13 +268,12 @@ def resolve_queuedata(self, pandaqueue, schedconf_priority=None): # merge return merge_dict_data(r, master_data) - def resolve_storage_data(self, ddmendpoints=[]): + def resolve_storage_data(self, ddmendpoints: list = []) -> dict: """ - Resolve final DDM Storages details by given names (DDMEndpoint) + Resolve final DDM Storages details by given names (DDMEndpoint). - :param ddmendpoints: list of ddmendpoint names - :return: dict of settings for given DDMEndpoint as a key + :param ddmendpoints: list of ddmendpoint names (list) + :return: dictionary of settings for given DDMEndpoint as a key (dict). """ - # load ddmconf settings return self.load_storage_data(ddmendpoints, cache_time=self.cache_time) ## use default priority diff --git a/pilot/info/filespec.py b/pilot/info/filespec.py index 045e3537..0f10973c 100644 --- a/pilot/info/filespec.py +++ b/pilot/info/filespec.py @@ -22,30 +22,27 @@ """ The implementation of data structure to host File related data description. -The main reasons for such incapsulation are to +The main reasons for such encapsulation are to - apply in one place all data validation actions (for attributes and values) - - introduce internal information schema (names of attribues) to remove direct dependency to ext storage/structures + - introduce internal information schema (names of attributes) to remove direct dependency to ext storage/structures :author: Alexey Anisenkov :date: April 2018 """ + +import logging import os.path +from typing import Any from .basedata import BaseData -import logging logger = logging.getLogger(__name__) class FileSpec(BaseData): - """ - High-level object to host File Specification (meta data like lfn, checksum, replica details, etc.) - """ + """High-level object to host File Specification (meta data like lfn, checksum, replica details, etc.).""" ## put explicit list of all the attributes with comments for better inline-documentation by sphinx - ## FIX ME LATER: use proper doc format - - ## incomplete list of attributes .. to be extended once becomes used lfn = "" guid = "" @@ -89,21 +86,24 @@ class FileSpec(BaseData): bool: ['allow_lan', 'allow_wan', 'direct_access_lan', 'direct_access_wan', 'checkinputsize'] } - def __init__(self, filetype='input', **data): ## FileSpec can be split into FileSpecInput + FileSpecOuput classes in case of significant logic changes - """ - :param kwargs: input dictionary of object description - :param type: type of File: either input, output or log + def __init__(self, filetype: str = 'input', **data: dict): """ + Init class instance. + FileSpec can be split into FileSpecInput + FileSpecOuput classes in case of significant logic changes. + + :param filetype: type of File: either input, output or log + :param data: input dictionary with object description (dict) + """ self.filetype = filetype self.load(data) - def load(self, data): - """ - Construct and initialize data from ext source for Input `FileSpec` - :param data: input dictionary of object description + def load(self, data: dict): """ + Construct and initialize data from ext source for input `FileSpec`. + :param data: input dictionary of object description. + """ # the translation map of the key attributes from external data to internal schema # if key is not explicitly specified then ext name will be used as is @@ -116,16 +116,20 @@ def load(self, data): ## custom function pattern to apply extra validation to the key values ##def clean__keyname(self, raw, value): ## :param raw: raw value passed from ext source as input - ## :param value: preliminary cleaned and casted to proper type value + ## :param value: preliminary cleaned and cast to proper type value ## ## return value - def clean__checksum(self, raw, value): - """ - Validate value for the checksum key - Expected raw format is 'ad:value' or 'md:value' + def clean__checksum(self, raw: Any, value: Any) -> dict: """ + Validate given value for the checksum key. + Expected raw format is 'ad:value' or 'md:value'. + + :param raw: raw value passed from ext source as input (Any) + :param value: preliminary cleaned and cast to proper type value (Any) + :return: dictionary with checksum values (dict). + """ if isinstance(value, dict): return value @@ -141,12 +145,11 @@ def clean__checksum(self, raw, value): def clean(self): """ - Validate and finally clean up required data values (required object properties) if need - Executed once all fields have already passed field-specific validation checks - Could be customized by child object - :return: None - """ + Validate and finally clean up required data values (required object properties) if needed. + Executed once all fields have already passed field-specific validation checks. + Could be customized by child object. + """ if self.lfn.startswith("zip://"): self.lfn = self.lfn.replace("zip://", "") self.is_tar = True @@ -154,20 +157,21 @@ def clean(self): self.surl = self.lfn self.lfn = os.path.basename(self.lfn) - def is_directaccess(self, ensure_replica=True, allowed_replica_schemas=None): - """ - Check if given (input) file can be used for direct access mode by Job transformation script - :param ensure_replica: boolean, if True then check by allowed schemas of file replica turl will be considered as well - :return: boolean + def is_directaccess(self, ensure_replica: bool = True, allowed_replica_schemas: list = None) -> bool: """ + Check if given (input) file can be used for direct access mode by job transformation script. + :param ensure_replica: if True then check by allowed schemas of file replica turl will be considered as well (bool) + :param allowed_replica_schemas: list of allowed replica schemas (list) + :return: True if file can be used for direct access mode (bool). + """ # check by filename pattern filename = self.lfn.lower() is_rootfile = True exclude_pattern = ['.tar.gz', '.lib.tgz', '.raw.'] - for e in exclude_pattern: - if e in filename or filename.startswith('raw.'): + for exclude in exclude_pattern: + if exclude in filename or filename.startswith('raw.'): is_rootfile = False break @@ -183,16 +187,19 @@ def is_directaccess(self, ensure_replica=True, allowed_replica_schemas=None): if ensure_replica: allowed_replica_schemas = allowed_replica_schemas or ['root', 'dcache', 'dcap', 'file', 'https'] - if not self.turl or not any([self.turl.startswith('%s://' % e) for e in allowed_replica_schemas]): + if not self.turl or not any([self.turl.startswith(f'{allowed}://') for allowed in allowed_replica_schemas]): _is_directaccess = False return _is_directaccess - def get_storage_id_and_path_convention(self): + def get_storage_id_and_path_convention(self) -> (str, str): """ Parse storage_token to get storage_id and path_convention. - :param storage_token: string, expected format is '', '', - :returns: storage_id, path_convention + + Format for storage token: expected format is '', '', + . + + :returns: storage_id (str), path_convention (str). """ storage_id = None path_convention = None @@ -204,7 +211,8 @@ def get_storage_id_and_path_convention(self): path_convention = int(path_convention) elif self.storage_token.isdigit(): storage_id = int(self.storage_token) - except Exception as ex: - logger.warning("Failed to parse storage_token(%s): %s" % (self.storage_token, ex)) - logger.info('storage_id: %s, path_convention: %s' % (storage_id, path_convention)) + except (ValueError, AttributeError, TypeError) as exc: + logger.warning(f"failed to parse storage_token({self.storage_token}): {exc}") + logger.info(f'storage_id: {storage_id}, path_convention: {path_convention}') + return storage_id, path_convention diff --git a/pilot/info/infoservice.py b/pilot/info/infoservice.py index f1c7c678..b9c7dd72 100644 --- a/pilot/info/infoservice.py +++ b/pilot/info/infoservice.py @@ -21,10 +21,12 @@ """ +Info Service module. + The implmemtation of high-level Info Service module, which includes a set of low-level information providers to aggregate, prioritize (overwrite), hide dependency to external storages and expose (queue, site, storage, etc) details -in a unified structured way via provided high-level API +in a unified structured way via provided high-level API. :author: Alexey Anisenkov :contact: anisyonk@cern.ch @@ -32,63 +34,84 @@ """ import inspect +import logging +import traceback +from typing import Any from pilot.common.exception import PilotException, NotDefined, QueuedataFailure - from .configinfo import PilotConfigProvider from .extinfo import ExtInfoProvider # from .jobinfo import JobInfoProvider - from .dataloader import merge_dict_data from .queuedata import QueueData from .storagedata import StorageData -import logging logger = logging.getLogger(__name__) -class InfoService(object): - """ - High-level Information Service - """ +class InfoService: + """High-level Information Service.""" cache_time = 60 # default cache time in seconds - def require_init(func): # noqa + # add instruction to pyling to prevent it from giving the wrong error message. require_init() is a method decorator + # and not a class method. pylint will otherwise suggest to add self as first argument to the method. + # pylint: disable=no-self-argument + def require_init(func: Any) -> Any: # noqa """ - Method decorator to check if object is initialized + Check if object is initialized. + + Method decorator. + + :param func: function to decorate (Any) + :return: decorated function (Any). """ key = 'pandaqueue' - def inner(self, *args, **kwargs): + # pylint: disable=not-callable + def inner(self, *args: Any, **kwargs: dict) -> Any: + """ + Inner function. + + :param args: arguments (Any) + :param kwargs: keyword arguments (dict). + :return: decorated function (Any) + :raises PilotException: in case of error. + """ if getattr(self, key, None) is None: - raise PilotException(f"failed to call {func.__name__}(): InfoService instance is not initialized. Call init() first!") + raise PilotException(f"failed to call {func.__name__}(): InfoService instance is not initialized. " + f"Call init() first!") + return func(self, *args, **kwargs) return inner def __init__(self): - + """Init class instance.""" self.pandaqueue = None self.queuedata = None ## cache instance of QueueData for PandaQueue settings - self.queues_info = {} ## cache of QueueData objects for PandaQueue settings self.storages_info = {} ## cache of QueueData objects for DDMEndpoint settings #self.sites_info = {} ## cache for Site settings - self.confinfo = None ## by default (when non initalized) ignore overwrites/settings from Config self.jobinfo = None ## by default (when non initalized) ignore overwrites/settings from Job self.extinfo = ExtInfoProvider(cache_time=self.cache_time) - self.storage_id2ddmendpoint = {} self.ddmendpoint2storage_id = {} - def init(self, pandaqueue, confinfo=None, extinfo=None, jobinfo=None): + def init(self, pandaqueue: str, confinfo: Any = None, extinfo: Any = None, jobinfo: Any = None): + """ + Initialize InfoService instance. + :param pandaqueue: name of PandaQueue (str) + :param confinfo: PilotConfigProvider instance (Any) + :param extinfo: ExtInfoProvider instance (Any) + :param jobinfo: JobInfoProvider instance (Any) + :raises PilotException: in case of error. + """ self.confinfo = confinfo or PilotConfigProvider() self.jobinfo = jobinfo # or JobInfoProvider() self.extinfo = extinfo or ExtInfoProvider(cache_time=self.cache_time) - self.pandaqueue = pandaqueue if not self.pandaqueue: @@ -98,32 +121,41 @@ def init(self, pandaqueue, confinfo=None, extinfo=None, jobinfo=None): self.storages_info = {} ## reset cache data #self.sites_info = {} ## reset cache data - self.queuedata = self.resolve_queuedata(self.pandaqueue) - + try: + self.queuedata = self.resolve_queuedata(self.pandaqueue) + except PilotException as exc: + logger.warning(f"failed to resolve queuedata for queue={self.pandaqueue}, error={exc}") + raise exc if not self.queuedata or not self.queuedata.name: raise QueuedataFailure(f"failed to resolve queuedata for queue={self.pandaqueue}, wrong PandaQueue name?") self.resolve_storage_data() ## prefetch details for all storages @classmethod - def whoami(self): + def whoami(cls): """ - :return: Current function name being executed + Return current function name being executed. + + :return: Current function name (str). """ return inspect.stack()[1][3] @classmethod - def _resolve_data(self, fname, providers=[], args=[], kwargs={}, merge=False): + def _resolve_data(cls, fname: Any, providers: list = [], args: list = [], kwargs: dict = {}, merge: bool = False) -> Any: """ - Resolve data by calling function `fname` of passed provider objects. - - Iterate over `providers`, merge data from all providers if merge is True, - (consider 1st success result from prioritized list if `merge` mode is False) - and resolve data by execution function `fname` with passed arguments `args` and `kwargs` - - :return: The result of first successfull execution will be returned + Resolve data by calling function `fname` of passed provider objects. + + Iterate over `providers`, merge data from all providers if merge is True, + (consider 1st success result from prioritized list if `merge` mode is False) + and resolve data by execution function `fname` with passed arguments `args` and `kwargs` + + :param fname: name of function to be called (Any) + :param providers: list of provider objects (list) + :param args: list of arguments to be passed to function (list) + :param kwargs: list of keyword arguments to be passed to function (dict) + :param merge: if True then merge data from all providers (bool) + :return: The result of first successful execution will be returned (Any). """ - ret = None if merge: providers = list(providers) @@ -138,24 +170,21 @@ def _resolve_data(self, fname, providers=[], args=[], kwargs={}, merge=False): ret = merge_dict_data(ret or {}, r or {}) except Exception as exc: logger.warning(f"failed to resolve data ({fcall.__name__}) from provider={provider} .. skipped, error={exc}") - import traceback logger.warning(traceback.format_exc()) return ret @require_init - def resolve_queuedata(self, pandaqueue): ## high level API + def resolve_queuedata(self, pandaqueue: str) -> Any: ## high level API """ - Resolve final full queue data details + Resolve final full queue data details. - :param pandaqueue: name of PandaQueue - :return: `QueueData` object or None if not exist + :param pandaqueue: name of PandaQueue (str) + :return: `QueueData` object or None if it does not exist (Any). """ - cache = self.queues_info if pandaqueue not in cache: # not found in cache: do load and initialize data - # the order of providers makes the priority r = self._resolve_data(self.whoami(), providers=(self.confinfo, self.jobinfo, self.extinfo), args=[pandaqueue], kwargs={'schedconf_priority': self.resolve_schedconf_sources()}, @@ -167,11 +196,14 @@ def resolve_queuedata(self, pandaqueue): ## high level API return cache.get(pandaqueue) #@require_init - def resolve_storage_data(self, ddmendpoints=[]): ## high level API - """ - :return: dict of DDMEndpoint settings by DDMEndpoint name as a key + def resolve_storage_data(self, ddmendpoints: list = []) -> dict: ## high level API """ + Resolve final full storage data details. + :param ddmendpoints: list of DDMEndpoint names (list) + :return: dictionary of DDMEndpoint settings by DDMEndpoint name as a key (dict) + :raises PilotException: in case of error. + """ if isinstance(ddmendpoints, str): ddmendpoints = [ddmendpoints] @@ -185,21 +217,23 @@ def resolve_storage_data(self, ddmendpoints=[]): ## high level API if ddmendpoints: not_resolved = set(ddmendpoints) - set(r) if not_resolved: - raise PilotException("internal error: Failed to load storage details for ddms=%s" % sorted(not_resolved)) + raise PilotException(f"internal error: Failed to load storage details for ddms={sorted(not_resolved)}") for ddm in r: cache[ddm] = StorageData(r[ddm]) return cache @require_init - def resolve_schedconf_sources(self): ## high level API - """ - Resolve prioritized list of source names for Schedconfig data load - Consider first the config settings of pilot instance (via `confinfo`) - and then Job specific settings (via `jobinfo` instance), - and failover to default value (LOCAL, CVMFS, AGIS, PANDA) + def resolve_schedconf_sources(self) -> Any: ## high level API """ + Resolve prioritized list of source names for Schedconfig data load. + Consider first the config settings of pilot instance (via `confinfo`) + and then Job specific settings (via `jobinfo` instance), + and failover to default value (LOCAL, CVMFS, AGIS, PANDA). + + :return: list of source names (list). + """ defval = ['LOCAL', 'CVMFS', 'CRIC', 'PANDA'] # look up priority order: either from job, local config or hardcoded in the logic @@ -218,14 +252,15 @@ def resolve_schedconf_sources(self): ## high level API # # look up priority order: either from job, local config, extinfo provider # return self._resolve_data(self.whoami(), providers=(self.confinfo, self.jobinfo, self.extinfo), args=[name]) - def resolve_ddmendpoint_storageid(self, ddmendpoint=[]): + def resolve_ddmendpoint_storageid(self, ddmendpoint: list = []): """ - Resolve the map between ddmendpoint and storage_id + Resolve the map between ddmendpoint and storage_id. + + :param ddmendpoint: ddmendpoint name (list). """ if not ddmendpoint or ddmendpoint not in self.ddmendpoint2storage_id: storages = self.resolve_storage_data(ddmendpoint) - for storage_name in storages: - storage = storages[storage_name] + for storage_name, storage in storages.items(): storage_id = storage.pk self.ddmendpoint2storage_id[storage_name] = storage_id self.storage_id2ddmendpoint[storage_id] = storage_name @@ -234,13 +269,13 @@ def resolve_ddmendpoint_storageid(self, ddmendpoint=[]): if bucket_id: self.storage_id2ddmendpoint[bucket_id] = storage_name - def get_storage_id(self, ddmendpoint): + def get_storage_id(self, ddmendpoint: str) -> int: """ Return the storage_id of a ddmendpoint. - :param ddmendpoint: ddmendpoint name. - :returns storage_id: storage_id of the ddmendpoint. - :raises NotDefined: + :param ddmendpoint: ddmendpoint name (str) + :returns storage_id: storage_id of the ddmendpoint (int) + :raises NotDefined: when storage_id is not defined. """ if ddmendpoint not in self.ddmendpoint2storage_id: self.resolve_ddmendpoint_storageid(ddmendpoint) @@ -249,16 +284,16 @@ def get_storage_id(self, ddmendpoint): storage_id = self.ddmendpoint2storage_id[ddmendpoint] logger.info(f"found storage id for ddmendpoint({ddmendpoint}): {storage_id}") return storage_id - else: - raise NotDefined(f"cannot find the storage id for ddmendpoint: {ddmendpoint}") - def get_ddmendpoint(self, storage_id): + raise NotDefined(f"cannot find the storage id for ddmendpoint: {ddmendpoint}") + + def get_ddmendpoint(self, storage_id: int) -> str: """ Return the ddmendpoint name from a storage id. - :param storage_id: storage_id as an int. - :returns ddmendpoint: ddmendpoint name. - :raises NotDefined: + :param storage_id: storage_id (int) + :returns ddmendpoint: ddmendpoint name (str) + :raises NotDefined: when ddmendpoint is not defined for the given storage id. """ storage_id = int(storage_id) if storage_id not in self.storage_id2ddmendpoint: @@ -268,6 +303,6 @@ def get_ddmendpoint(self, storage_id): ddmendpoint = self.storage_id2ddmendpoint[storage_id] logger.info(f"found ddmendpoint for storage id({storage_id}): {ddmendpoint}") return ddmendpoint - else: - self.resolve_storage_data() - raise NotDefined(f"cannot find ddmendpoint for storage id: {storage_id}") + + self.resolve_storage_data() + raise NotDefined(f"cannot find ddmendpoint for storage id: {storage_id}") diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index d3b7fbe4..b61c09bb 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -308,7 +308,7 @@ def show_access_settings(access_keys): :return: """ dat = dict([item, getattr(FileSpec, item, None)] for item in access_keys) - msg = ', '.join(["%s=%s" % (item, value) for item, value in sorted(dat.items())]) + msg = ', '.join([f"{item}={value}" for item, value in sorted(dat.items())]) logger.info(f'job.infosys.queuedata is not initialized: the following access settings will be used by default: {msg}') @staticmethod @@ -623,7 +623,7 @@ def clean__jobparams(self, raw, value): # (return list of strings not to be filtered, which will be put back in the post-filtering below) pilot_user = os.environ.get('PILOT_USER', 'generic').lower() try: - user = __import__('pilot.user.%s.jobdata' % pilot_user, globals(), locals(), [pilot_user], 0) + user = __import__(f'pilot.user.{pilot_user}.jobdata', globals(), locals(), [pilot_user], 0) exclusions, value = user.jobparams_prefiltering(value) except Exception as exc: logger.warning(f'caught exception in user code: {exc}') @@ -660,7 +660,7 @@ def clean__jobparams(self, raw, value): except Exception as exc: logger.warning(f'caught exception in user code: {exc}') - logger.info('cleaned jobparams: %s' % ret) + logger.info(f'cleaned jobparams: {ret}') return ret @@ -692,7 +692,7 @@ def extract_container_image(self, jobparams): else: logger.info(f"extracted image from jobparams: {imagename}") else: - logger.warning("image could not be extract from %s" % jobparams) + logger.warning(f"image could not be extract from {jobparams}") # remove the option from the job parameters jobparams = re.sub(_pattern, "", jobparams) @@ -893,7 +893,7 @@ def get_job_option_for_input_name(self, input_name): :returns: job_option such as --inputHitsFile """ job_options = self.jobparams.split(' ') - input_name_option = '=@%s' % input_name + input_name_option = f'=@{input_name}' for job_option in job_options: if input_name_option in job_option: return job_option.split("=")[0] @@ -933,7 +933,7 @@ def process_writetofile(self): self.jobparams = self.jobparams.replace(input_name, input_name_new) if job_option: - self.jobparams = self.jobparams.replace('%s=' % job_option, '') + self.jobparams = self.jobparams.replace(f'{job_option}=', '') self.jobparams = self.jobparams.replace('--autoConfiguration=everything', '') logger.info(f"jobparams after processing writeToFile: {self.jobparams}") diff --git a/pilot/info/jobinfo.py b/pilot/info/jobinfo.py index 5e163c2f..1f37a6c4 100644 --- a/pilot/info/jobinfo.py +++ b/pilot/info/jobinfo.py @@ -81,7 +81,7 @@ def resolve_queuedata(self, pandaqueue, **kwargs): data.update(self.job.overwrite_queuedata) ## use job.overwrite_queuedata as a master source - logger.info('queuedata: following keys will be overwritten by Job values: %s' % data) + logger.info(f'queuedata: following keys will be overwritten by Job values: {data}') return {pandaqueue: data} @@ -101,6 +101,6 @@ def resolve_storage_data(self, ddmendpoints=[], **kwargs): data.update((k, v) for k, v in list(master_data.items()) if k in set(ddmendpoints or master_data) & set(master_data)) # Python 3 if data: - logger.info('storagedata: following data extracted from Job definition will be used: %s' % data) + logger.info(f'storagedata: following data extracted from Job definition will be used: {data}') return data diff --git a/pilot/info/queuedata.py b/pilot/info/queuedata.py index eafa022b..5e89075c 100644 --- a/pilot/info/queuedata.py +++ b/pilot/info/queuedata.py @@ -114,15 +114,16 @@ class QueueData(BaseData): def __init__(self, data): """ - :param data: input dictionary of queue data settings - """ + Init class instance. + :param data: input dictionary of queue data settings (dict). + """ self.load(data) # DEBUG #import pprint - #logger.debug('initialize QueueData from raw:\n%s' % pprint.pformat(data)) - logger.debug('Final parsed QueueData content:\n%s' % self) + #logger.debug(f'initialize QueueData from raw:\n{pprint.pformat(data)}') + logger.debug(f'final parsed QueueData content:\n{self}') def load(self, data): """ @@ -157,13 +158,8 @@ def resolve_allowed_schemas(self, activity, copytool=None): if not activity: activity = 'default' - try: - if isinstance(activity, basestring): # Python 2 # noqa: F821 - activity = [activity] - except Exception: - if isinstance(activity, str): # Python 3 - activity = [activity] - + if isinstance(activity, str): + activity = [activity] if 'default' not in activity: activity = activity + ['default'] @@ -205,13 +201,13 @@ def clean(self): found = re.findall(pattern, self.catchall) if found: self.container_options = found[0] - logger.info('container_options extracted from catchall: %s' % self.container_options) + logger.info(f'container_options extracted from catchall: {self.container_options}') # verify container_options: add the workdir if missing if self.container_options: if "${workdir}" not in self.container_options and " --contain" in self.container_options: ## reimplement with shlex later self.container_options = self.container_options.replace(" --contain", ",${workdir} --contain") - logger.info("Note: added missing ${workdir} to container_options: %s" % self.container_options) + logger.info(f"note: added missing $workdir to container_options: {self.container_options}") pass diff --git a/pilot/info/storagedata.py b/pilot/info/storagedata.py index b4fc5092..ea5bab8b 100644 --- a/pilot/info/storagedata.py +++ b/pilot/info/storagedata.py @@ -82,9 +82,9 @@ def __init__(self, data): self.load(data) # DEBUG - #import pprint - #logger.debug('initialize StorageData from raw:\n%s' % pprint.pformat(data)) - #logger.debug('Final parsed StorageData content:\n%s' % self) + # import pprint + # logger.debug(f'initialize StorageData from raw:\n{pprint.pformat(data)}') + # logger.debug(f'final parsed StorageData content:\n{self}') def load(self, data): """ @@ -122,15 +122,15 @@ def get_security_key(self, secret_key, access_key): """ try: data = {'privateKeyName': secret_key, 'publicKeyName': access_key} - logger.info("Getting key pair: %s" % data) + logger.info(f"Getting key pair: {data}") url = environ.get('PANDA_SERVER_URL', config.Pilot.pandaserver) - res = https.request('{pandaserver}/server/panda/getKeyPair'.format(pandaserver=url), data=data) + res = https.request(f'{url}/server/panda/getKeyPair', data=data) if res and res['StatusCode'] == 0: return {"publicKey": res["publicKey"], "privateKey": res["privateKey"]} else: - logger.info("Got key pair returns wrong value: %s" % res) - except Exception as ex: - logger.error("Failed to get key pair(%s,%s): %s, %s" % (access_key, secret_key, ex, traceback.format_exc())) + logger.info(f"Got key pair returns wrong value: {res}") + except Exception as exc: + logger.error(f"Failed to get key pair({access_key},{secret_key}): {exc}, {traceback.format_exc()}") return {} def get_special_setup(self, protocol_id=None): @@ -140,7 +140,7 @@ def get_special_setup(self, protocol_id=None): :return: setup as a string """ - logger.info("Get special setup for protocol id(%s)" % (protocol_id)) + logger.info(f"get special setup for protocol id({protocol_id})") if protocol_id in self.special_setup and self.special_setup[protocol_id]: return self.special_setup[protocol_id] @@ -161,12 +161,11 @@ def get_special_setup(self, protocol_id=None): if access_key and secret_key and is_secure: key_pair = self.get_security_key(secret_key, access_key) if "privateKey" not in key_pair or key_pair["privateKey"] is None: - logger.error("Failed to get the key pair for S3 objectstore from panda") + logger.error("failed to get the key pair for S3 objectstore from panda") else: - setup = "export S3_ACCESS_KEY=%s; export S3_SECRET_KEY=%s; export S3_IS_SECURE=%s;" % (key_pair["publicKey"], - key_pair["privateKey"], - is_secure) - self.special_setup[protocol_id] = setup - logger.info("Return key pair with public key: %s" % key_pair["publicKey"]) + self.special_setup[protocol_id] = f"export S3_ACCESS_KEY={key_pair['publicKey']}; " \ + f"export S3_SECRET_KEY={key_pair['privateKey']}; " \ + f"export S3_IS_SECURE={is_secure};" + logger.info(f"return key pair with public key: {key_pair['publicKey']}") return self.special_setup[protocol_id] return None diff --git a/pilot/resource/alcf.py b/pilot/resource/alcf.py index 577ee61b..7f14a838 100644 --- a/pilot/resource/alcf.py +++ b/pilot/resource/alcf.py @@ -19,16 +19,22 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Funcions for ALCF.""" + import logging +from typing import Any + logger = logging.getLogger(__name__) -def get_setup(job=None): +def get_setup(job: Any = None) -> list: """ Return the resource specific setup. - :param job: optional job object. + :param job: optional job object (Any) :return: setup commands (list). """ + if not job: + logger.warning('job object not sent to get_setup') return [] diff --git a/pilot/resource/bnl.py b/pilot/resource/bnl.py index 577ee61b..f65fc8a5 100644 --- a/pilot/resource/bnl.py +++ b/pilot/resource/bnl.py @@ -19,16 +19,22 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Functions for BNL.""" + import logging +from typing import Any + logger = logging.getLogger(__name__) -def get_setup(job=None): +def get_setup(job: Any = None) -> list: """ Return the resource specific setup. - :param job: optional job object. + :param job: optional job object (Any) :return: setup commands (list). """ + if not job: + logger.warning('job object not sent to get_setup') return [] diff --git a/pilot/resource/generic.py b/pilot/resource/generic.py index 577ee61b..9cc8d18e 100644 --- a/pilot/resource/generic.py +++ b/pilot/resource/generic.py @@ -19,16 +19,22 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Functions for generic resources.""" + import logging +from typing import Any + logger = logging.getLogger(__name__) -def get_setup(job=None): +def get_setup(job: Any = None) -> list: """ Return the resource specific setup. - :param job: optional job object. + :param job: optional job object (Any) :return: setup commands (list). """ + if not job: + logger.warning('job object not sent to get_setup') return [] diff --git a/pilot/resource/jobdescription.py b/pilot/resource/jobdescription.py index fc0a060c..7fc7ad3c 100755 --- a/pilot/resource/jobdescription.py +++ b/pilot/resource/jobdescription.py @@ -17,34 +17,44 @@ # under the License. # # Authors: +# - Danila Oleynik, 2018-2021 # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +"""Function library for Titan.""" + import re import json +import logging import numbers +import sys import traceback import threading -import logging +from typing import Any + logger = logging.getLogger(__name__) -def camel_to_snake(name): +def camel_to_snake(name: str) -> str: """ - Changes CamelCase to snake_case, used by python. + Change CamelCase to snake_case. + + Used by Python. - :param name: name to change - :return: name in snake_case + :param name: name to change (str) + :return: name in snake_case (str). """ s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() -def snake_to_camel(snake_str): +def snake_to_camel(snake_str: str) -> str: """ - Changes snake_case to firstLowCamelCase, used by server. + Change snake_case to firstLowCamelCase. + + Used by server. - :param snake_str: name to change - :return: name in camelCase + :param snake_str: name to change (str) + :return: name in camelCase (str). """ components = snake_str.split('_') # We capitalize the first letter of each component except the first one @@ -52,16 +62,16 @@ def snake_to_camel(snake_str): return components[0] + "".join(x.title() for x in components[1:]) -def split(val, separator=",", min_len=0, fill_last=False): +def split(val: str, separator: str = ",", min_len: int = 0, fill_last: bool = False) -> list: """ - Splits comma separated values and parses them. - - :param val: values to split - :param separator: comma or whatever - :param min_len: minimum needed length of array, array is filled up to this value - :param fill_last: Flag stating the array filler, if min_value is greater then extracted array length. - If true, array is filled with last value, else, with Nones. - :return: parsed array + Split comma separated values and parse them. + + :param val: values to split (str) + :param separator: comma or whatever (str) + :param min_len: minimum needed length of array, array is filled up to this value (int) + :param fill_last: flag stating the array filler, if min_value is greater then extracted array length. + If true, array is filled with last value, else, with Nones (bool) + :return: parsed array (list). """ if val is None: return [None for _ in range(min_len)] @@ -78,9 +88,9 @@ def split(val, separator=",", min_len=0, fill_last=False): return v_arr -def get_nulls(val): +def get_nulls(val: str) -> str or None: """ - Converts every "NULL" string to python's None. + Convert every "NULL" string to None. :param val: string or whatever :return: val or None if val is "NULL" @@ -88,43 +98,46 @@ def get_nulls(val): return val if val != "NULL" else None -def is_float(val): +def is_float(val: Any) -> bool: """ Test floatliness of the string value. - :param val: string or whatever - :return: True if the value may be converted to Float + :param val: string or whatever (Any) + :return: True if the value may be converted to Float (bool). """ try: float(val) - return True except ValueError: return False + return True + -def is_int(val): +def is_int(val: Any) -> bool: """ - Test int of the string value. + Test if the given string is an integer. - :param val: string or whatever - :return: True if the value may be converted to int + :param val: string or whatever (Any) + :return: True if the value may be converted to int (bool). """ try: int(val) - return True except ValueError: return False + return True -def parse_value(value): + +def parse_value(value: Any) -> Any: """ - Tries to parse value as number or None. If some of this can be done, parsed value is returned. Otherwise returns + Try to parse value as number or None. + + If some of this can be done, parsed value is returned. Otherwise returns value unparsed. - :param value: - :return: mixed + :param value: value to be tested (Any) + :return: mixed (Any). """ - if not isinstance(value, str): return value @@ -137,38 +150,39 @@ def parse_value(value): return get_nulls(value) -def stringify_weird(arg): +def stringify_weird(arg: Any) -> str: """ - Converts None to "NULL" + Convert None to "NULL". - :param arg: - :return: arg or "NULL" + :param arg: value to stringify (Any) + :return: arg or "NULL" if arg is None (str). """ if arg is None: return "NULL" if isinstance(arg, numbers.Number): return arg + return str(arg) -def join(arr): +def join(arr: list) -> str: """ - Joins arrays, converting contents to strings. + Join arrays, converting contents to strings. - :param arr: - :return: joined array + :param arr: array (list) + :return: joined array (str). """ return ",".join(str(stringify_weird(x)) for x in arr) -def get_input_files(description): +def get_input_files(description: dict) -> dict: """ - Extracts input files from the description. + Extract input files from the description. - :param description: - :return: file list + :param description: job decsription (dict) + :return: file dictionary (dict). """ - logger.info("Extracting input files from job description") + logger.info("extracting input files from job description") files = {} if description['inFiles'] and description['inFiles'] != "NULL": in_files = split(description["inFiles"]) @@ -185,31 +199,32 @@ def get_input_files(description): scope = split(description.get("scopeIn"), min_len=length, fill_last=True) guids = split(description.get("GUID"), min_len=length, fill_last=True) - for i, f in enumerate(in_files): - if f is not None: - files[f] = { - "ddm_endpoint": ddm_endpoint[i], - "storage_element": destination_se[i], - "dispatch_dblock": dispatch_dblock[i], - "dispatch_dblock_token": dispatch_dblock_token[i], - "dataset": datasets[i], - "dblock": dblocks[i], - "dblock_token": dblock_tokens[i], - "size": size[i], - "checksum": c_sum[i], - 'scope': scope[i], - "guid": guids[i] + for counter, _file in enumerate(in_files): + if _file is not None: + files[_file] = { + "ddm_endpoint": ddm_endpoint[counter], + "storage_element": destination_se[counter], + "dispatch_dblock": dispatch_dblock[counter], + "dispatch_dblock_token": dispatch_dblock_token[counter], + "dataset": datasets[counter], + "dblock": dblocks[counter], + "dblock_token": dblock_tokens[counter], + "size": size[counter], + "checksum": c_sum[counter], + 'scope': scope[counter], + "guid": guids[counter] } + return files -def fix_log(description, files): +def fix_log(description: dict, files: dict) -> dict: """ - Fixes log file description in output files (changes GUID and scope). + Fix log file description in output files (change GUID and scope). - :param description: - :param files: output files - :return: fixed output files + :param description: job description (dict) + :param files: output files (dict) + :return: fixed output files (dict). """ logger.info("modifying log-specific values in a log file description") if description["logFile"] and description["logFile"] != "NULL": @@ -221,14 +236,14 @@ def fix_log(description, files): return files -def get_output_files(description): +def get_output_files(description: dict) -> dict: """ - Extracts output files from the description. + Extract output files from the description. - :param description: - :return: output files + :param description: job description (dict) + :return: output files (dict). """ - logger.info("Extracting output files in description") + logger.info("extracting output files in description") files = {} if description['outFiles'] and description['outFiles'] != "NULL": out_files = split(description["outFiles"]) @@ -242,23 +257,29 @@ def get_output_files(description): destination_dblock_token = split(description.get("destinationDBlockToken"), min_len=length) scope = split(description.get("scopeOut"), min_len=length, fill_last=True) - for i, f in enumerate(out_files): - if f is not None: - files[f] = { - "ddm_endpoint": ddm_endpoint[i], - "storage_element": destination_se[i], - "dispatch_dblock_token": dblock_token[i], - "destination_dblock_token": destination_dblock_token[i], - "dblock_token": dblock_tokens[i], - "dataset": datasets[i], - "dblock": dblocks[i], - "scope": scope[i] + for counter, _file in enumerate(out_files): + if _file is not None: + files[_file] = { + "ddm_endpoint": ddm_endpoint[counter], + "storage_element": destination_se[counter], + "dispatch_dblock_token": dblock_token[counter], + "destination_dblock_token": destination_dblock_token[counter], + "dblock_token": dblock_tokens[counter], + "dataset": datasets[counter], + "dblock": dblocks[counter], + "scope": scope[counter] } return fix_log(description, files) -def one_or_set(array): +def one_or_set(array: list) -> str: + """ + Return the only element of array if it's the only one. + + :param array: array (list) + :return: array[0] or array (str). + """ if len(array) < 1: return join(array) @@ -271,7 +292,9 @@ def one_or_set(array): return stringify_weird(zero) -class JobDescription(object): +class JobDescription(): + """Job description class.""" + __holder = None __key_aliases = { 'PandaID': 'jobid', # it is job id, not PanDA @@ -336,7 +359,8 @@ class JobDescription(object): output_files = None def __init__(self): - super(JobDescription, self).__init__() + """Job description constructor.""" + super().__init__() self.__key_back_aliases_from_forward = self.__key_back_aliases.copy() self.__key_reverse_aliases = {} @@ -344,24 +368,35 @@ def __init__(self): self.input_files = {} self.output_files = {} - for key in self.__key_aliases: - alias = self.__key_aliases[key] + for key, alias in self.__key_aliases.items(): self.__key_back_aliases_from_forward[alias] = key self.__key_aliases_snake[camel_to_snake(key)] = alias - def get_input_file_prop(self, key): + def get_input_file_prop(self, key: str) -> str: + """ + Get input file property. + + :param key: property name (str) + :return: property value (str). + """ corresponding_key = self.__input_file_keys[key] ret = [] - for f in self.input_files: - ret.append(f if corresponding_key == '' else self.input_files[f][corresponding_key]) + for _file in self.input_files: + ret.append(_file if corresponding_key == '' else self.input_files[_file][corresponding_key]) if corresponding_key in self.__may_be_united: return one_or_set(ret) return join(ret) - def get_output_file_prop(self, key): + def get_output_file_prop(self, key: str) -> str: + """ + Get output file property. + + :param key: property name (str) + :return: property value (str). + """ log_file = self.log_file if key == 'logGUID': @@ -372,16 +407,21 @@ def get_output_file_prop(self, key): corresponding_key = self.__output_file_keys[key] ret = [] - for f in self.output_files: - if key != 'scopeOut' or f != log_file: - ret.append(f if corresponding_key == '' else self.output_files[f][corresponding_key]) + for _file in self.output_files: + if key != 'scopeOut' or _file != log_file: + ret.append(_file if corresponding_key == '' else self.output_files[_file][corresponding_key]) if corresponding_key in self.__may_be_united: return one_or_set(ret) return join(ret) - def load(self, new_desc): + def load(self, new_desc: Any): + """ + Load job description. + + :param new_desc: job description (Any). + """ if isinstance(new_desc, str): new_desc = json.loads(new_desc) @@ -417,7 +457,14 @@ def load(self, new_desc): self.__holder = new_desc - def to_json(self, decompose=False, **kwargs): + def to_json(self, decompose: bool = False, **kwargs: dict) -> str: + """ + Convert description to JSON. + + :param decompose: flag stating if the description should be decomposed (bool) + :param kwargs: additional arguments for json.dumps (dict) + :return: JSON representation of the description (str). + """ if decompose: prep = {} @@ -441,24 +488,34 @@ def to_json(self, decompose=False, **kwargs): return json.dumps(prep, **kwargs) - def get_description_parameter(self, key): + def get_description_parameter(self, key: str) -> str: + """ + Get description parameter. + + :param key: parameter name (str) + :return: parameter value (str) + :raises: AttributeError if parameter not found. + """ if self.__holder is not None: if key in self.__holder: return self.__holder[key] if key in self.__input_file_keys: - logger.warning(("Old key JobDescription.%s is used. Better to use JobDescription.input_files[][%s] to " - "access and manipulate this value.\n" % (key, self.__input_file_keys[key])) + self.get_traceback()) + logger.warning((f"Old key JobDescription.{key} is used. " + f"Better to use JobDescription.input_files[][{self.__input_file_keys[key]}] to " + "access and manipulate this value.\n") + self.get_traceback()) return self.get_input_file_prop(key) if key in self.__output_file_keys: - logger.warning(("Old key JobDescription.%s is used. Better to use JobDescription.output_files[][%s] to" - " access and manipulate this value.\n" % (key, self.__output_file_keys[key])) + self.get_traceback()) + logger.warning((f"Old key JobDescription.{key} is used. " + f"Better to use JobDescription.output_files[][{self.__output_file_keys[key]}] to " + "access and manipulate this value.\n") + self.get_traceback()) return self.get_output_file_prop(key) snake_key = camel_to_snake(key) if snake_key in self.__key_aliases_snake: - logger.warning(("Old key JobDescription.%s is used. Better to use JobDescription.%s to access and " - "manipulate this value.\n" % (key, self.__key_aliases_snake[snake_key])) + self.get_traceback()) + logger.warning((f"Old key JobDescription.{key} is used. " + f"Better to use JobDescription.{self.__key_aliases_snake[snake_key]} to access and " + "manipulate this value.\n") + self.get_traceback()) return stringify_weird(self.__holder[self.__key_aliases_snake[snake_key]]) if key in self.__soft_key_aliases: @@ -466,34 +523,43 @@ def get_description_parameter(self, key): raise AttributeError("Description parameter not found") - def set_description_parameter(self, key, value): + def set_description_parameter(self, key: str, value: Any) -> bool: + """ + Set description parameter. + + :param key: parameter name (str) + :param value: parameter value (Any) + :return: True if parameter was set, False otherwise (bool) + :raises: AttributeError if parameter is read-only. + """ if self.__holder is not None: if key in self.__holder: self.__holder[key] = value return True if key in self.__input_file_keys: - err = "Key JobDescription.%s is read-only\n" % key + err = f"Key JobDescription.{key} is read-only\n" if key == 'inFiles': err += "Use JobDescription.input_files to manipulate input files" else: - err += "Use JobDescription.input_files[][%s] to set up this parameter in files description" %\ - self.__input_file_keys[key] + err += f"Use JobDescription.input_files[][{self.__input_file_keys[key]}] to " \ + f"set up this parameter in files description" raise AttributeError(err) if key in self.__output_file_keys: - err = "Key JobDescription.%s is read-only\n" % key + err = f"Key JobDescription.{key} is read-only\n" if key == 'outFiles': err += "Use JobDescription.output_files to manipulate output files" else: - err += "Use JobDescription.output_files[][%s] to set up this parameter in files description" %\ - self.__output_file_keys[key] + err += f"Use JobDescription.output_files[][{self.__output_file_keys[key]}] to " \ + f"set up this parameter in files description" raise AttributeError(err) snake_key = camel_to_snake(key) if snake_key in self.__key_aliases_snake: - logger.warning(("Old key JobDescription.%s is used. Better to use JobDescription.%s to access and" - "manipulate this value.\n" % (key, self.__key_aliases_snake[snake_key])) + self.get_traceback()) + logger.warning((f"Old key JobDescription.{key} is used. Better to use " + f"JobDescription.{self.__key_aliases_snake[snake_key]} to access and " + "manipulate this value.\n") + self.get_traceback()) self.__holder[self.__key_aliases_snake[snake_key]] = parse_value(value) if key in self.__soft_key_aliases: @@ -501,7 +567,12 @@ def set_description_parameter(self, key, value): return False - def get_traceback(self): + def get_traceback(self) -> str: + """ + Get traceback. + + :return: traceback (str). + """ tb = list(reversed(traceback.extract_stack())) tb_str = '\n' @@ -509,46 +580,48 @@ def get_traceback(self): if ii[0] < 3: continue # we don't need inner scopes of this and subsequent calls i = ii[1] - tb_str += '{file}:{line} (in {module}): {call}\n'.format(file=i[0], - line=i[1], - module=i[2], - call=i[3]) + tb_str += f'{i[0]}:{i[1]} (in {i[2]}): {i[3]}\n' thread = threading.currentThread() - return 'Traceback: (latest call first)' + tb_str + 'Thread: %s(%d)' % (thread.getName(), thread.ident) - def __getattr__(self, key): + return 'Traceback: (latest call first)' + tb_str + f'Thread: {thread.getName()}({thread.ident})' + + def __getattr__(self, key: str) -> str: """ - Reflection of description values into Job instance properties if they are not shadowed. - If there is no own property with corresponding name, the value of Description is used. - Params and return described in __getattr__ interface. + Return attribute value. + + :param key: attribute name (str) + :return: attribute value (str). """ try: return object.__getattribute__(self, key) except AttributeError: return self.get_description_parameter(key) - def __setattr__(self, key, value): + def __setattr__(self, key: str, value: Any) -> Any: """ - Reflection of description values into Job instance properties if they are not shadowed. - If there is no own property with corresponding name, the value of Description is set. - Params and return described in __setattr__ interface. + Set attribute value. + + :param key: attribute name (str) + :param value: attribute value (Any). + :return: attribute value (Any). """ try: object.__getattribute__(self, key) - return object.__setattr__(self, key, value) except AttributeError: if not self.set_description_parameter(key, value): return object.__setattr__(self, key, value) + return None + + return object.__setattr__(self, key, value) if __name__ == "__main__": - import sys logging.basicConfig() logger.setLevel(logging.DEBUG) jd = JobDescription() - with open(sys.argv[1], "r") as f: - contents = f.read() + with open(sys.argv[1], "r", encoding='utf-8') as _fil: + contents = _fil.read() jd.load(contents) diff --git a/pilot/resource/nersc.py b/pilot/resource/nersc.py index 577ee61b..852082c6 100644 --- a/pilot/resource/nersc.py +++ b/pilot/resource/nersc.py @@ -19,16 +19,22 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Functions for NERSC.""" + import logging +from typing import Any + logger = logging.getLogger(__name__) -def get_setup(job=None): +def get_setup(job: Any = None) -> list: """ Return the resource specific setup. - :param job: optional job object. + :param job: optional job object (Any) :return: setup commands (list). """ + if not job: + logger.warning('job object not sent to get_setup') return [] diff --git a/pilot/resource/summit.py b/pilot/resource/summit.py index f8c391df..bceccc60 100644 --- a/pilot/resource/summit.py +++ b/pilot/resource/summit.py @@ -19,16 +19,22 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +"""Functions for Summit.""" + import logging +from typing import Any + logger = logging.getLogger(__name__) -def get_setup(job=None): +def get_setup(job: Any = None) -> list: """ Return the resource specific setup. - :param job: optional job object. + :param job: optional job object (Any) :return: setup commands (list). """ + if not job: + logger.warning('job object not sent to get_setup') return [] diff --git a/pilot/resource/titan.py b/pilot/resource/titan.py index cd98648f..043bd9f0 100644 --- a/pilot/resource/titan.py +++ b/pilot/resource/titan.py @@ -20,49 +20,57 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 # - Danila Oleynik danila.oleynik@cern.ch, 2018 +"""Functions for Titan.""" + import logging import os import shutil import sys import time +from typing import Any -from .jobdescription import JobDescription # Python 2/3 from pilot.common.exception import FileHandlingFailure from pilot.util.config import config -from pilot.util.constants import PILOT_PRE_STAGEIN, PILOT_POST_STAGEIN -from pilot.util.filehandling import read_json, write_json, remove -#from pilot.util.mpi import get_ranks_info +from pilot.util.constants import ( + PILOT_PRE_STAGEIN, + PILOT_POST_STAGEIN +) +from pilot.util.filehandling import ( + read_json, + write_json, + remove +) from pilot.util.timing import add_to_pilot_timing +from .jobdescription import JobDescription logger = logging.getLogger(__name__) -def get_job(harvesterpath): +def get_job(harvesterpath: str) -> (JobDescription, int): """ - Return job description in dictionary and MPI rank (if applicable) + Return job description in dictionary form and MPI rank (if applicable). :param harvesterpath: path to config.Harvester.jobs_list_file (string). - :return: job object, rank (int). + :return: job object (JobDescription), rank (int). """ - rank = 0 job = None - logger.info("Going to read job definition from file") + logger.info("going to read job definition from file") pandaids_list_filename = os.path.join(harvesterpath, config.Harvester.jobs_list_file) if not os.path.isfile(pandaids_list_filename): - logger.info("File with PanDA IDs are missing. Nothing to execute.") + logger.info("file with PanDA IDs are missing. nothing to execute.") return job, rank harvesterpath = os.path.abspath(harvesterpath) #rank, max_ranks = get_ranks_info() pandaids = read_json(pandaids_list_filename) - logger.info('Got {0} job ids'.format(len(pandaids))) + logger.info(f'Got {len(pandaids)} job ids') pandaid = pandaids[rank] job_workdir = os.path.join(harvesterpath, str(pandaid)) - logger.info('Rank: {2} with job {0} will have work directory {1}'.format(pandaid, job_workdir, rank)) + logger.info(f'rank: {rank} with job {pandaid} will have work directory {job_workdir}') job_def_filename = os.path.join(job_workdir, config.Harvester.pandajob_file) jobs_dict = read_json(job_def_filename) @@ -73,14 +81,15 @@ def get_job(harvesterpath): return job, rank -def get_setup(job=None): +def get_setup(job: Any = None) -> list: """ Return the resource specific setup. - :param job: optional job object. + :param job: optional job object (Any) :return: setup commands (list). """ - + if not job: + logger.warning('job object not sent to get_setup') setup_commands = ['source /ccs/proj/csc108/athena_grid_env/setup.sh', 'source $MODULESHOME/init/bash', 'tmp_dirname=/tmp/scratch', @@ -104,13 +113,13 @@ def get_setup(job=None): return setup_commands -def set_job_workdir(job, path): +def set_job_workdir(job: Any, path: str) -> str: """ Point pilot to job working directory (job id). - :param job: job object. - :param path: local path to Harvester access point (string). - :return: job working directory (string). + :param job: job object (Any) + :param path: local path to Harvester access point (str) + :return: job working directory (str). """ work_dir = os.path.join(path, str(job.jobid)) os.chdir(work_dir) @@ -118,21 +127,21 @@ def set_job_workdir(job, path): return work_dir -def set_scratch_workdir(job, work_dir, args): +def set_scratch_workdir(job: Any, work_dir: str, args: dict) -> str: """ Copy input files and some db files to RAM disk. - :param job: job object. - :param work_dir: job working directory (permanent FS) (string). - :param args: args dictionary to collect timing metrics. - :return: job working directory in scratch (string). + :param job: job object (Any) + :param work_dir: job working directory (permanent FS) (str) + :param args: args dictionary to collect timing metrics (dict) + :return: job working directory in scratch (str) + :raises FileHandlingFailure: in case of IOError. """ - scratch_path = config.HPC.scratch job_scratch_dir = os.path.join(scratch_path, str(job.jobid)) for inp_file in job.input_files: job.input_files[inp_file]["scratch_path"] = job_scratch_dir - logger.debug("Job scratch path: {0}".format(job_scratch_dir)) + logger.debug(f"Job scratch path: {job_scratch_dir}") # special data, that should be preplaced in RAM disk dst_db_path = 'sqlite200/' dst_db_filename = 'ALLP200.db' @@ -155,38 +164,36 @@ def set_scratch_workdir(job, work_dir, args): shutil.copyfile(src_file, scratch_path + dst_db_path + dst_db_filename) logger.debug("") sql_cp_time = time.time() - t0 - logger.debug("Copy of sqlite files took: {0}".format(sql_cp_time)) + logger.debug(f"Copy of sqlite files took: {sql_cp_time}") logger.debug("Prepare dst and copy geomDB files") t0 = time.time() if not os.path.exists(scratch_path + dst_db_path_2): os.makedirs(scratch_path + dst_db_path_2) shutil.copyfile(src_file_2, scratch_path + dst_db_path_2 + dst_db_filename_2) geomdb_cp_time = time.time() - t0 - logger.debug("Copy of geomDB files took: {0} s".format(geomdb_cp_time)) - logger.debug("Prepare job scratch dir") + logger.debug(f"copy of geomDB files took: {geomdb_cp_time} s") + logger.debug("prepare job scratch dir") t0 = time.time() if not os.path.exists(job_scratch_dir): os.makedirs(job_scratch_dir) - logger.debug("Copy input file") + logger.debug("copy input file") for inp_file in job.input_files: - logger.debug("Copy: {0} to {1}".format(os.path.join(work_dir, inp_file), - job.input_files[inp_file]["scratch_path"])) shutil.copyfile(os.path.join(work_dir, inp_file), os.path.join(job.input_files[inp_file]["scratch_path"], inp_file)) input_cp_time = time.time() - t0 - logger.debug("Copy of input files took: {0} s".format(input_cp_time)) - except IOError as e: - logger.error("I/O error({0}): {1}".format(e.errno, e.strerror)) - logger.error("Copy to scratch failed, execution terminated': \n %s " % (sys.exc_info()[1])) + logger.debug(f"copy of input files took: {input_cp_time} s") + except IOError as exc: + logger.error(f"i/o error({exc.errno}): {exc.strerror}") + logger.error(f"copy to scratch failed, execution terminated': \n {sys.exc_info()[1]} ") raise FileHandlingFailure("Copy to RAM disk failed") finally: add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) else: - logger.info('Scratch directory (%s) dos not exist' % scratch_path) + logger.info(f'Scratch directory ({scratch_path}) dos not exist') return work_dir os.chdir(job_scratch_dir) - logger.debug("Current directory: {0}".format(os.getcwd())) + logger.debug(f"Current directory: {os.getcwd()}") true_dir = '/ccs/proj/csc108/AtlasReleases/21.0.15/nfs_db_files' pseudo_dir = "./poolcond" os.symlink(true_dir, pseudo_dir) @@ -194,22 +201,21 @@ def set_scratch_workdir(job, work_dir, args): return job_scratch_dir -def process_jobreport(payload_report_file, job_scratch_path, job_communication_point): +def process_jobreport(payload_report_file: str, job_scratch_path: str, job_communication_point: str): """ Copy job report file to make it accessible by Harvester. Shrink job report file. - :param payload_report_file: name of job report (string). - :param job_scratch_path: path to scratch directory (string). - :param job_communication_point: path to updated job report accessible by Harvester (string). + :param payload_report_file: name of job report (str) + :param job_scratch_path: path to scratch directory (str) + :param job_communication_point: path to updated job report accessible by Harvester (str) :raises FileHandlingFailure: in case of IOError. """ - src_file = os.path.join(job_scratch_path, payload_report_file) dst_file = os.path.join(job_communication_point, payload_report_file) try: logger.info( - "Copy of payload report [{0}] to access point: {1}".format(payload_report_file, job_communication_point)) + f"copy of payload report [{payload_report_file}] to access point: {job_communication_point}") # shrink jobReport job_report = read_json(src_file) if 'executor' in job_report: @@ -220,18 +226,17 @@ def process_jobreport(payload_report_file, job_scratch_path, job_communication_p write_json(dst_file, job_report) except IOError: - logger.error("Job report copy failed, execution terminated': \n %s " % (sys.exc_info()[1])) - raise FileHandlingFailure("Job report copy from RAM failed") + logger.error(f"job report copy failed, execution terminated': \n {sys.exc_info()[1]} ") + raise FileHandlingFailure("job report copy from RAM failed") -def postprocess_workdir(workdir): +def postprocess_workdir(workdir: str): """ Post-processing of working directory. Unlink paths. - :param workdir: path to directory to be processed (string). + :param workdir: path to directory to be processed (str) :raises FileHandlingFailure: in case of IOError. """ - pseudo_dir = "poolcond" try: if os.path.exists(pseudo_dir): @@ -240,15 +245,14 @@ def postprocess_workdir(workdir): raise FileHandlingFailure("Post processing of working directory failed") -def command_fix(command, job_scratch_dir): +def command_fix(command: str, job_scratch_dir: str) -> str: """ Modification of payload parameters, to be executed on Titan on RAM disk. Some cleanup. - :param command: payload command (string). - :param job_scratch_dir: local path to input files (string). - :return: updated/fixed payload command (string). + :param command: payload command (str) + :param job_scratch_dir: local path to input files (str) + :return: updated/fixed payload command (str). """ - subs_a = command.split() for i in range(len(subs_a)): if i > 0: diff --git a/pilot/scripts/cpu_arch.py b/pilot/scripts/cpu_arch.py index 48d4d93b..144e11e5 100755 --- a/pilot/scripts/cpu_arch.py +++ b/pilot/scripts/cpu_arch.py @@ -20,6 +20,8 @@ # - Alaettin Serhan Mete, alaettin.serhan.mete@cern.ch, 2023 # - Paul Nilsson, paul.nilsson@cern.ch, 2023 +"""Script for reporting CPU architecture.""" + import argparse import logging import re @@ -32,12 +34,14 @@ must_not_v2 = [] -def get_flags_cpuinfo(): +def get_flags_cpuinfo() -> dict: """ - Get the CPU (model) name, number of cores of the corresponding CPU and the CPU flags from the /proc/cpuinfo + Get the CPU (model) name, number of cores of the corresponding CPU and the CPU flags from the /proc/cpuinfo. + + :return: dictionary containing the CPU (model) name, number of cores of the corresponding CPU and the CPU flags (dict). """ cpu, cpu_core, flags = None, None, None - with open('/proc/cpuinfo', 'r') as fiile: + with open('/proc/cpuinfo', 'r', encoding='utf-8') as fiile: for line in fiile.readlines(): if 'model name' in line: cpu = line.split(':')[-1].strip() @@ -48,13 +52,18 @@ def get_flags_cpuinfo(): if all([cpu, cpu_core, flags]): return {"cpu": cpu, "cpu_core": cpu_core, "flags": flags} + return {} + -def get_flags_pilotlog(pilotlogname): +def get_flags_pilotlog(pilotlogname: str) -> dict: """ - Get the site/queue name, the CPU (model) name, number of cores of the corresponding CPU and the CPU flags from the downloaded pilotlog + Get the site/queue name, the CPU (model) name, number of cores of the corresponding CPU and the CPU flags from the downloaded pilotlog. + + :param pilotlogname: full path to the pilotlog (str) + :return: dictionary containing the site/queue name, the CPU (model) name, number of cores of the corresponding CPU and the CPU flags (dict). """ site, cpu, cpu_core, flags = None, None, None, None - with open(pilotlogname, 'r') as fiile: + with open(pilotlogname, 'r', encoding='utf-8') as fiile: for line in fiile.readlines(): if 'PANDA_RESOURCE' in line: site = line.split('=')[-1].strip() @@ -67,10 +76,13 @@ def get_flags_pilotlog(pilotlogname): if all([site, cpu, cpu_core, flags]): return {"site": site, "cpu": cpu, "cpu_core": cpu_core, "flags": flags} + return {} + def set_naive(): """ - Make a decision on the CPU architecture based on the simplified lists (must_'s) of flags + Make a decision on the CPU architecture based on the simplified lists (must_'s) of flags. + The must_not_'s have been left blank, these could be filled if need be """ global must_v4 @@ -92,13 +104,16 @@ def set_naive(): def set_gcc(): """ - Make a decision on the CPU architecture based on the modified lists (must_'s) of flags from gcc: LAHF_SAHF --> LAHF_LM; LZCNT --> ABM; removal of SSE3 + Make a decision on the CPU architecture based on the modified lists (must_'s) of flags from gcc. + + LAHF_SAHF --> LAHF_LM; LZCNT --> ABM; removal of SSE3. + References: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/testsuite/gcc.target/i386/x86-64-v4.c;hb=324bec558e95584e8c1997575ae9d75978af59f1 https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/testsuite/gcc.target/i386/x86-64-v3.c;hb=324bec558e95584e8c1997575ae9d75978af59f1 https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/testsuite/gcc.target/i386/x86-64-v2.c;hb=324bec558e95584e8c1997575ae9d75978af59f1 - The must_not_'s have been left blank, these could be filled if need be + The must_not_'s have been left blank, these could be filled if need be. """ global must_v4 global must_not_v4 @@ -119,49 +134,61 @@ def set_gcc(): must_not_v2 = [] -def check_flags(must, must_not, flags): +def check_flags(must: list, must_not: list, flags: list) -> bool: """ - Matching of the actual CPU flags w.r.t. the lists of flags defined for deciding on architecture + Match the actual CPU flags w.r.t. the lists of flags defined for deciding on architecture. + + :param must: list of flags that must be present (list) + :param must_not: list of flags that must not be present (list) + :param flags: list of actual flags (list) + :return: True if the actual flags match the must and must_not lists, False otherwise (bool). """ failed = False for flag in must: - if not any([re.match(flag, test_flag, re.IGNORECASE) for test_flag in flags]): - logging.debug("Missing must-have: {0}".format(flag)) + if not any(re.match(flag, test_flag, re.IGNORECASE) for test_flag in flags): + logging.debug(f"Missing must-have: {flag}") failed = True + for flag in must_not: - if not any([re.match(flag, test_flag, re.IGNORECASE) for test_flag in flags]): - logging.debug("Present must-not-have: {0}".format(flag)) + if not any(re.match(flag, test_flag, re.IGNORECASE) for test_flag in flags): + logging.debug(f"Present must-not-have: {flag}") failed = True + return failed -def all_version_checks(flag_string, name): +def all_version_checks(flag_string: str, name: str) -> str: """ - Architecture is assigned to the CPU based on the check_flags() function + Check the CPU flags against the lists of flags for all versions of the CPU architecture. + + Architecture is assigned to the CPU based on the check_flags() function. + + :param flag_string: string containing the CPU flags (str) + :param name: name of the CPU (str) + :return: architecture of the CPU (str). """ flag_list = flag_string.split() - logging.debug("-------Checking V4 for {0}--------".format(name)) + logging.debug(f"-------Checking V4 for {name}--------") failed_v4 = check_flags(must_v4, must_not_v4, flag_list) if not failed_v4: return "x86-64-v4" - else: - pass - logging.debug("-------Checking V3 for {0}--------".format(name)) + + logging.debug(f"-------Checking V3 for {name}--------") failed_v3 = check_flags(must_v3, must_not_v3, flag_list) if not failed_v3: return "x86-64-v3" - else: - pass - logging.debug("-------Checking V2 for {0}--------".format(name)) + + logging.debug(f"-------Checking V2 for {name}--------") failed_v2 = check_flags(must_v2, must_not_v2, flag_list) if not failed_v2: return "x86-64-v2" - else: - pass - logging.debug("-------Defaulting {0} to V1--------".format(name)) + + logging.debug(f"-------Defaulting {name} to V1--------") if failed_v2 and failed_v3 and failed_v4: return "x86-64-v1" + return "" + if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/pilot/scripts/data_api_stagein.py b/pilot/scripts/data_api_stagein.py index 87cc0906..c5fda1de 100644 --- a/pilot/scripts/data_api_stagein.py +++ b/pilot/scripts/data_api_stagein.py @@ -19,7 +19,7 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 -# This script shows how to use the Data API stage-in client to download a file from storage +"""This script shows how to use the Data API stage-in client to download a file from storage.""" from pilot.api import data from pilot.info import InfoService, FileSpec, infosys diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index 542e5c2b..b6f20ad1 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -18,17 +18,22 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +"""Script for remote file open verification.""" + import argparse import functools import os import logging import queue -import ROOT import signal import subprocess +import sys import threading import traceback from collections import namedtuple +from typing import Any + +import ROOT from pilot.util.config import config from pilot.util.filehandling import ( @@ -43,13 +48,12 @@ logger = logging.getLogger(__name__) -def get_args(): +def get_args() -> argparse.Namespace: """ Return the args from the arg parser. :return: args (arg parser object). """ - arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', @@ -81,15 +85,12 @@ def get_args(): return arg_parser.parse_args() -def message(msg): +def message(msg: str): """ Print message to stdout or to log. - Note: not using lazy formatting. - :param msg: message (string). - :return: + :param msg: message (str). """ - if logger: logger.info(msg) # make sure that stdout buffer gets flushed - in case of time-out exceptions @@ -98,129 +99,125 @@ def message(msg): print(msg, flush=True) # always write message to instant log file (message might otherwise get lost in case of time-outs) - with open(config.Pilot.remotefileverification_instant, 'a') as _file: + with open(config.Pilot.remotefileverification_instant, 'a', encoding='utf-8') as _file: _file.write(msg + '\n') -def get_file_lists(turls): +def get_file_lists(turls_string: str) -> dict: """ Return a dictionary with the turls. + Format: {'turls': } - :param turls: comma separated turls (string) - :return: turls dictionary. + :param turls_string: comma separated turls (str) + :return: turls dictionary (dict). """ - _turls = [] try: - _turls = turls.split(',') - except Exception as error: - message("exception caught: %s" % error) + _turls = turls_string.split(',') + except Exception as _error: + message(f"exception caught: {_error}") return {'turls': _turls} -def try_open_file(turl, queues): +# pylint: disable=useless-param-doc +def try_open_file(turl_str: str, _queues: namedtuple): """ Attempt to open a remote file. + Successfully opened turls will be put in the queues.opened queue. Unsuccessful turls will be placed in the queues.unopened queue. - :param turl: turl (string). - :param queues: queues collection. - :return: + :param turl_str: turl (str) + :param _queues: Namedtuple containing queues for opened and unopened turls. + Should have 'opened' and 'unopened' attributes to store respective turls. """ - turl_opened = False _timeout = 30 * 1000 # 30 s per file try: _ = ROOT.TFile.SetOpenTimeout(_timeout) - # message("internal TFile.Open() time-out set to %d ms" % _timeout) - message('opening %s' % turl) - in_file = ROOT.TFile.Open(turl) + # message(f"internal TFile.Open() time-out set to {_timeout} ms") + message(f'opening {turl_str}') + in_file = ROOT.TFile.Open(turl_str) except Exception as exc: - message('caught exception: %s' % exc) + message(f'caught exception: {exc}') else: if in_file and in_file.IsOpen(): in_file.Close() turl_opened = True - message('closed %s' % turl) - queues.opened.put(turl) if turl_opened else queues.unopened.put(turl) - queues.result.put(turl) + message(f'closed {turl_str}') + if turl_opened: + _queues.opened.put(turl_str) + else: + _queues.unopened.put(turl_str) + _queues.result.put(turl_str) -def spawn_file_open_thread(queues, file_list): +# pylint: disable=useless-param-doc +def spawn_file_open_thread(_queues: Any, file_list: list) -> threading.Thread: """ - Spawn a thread for the try_open_file(). + Spawn a thread for the try_open_file().. - :param queues: queue collection. - :param file_list: files to open (list). - :return: thread. + :param _queues: queue collection (Any) + :param file_list: files to open (list) + :return: thread (threading.Thread). """ - - thread = None + _thread = None try: - turl = file_list.pop(0) + _turl = file_list.pop(0) except IndexError: pass else: # create and start thread for the current turl - thread = threading.Thread(target=try_open_file, args=(turl, queues)) - thread.daemon = True - thread.start() + _thread = threading.Thread(target=try_open_file, args=(_turl, _queues)) + _thread.daemon = True + _thread.start() - return thread + return _thread -def register_signals(signals, args): +def register_signals(signals: list, _args: Any): """ Register kill signals for intercept function. - :param signals: list of signals. - :param args: pilot args. - :return: + :param signals: list of signals (list) + :param _args: pilot arguments object (Any). """ - for sig in signals: - signal.signal(sig, functools.partial(interrupt, args)) + signal.signal(sig, functools.partial(interrupt, _args)) -def interrupt(args, signum, frame): +def interrupt(_args: Any, signum: Any, frame: Any): """ + Receive and handle kill signals. + Interrupt function on the receiving end of kill signals. This function is forwarded any incoming signals (SIGINT, SIGTERM, etc) and will set abort_job which instructs the threads to abort the job. - :param args: pilot arguments. + :param _args: pilot arguments object (Any). :param signum: signal. :param frame: stack/execution frame pointing to the frame that was interrupted by the signal. - :return: """ - - if args.signal: + if _args.signal: logger.warning('process already being killed') return - try: - sig = [v for v, k in signal.__dict__.iteritems() if k == signum][0] - except Exception: - sig = [v for v, k in list(signal.__dict__.items()) if k == signum][0] - logger.warning(f'caught signal: {sig} in FRAME=\n%s', '\n'.join(traceback.format_stack(frame))) + sig = [v for v, k in list(signal.__dict__.items()) if k == signum][0] + tmp = '\n'.join(traceback.format_stack(frame)) + logger.warning(f'caught signal: {sig} in FRAME=\n{tmp}') cmd = f'ps aux | grep {os.getpid()}' out = subprocess.getoutput(cmd) logger.info(f'{cmd}:\n{out}') logger.warning(f'will terminate pid={os.getpid()}') logging.shutdown() - args.signal = sig + _args.signal = sig kill_processes(os.getpid()) -if __name__ == '__main__': - """ - Main function of the remote file open script. - """ - +if __name__ == '__main__': # noqa: C901 # get the args from the arg parser args = get_args() args.debug = True @@ -230,12 +227,12 @@ def interrupt(args, signum, frame): try: logname = config.Pilot.remotefileverification_log except Exception as error: - print("caught exception: %s (skipping remote file open verification)" % error) - exit(1) + print(f"caught exception: {error} (skipping remote file open verification)") + sys.exit(1) else: if not logname: print("remote file open verification not desired") - exit(0) + sys.exit(0) establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=logname) logger = logging.getLogger(__name__) @@ -254,7 +251,7 @@ def interrupt(args, signum, frame): queues.unopened = queue.Queue() threads = [] - message('will attempt to open %d file(s) using %d thread(s)' % (len(turls), args.nthreads)) + message(f'will attempt to open {len(turls)} file(s) using {args.nthreads} thread(s)') if turls: # make N calls to begin with @@ -271,14 +268,20 @@ def interrupt(args, signum, frame): message("reached time-out") break except Exception as error: - message("caught exception: %s" % error) + message(f"caught exception: {error}") thread = spawn_file_open_thread(queues, turls) if thread: threads.append(thread) # wait until all threads have finished - [_thread.join() for _thread in threads] + try: + for thread in threads: + thread.join() + except Exception as exc: + logger.warning(f"exception caught while handling threads: {exc}") + finally: + logger.info('all remote file open threads have been joined') opened_turls = list(queues.opened.queue) opened_turls.sort() @@ -296,4 +299,4 @@ def interrupt(args, signum, frame): message('no TURLs to verify') message('file remote open script has finished') - exit(0) + sys.exit(0) diff --git a/pilot/scripts/rucio_api_download.py b/pilot/scripts/rucio_api_download.py index 2c3f64fc..7473b73e 100644 --- a/pilot/scripts/rucio_api_download.py +++ b/pilot/scripts/rucio_api_download.py @@ -19,12 +19,13 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 -# This script demonstrates how to download a file using the Rucio download client. +"""This script demonstrates how to download a file using the Rucio download client.""" + # Note: Rucio needs to be setup with 'lsetup rucio'. try: from rucio.client.downloadclient import DownloadClient -except Exception: +except ImportError: print("Rucio client has not been setup, please run \'lsetup rucio\' first") else: f_ific = {'did_scope': 'mc16_13TeV', 'did': 'mc16_13TeV:EVNT.16337107._000147.pool.root.1', diff --git a/pilot/scripts/stagein.py b/pilot/scripts/stagein.py index ce958993..6fc6f1fc 100644 --- a/pilot/scripts/stagein.py +++ b/pilot/scripts/stagein.py @@ -19,9 +19,13 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +"""This script is executed by the pilot in a container to perform stage-in of input files.""" + import argparse +import logging import os import re +import sys from pilot.api.data import StageInClient from pilot.api.es_data import StageInESClient @@ -39,8 +43,6 @@ from pilot.util.loggingsupport import establish_logging from pilot.util.tracereport import TraceReport -import logging - # error codes GENERAL_ERROR = 1 NO_QUEUENAME = 2 @@ -56,13 +58,12 @@ TRANSFER_ERROR = 12 -def get_args(): +def get_args() -> argparse.Namespace: """ Return the args from the arg parser. :return: args (arg parser object). """ - arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', @@ -198,79 +199,43 @@ def get_args(): return arg_parser.parse_args() -def str2bool(_str): - """ Helper function to convert string to bool """ +# pylint: disable=useless-param-doc +def str2bool(_str: str) -> bool: + """ + Convert string to bool. + :param _str: string to be converted (str) + :return: boolean (bool) + :raise: argparse.ArgumentTypeError. + """ if isinstance(_str, bool): return _str - if _str.lower() in ('yes', 'true', 't', 'y', '1'): + if _str.lower() in {'yes', 'true', 't', 'y', '1'}: return True - elif _str.lower() in ('no', 'false', 'f', 'n', '0'): + if _str.lower() in {'no', 'false', 'f', 'n', '0'}: return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') + raise argparse.ArgumentTypeError('Boolean value expected.') -def verify_args(): - """ - Make sure required arguments are set, and if they are not then set them. - (deprecated) - :return: internal error code (int). +# logger is set in the main function +# pylint: disable=used-before-assignment +def message(msg: str): """ + Print message to stdout or to log. - ret = 0 - if not args.workdir: - args.workdir = os.getcwd() - - elif not args.queuename: - message('queue name not set, cannot initialize InfoService') - ret = NO_QUEUENAME - - elif not args.scopes: - message('scopes not set') - ret = NO_SCOPES - - elif not args.lfns: - message('LFNs not set') - ret = NO_LFNS - - elif not args.eventtype: - message('No event type provided') - ret = NO_EVENTTYPE - - elif not args.localsite: - message('No local site provided') - ret = NO_LOCALSITE - - elif not args.remotesite: - message('No remote site provided') - ret = NO_REMOTESITE - - elif not args.produserid: - message('No produserid provided') - ret = NO_PRODUSERID - - elif not args.jobid: - message('No jobid provided') - ret = NO_JOBID - - elif not args.taskid: - message('No taskid provided') - ret = NO_TASKID - - elif not args.jobdefinitionid: - message('No jobdefinitionid provided') - ret = NO_JOBDEFINITIONID - - return ret - - -def message(msg): + :param msg: message (str). + """ print(msg) if not logger else logger.info(msg) -def str_to_int_list(_list): +def str_to_int_list(_list: list) -> list: + """ + Convert list of strings to list of integers. + + :param _list: list of strings (list) + :return: list of integers (list). + """ _new_list = [] for val in _list: try: @@ -278,16 +243,47 @@ def str_to_int_list(_list): except (ValueError, TypeError): _val = None _new_list.append(_val) + return _new_list -def str_to_bool_list(_list): +def str_to_bool_list(_list: list) -> list: + """ + Convert list of strings to list of booleans. + + :param _list: list of strings (list) + :return: list of booleans (list). + """ changes = {"True": True, "False": False, "None": None, "NULL": None} + return [changes.get(x, x) for x in _list] -def get_file_lists(lfns, scopes, filesizes, checksums, allowlans, allowwans, directaccesslans, directaccesswans, istars, - accessmodes, storagetokens, guids): +def get_file_lists(lfns: str, scopes: str, filesizes: str, checksums: str, allowlans: str, allowwans: str, + directaccesslans: str, directaccesswans: str, istars: str, accessmodes: str, + storagetokens: str, guids: str) -> dict: + """ + Return a dictionary with the file lists. + + Format: {'lfns': , 'scopes': , 'filesizes': , 'checksums': , + 'allowlans': , 'allowwans': , 'directaccesslans': , + 'directaccesswans': , 'istars': , 'accessmodes': , + 'storagetokens': , 'guids': } + + :param lfns: comma separated lfns (str) + :param scopes: comma separated scopes (str) + :param filesizes: comma separated filesizes (str) + :param checksums: comma separated checksums (str) + :param allowlans: comma separated allowlans (str) + :param allowwans: comma separated allowwans (str) + :param directaccesslans: comma separated directaccesslans (str) + :param directaccesswans: comma separated directaccesswans (str) + :param istars: comma separated istars (str) + :param accessmodes: comma separated accessmodes (str) + :param storagetokens: comma separated storagetokens (str) + :param guids: comma separated guids (str) + :return: file lists dictionary (dict). + """ _lfns = [] _scopes = [] _filesizes = [] @@ -314,52 +310,65 @@ def get_file_lists(lfns, scopes, filesizes, checksums, allowlans, allowwans, dir _storagetokens = storagetokens.split(',') _guids = guids.split(',') except (NameError, TypeError, ValueError) as error: - message("exception caught: %s" % error) + message(f"exception caught: {error}") file_list_dictionary = {'lfns': _lfns, 'scopes': _scopes, 'filesizes': _filesizes, 'checksums': _checksums, 'allowlans': _allowlans, 'allowwans': _allowwans, 'directaccesslans': _directaccesslans, 'directaccesswans': _directaccesswans, 'istars': _istars, 'accessmodes': _accessmodes, 'storagetokens': _storagetokens, 'guids': _guids} + return file_list_dictionary class Job: - """ - A minimal implementation of the Pilot Job class with data members necessary for the trace report only. - """ + """A minimal implementation of the Pilot Job class with data members necessary for the trace report only.""" produserid = "" jobid = "" taskid = "" jobdefinitionid = "" - def __init__(self, produserid="", jobid="", taskid="", jobdefinitionid=""): + def __init__(self, produserid: str = "", jobid: str = "", taskid: str = "", jobdefinitionid: str = ""): + """ + Initialize the Job class. + + :param produserid: produserid (str) + :param jobid: jobid (str) + :param taskid: taskid (str) + :param jobdefinitionid: jobdefinitionid (str). + """ self.produserid = produserid.replace('%20', ' ') self.jobid = jobid self.taskid = taskid self.jobdefinitionid = jobdefinitionid -def add_to_dictionary(dictionary, key, value1, value2, value3, value4): +def add_to_dictionary(dictionary: dict, key: str, value1: str, value2: str, value3: str, value4: str) -> dict: """ Add key: [value1, value2, ..] to dictionary. + In practice; lfn: [status, status_code, turl, DDM endpoint]. - :param dictionary: dictionary to be updated. - :param key: lfn key to be added (string). - :param value1: status to be added to list belonging to key (string). - :param value2: status_code to be added to list belonging to key (string). - :param value3: turl (string). - :param value4: DDM endpoint (string). - :return: updated dictionary. + :param dictionary: dictionary to be updated (dict) + :param key: lfn key to be added (str) + :param value1: status to be added to list belonging to key (str) + :param value2: status_code to be added to list belonging to key (str) + :param value3: turl (str) + :param value4: DDM endpoint (str) + :return: updated dictionary (dict). """ - dictionary[key] = [value1, value2, value3, value4] + return dictionary -def extract_error_info(errc): +def extract_error_info(errc: str) -> (int, str): + """ + Extract error code and message from the error string. + :param errc: error string (str) + :return: error code (int), error message (str). + """ error_code = 0 error_message = "" @@ -376,10 +385,6 @@ def extract_error_info(errc): if __name__ == '__main__': - """ - Main function of the stage-in script. - """ - # get the args from the arg parser args = get_args() args.debug = True @@ -388,16 +393,12 @@ def extract_error_info(errc): establish_logging(debug=args.debug, nopilotlog=args.nopilotlog, filename=config.Pilot.stageinlog) logger = logging.getLogger(__name__) - #ret = verify_args() - #if ret: - # exit(ret) - # get the file info try: replica_dictionary = read_json(os.path.join(args.workdir, args.replicadictionary)) except ConversionFailure as exc: - message('exception caught reading json: %s' % exc) - exit(1) + message(f'exception caught reading json: {exc}') + sys.exit(1) # file_list_dictionary = get_file_lists(args.lfns, args.scopes, args.filesizes, args.checksums, args.allowlans, # args.allowwans, args.directaccesslans, args.directaccesswans, args.istars, @@ -437,8 +438,9 @@ def extract_error_info(errc): else: client = StageInClient(infoservice, logger=logger, trace_report=trace_report, workdir=args.workdir) activity = 'pr' - kwargs = dict(workdir=args.workdir, cwd=args.workdir, usecontainer=False, use_pcache=args.usepcache, use_bulk=False, - use_vp=args.usevp, input_dir=args.inputdir, catchall=args.catchall, rucio_host=args.rucio_host) + kwargs = {"workdir": args.workdir, "cwd": args.workdir, "usecontainer": False, "use_pcache": args.usepcache, + "use_bulk": False, "use_vp": args.usevp, "input_dir": args.inputdir, "catchall": args.catchall, + "rucio_host": args.rucio_host} xfiles = [] for lfn in replica_dictionary: files = [{'scope': replica_dictionary[lfn]['scope'], @@ -475,7 +477,7 @@ def extract_error_info(errc): for fspec in xfiles: add_to_dictionary(file_dictionary, fspec.lfn, fspec.status, fspec.status_code, fspec.turl, fspec.ddmendpoint) status = fspec.status if fspec.status else "(not transferred)" - message(" -- lfn=%s, ddmendpoint=%s, status_code=%s, status=%s" % (fspec.lfn, fspec.ddmendpoint, fspec.status_code, status)) + message(f" -- lfn={fspec.lfn}, ddmendpoint={fspec.ddmendpoint}, status_code={fspec.status_code}, status={status}") # add error info, if any if err: @@ -483,8 +485,8 @@ def extract_error_info(errc): add_to_dictionary(file_dictionary, 'error', err, errcode, None, None) write_json(os.path.join(args.workdir, config.Container.stagein_status_dictionary), file_dictionary) if err: - message("containerised file transfers failed: %s" % err) - exit(TRANSFER_ERROR) + message(f"containerised file transfers failed: {err}") + sys.exit(TRANSFER_ERROR) message("containerised file transfers finished") - exit(0) + sys.exit(0) diff --git a/pilot/scripts/stageout.py b/pilot/scripts/stageout.py index 3e31520c..e04b8f3e 100644 --- a/pilot/scripts/stageout.py +++ b/pilot/scripts/stageout.py @@ -19,9 +19,13 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +"""This script is executed by the pilot in a container to perform stage-out of output files.""" + import argparse +import logging import os import re +import sys from pilot.api.data import StageOutClient from pilot.common.errorcodes import ErrorCodes @@ -36,8 +40,6 @@ from pilot.util.loggingsupport import establish_logging from pilot.util.tracereport import TraceReport -import logging - errors = ErrorCodes() # error codes @@ -58,13 +60,12 @@ TRANSFER_ERROR = 15 -def get_args(): +def get_args() -> argparse.Namespace: """ Return the args from the arg parser. - :return: args (arg parser object). + :return: args (argparse.Namespace). """ - arg_parser = argparse.ArgumentParser() arg_parser.add_argument('-d', @@ -163,138 +164,110 @@ def get_args(): return arg_parser.parse_args() -def str2bool(v): - """ Helper function to convert string to bool """ +# pylint: disable=useless-param-doc +def str2bool(_str: str) -> bool: + """ + Convert string to bool. - if isinstance(v, bool): - return v - if v.lower() in ('yes', 'true', 't', 'y', '1'): + :param _str: string to be converted (str) + :return: boolean (bool) + :raise: argparse.ArgumentTypeError. + """ + if isinstance(_str, bool): + return _str + if _str.lower() in {'yes', 'true', 't', 'y', '1'}: return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): + if _str.lower() in {'no', 'false', 'f', 'n', '0'}: return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - - -def verify_args(): - """ - Make sure required arguments are set, and if they are not then set them. - (deprecated) - :return: - """ - if not args.workdir: - args.workdir = os.getcwd() - - if not args.queuename: - message('queue name not set, cannot initialize InfoService') - return NO_QUEUENAME - - if not args.scopes: - message('scopes not set') - return NO_SCOPES - - if not args.lfns: - message('LFNs not set') - return NO_LFNS - - if not args.eventtype: - message('No event type provided') - return NO_EVENTTYPE - - if not args.localsite: - message('No local site provided') - return NO_LOCALSITE - - if not args.remotesite: - message('No remote site provided') - return NO_REMOTESITE - - if not args.produserid: - message('No produserid provided') - return NO_PRODUSERID - - if not args.jobid: - message('No jobid provided') - return NO_JOBID - - if not args.ddmendpoints: - message('No ddmendpoint provided') - return NO_DDMENDPOINTS - if not args.datasets: - message('No dataset provided') - return NO_DATASETS + raise argparse.ArgumentTypeError('Boolean value expected.') - if not args.guids: - message('No GUIDs provided') - return NO_GUIDS - - if not args.taskid: - message('No taskid provided') - return NO_TASKID - - if not args.jobdefinitionid: - message('No jobdefinitionid provided') - return NO_JOBDEFINITIONID - - return 0 +# logger is set in the main function +# pylint: disable=used-before-assignment +def message(msg: str): + """ + Print message to stdout or to log. -def message(msg): + :param msg: message (str). + """ print(msg) if not logger else logger.info(msg) -def get_file_lists(lfns, scopes, ddmendpoints, datasets, guids): - return lfns.split(','), scopes.split(','), ddmendpoints.split(','), datasets.split(','), guids.split(',') +def get_file_lists(_lfns: str, _scopes: str, _ddmendpoints: str, _datasets: str, _guids: str) -> tuple: + """ + Return the file lists. + + :param _lfns: comma separated list of lfns (str) + :param _scopes: comma separated list of scopes (str) + :param _ddmendpoints: comma separated list of ddmendpoints (str) + :param _datasets: comma separated list of datasets (str) + :param _guids: comma separated list of guids (str) + :return: tuple of lists (lfns, scopes, ddmendpoints, datasets, guids). + """ + return _lfns.split(','), _scopes.split(','), _ddmendpoints.split(','), _datasets.split(','), _guids.split(',') class Job: - """ - A minimal implementation of the Pilot Job class with data members necessary for the trace report only. - """ + """A minimal implementation of the Pilot Job class with data members necessary for the trace report only.""" produserid = "" jobid = "" taskid = "" jobdefinitionid = "" - def __init__(self, produserid="", jobid="", taskid="", jobdefinitionid=""): + def __init__(self, produserid: str = "", jobid: str = "", taskid: str = "", jobdefinitionid: str = ""): + """ + Initialize the Job object. + + :param produserid: produserid (str) + :param jobid: jobid (str) + :param taskid: taskid (str) + :param jobdefinitionid: jobdefinitionid (str) + """ self.produserid = produserid.replace('%20', ' ') self.jobid = jobid self.taskid = taskid self.jobdefinitionid = jobdefinitionid -def add_to_dictionary(dictionary, key, value1, value2, value3, value4, value5, value6): +def add_to_dictionary(dictionary: dict, key: str, value1: str, value2: str, value3: str, value4: str, value5: str, + value6: str) -> dict: """ Add key: [value1, value2, value3, value4, value5, value6] to dictionary. + In practice; lfn: [status, status_code, surl, turl, checksum, fsize]. - :param dictionary: dictionary to be updated. - :param key: lfn key to be added (string). - :param value1: status to be added to list belonging to key (string). - :param value2: status_code to be added to list belonging to key (string). - :param value3: surl to be added to list belonging to key (string). - :param value4: turl to be added to list belonging to key (string). - :param value5: checksum to be added to list belonging to key (string). - :param value6: fsize to be added to list belonging to key (string). - :return: updated dictionary. + :param dictionary: dictionary to be updated (dict) + :param key: lfn key to be added (str) + :param value1: status to be added to list belonging to key (str) + :param value2: status_code to be added to list belonging to key (str) + :param value3: surl to be added to list belonging to key (str) + :param value4: turl to be added to list belonging to key (str) + :param value5: checksum to be added to list belonging to key (str) + :param value6: fsize to be added to list belonging to key (str) + :return: updated dictionary (dict). """ - dictionary[key] = [value1, value2, value3, value4, value5, value6] + return dictionary -def extract_error_info(err): +def extract_error_info(_err: str) -> tuple: + """ + Extract error code and error message from the given error string. + :param _err: error string (str) + :return: tuple of error code and error message (int, str). + """ error_code = 0 error_message = "" - _code = re.search(r'error code: (\d+)', err) + _code = re.search(r'error code: (\d+)', _err) if _code: error_code = _code.group(1) - _msg = re.search('details: (.+)', err) + _msg = re.search('details: (.+)', _err) if _msg: error_message = _msg.group(1) error_message = error_message.replace('[PilotException(', '').strip() @@ -303,10 +276,6 @@ def extract_error_info(err): if __name__ == '__main__': # noqa: C901 - """ - Main function of the stage-in script. - """ - # get the args from the arg parser args = get_args() args.debug = True @@ -318,8 +287,8 @@ def extract_error_info(err): # get the file info lfns, scopes, ddmendpoints, datasets, guids = get_file_lists(args.lfns, args.scopes, args.ddmendpoints, args.datasets, args.guids) if len(lfns) != len(scopes) or len(lfns) != len(ddmendpoints) or len(lfns) != len(datasets) or len(lfns) != len(guids): - message('file lists not same length: len(lfns)=%d, len(scopes)=%d, len(ddmendpoints)=%d, len(datasets)=%d, len(guids)=%d' % - (len(lfns), len(scopes), len(ddmendpoints), len(datasets), len(guids))) + message(f'file lists not same length: len(lfns)={len(lfns)}, len(scopes)={len(scopes)}, ' + f'len(ddmendpoints)={len(ddmendpoints)}, len(datasets)={len(datasets)}, len(guids)={len(guids)}') # generate the trace report trace_report = TraceReport(pq=os.environ.get('PILOT_SITENAME', ''), localSite=args.localsite, @@ -341,10 +310,10 @@ def extract_error_info(err): activity = 'pw' client = StageOutClient(infoservice, logger=logger, trace_report=trace_report, workdir=args.workdir) - kwargs = dict(workdir=args.workdir, cwd=args.workdir, usecontainer=False, job=job, output_dir=args.outputdir, - catchall=args.catchall, rucio_host=args.rucio_host) # , mode='stage-out') - + kwargs = {"workdir": args.workdir, "cwd": args.workdir, "usecontainer": False, "job": job, + "output_dir": args.outputdir, "catchall": args.catchall, "rucio_host": args.rucio_host} # , "mode"='stage-out'} xfiles = [] + for lfn, scope, dataset, ddmendpoint, guid in list(zip(lfns, scopes, datasets, ddmendpoints, guids)): if 'job.log' in lfn: @@ -380,8 +349,8 @@ def extract_error_info(err): add_to_dictionary(file_dictionary, fspec.lfn, fspec.status, fspec.status_code, fspec.surl, fspec.turl, fspec.checksum.get('adler32'), fspec.filesize) status = fspec.status if fspec.status else "(not transferred)" - message(" -- lfn=%s, status_code=%s, status=%s, surl=%s, turl=%s, checksum=%s, filesize=%s" % - (fspec.lfn, fspec.status_code, status, fspec.surl, fspec.turl, fspec.checksum.get('adler32'), fspec.filesize)) + message(f" -- lfn={fspec.lfn}, status_code={fspec.status_code}, status={status}, surl={fspec.surl}, " + f"turl={fspec.turl}, checksum={fspec.checksum.get('adler32')}, filesize={fspec.filesize}") # add error info, if any if err: @@ -392,9 +361,9 @@ def extract_error_info(err): path += '.log' write_json(path, file_dictionary) if err: - message("containerised file transfers failed: %s" % err) - exit(TRANSFER_ERROR) + message(f"containerised file transfers failed: {err}") + sys.exit(TRANSFER_ERROR) - message("wrote %s" % path) + message(f"wrote {path}") message("containerised file transfers finished") - exit(0) + sys.exit(0) diff --git a/pilot/test/test_analytics.py b/pilot/test/test_analytics.py index 23769ccc..ad0b8fcb 100644 --- a/pilot/test/test_analytics.py +++ b/pilot/test/test_analytics.py @@ -19,6 +19,8 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Unit test functions for the Analytics package.""" + import unittest import os @@ -26,21 +28,14 @@ class TestAnalytics(unittest.TestCase): - """ - Unit tests for the Analytics package. - """ + """Unit tests for the Analytics package.""" def setUp(self): - + """Set up test fixtures.""" self.client = analytics.Analytics() def test_linear_fit(self): - """ - Make sure that a linear fit works. - - :return: (assertion). - """ - + """Make sure that a linear fit works.""" self.assertIsInstance(self.client, analytics.Analytics) # python 2.7 x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] @@ -63,12 +58,7 @@ def test_linear_fit(self): self.assertEqual(slope, -1.0) def test_parsing_memory_monitor_data(self): - """ - Read and fit PSS vs Time from memory monitor output file. - - :return: (assertion). - """ - + """Read and fit PSS vs Time from memory monitor output file.""" # old MemoryMonitor format filename = 'pilot/test/resource/memory_monitor_output.txt' self.assertEqual(os.path.exists(filename), True) diff --git a/pilot/test/test_copytools_mv.py b/pilot/test/test_copytools_mv.py index 1a537e17..b085c998 100644 --- a/pilot/test/test_copytools_mv.py +++ b/pilot/test/test_copytools_mv.py @@ -20,35 +20,40 @@ # - Pavlo Svirin, pavlo.svirin@gmail.com, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 -import unittest +"""Unit test functions for the copytool mv.""" + +import os.path +import random +import shutil import string import tempfile -import shutil -import random -import os.path - -from pilot.copytool.mv import copy_in, copy_out -from pilot.common.exception import StageInFailure, StageOutFailure -from pilot.util.container import execute +import unittest -from pilot.control.job import get_fake_job from pilot.info import JobData +from pilot.common.exception import ( + StageInFailure, + StageOutFailure +) +from pilot.control.job import get_fake_job +from pilot.copytool.mv import ( + copy_in, + copy_out +) +from pilot.util.container import execute class TestCopytoolMv(unittest.TestCase): - """ - Unit tests for mv copytool. - """ + """Unit tests for mv copytool.""" #filelist = [] numfiles = 10 maxfilesize = 100 * 1024 def setUp(self): - - """ Create temp source directory """ + """Set up test fixtures.""" + # Create temp source directory self.tmp_src_dir = tempfile.mkdtemp() - """ Create temp destination directory """ + # Create temp destination directory self.tmp_dst_dir = os.path.join(self.tmp_src_dir, 'dest') os.mkdir(self.tmp_dst_dir) @@ -56,7 +61,7 @@ def setUp(self): #self.filelist = [] # need a job data object, but we will overwrite some of its info - res = get_fake_job(input=False) + res = get_fake_job(inpt=False) jdata = JobData(res) infiles = "" @@ -69,11 +74,10 @@ def setUp(self): scope = "" ddmendpointin = "" ddmendpointout = "" - turl = "" - """ Create temp files in source dir """ - for i in range(0, self.numfiles): + # Create temp files in source directory + for _ in range(0, self.numfiles): # generate random name - fname = ''.join(random.choice(string.ascii_lowercase) for x in range(20)) + fname = ''.join(random.choice(string.ascii_lowercase) for _ in range(20)) if infiles == "": infiles = fname else: @@ -112,15 +116,11 @@ def setUp(self): dispatchdblocktokenforout = "NULL" else: dispatchdblocktokenforout += ",NULL" - _data = [random.randint(0, 255) for x in range(0, filesize)] + _data = [random.randint(0, 255) for _ in range(0, filesize)] fname = os.path.join(self.tmp_src_dir, fname) - if turl == "": - turl = fname - else: - turl = "," + fname - new_file = open(fname, "wb") - new_file.write(str(_data).encode('utf-8')) - new_file.close() + with open(fname, "wb") as new_file: + new_file.write(str(_data).encode('utf-8')) + # add to list #self.filelist.append({'name': fname, 'source': self.tmp_src_dir, 'destination': self.tmp_dst_dir}) @@ -138,8 +138,7 @@ def setUp(self): # 'fsize': fsize, 'checksum': checksum, 'scopeIn': scope, # 'ddmEndPointIn': ddmendpointin} data = {'inFiles': infiles, 'realDatasetsIn': realdatasetsin, 'GUID': guid, - 'fsize': fsize, 'checksum': checksum, 'scopeIn': scope, - 'ddmEndPointIn': ddmendpointin} + 'fsize': fsize, 'checksum': checksum, 'scopeIn': scope, 'ddmEndPointIn': ddmendpointin} self.indata = jdata.prepare_infiles(data) for _file in self.indata: _file.workdir = self.tmp_dst_dir @@ -159,24 +158,28 @@ def setUp(self): _file.fsize = 'abcdef' def test_copy_in_mv(self): - _, stdout1, stderr1 = execute(' '.join(['ls', self.tmp_src_dir, '|', 'grep', '-v', 'dest'])) + """Make sure that a mv copy works.""" + _, stdout1, _ = execute(' '.join(['ls', self.tmp_src_dir, '|', 'grep', '-v', 'dest'])) copy_in(self.indata, copy_type='mv', workdir=self.tmp_dst_dir) # here check files moved self.assertEqual(self.__dirs_content_valid(self.tmp_src_dir, self.tmp_dst_dir, dir2_expected_content=stdout1), 0) def test_copy_in_cp(self): + """Make sure that a cp copy works.""" copy_in(self.indata, copy_type='cp', workdir=self.tmp_dst_dir) self.assertEqual(self.__dirs_content_equal(self.tmp_src_dir, self.tmp_dst_dir), 0) def test_copy_in_symlink(self): + """Make sure that a symlink copy works.""" copy_in(self.indata, copy_type='symlink', workdir=self.tmp_dst_dir) # here check files linked self.assertEqual(self.__dirs_content_equal(self.tmp_src_dir, self.tmp_dst_dir), 0) # check dst files are links - _, stdout, _ = execute(r'find %s -type l -exec echo -n l \;' % self.tmp_dst_dir) + _, stdout, _ = execute(rf'find {self.tmp_dst_dir} -type l -exec echo -n l \;') self.assertEqual(stdout, ''.join('l' for i in range(self.numfiles))) def test_copy_in_invalid(self): + """Make sure that an invalid copy type fails.""" self.assertRaises(StageInFailure, copy_in, self.indata, **{'copy_type': ''}) self.assertRaises(StageInFailure, copy_in, self.indata, **{'copy_type': None}) @@ -188,39 +191,45 @@ def test_copy_in_invalid(self): # self.assertEqual(self.__dirs_content_valid(self.tmp_src_dir, os.path.join(self.tmp_dst_dir, 'abc/def'), dir2_expected_content=stdout1), 0) def test_copy_out_cp(self): + """Make sure that a cp copy works.""" pass # copy_out(self.outdata, copy_type='cp') # self.assertEqual(self.__dirs_content_equal(self.tmp_src_dir, self.tmp_dst_dir), 0) def test_copy_out_invalid(self): + """Make sure that an invalid copy type fails.""" self.assertRaises(StageOutFailure, copy_out, self.outdata, **{'copy_type': ''}) self.assertRaises(StageOutFailure, copy_out, self.outdata, **{'copy_type': 'symlink'}) self.assertRaises(StageOutFailure, copy_out, self.outdata, **{'copy_type': None}) def tearDown(self): - """ Drop temp directories """ + """Remove temp directories.""" shutil.rmtree(self.tmp_dst_dir) shutil.rmtree(self.tmp_src_dir) def __dirs_content_equal(self, dir1, dir2): + """Compare the content of two directories.""" if dir1 == '' or dir2 == '' or dir1 is None or dir2 is None: return -1 - _, stdout1, stderr1 = execute(' '.join(['ls', dir1, '|', 'grep', '-v', 'dest'])) - _, stdout2, stderr2 = execute(' '.join(['ls', dir2, '|', 'grep', '-v', 'dest'])) + _, stdout1, _ = execute(' '.join(['ls', dir1, '|', 'grep', '-v', 'dest'])) + _, stdout2, _ = execute(' '.join(['ls', dir2, '|', 'grep', '-v', 'dest'])) if stdout1 != stdout2: return -2 + return 0 def __dirs_content_valid(self, dir1, dir2, dir1_expected_content=None, dir2_expected_content=None): + """Compare the content of two directories.""" # currently this fails: need to fix if dir1 == '' or dir2 == '' or dir1 is None or dir2 is None: return -1 - _, stdout1, stderr1 = execute(' '.join(['ls', dir1, '|', 'grep', '-v', 'dest'])) + _, stdout1, _ = execute(' '.join(['ls', dir1, '|', 'grep', '-v', 'dest'])) if dir1_expected_content is not None and stdout1 != dir1_expected_content: return -3 - _, stdout2, stderr2 = execute(' '.join(['ls', dir2, '|', 'grep', '-v', 'dest'])) + _, stdout2, _ = execute(' '.join(['ls', dir2, '|', 'grep', '-v', 'dest'])) if dir2_expected_content is not None and stdout2 != dir2_expected_content: return -4 + return 0 diff --git a/pilot/test/test_copytools_rucio.py b/pilot/test/test_copytools_rucio.py index 3671913b..ed782d9d 100644 --- a/pilot/test/test_copytools_rucio.py +++ b/pilot/test/test_copytools_rucio.py @@ -20,20 +20,29 @@ # - Pavlo Svirin pavlo.svirin@gmail.com, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2023 -import unittest +"""Unit test functions for the copytool rucio.""" + import os +import unittest # from pilot.control.job import get_fake_job # from pilot.info import JobData from pilot.info.filespec import FileSpec from pilot.util.tracereport import TraceReport +try: + from pilot.copytool.rucio import copy_out +except ImportError: + pass -def check_env(): + +def check_env() -> bool: """ - Function to check whether rucio copytool is loaded correctly. + Check whether rucio copytool is loaded correctly. + To be used to decide whether to skip some test functions. - :returns True: if rucio copytool is available. Otherwise False. + + :return: Turn if rucio copytool is available, otherwise False (bool). """ aval = False return aval @@ -41,27 +50,29 @@ def check_env(): @unittest.skipIf(not check_env(), "No Rucio copytool") class TestCopytoolRucio(unittest.TestCase): - """ - Unit tests for rucio copytool. - """ + """Unit tests for rucio copytool.""" def setUp(self): - test_file = open('test.txt', 'w') - test_file.write('For test purposes only.') - test_file.close() + """Set up test fixtures.""" + with open('test.txt', 'w', encoding='utf-8') as test_file: + test_file.write('For test purposes only.') + fspec_out = FileSpec() fspec_out.lfn = 'test.txt' - fspec_out.scope = 'user.tjavurek' + fspec_out.scope = 'user.pnilsson' fspec_out.checksum = {'adler32': '682c08b9'} fspec_out.pfn = os.getcwd() + '/' + 'test.txt' fspec_out.ddmendpoint = 'UNI-FREIBURG_SCRATCHDISK' self.outdata = [fspec_out] def test_copy_out_rucio(self): - from pilot.copytool.rucio import copy_out + """Test copy_out function.""" trace_report = TraceReport() trace_report.update(eventType='unit test') - copy_out(self.outdata, trace_report=trace_report) + try: + copy_out(self.outdata, trace_report=trace_report) + except NameError: + pass os.remove(self.outdata[0].pfn) diff --git a/pilot/test/test_escommunicator.py b/pilot/test/test_escommunicator.py index 2181e0ee..b483914e 100644 --- a/pilot/test/test_escommunicator.py +++ b/pilot/test/test_escommunicator.py @@ -19,48 +19,42 @@ # - Wen Guan, wen.guan@cern.ch, 2018 # - Paul Nilsson, paul.nilsson@cern.ch , 2023 +"""Unit tests for the ES communication module.""" + import json import logging import os import socket import sys import time +import unittest from pilot.eventservice.communicationmanager.communicationmanager import CommunicationRequest, CommunicationResponse, CommunicationManager from pilot.util.https import https_setup from pilot.util.timing import time_stamp -if sys.version_info < (2, 7): - import unittest2 as unittest -else: - import unittest - logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) logger = logging.getLogger(__name__) - https_setup(None, None) -def check_env(): +def check_env() -> bool: """ - Function to check whether cvmfs is available. + Check whether cvmfs is available. + To be used to decide whether to skip some test functions. - :returns True: if cvmfs is available. Otherwise False. + :return: True if cvmfs is available, otherwise False (bool). """ - return os.path.exists('/cvmfs/atlas.cern.ch/repo/') + return os.path.exists('/cvmfs/') class TestESCommunicationRequestResponse(unittest.TestCase): - """ - Unit tests for event service communicator Request and Response. - """ + """Unit tests for event service communicator Request and Response.""" def test_communicator_request(self): - """ - Make sure that es message thread works as expected. - """ + """Make sure that es message thread works as expected.""" req_attrs = {'request_type': CommunicationRequest.RequestType.RequestJobs, 'num_jobs': 1, 'post_hook': None, 'response': None} req_job = CommunicationRequest(req_attrs) @@ -82,15 +76,11 @@ def test_communicator_request(self): class TestESCommunicationManagerPanda(unittest.TestCase): - """ - Unit tests for event service communicator manager. - """ + """Unit tests for event service communicator manager.""" @unittest.skipIf(not check_env(), "No CVMFS") def test_communicator_manager(self): - """ - Make sure that es communicator manager thread works as expected. - """ + """Make sure that es communicator manager thread works as expected.""" communicator_manager = None try: args = {'workflow': 'eventservice_hpc', @@ -154,22 +144,22 @@ def test_communicator_manager(self): for event in events: event_range = {"eventRangeID": event['eventRangeID'], "eventStatus": 'finished'} update_events.append(event_range) - event_range_status = [{"zipFile": {"numEvents": len(update_events), - "objstoreID": 1318, - "adler32": '000000', - "lfn": 'test_file', - "fsize": 100, - "pathConvention": 1000}, - "eventRanges": update_events}] - - event_range_message = {'version': 1, 'eventRanges': json.dumps(event_range_status)} + event_range_status_list = [{"zipFile": {"numEvents": len(update_events), + "objstoreID": 1318, + "adler32": '000000', + "lfn": 'test_file', + "fsize": 100, + "pathConvention": 1000}, + "eventRanges": update_events}] + + event_range_message = {'version': 1, 'eventRanges': json.dumps(event_range_status_list)} res = communicator_manager.update_events(update_events=event_range_message) self.assertEqual(res['StatusCode'], 0) communicator_manager.stop() time.sleep(2) self.assertFalse(communicator_manager.is_alive()) - except Exception as ex: + except Exception as exc: if communicator_manager: communicator_manager.stop() - raise ex + raise exc diff --git a/pilot/test/test_esprocess.py b/pilot/test/test_esprocess.py index b96623b2..f042d376 100644 --- a/pilot/test/test_esprocess.py +++ b/pilot/test/test_esprocess.py @@ -19,83 +19,76 @@ # - Wen Guan, wen.guan@cern.ch, 2017-18 # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +"""Unit tests for the esprocess package.""" + import json import logging import os +import queue import subprocess import sys import threading import time - -try: - import Queue as queue # noqa: N813 -except Exception: - import queue # Python 3 +import unittest from pilot.eventservice.esprocess.eshook import ESHook from pilot.eventservice.esprocess.esmanager import ESManager from pilot.eventservice.esprocess.esmessage import MessageThread from pilot.eventservice.esprocess.esprocess import ESProcess -if sys.version_info < (2, 7): - import unittest2 as unittest -else: - import unittest - logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) -def check_env(): +def check_env() -> bool: """ - Function to check whether cvmfs is available. + Check whether cvmfs is available. + To be used to decide whether to skip some test functions. - :returns True: if cvmfs is available. Otherwise False. + :return: True if cvmfs is available, otherwise False (bool). """ - return os.path.exists('/cvmfs/atlas.cern.ch/repo/') + return os.path.exists('/cvmfs/') class TestESHook(ESHook): - """ - A class implemented ESHook, to be used to test eventservice. - """ + """A class implemented ESHook, to be used to test eventservice.""" def __init__(self): """ - Init the hook class for tests: Read payload and event ranges from a file. - Download evgen files which are needed to run payload. + Initialize the hook class for tests. + + Read payload and event ranges from a file. + Download evgen files which are needed to run payload. """ - with open('pilot/test/resource/eventservice_job.txt') as job_file: + with open('pilot/test/resource/eventservice_job.txt', 'r', encoding='utf-8') as job_file: job = json.load(job_file) self.__payload = job['payload'] self.__event_ranges = job['event_ranges'] # doesn't exit if check_env(): - process = subprocess.Popen('pilot/test/resource/download_test_es_evgen.sh', shell=True, stdout=subprocess.PIPE) - process.wait() - if process.returncode != 0: - raise Exception('failed to download input files for es test: %s %s' % (process.communicate())) + with subprocess.Popen('pilot/test/resource/download_test_es_evgen.sh', shell=True, stdout=subprocess.PIPE) as process: + process.wait() + if process.returncode != 0: + raise Exception(f'failed to download input files for es test: {process.communicate()}') else: - logging.info("No CVMFS. skip downloading files.") + logging.info("no CVMFS. skip downloading files.") self.__injected_event_ranges = [] self.__outputs = [] - def get_payload(self): + def get_payload(self) -> dict: """ Get payload hook function for tests. :returns: dict {'executable': , 'output_file': , 'error_file': } """ - return self.__payload - def get_event_ranges(self, num_ranges=1): + def get_event_ranges(self, num_ranges=1) -> list: """ Get event ranges hook function for tests. - :returns: dict of event ranges. - None if no available events. + :returns: list of event ranges (list). """ ret = [] for _ in range(num_ranges): @@ -103,13 +96,14 @@ def get_event_ranges(self, num_ranges=1): event_range = self.__event_ranges.pop(0) ret.append(event_range) self.__injected_event_ranges.append(event_range) + return ret - def handle_out_message(self, message): + def handle_out_message(self, message: dict): """ Handle ES output or error messages hook function for tests. - :param message: a dict of parsed message. + :param message: a dict of parsed message (dict). For 'finished' event ranges, it's {'id': , 'status': 'finished', 'output': , 'cpu': , 'wall': , 'message': }. Fro 'failed' event ranges, it's {'id': , 'status': 'failed', 'message': }. @@ -118,15 +112,15 @@ def handle_out_message(self, message): print(message) self.__outputs.append(message) - def get_injected_event_ranges(self): + def get_injected_event_ranges(self) -> list: """ Get event ranges injected to payload for test assertion. - :returns: List of injected event ranges. + :returns: List of injected event ranges (list). """ return self.__injected_event_ranges - def get_outputs(self): + def get_outputs(self) -> list: """ Get outputs for test assertion. @@ -142,10 +136,8 @@ class TestESMessageThread(unittest.TestCase): @unittest.skipIf(not check_env(), "No CVMFS") def test_msg_thread(self): - """ - Make sure that es message thread works as expected. - """ - _queue = queue.Queue() # Python 2/3 + """Make sure that the es message thread works as expected.""" + _queue = queue.Queue() msg_thread = MessageThread(_queue, socket_name='test', context='local') self.assertIsInstance(msg_thread, threading.Thread) @@ -162,36 +154,26 @@ def test_msg_thread(self): @unittest.skipIf(not check_env(), "No CVMFS") class TestESProcess(unittest.TestCase): - """ - Unit tests for event service process functions - """ + """Unit tests for event service process functions.""" @classmethod def setUpClass(cls): + """Set up test fixtures.""" cls._test_hook = TestESHook() cls._esProcess = ESProcess(cls._test_hook.get_payload()) def test_set_get_event_ranges_hook(self): - """ - Make sure that no exceptions to set get_event_ranges hook. - """ - + """Make sure that no exceptions to set get_event_ranges hook.""" self._esProcess.set_get_event_ranges_hook(self._test_hook.get_event_ranges) self.assertEqual(self._test_hook.get_event_ranges, self._esProcess.get_get_event_ranges_hook()) def test_set_handle_out_message_hook(self): - """ - Make sure that no exceptions to set handle_out_message hook. - """ - + """Make sure that no exceptions to set handle_out_message hook.""" self._esProcess.set_handle_out_message_hook(self._test_hook.handle_out_message) self.assertEqual(self._test_hook.handle_out_message, self._esProcess.get_handle_out_message_hook()) def test_parse_out_message(self): - """ - Make sure to parse messages from payload correctly. - """ - + """Make sure to parse messages from payload correctly.""" output_msg = '/tmp/HITS.12164365._000300.pool.root.1.12164365-3616045203-10980024041-4138-8,ID:12164365-3616045203-10980024041-4138-8,CPU:288,WALL:303' ret = self._esProcess.parse_out_message(output_msg) self.assertEqual(ret['status'], 'finished') @@ -209,24 +191,18 @@ def test_parse_out_message(self): class TestEventService(unittest.TestCase): - """ - Unit tests for event service functions. - """ + """Unit tests for event service functions.""" @unittest.skipIf(not check_env(), "No CVMFS") def test_init_esmanager(self): - """ - Make sure that no exceptions to init ESManager - """ + """Make sure that no exceptions to init ESManager.""" test_hook = TestESHook() es_manager = ESManager(test_hook) self.assertIsInstance(es_manager, ESManager) @unittest.skipIf(not check_env(), "No CVMFS") def test_run_es(self): - """ - Make sure that ES produced all events that injected. - """ + """Make sure that ES produced all events that injected.""" test_hook = TestESHook() es_manager = ESManager(test_hook) es_manager.run() diff --git a/pilot/test/test_esstager.py b/pilot/test/test_esstager.py index 649bfe23..bc2bd32f 100644 --- a/pilot/test/test_esstager.py +++ b/pilot/test/test_esstager.py @@ -19,54 +19,58 @@ # - Wen Guan, wen.guan@cern.ch, 2017-18 # - Paul Nilsson, paul.nilsson@cern.ch, 2023 +"""Unit tests for the esstager package.""" + import logging import os import shutil import sys import traceback +import unittest import uuid -from pilot.api.es_data import StageOutESClient, StageInESClient +from pilot.api.es_data import ( + StageOutESClient, + StageInESClient +) from pilot.common import exception +from pilot.info import ( + infosys, + InfoService +) from pilot.info.filespec import FileSpec from pilot.util.https import https_setup - -if sys.version_info < (2, 7): - import unittest2 as unittest -else: - import unittest - logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) logger = logging.getLogger(__name__) https_setup(None, None) -def check_env(): +def check_env() -> bool: """ - Function to check whether cvmfs is available. + Check whether cvmfs is available. + To be used to decide whether to skip some test functions. - :returns True: if cvmfs is available. Otherwise False. + :returns: True if cvmfs is available, otherwise False (bool). """ return os.path.exists('/cvmfs/atlas.cern.ch/repo/') @unittest.skipIf(not check_env(), "No CVMFS") class TestStager(unittest.TestCase): - """ - Unit tests for event service Grid work executor - """ + """Unit tests for event service Grid work executor.""" @unittest.skipIf(not check_env(), "No CVMFS") def test_stageout_es_events(self): """ - Make sure that no exceptions to stage out file. + Make sure there are no exceptions to stage out file. + + :raises: StageOutFailure in case of failure. """ error = None try: - from pilot.info import infosys, InfoService infoservice = InfoService() infoservice.init('BNL_CLOUD_MCORE', infosys.confinfo, infosys.extinfo) @@ -85,31 +89,32 @@ def test_stageout_es_events(self): xdata = [file_spec] workdir = os.path.dirname(output_file) client = StageOutESClient(infoservice) - kwargs = dict(workdir=workdir, cwd=workdir, usecontainer=False) + kwargs = {'workdir': workdir, 'cwd': workdir, 'usecontainer': False} client.prepare_destinations(xdata, activity='es_events') client.transfer(xdata, activity='es_events', **kwargs) - except exception.PilotException as error: # Python 2/3 - logger.error("Pilot Exception: %s, %s" % (error.get_detail(), traceback.format_exc())) - except Exception as e: # Python 2/3 + except exception.PilotException as error: + logger.error(f"Pilot Exception: {error.get_detail()}, {traceback.format_exc()}") + except Exception as exc: logger.error(traceback.format_exc()) - error = exception.StageOutFailure("stageOut failed with error=%s" % e) + error = exception.StageOutFailure(f"stageOut failed with error={exc}") else: logger.info('Summary of transferred files:') for e in xdata: - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, e.status)) + logger.info(f" -- lfn={e.lfn}, status_code={e.status_code}, status={e.status}") if error: - logger.error('Failed to stage-out eventservice file(%s): error=%s' % (output_file, error.get_detail())) + logger.error(f'Failed to stage-out eventservice file({output_file}): error={error.get_detail()}') raise error @unittest.skipIf(not check_env(), "No CVMFS") def test_stageout_es_events_pw(self): """ - Make sure that no exceptions to stage out file. + Make sure there are no exceptions to stage out file. + + :raises: StageOutFailure in case of failure. """ error = None try: - from pilot.info import infosys, InfoService infoservice = InfoService() infoservice.init('BNL_CLOUD_MCORE', infosys.confinfo, infosys.extinfo) @@ -128,31 +133,32 @@ def test_stageout_es_events_pw(self): xdata = [file_spec] workdir = os.path.dirname(output_file) client = StageOutESClient(infoservice) - kwargs = dict(workdir=workdir, cwd=workdir, usecontainer=False) + kwargs = {'workdir': workdir, 'cwd': workdir, 'usecontainer': False} client.prepare_destinations(xdata, activity=['es_events', 'pw']) # allow to write to `es_events` and `pw` astorages client.transfer(xdata, activity=['es_events', 'pw'], **kwargs) - except exception.PilotException as error: # Python 2/3 - logger.error("Pilot Exeception: %s, %s" % (error.get_detail(), traceback.format_exc())) - except Exception as e: # Python 2/3 + except exception.PilotException as error: + logger.error(f"Pilot Exeception: {error.get_detail()}, {traceback.format_exc()}") + except Exception as exc: logger.error(traceback.format_exc()) - error = exception.StageOutFailure("stageOut failed with error=%s" % e) + error = exception.StageOutFailure(f"stageOut failed with error={exc}") else: logger.info('Summary of transferred files:') - for e in xdata: - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, e.status)) + for fil in xdata: + logger.info(f" -- lfn={fil.lfn}, status_code={fil.status_code}, status={fil.status}") if error: - logger.error('Failed to stage-out eventservice file(%s): error=%s' % (output_file, error.get_detail())) + logger.error(f'Failed to stage-out eventservice file({output_file}): error={error.get_detail()}') raise error @unittest.skipIf(not check_env(), "No CVMFS") def test_stageout_es_events_non_exist_pw(self): """ - Make sure that no exceptions to stage out file. + Make sure there are no exceptions to stage out file. + + :raises: StageOutFailure in case of failure. """ error = None try: - from pilot.info import infosys, InfoService infoservice = InfoService() infoservice.init('BNL_CLOUD_MCORE', infosys.confinfo, infosys.extinfo) @@ -171,31 +177,32 @@ def test_stageout_es_events_non_exist_pw(self): xdata = [file_spec] workdir = os.path.dirname(output_file) client = StageOutESClient(infoservice) - kwargs = dict(workdir=workdir, cwd=workdir, usecontainer=False) + kwargs = {'workdir': workdir, 'cwd': workdir, 'usecontainer': False} client.prepare_destinations(xdata, activity=['es_events_non_exist', 'pw']) # allow to write to `es_events_non_exist` and `pw` astorages client.transfer(xdata, activity=['es_events_non_exist', 'pw'], **kwargs) - except exception.PilotException as error: # Python 2/3 - logger.error("Pilot Exeception: %s, %s" % (error.get_detail(), traceback.format_exc())) - except Exception as e: # Python 2/3 + except exception.PilotException as error: + logger.error(f"Pilot Exeception: {error.get_detail()}, {traceback.format_exc()}") + except Exception as exc: logger.error(traceback.format_exc()) - error = exception.StageOutFailure("stageOut failed with error=%s" % e) + error = exception.StageOutFailure(f"stageOut failed with error={exc}") else: logger.info('Summary of transferred files:') - for e in xdata: - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, e.status)) + for fil in xdata: + logger.info(f" -- lfn={fil.lfn}, status_code={fil.status_code}, status={fil.status}") if error: - logger.error('Failed to stage-out eventservice file(%s): error=%s' % (output_file, error.get_detail())) + logger.error(f'Failed to stage-out eventservice file({output_file}): error={error.get_detail()}') raise error @unittest.skipIf(not check_env(), "No CVMFS") def test_stageout_stagein(self): """ - Make sure that no exceptions to stage out file. + Make sure there are no exceptions to stage out file. + + :raises: StageOutFailure in case of failure. """ error = None try: - from pilot.info import infosys, InfoService infoservice = InfoService() infoservice.init('BNL_CLOUD_MCORE', infosys.confinfo, infosys.extinfo) @@ -214,60 +221,61 @@ def test_stageout_stagein(self): xdata = [file_spec] workdir = os.path.dirname(output_file) client = StageOutESClient(infoservice) - kwargs = dict(workdir=workdir, cwd=workdir, usecontainer=False) + kwargs = {'workdir': workdir, 'cwd': workdir, 'usecontainer': False} client.prepare_destinations(xdata, activity=['es_events', 'pw']) # allow to write to `es_events` and `pw` astorages client.transfer(xdata, activity=['es_events', 'pw'], **kwargs) - except exception.PilotException as error: # Python 2/3 - logger.error("Pilot Exeception: %s, %s" % (error.get_detail(), traceback.format_exc())) - except Exception as e: # Python 2/3 + except exception.PilotException as error: + logger.error(f"Pilot Exeception: {error.get_detail()}, {traceback.format_exc()}") + except Exception as exc: logger.error(traceback.format_exc()) - error = exception.StageOutFailure("stageOut failed with error=%s" % e) + error = exception.StageOutFailure(f"stageOut failed with error={exc}") else: logger.info('Summary of transferred files:') - for e in xdata: - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, e.status)) + for fil in xdata: + logger.info(f" -- lfn={fil.lfn}, status_code={fil.status_code}, status={fil.status}") if error: - logger.error('Failed to stage-out eventservice file(%s): error=%s' % (output_file, error.get_detail())) + logger.error(f'Failed to stage-out eventservice file({output_file}): error={error.get_detail()}') raise error storage_id = infosys.get_storage_id(file_spec.ddmendpoint) - logger.info('File %s staged out to %s(id: %s)' % (file_spec.lfn, file_spec.ddmendpoint, storage_id)) + logger.info(f'File {file_spec.lfn} staged out to {file_spec.ddmendpoint}(id: {storage_id})') new_file_data = {'scope': 'test', 'lfn': file_spec.lfn, - 'storage_token': '%s/1000' % storage_id} + 'storage_token': f'{storage_id}/1000'} try: new_file_spec = FileSpec(filetype='input', **new_file_data) xdata = [new_file_spec] workdir = os.path.dirname(output_file) client = StageInESClient(infoservice) - kwargs = dict(workdir=workdir, cwd=workdir, usecontainer=False) + kwargs = {'workdir': workdir, 'cwd': workdir, 'usecontainer': False} client.prepare_sources(xdata) client.transfer(xdata, activity=['es_events_read'], **kwargs) - except exception.PilotException as error: # Python 2/3 - logger.error("Pilot Exeception: %s, %s" % (error.get_detail(), traceback.format_exc())) - except Exception as e: # Python 2/3 + except exception.PilotException as error: + logger.error(f"Pilot Exeception: {error.get_detail()}, {traceback.format_exc()}") + except Exception as exc: logger.error(traceback.format_exc()) - error = exception.StageInFailure("stagein failed with error=%s" % e) + error = exception.StageInFailure(f"stagein failed with error={exc}") else: logger.info('Summary of transferred files:') - for e in xdata: - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, e.status)) + for fil in xdata: + logger.info(f" -- lfn={fil.lfn}, status_code={fil.status_code}, status={fil.status}") if error: - logger.error('Failed to stage-in eventservice file(%s): error=%s' % (output_file, error.get_detail())) + logger.error(f'Failed to stage-in eventservice file({output_file}): error={error.get_detail()}') raise error @unittest.skipIf(not check_env(), "No CVMFS") def test_stageout_noexist_activity_stagein(self): """ - Make sure that no exceptions to stage out file. + Make sure there are no exceptions to stage out file. + + :raises: StageOutFailure in case of failure. """ error = None try: - from pilot.info import infosys, InfoService infoservice = InfoService() infoservice.init('BNL_CLOUD_MCORE', infosys.confinfo, infosys.extinfo) @@ -286,48 +294,48 @@ def test_stageout_noexist_activity_stagein(self): xdata = [file_spec] workdir = os.path.dirname(output_file) client = StageOutESClient(infoservice) - kwargs = dict(workdir=workdir, cwd=workdir, usecontainer=False) + kwargs = {'workdir': workdir, 'cwd': workdir, 'usecontainer': False} client.prepare_destinations(xdata, activity=['es_events_no_exist', 'pw']) # allow to write to `es_events_no_exist` and `pw` astorages client.transfer(xdata, activity=['es_events_no_exist', 'pw'], **kwargs) - except exception.PilotException as error: # Python 2/3 - logger.error("Pilot Exeception: %s, %s" % (error.get_detail(), traceback.format_exc())) - except Exception as e: # Python 2/3 + except exception.PilotException as error: + logger.error(f"Pilot Exeception: {error.get_detail()}, {traceback.format_exc()}") + except Exception as exc: logger.error(traceback.format_exc()) - error = exception.StageOutFailure("stageOut failed with error=%s" % e) + error = exception.StageOutFailure(f"stageOut failed with error={exc}") else: logger.info('Summary of transferred files:') - for e in xdata: - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, e.status)) + for fil in xdata: + logger.info(f" -- lfn={fil.lfn}, status_code={fil.status_code}, status={fil.status}") if error: - logger.error('Failed to stage-out eventservice file(%s): error=%s' % (output_file, error.get_detail())) + logger.error(f'Failed to stage-out eventservice file({output_file}): error={error.get_detail()}') raise error storage_id = infosys.get_storage_id(file_spec.ddmendpoint) - logger.info('File %s staged out to %s(id: %s)' % (file_spec.lfn, file_spec.ddmendpoint, storage_id)) + logger.info(f'File {file_spec.lfn} staged out to {file_spec.ddmendpoint}(id: {storage_id})') new_file_data = {'scope': 'test', 'lfn': file_spec.lfn, - 'storage_token': '%s/1000' % storage_id} + 'storage_token': f'{storage_id}/1000'} try: new_file_spec = FileSpec(filetype='input', **new_file_data) xdata = [new_file_spec] workdir = os.path.dirname(output_file) client = StageInESClient(infoservice) - kwargs = dict(workdir=workdir, cwd=workdir, usecontainer=False) + kwargs = {'workdir': workdir, 'cwd': workdir, 'usecontainer': False} client.prepare_sources(xdata) client.transfer(xdata, activity=['es_events_read'], **kwargs) - except exception.PilotException as error: # Python 2/3 - logger.error("Pilot Exeception: %s, %s" % (error.get_detail(), traceback.format_exc())) - except Exception as e: # Python 2/3 + except exception.PilotException as error: + logger.error(f"Pilot Exeception: {error.get_detail()}, {traceback.format_exc()}") + except Exception as exc: logger.error(traceback.format_exc()) - error = exception.StageInFailure("stagein failed with error=%s" % e) + error = exception.StageInFailure(f"stagein failed with error={exc}") else: logger.info('Summary of transferred files:') - for e in xdata: - logger.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, e.status)) + for fil in xdata: + logger.info(f" -- lfn={fil.lfn}, status_code={fil.status_code}, status={fil.status}") if error: - logger.error('Failed to stage-in eventservice file(%s): error=%s' % (output_file, error.get_detail())) + logger.error(f'Failed to stage-in eventservice file({output_file}): error={error.get_detail()}') raise error diff --git a/pilot/test/test_esworkexecutor.py b/pilot/test/test_esworkexecutor.py index 30e7977b..c88d6a0a 100644 --- a/pilot/test/test_esworkexecutor.py +++ b/pilot/test/test_esworkexecutor.py @@ -19,48 +19,51 @@ # - Wen Guan, wen.guan@cern.ch, 2017-18 # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +"""Unit tests for the esprocess package.""" + import logging +import json import os import sys import socket import time import traceback +import unittest from pilot.api.es_data import StageInESClient +from pilot.control.job import create_job from pilot.eventservice.communicationmanager.communicationmanager import CommunicationManager from pilot.eventservice.workexecutor.workexecutor import WorkExecutor -from pilot.control.job import create_job from pilot.util.https import https_setup -if sys.version_info < (2, 7): - import unittest2 as unittest -else: - import unittest - logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) logger = logging.getLogger(__name__) https_setup(None, None) -def check_env(): +def check_env() -> bool: """ - Function to check whether cvmfs is available. + Check whether cvmfs is available. + To be used to decide whether to skip some test functions. - :returns True: if cvmfs is available. Otherwise False. + :returns: True if cvmfs is available, otherwise False (bool). """ return os.path.exists('/cvmfs/atlas.cern.ch/repo/') @unittest.skipIf(not check_env(), "No CVMFS") class TestESWorkExecutorGrid(unittest.TestCase): - """ - Unit tests for event service Grid work executor - """ + """Unit tests for event service Grid work executor.""" @classmethod def setUpClass(cls): + """ + Set up test fixtures. + + :raises Exception: in case of failure. + """ try: args = {'workflow': 'eventservice_hpc', 'queue': 'BNL_CLOUD_MCORE', @@ -81,7 +84,7 @@ def setUpClass(cls): communicator_manager.start() jobs = communicator_manager.get_jobs(njobs=1, args=args) - job = create_job(jobs[0], 'BNL_CLOUD_MCORE') + job = create_job(jobs[0], queuename='BNL_CLOUD_MCORE') job.workdir = '/tmp/test_esworkexecutor' job.corecount = 1 if not os.path.exists(job.workdir): @@ -95,42 +98,45 @@ def setUpClass(cls): job_data['node'] = 'pilot3_test' job_data['schedulerID'] = 'pilot3_test' job_data['coreCount'] = 1 - status = communicator_manager.update_jobs(jobs=[job_data]) + _ = communicator_manager.update_jobs(jobs=[job_data]) job_data['state'] = 'running' - status = communicator_manager.update_jobs(jobs=[job_data]) + _ = communicator_manager.update_jobs(jobs=[job_data]) communicator_manager.stop() # download input files client = StageInESClient(job.infosys, logger=logger) - kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, job=job) + kwargs = {'workdir': job.workdir, 'cwd': job.workdir, 'usecontainer': False, 'job': job} client.prepare_sources(job.indata) client.transfer(job.indata, activity='pr', **kwargs) # get the payload command from the user specific code pilot_user = os.environ.get('PILOT_USER', 'atlas').lower() - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 + user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) cmd = user.get_payload_command(job) - logger.info("payload execution command: %s" % cmd) + logger.info(f"payload execution command: {cmd}") payload = {'executable': cmd, 'workdir': job.workdir, - 'output_file': 'pilot_test_%s_stdout.txt' % job['PandaID'], - 'error_file': 'pilot_test_%s_stderr.txt' % job['PandaID'], + 'output_file': f"pilot_test_{job['PandaID']}_stdout.txt", + 'error_file': f"pilot_test_{job['PandaID']}_stderr.txt", 'job': job} cls._payload = payload - except Exception as ex: + except Exception as exc: if cls._communicator_manager: cls._communicator_manager.stop() - raise ex + raise exc @classmethod def tearDownClass(cls): + """Remove test fixtures.""" cls._communicator_manager.stop() def setup(self): + """Set up test fixtures.""" self.executor = None def tearDown(self): + """Remove test fixtures.""" if self._communicator_manager: self._communicator_manager.stop() if self.executor: @@ -138,9 +144,10 @@ def tearDown(self): def test_workexecutor_generic(self): """ - Make sure that no exceptions to run work executor. - """ + Make sure there are no exceptions when running work executor. + :raises Exception: in case of failure. + """ try: executor = WorkExecutor() self.executor = executor @@ -161,20 +168,21 @@ def test_workexecutor_generic(self): time.sleep(0.1) exit_code = executor.get_exit_code() self.assertEqual(exit_code, 0) - except Exception as ex: - logger.debug("Exception: %s, %s" % (ex, traceback.format_exc())) + except Exception as exc: + logger.debug(f"Exception: {exc}, {traceback.format_exc()}") if self.executor: self.executor.stop() while self.executor.is_alive(): time.sleep(0.1) - raise ex + raise exc @unittest.skipIf(True, "skip it") def test_workexecutor_update_events(self): """ - Make sure that no exceptions to run work executor. - """ + Make sure there are no exceptions when running work executor. + :raises Exception: in case of failure. + """ try: executor = WorkExecutor() self.executor = executor @@ -194,13 +202,12 @@ def test_workexecutor_update_events(self): "fsize": 100, "pathConvention": 1000}, "eventRanges": update_events}] - import json event_range_message = {'version': 1, 'eventRanges': json.dumps(event_range_status)} ret = executor.update_events(event_range_message) logger.debug(ret) executor.stop() - except Exception as ex: + except Exception as exc: if self.executor: self.executor.stop() - raise ex + raise exc diff --git a/pilot/test/test_exception.py b/pilot/test/test_exception.py index e2468900..e7964815 100644 --- a/pilot/test/test_exception.py +++ b/pilot/test/test_exception.py @@ -20,46 +20,32 @@ # - Wen Guan, wen.guan@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Unit tests for the esprocess package.""" import logging import sys +import unittest from pilot.common.exception import RunPayloadFailure, PilotException -if sys.version_info < (2, 7): - import unittest2 as unittest -else: - import unittest - logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) class TestException(unittest.TestCase): - """ - Unit tests for exceptions. - """ + """Unit tests for exceptions.""" def test_run_payload_failure(self): - """ - Make sure that es message thread works as expected. - """ - + """Make sure that es message thread works as expected.""" try: - pass raise RunPayloadFailure(a='message a', b='message b') - except PilotException as ex: - self.assertIsInstance(ex, PilotException) - self.assertEqual(ex.get_error_code(), 1305) - logging.info("\nException: error code: %s\n\nMain message: %s\n\nFullStack: %s" % (ex.get_error_code(), - str(ex), - ex.get_detail())) + except PilotException as exc: + self.assertIsInstance(exc, PilotException) + self.assertEqual(exc.get_error_code(), 1305) + logging.info(f"\nException: error code: {exc.get_error_code()}\n\nMain message: {exc}\n\nFullStack: {exc.get_detail()}") try: - pass raise RunPayloadFailure("Test message") - except PilotException as ex: - self.assertIsInstance(ex, PilotException) - self.assertEqual(ex.get_error_code(), 1305) - logging.info("\nException: error code: %s\n\nMain message: %s\n\nFullStack: %s" % (ex.get_error_code(), - str(ex), - ex.get_detail())) + except PilotException as exc: + self.assertIsInstance(exc, PilotException) + self.assertEqual(exc.get_error_code(), 1305) + logging.info(f"\nException: error code: {exc.get_error_code()}\n\nMain message: {exc}\n\nFullStack: {exc.get_detail()}") diff --git a/pilot/test/test_harvester.py b/pilot/test/test_harvester.py index 0043f06b..3d6017da 100644 --- a/pilot/test/test_harvester.py +++ b/pilot/test/test_harvester.py @@ -20,6 +20,8 @@ # - Tobias Wegner, tobias.wegner@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +"""Unit tests for Harvester functions.""" + import os import random import shutil @@ -30,14 +32,15 @@ from pilot.api import data -def check_env(): +def check_env() -> bool: """ - Function to check whether cvmfs is available. + Check whether cvmfs is available. + To be used to decide whether to skip some test functions. - :returns True: if unit test should run (currently broken) + :returns: True if cvmfs is available, otherwise False (bool). """ - return False + return os.path.exists('/cvmfs/atlas.cern.ch/repo/') @unittest.skipIf(not check_env(), "This unit test is broken") @@ -69,8 +72,7 @@ class TestHarvesterStageIn(unittest.TestCase): """ def setUp(self): - # skip tests if running through Travis -- github does not have working rucio - self.travis = os.environ.get('TRAVIS') == 'true' + """Set up test fixtures.""" # setup pilot data client @@ -85,12 +87,7 @@ def setUp(self): self.data_client = data.StageInClient(acopytools='rucio') ## use rucio everywhere def test_stagein_sync_fail_nodirectory(self): - ''' - Test error message propagation. - ''' - if self.travis: - return True - + """Test error message propagation.""" result = self.data_client.transfer(files=[{'scope': 'does_not_matter', 'name': 'does_not_matter', 'destination': '/i_do_not_exist'}, @@ -106,12 +103,7 @@ def test_stagein_sync_fail_nodirectory(self): # 'Destination directory does not exist: /neither_do_i']) def test_stagein_sync_fail_noexist(self): - ''' - Test error message propagation. - ''' - if self.travis: - return True - + """Test error message propagation.""" result = self.data_client.transfer(files=[{'scope': 'no_scope1', 'name': 'no_name1', 'destination': '/tmp'}, @@ -127,12 +119,7 @@ def test_stagein_sync_fail_noexist(self): # 'Data identifier \'no_scope2:no_name2\' not found']) def test_stagein_sync_fail_mix(self): - ''' - Test error message propagation - ''' - if self.travis: - return True - + """Test error message propagation.""" ## if infosys was not passed to StageInClient in constructor ## then it's mandatory to specify allowed `inputddms` that can be used as source for replica lookup tmp_dir1, tmp_dir2 = tempfile.mkdtemp(), tempfile.mkdtemp() @@ -157,7 +144,7 @@ def test_stagein_sync_fail_mix(self): self.assertIsNotNone(result) for _file in result: - if _file['name'] in ['no_name1', 'no_name2']: + if _file['name'] in {'no_name1', 'no_name2'}: self.assertEqual(_file['errno'], 3) self.assertEqual(_file['status'], 'failed') #self.assertIn(_file['errmsg'], ['Data identifier \'no_scope1:no_name1\' not found', @@ -167,12 +154,7 @@ def test_stagein_sync_fail_mix(self): self.assertEqual(_file['status'], 'done') def test_stagein_sync_simple(self): - ''' - Single file going to a destination directory. - ''' - if self.travis: - return True - + """Test single file going to a destination directory.""" result = self.data_client.transfer(files=[{'scope': 'mc15_13TeV', 'name': 'HITS.06828093._000096.pool.root.1', 'destination': '/tmp'}]) @@ -184,12 +166,7 @@ def test_stagein_sync_simple(self): self.assertEqual(_file['errno'], 0) def test_stagein_sync_merged_same(self): - ''' - Multiple files going to the same destination directory. - ''' - if self.travis: - return True - + """Test multiple files going to the same destination directory.""" result = self.data_client.transfer(files=[{'scope': 'mc15_14TeV', 'name': 'HITS.10075481._000432.pool.root.1', 'destination': '/tmp'}, @@ -205,12 +182,7 @@ def test_stagein_sync_merged_same(self): self.assertEqual(_file['errno'], 0) def test_stagein_sync_merged_diff(self): - ''' - Multiple files going to different destination directories. - ''' - if self.travis: - return True - + """Test multiple files going to different destination directories.""" tmp_dir1, tmp_dir2 = tempfile.mkdtemp(), tempfile.mkdtemp() result = self.data_client.transfer(files=[{'scope': 'mc15_14TeV', 'name': 'HITS.10075481._000432.pool.root.1', @@ -233,29 +205,21 @@ def test_stagein_sync_merged_diff(self): @unittest.skipIf(not check_env(), "This unit test is broken") class TestHarvesterStageOut(unittest.TestCase): - ''' + """ Automatic stage-out tests for Harvester. from pilot.api import data data_client = data.StageOutClient(site) result = data_client.transfer(files=[{scope, name, ...}, ...]) - ''' + """ def setUp(self): - # skip tests if running through Travis -- github does not have working rucio - - self.travis = os.environ.get('TRAVIS') == 'true' - + """Set up test fixtures.""" # setup pilot data client self.data_client = data.StageOutClient(acopytools=['rucio']) def test_stageout_fail_notfound(self): - ''' - Test error message propagation. - ''' - if self.travis: - return True - + """Test error message propagation.""" result = self.data_client.transfer(files=[{'scope': 'tests', 'file': 'i_do_not_exist', 'rse': 'CERN-PROD_SCRATCHDISK'}, @@ -267,12 +231,7 @@ def test_stageout_fail_notfound(self): self.assertEqual(_file['errno'], 1) def test_stageout_file(self): - ''' - Single file upload with various combinations of parameters. - ''' - if self.travis: - return True - + """Test single file upload with various combinations of parameters.""" tmp_fd, tmp_file1 = tempfile.mkstemp() tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) @@ -311,12 +270,7 @@ def test_stageout_file(self): self.assertEqual(_file['errno'], 0) def test_stageout_file_and_attach(self): - ''' - Single file upload and attach to dataset. - ''' - if self.travis: - return True - + """Test single file upload and attach to dataset.""" tmp_fd, tmp_file1 = tempfile.mkstemp() tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) @@ -342,12 +296,7 @@ def test_stageout_file_and_attach(self): self.assertEqual(_file['errno'], 0) def test_stageout_file_noregister(self): - ''' - Single file upload without registering. - ''' - if self.travis: - return True - + """Single file upload without registering.""" tmp_fd, tmp_file1 = tempfile.mkstemp() tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) @@ -370,22 +319,17 @@ def test_stageout_file_noregister(self): self.assertEqual(_file['errno'], 0) def test_stageout_dir(self): - ''' - Single file upload. - ''' - if self.travis: - return True - + """Test single file upload.""" tmp_dir = tempfile.mkdtemp() - tmp_fd, tmp_file1 = tempfile.mkstemp(dir=tmp_dir) + tmp_fd, _ = tempfile.mkstemp(dir=tmp_dir) tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) tmp_fdo.close() - tmp_fd, tmp_file2 = tempfile.mkstemp(dir=tmp_dir) + tmp_fd, _ = tempfile.mkstemp(dir=tmp_dir) tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) tmp_fdo.close() - tmp_fd, tmp_file3 = tempfile.mkstemp(dir=tmp_dir) + tmp_fd, _ = tempfile.mkstemp(dir=tmp_dir) tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) tmp_fdo.close() @@ -398,22 +342,17 @@ def test_stageout_dir(self): self.assertEqual(_file['errno'], 0) def test_stageout_dir_and_attach(self): - ''' - Single file upload and attach to dataset. - ''' - if self.travis: - return True - + """Test single file upload and attach to dataset.""" tmp_dir = tempfile.mkdtemp() - tmp_fd, tmp_file1 = tempfile.mkstemp(dir=tmp_dir) + tmp_fd, _ = tempfile.mkstemp(dir=tmp_dir) tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) tmp_fdo.close() - tmp_fd, tmp_file2 = tempfile.mkstemp(dir=tmp_dir) + tmp_fd, _ = tempfile.mkstemp(dir=tmp_dir) tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) tmp_fdo.close() - tmp_fd, tmp_file3 = tempfile.mkstemp(dir=tmp_dir) + tmp_fd, _ = tempfile.mkstemp(dir=tmp_dir) tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) tmp_fdo.close() @@ -429,22 +368,17 @@ def test_stageout_dir_and_attach(self): self.assertEqual(_file['errno'], 0) def test_stageout_dir_noregister(self): - ''' - Single file upload without registering. - ''' - if self.travis: - return True - + """Test single file upload without registering.""" tmp_dir = tempfile.mkdtemp() - tmp_fd, tmp_file1 = tempfile.mkstemp(dir=tmp_dir) + tmp_fd, _ = tempfile.mkstemp(dir=tmp_dir) tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) tmp_fdo.close() - tmp_fd, tmp_file2 = tempfile.mkstemp(dir=tmp_dir) + tmp_fd, _ = tempfile.mkstemp(dir=tmp_dir) tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) tmp_fdo.close() - tmp_fd, tmp_file3 = tempfile.mkstemp(dir=tmp_dir) + tmp_fd, _ = tempfile.mkstemp(dir=tmp_dir) tmp_fdo = os.fdopen(tmp_fd, 'wb') tmp_fdo.write(str(random.randint(1, 2**2048))) tmp_fdo.close() diff --git a/pilot/test/test_jobreport_parser.py b/pilot/test/test_jobreport_parser.py index 397a192e..e291719e 100644 --- a/pilot/test/test_jobreport_parser.py +++ b/pilot/test/test_jobreport_parser.py @@ -19,34 +19,19 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 -import unittest +"""NOT IMPLEMENTED: Unit tests for the job report parser.""" + import json +import unittest from pilot.user.atlas.common import parse_jobreport_data class TestUtils(unittest.TestCase): - """ - Unit tests for utils functions. - """ - - def setUp(self): - # skip tests if running on a Mac -- Macs don't have /proc - #self.mac = False - # if os.environ.get('MACOSX') == 'true': - # self.mac = True - - #from pilot.info import infosys - # infosys.init("AGLT2_TEST-condor") - pass + """Unit tests for utils functions.""" def test_failed_jobreport(self): - """ - .. - - :return: (assertion) - """ - + """Test failed job report.""" report = """ { "cmdLine": "'/ccs/proj/csc108/AtlasReleases/21.0.15/AtlasOffline/21.0.15/InstallArea/x86_64-slc6-gcc49-opt/share/Sim_tf.py'""" \ @@ -156,12 +141,7 @@ def test_failed_jobreport(self): print((json.dumps(parse_jobreport_data(report_data), sort_keys=True, indent=2))) def test_successful_jobreport(self): - """ - .. - - :return: (assertion) - """ - + """Test successful job report.""" report = """ { "cmdLine": "'/cvmfs/atlas.cern.ch/repo/sw/software/21.0/AtlasOffline/21.0.15/InstallArea/x86_64-slc6-gcc49-opt/share/Sim_tf.py'""" \ diff --git a/pilot/test/test_utils.py b/pilot/test/test_utils.py index c965febd..b15d0c00 100644 --- a/pilot/test/test_utils.py +++ b/pilot/test/test_utils.py @@ -19,36 +19,41 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 -import unittest +"""Unit tests for pilot utils.""" + import os +import unittest -from pilot.util.workernode import collect_workernode_info, get_disk_space +from pilot.info import infosys +from pilot.util.workernode import ( + collect_workernode_info, + get_disk_space +) -class TestUtils(unittest.TestCase): - """ - Unit tests for utils functions. +def check_env() -> bool: """ + Check whether cvmfs is available. - def setUp(self): - # skip tests if running on a Mac -- Macs don't have /proc - self.mac = False - if os.environ.get('MACOSX') == 'true' or not os.path.exists('/proc/meminfo'): - self.mac = True + To be used to decide whether to skip some test functions. - from pilot.info import infosys - infosys.init("CERN") + :returns: True if not a Mac, otherwise False (bool). + """ + is_mac = os.environ.get('MACOSX') == 'true' or not os.path.exists('/proc/meminfo') + return not is_mac + # return os.path.exists('/cvmfs/atlas.cern.ch/repo/') - def test_collect_workernode_info(self): - """ - Make sure that collect_workernode_info() returns the proper types (float, float, float). - :return: (assertion) - """ +@unittest.skipIf(not check_env(), "This unit test is broken") +class TestUtils(unittest.TestCase): + """Unit tests for utils functions.""" - if self.mac: - return True + def setUp(self): + """Set up test fixtures.""" + infosys.init("CERN") + def test_collect_workernode_info(self): + """Make sure that collect_workernode_info() returns the proper types (float, float, float).""" mem, cpu, disk = collect_workernode_info(path=os.getcwd()) self.assertEqual(type(mem), float) @@ -60,17 +65,8 @@ def test_collect_workernode_info(self): self.assertNotEqual(disk, 0.0) def test_get_disk_space(self): - """ - Verify that get_disk_space() returns the proper type (int). - - :return: (assertion) - """ - - if self.mac: - return True - + """Verify that get_disk_space() returns the proper type (int).""" #queuedata = {'maxwdir': 123456789} - from pilot.info import infosys diskspace = get_disk_space(infosys.queuedata) ## FIX ME LATER diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index c7db5ff1..35fb77bd 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -17,57 +17,35 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 # - Wen Guan, wen.guan@cern.ch, 2018 -from collections import defaultdict +"""Common functions for ATLAS.""" + import fnmatch -from glob import glob import logging import os import re + +from collections import defaultdict +from functools import reduce +from glob import glob from random import randint from signal import SIGTERM, SIGUSR1 +from typing import Any # from tarfile import ExFileObject -try: - from functools import reduce # Python 3 -except ImportError: - pass - -from .container import ( - create_root_container_command, - verify_container_script -) #, create_middleware_container_command -from .dbrelease import get_dbrelease_version, create_dbrelease -from .setup import ( - should_pilot_prepare_setup, - is_standard_atlas_job, - get_asetup, - set_inds, - get_analysis_trf, - get_payload_environment_variables, - replace_lfns_with_turls, -) -from .utilities import ( - get_memory_monitor_setup, - get_network_monitor_setup, - post_memory_monitor_action, - get_memory_monitor_summary_filename, - get_prefetcher_setup, - get_benchmark_setup, - get_memory_monitor_output_filename, - get_metadata_dict_from_txt, -) - from pilot.util.auxiliary import ( get_resource_name, get_key_value, ) - from pilot.common.errorcodes import ErrorCodes -from pilot.common.exception import TrfDownloadFailure, PilotException, FileHandlingFailure +from pilot.common.exception import ( + TrfDownloadFailure, + PilotException, + FileHandlingFailure +) from pilot.util.config import config from pilot.util.constants import ( UTILITY_BEFORE_PAYLOAD, @@ -94,53 +72,73 @@ update_extension, write_file, ) +from pilot.info.filespec import FileSpec from pilot.util.processes import ( convert_ps_to_dict, find_pid, find_cmd_pids, get_trimmed_dictionary, is_child ) - from pilot.util.tracereport import TraceReport -logger = logging.getLogger(__name__) +from .container import create_root_container_command +from .dbrelease import get_dbrelease_version, create_dbrelease +from .setup import ( + should_pilot_prepare_setup, + is_standard_atlas_job, + get_asetup, + set_inds, + get_analysis_trf, + get_payload_environment_variables, + replace_lfns_with_turls, +) +from .utilities import ( + get_memory_monitor_setup, + get_network_monitor_setup, + post_memory_monitor_action, + get_memory_monitor_summary_filename, + get_prefetcher_setup, + get_benchmark_setup, + get_memory_monitor_output_filename, + get_metadata_dict_from_txt, +) +logger = logging.getLogger(__name__) errors = ErrorCodes() -def sanity_check(): +def sanity_check() -> int: """ - Perform an initial sanity check before doing anything else in a - given workflow. This function can be used to verify importing of - modules that are otherwise used much later, but it is better to abort - the pilot if a problem is discovered early. + Perform an initial sanity check before doing anything else in a given workflow. - :return: exit code (0 if all is ok, otherwise non-zero exit code). - """ + This function can be used to verify importing of modules that are otherwise used much later, but it is better to + abort the pilot if a problem is discovered early. - exit_code = 0 + Note: currently this function does not do anything. + :return: exit code (0 if all is ok, otherwise non-zero exit code) (int). + """ #try: # from rucio.client.downloadclient import DownloadClient # from rucio.client.uploadclient import UploadClient # # note: must do something with Download/UploadClients or flake8 # will complain - but do not instantiate - #except Exception as e: - # logger.warning('sanity check failed: %s' % e) + #except Exception as exc: + # logger.warning(f'sanity check failed: {exc}') # exit_code = errors.MIDDLEWAREIMPORTFAILURE - return exit_code + return 0 -def validate(job): +def validate(job: Any) -> bool: """ Perform user specific payload/job validation. + This function will produce a local DBRelease file if necessary (old releases). - :param job: job object. - :return: Boolean (True if validation is successful). + :param job: job object (Any) + :return: True if validation is successful, False otherwise (bool). """ - status = True if 'DBRelease' in job.jobparams: @@ -159,15 +157,15 @@ def validate(job): if status: if job.imagename and job.imagename.startswith('/'): if os.path.exists(job.imagename): - logger.info('verified that image exists: %s', job.imagename) + logger.info(f'verified that image exists: {job.imagename}') else: status = False - logger.warning('image does not exist: %s', job.imagename) + logger.warning(f'image does not exist: {job.imagename}') job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.IMAGENOTFOUND) # cleanup job parameters if only copy-to-scratch #if job.only_copy_to_scratch(): - # logger.debug('job.params=%s' % job.jobparams) + # logger.debug(f'job.params={job.jobparams}') # if ' --usePFCTurl' in job.jobparams: # logger.debug('cleaning up --usePFCTurl from job parameters # since all input is copy-to-scratch') @@ -180,16 +178,16 @@ def validate(job): return status -def open_remote_files(indata, workdir, nthreads): +def open_remote_files(indata: list, workdir: str, nthreads: int) -> (int, str, list): """ Verify that direct i/o files can be opened. - :param indata: list of FileSpec. - :param workdir: working directory (string). - :param nthreads: number of concurrent file open threads (int). - :return: exit code (int), diagnostics (string). + :param indata: list of FileSpec (list) + :param workdir: working directory (str) + :param nthreads: number of concurrent file open threads (int) + :return: exit code (int), diagnostics (str), not opened files (list) + :raises PilotException: in case of pilot error. """ - exitcode = 0 diagnostics = "" not_opened = [] @@ -214,85 +212,85 @@ def open_remote_files(indata, workdir, nthreads): if not os.path.exists(full_script_path): # do not set ec since this will be a pilot issue rather than site issue diagnostics = ( - 'cannot perform file open test - script path does ' - 'not exist: %s' % full_script_path + f'cannot perform file open test - script path does not exist: {full_script_path}' ) logger.warning(diagnostics) - logger.warning('tested both path=%s and path=%s (none exists)', dir1, dir2) + logger.warning(f'tested both path={dir1} and path={dir2} (none exists)') return exitcode, diagnostics, not_opened try: copy(full_script_path, final_script_path) except PilotException as exc: # do not set ec since this will be a pilot issue rather than site issue - diagnostics = 'cannot perform file open test - pilot source copy failed: %s' % exc + diagnostics = f'cannot perform file open test - pilot source copy failed: {exc}' logger.warning(diagnostics) return exitcode, diagnostics, not_opened + + # correct the path when containers have been used + final_script_path = os.path.join('.', script) + + _cmd = get_file_open_command(final_script_path, turls, nthreads) + cmd = create_root_container_command(workdir, _cmd) + + timeout = get_timeout_for_remoteio(indata) + logger.info(f'executing file open verification script (timeout={timeout}):\n\n\'{cmd}\'\n\n') + + exitcode, stdout, stderr = execute(cmd, usecontainer=False, timeout=timeout) + if config.Pilot.remotefileverification_log: + fpath = os.path.join(workdir, config.Pilot.remotefileverification_log) + write_file(fpath, stdout + stderr, mute=False) + logger.info(f'remote file open finished with ec={exitcode}') + + # error handling + if exitcode: + # first check for apptainer errors + _exitcode = errors.resolve_transform_error(exitcode, stdout + stderr) + if _exitcode != exitcode: # a better error code was found (COMMANDTIMEDOUT error will be passed through) + return _exitcode, stderr, not_opened + + # note: if the remote files could still be opened the reported error should not be REMOTEFILEOPENTIMEDOUT + _exitcode, diagnostics, not_opened = parse_remotefileverification_dictionary(workdir) + if not _exitcode: + logger.info('remote file could still be opened in spite of previous error') + elif _exitcode: + if exitcode == errors.COMMANDTIMEDOUT and _exitcode == errors.REMOTEFILECOULDNOTBEOPENED: + exitcode = errors.REMOTEFILEOPENTIMEDOUT + elif exitcode == errors.COMMANDTIMEDOUT and _exitcode == errors.REMOTEFILEDICTDOESNOTEXIST: + exitcode = errors.REMOTEFILEOPENTIMEDOUT + diagnostics = f'remote file open command was timed-out and: {diagnostics}' # cannot give further info + else: # REMOTEFILECOULDNOTBEOPENED + exitcode = _exitcode else: - # correct the path when containers have been used - final_script_path = os.path.join('.', script) - - _cmd = get_file_open_command(final_script_path, turls, nthreads) - cmd = create_root_container_command(workdir, _cmd) - - timeout = get_timeout_for_remoteio(indata) - logger.info('executing file open verification script (timeout=%d):\n\n\'%s\'\n\n', timeout, cmd) - - exitcode, stdout, stderr = execute(cmd, usecontainer=False, timeout=timeout) - if config.Pilot.remotefileverification_log: - fpath = os.path.join(workdir, config.Pilot.remotefileverification_log) - write_file(fpath, stdout + stderr, mute=False) - logger.info('remote file open finished with ec=%d', exitcode) - - # error handling - if exitcode: - # first check for apptainer errors - _exitcode = errors.resolve_transform_error(exitcode, stdout + stderr) - if _exitcode != exitcode: # a better error code was found (COMMANDTIMEDOUT error will be passed through) - return _exitcode, stderr, not_opened - - # note: if the remote files could still be opened the reported error should not be REMOTEFILEOPENTIMEDOUT - _exitcode, diagnostics, not_opened = parse_remotefileverification_dictionary(workdir) - if not _exitcode: - logger.info('remote file could still be opened in spite of previous error') - elif _exitcode: - if exitcode == errors.COMMANDTIMEDOUT and _exitcode == errors.REMOTEFILECOULDNOTBEOPENED: - exitcode = errors.REMOTEFILEOPENTIMEDOUT - elif exitcode == errors.COMMANDTIMEDOUT and _exitcode == errors.REMOTEFILEDICTDOESNOTEXIST: - exitcode = errors.REMOTEFILEOPENTIMEDOUT - diagnostics = f'remote file open command was timed-out and: {diagnostics}' # cannot give further info - else: # REMOTEFILECOULDNOTBEOPENED - exitcode = _exitcode - else: - exitcode, diagnostics, not_opened = parse_remotefileverification_dictionary(workdir) + exitcode, diagnostics, not_opened = parse_remotefileverification_dictionary(workdir) else: logger.info('nothing to verify (for remote files)') if exitcode: logger.warning(f'remote file open exit code: {exitcode}') + return exitcode, diagnostics, not_opened -def get_timeout_for_remoteio(indata): +def get_timeout_for_remoteio(indata: list) -> int: """ Calculate a proper timeout to be used for remote i/o files. - :param indata: indata object. + :param indata: list of FileSpec objects (list) :return: timeout in seconds (int). """ - remote_io = [fspec.status == 'remote_io' for fspec in indata] + return len(remote_io) * 30 + 600 -def parse_remotefileverification_dictionary(workdir): +def parse_remotefileverification_dictionary(workdir: str) -> (int, str, list): """ Verify that all files could be remotely opened. + Note: currently ignoring if remote file dictionary doesn't exist. - :param workdir: work directory needed for opening remote file dictionary (string). - :return: exit code (int), diagnostics (string). + :param workdir: work directory needed for opening remote file dictionary (str) + :return: exit code (int), diagnostics (str), not opened files (list). """ - exitcode = 0 diagnostics = "" not_opened = [] @@ -309,16 +307,16 @@ def parse_remotefileverification_dictionary(workdir): file_dictionary = read_json(dictionary_path) if not file_dictionary: - diagnostics = 'could not read dictionary from %s' % dictionary_path + diagnostics = f'could not read dictionary from {dictionary_path}' logger.warning(diagnostics) else: for turl in file_dictionary: opened = file_dictionary[turl] if not opened: - logger.info('turl could not be opened: %s', turl) + logger.info(f'turl could not be opened: {turl}') not_opened.append(turl) else: - logger.info('turl could be opened: %s', turl) + logger.info(f'turl could be opened: {turl}') if not_opened: exitcode = errors.REMOTEFILECOULDNOTBEOPENED @@ -327,33 +325,36 @@ def parse_remotefileverification_dictionary(workdir): return exitcode, diagnostics, not_opened -def get_file_open_command(script_path, turls, nthreads, stdout='remote_open.stdout', stderr='remote_open.stderr'): +def get_file_open_command(script_path: str, turls: str, nthreads: int, + stdout: str = 'remote_open.stdout', stderr: str = 'remote_open.stderr') -> str: """ + Return the command for opening remote files. - :param script_path: path to script (string). - :param turls: comma-separated turls (string). - :param nthreads: number of concurrent file open threads (int). - :return: comma-separated list of turls (string). + :param script_path: path to script (str) + :param turls: comma-separated turls (str) + :param nthreads: number of concurrent file open threads (int) + :param stdout: stdout file name (str) + :param stderr: stderr file name (str) + :return: comma-separated list of turls (str). """ - cmd = f"{script_path} --turls=\'{turls}\' -w {os.path.dirname(script_path)} -t {nthreads}" if stdout and stderr: cmd += f' 1>{stdout} 2>{stderr}' + return cmd -def extract_turls(indata): +def extract_turls(indata: list) -> str: """ Extract TURLs from indata for direct i/o files. - :param indata: list of FileSpec. - :return: comma-separated list of turls (string). + :param indata: list of FileSpec (list) + :return: comma-separated list of turls (str). """ - # turls = "" # for filespc in indata: # if filespc.status == 'remote_io': - # turls += filespc.turl if not turls else ",%s" % filespc.turl + # turls += filespc.turl if not turls else f",{filespc.turl}" # return turls return ",".join( @@ -361,22 +362,21 @@ def extract_turls(indata): ) -def process_remote_file_traces(path, job, not_opened_turls): +def process_remote_file_traces(path: str, job: Any, not_opened_turls: list): """ Report traces for remote files. + The function reads back the base trace report (common part of all traces) and updates it per file before reporting it to the Rucio server. - :param path: path to base trace report (string). - :param job: job object. - :param not_opened_turls: list of turls that could not be opened (list). - :return: + :param path: path to base trace report (str) + :param job: job object (Any) + :param not_opened_turls: list of turls that could not be opened (list) """ - try: base_trace_report = read_json(path) except PilotException as exc: - logger.warning('failed to open base trace report (cannot send trace reports): %s', exc) + logger.warning(f'failed to open base trace report (cannot send trace reports): {exc}') else: if not base_trace_report: logger.warning('failed to read back base trace report (cannot send trace reports)') @@ -405,17 +405,17 @@ def process_remote_file_traces(path, job, not_opened_turls): if trace_report: trace_report.send() else: - logger.warning('failed to create trace report for turl=%s', fspec.turl) + logger.warning(f'failed to create trace report for turl={fspec.turl}') -def get_protocol(surl, event_type): +def get_protocol(surl: str, event_type: str) -> str: """ Extract the protocol from the surl for event type get_sm_a. - :param surl: SURL (string). - :return: protocol (string). + :param surl: SURL (str) + :param event_type: event type (str) + :return: protocol (str). """ - protocol = '' if event_type != 'get_sm_a': return '' @@ -427,29 +427,27 @@ def get_protocol(surl, event_type): return protocol -def get_nthreads(catchall): +def get_nthreads(catchall: str) -> int: """ Extract number of concurrent file open threads from catchall. + Return nthreads=1 if nopenfiles=.. is not present in catchall. - :param catchall: queuedata catchall (string). + :param catchall: queuedata catchall (str) :return: number of threads (int). """ - _nthreads = get_key_value(catchall, key='nopenfiles') return _nthreads if _nthreads else 1 -def get_payload_command(job): +def get_payload_command(job: Any) -> str: """ - Return the full command for executing the payload, including the - sourcing of all setup files and setting of environment variables. + Return the full command for executing the payload, including the sourcing of all setup files and setting of environment variables. - :param job: job object. - :raises PilotException: TrfDownloadFailure. + :param job: job object (Any) :return: command (string). + :raises TrfDownloadFailure: in case of download failure. """ - # Should the pilot do the setup or does jobPars already contain the information? preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) @@ -458,13 +456,11 @@ def get_payload_command(job): # Is it a user job or not? userjob = job.is_analysis() - logger.info('pilot is running a %s job', 'user analysis' if userjob else 'production') + tmp = 'user analysis' if userjob else 'production' + logger.info(f'pilot is running a {tmp} job') resource_name = get_resource_name() # 'grid' if no hpc_resource is set - - # Python 3, level -1 -> 0 - modname = 'pilot.user.atlas.resource.%s' % resource_name - resource = __import__(modname, globals(), locals(), [resource_name], 0) + resource = __import__(f'pilot.user.atlas.resource.{resource_name}', globals(), locals(), [resource_name], 0) # make sure that remote file can be opened before executing payload catchall = job.infosys.queuedata.catchall.lower() if job.infosys.queuedata.catchall else '' @@ -476,14 +472,13 @@ def get_payload_command(job): logger.debug('executing open_remote_files()') exitcode, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir, get_nthreads(catchall)) except Exception as exc: - logger.warning('caught std exception: %s', exc) + logger.warning(f'caught std exception: {exc}') else: # read back the base trace report path = os.path.join(job.workdir, config.Pilot.base_trace_report) if not os.path.exists(path): - logger.warning(( - 'base trace report does not exist (%s) - input file ' - 'traces should already have been sent'), path) + logger.warning(f'base trace report does not exist ({path}) - ' + f'input file traces should already have been sent') else: process_remote_file_traces(path, job, not_opened_turls) @@ -536,16 +531,10 @@ def get_payload_command(job): # prepend PanDA job id in case it is not there already (e.g. runcontainer jobs) if 'export PandaID' not in cmd: - cmd = "export PandaID=%s;" % job.jobid + cmd + cmd = f"export PandaID={job.jobid};" + cmd cmd = cmd.replace(';;', ';') - # For direct access in prod jobs, we need to substitute the input file names - # with the corresponding TURLs - # get relevant file transfer info - #use_copy_tool, use_direct_access, use_pfc_turl = get_file_transfer_info(job) - #if not userjob and use_direct_access and job.transfertype == 'direct': - ## ported from old logic if not userjob and not job.is_build_job() and job.has_remoteio(): ## ported from old logic but still it looks strange (anisyonk) @@ -566,27 +555,22 @@ def get_payload_command(job): # Explicitly add the ATHENA_PROC_NUMBER (or JOB value) cmd = add_athena_proc_number(cmd) - - #if os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': - # cmd = 'export PYTHONPATH=/usr/lib64/python3.6:/usr/local/lib/python3.6/site-packages/dask:$PYTHONPATH' + cmd - if job.dask_scheduler_ip: cmd += f'export DASK_SCHEDULER_IP={job.dask_scheduler_ip}; ' + cmd - logger.info('payload run command: %s', cmd) + logger.info(f'payload run command: {cmd}') return cmd -def prepend_env_vars(environ, cmd): +def prepend_env_vars(environ: str, cmd: str) -> str: """ Prepend the payload command with environmental variables from PQ.environ if set. - :param environ: PQ.environ (string). - :param cmd: payload command (string). - :return: updated payload command (string). + :param environ: PQ.environ (str) + :param cmd: payload command (str) + :return: updated payload command (str). """ - exports = get_exports(environ) exports_to_add = '' for _cmd in exports: @@ -598,26 +582,25 @@ def prepend_env_vars(environ, cmd): return cmd -def get_key_values(from_string): +def get_key_values(from_string: str) -> list: """ Return a list of key value tuples from given string. + Example: from_string = 'KEY1=VALUE1 KEY2=VALUE2' -> [('KEY1','VALUEE1'), ('KEY2', 'VALUE2')] - :param from_string: string containing key-value pairs (string). + :param from_string: string containing key-value pairs (str) :return: list of key-pair tuples (list). """ - return re.findall(re.compile(r"\b(\w+)=(.*?)(?=\s\w+=\s*|$)"), from_string) -def get_exports(from_string): +def get_exports(from_string: str) -> list: """ Return list of exports from given string. - :param from_string: string containing key-value pairs (string). + :param from_string: string containing key-value pairs (str) :return: list of export commands (list). """ - exports = [] key_values = get_key_values(from_string) logger.debug(f'extracted key-values: {key_values}') @@ -636,18 +619,16 @@ def get_exports(from_string): return exports -def get_normal_payload_command(cmd, job, preparesetup, userjob): +def get_normal_payload_command(cmd: str, job: Any, preparesetup: bool, userjob: bool) -> str: """ Return the payload command for a normal production/analysis job. - :param cmd: any preliminary command setup (string). - :param job: job object. - :param userjob: True for user analysis jobs, False otherwise (bool). - :param preparesetup: True if the pilot should prepare the setup, - False if already in the job parameters. - :return: normal payload command (string). + :param cmd: any preliminary command setup (str) + :param job: job object (Any) + :param userjob: True for user analysis jobs, False otherwise (bool) + :param preparesetup: True if the pilot should prepare the setup, False if already in the job parameters (bool) + :return: normal payload command (str). """ - # set the INDS env variable # (used by runAthena but also for EventIndex production jobs) set_inds(job.datasetin) # realDatasetsIn @@ -658,7 +639,7 @@ def get_normal_payload_command(cmd, job, preparesetup, userjob): if exitcode != 0: raise TrfDownloadFailure(diagnostics) - logger.debug('user analysis trf: %s', trf_name) + logger.debug(f'user analysis trf: {trf_name}') if preparesetup: _cmd = get_analysis_run_command(job, trf_name) @@ -674,41 +655,41 @@ def get_normal_payload_command(cmd, job, preparesetup, userjob): if job.is_eventservice: if job.corecount: - cmd += '; export ATHENA_PROC_NUMBER=%s' % job.corecount - cmd += '; export ATHENA_CORE_NUMBER=%s' % job.corecount + cmd += f'; export ATHENA_PROC_NUMBER={job.corecount}' + cmd += f'; export ATHENA_CORE_NUMBER={job.corecount}' else: cmd += '; export ATHENA_PROC_NUMBER=1' cmd += '; export ATHENA_CORE_NUMBER=1' # Add the transform and the job parameters (production jobs) if preparesetup: - cmd += "; %s %s" % (job.transformation, job.jobparams) + cmd += f"; {job.transformation} {job.jobparams}" else: cmd += "; " + job.jobparams return cmd -def get_generic_payload_command(cmd, job, preparesetup, userjob): +def get_generic_payload_command(cmd: str, job: Any, preparesetup: bool, userjob: bool) -> str: """ + Return the payload command for a generic job. - :param cmd: - :param job: job object. - :param preparesetup: - :param userjob: True for user analysis jobs, False otherwise (bool). - :return: generic job command (string). + :param cmd: any preliminary command setup (str) + :param job: job object (Any) + :param preparesetup: True if the pilot should prepare the setup, False if already in the job parameters (bool) + :param userjob: True for user analysis jobs, False otherwise (bool) + :return: generic job command (str). """ - if userjob: # Try to download the trf #if job.imagename != "" or "--containerImage" in job.jobparams: # job.transformation = os.path.join(os.path.dirname(job.transformation), "runcontainer") - # logger.warning('overwrote job.transformation, now set to: %s' % job.transformation) + # logger.warning(f'overwrote job.transformation, now set to: {job.transformation}') exitcode, diagnostics, trf_name = get_analysis_trf(job.transformation, job.workdir) if exitcode != 0: raise TrfDownloadFailure(diagnostics) - logger.debug('user analysis trf: %s', trf_name) + logger.debug(f'user analysis trf: {trf_name}') if preparesetup: _cmd = get_analysis_run_command(job, trf_name) @@ -724,49 +705,44 @@ def get_generic_payload_command(cmd, job, preparesetup, userjob): elif verify_release_string(job.homepackage) != 'NULL' and job.homepackage != ' ': if preparesetup: - cmd = "python %s/%s %s" % (job.homepackage, job.transformation, job.jobparams) + cmd = f"python {job.homepackage}/{job.transformation} {job.jobparams}" else: cmd = job.jobparams + elif preparesetup: + cmd = f"python {job.transformation} {job.jobparams}" else: - if preparesetup: - cmd = "python %s %s" % (job.transformation, job.jobparams) - else: - cmd = job.jobparams + cmd = job.jobparams return cmd -def add_athena_proc_number(cmd): +def add_athena_proc_number(cmd: str) -> str: """ - Add the ATHENA_PROC_NUMBER and ATHENA_CORE_NUMBER to - the payload command if necessary. + Add the ATHENA_PROC_NUMBER and ATHENA_CORE_NUMBER to the payload command if necessary. - :param cmd: payload execution command (string). - :return: updated payload execution command (string). + :param cmd: payload execution command (str) + :return: updated payload execution command (str). """ - # get the values if they exist try: value1 = int(os.environ['ATHENA_PROC_NUMBER_JOB']) except (TypeError, KeyError, ValueError) as exc: - logger.warning('failed to convert ATHENA_PROC_NUMBER_JOB to int: %s', exc) + logger.warning(f'failed to convert ATHENA_PROC_NUMBER_JOB to int: {exc}') value1 = None try: value2 = int(os.environ['ATHENA_CORE_NUMBER']) except (TypeError, KeyError, ValueError) as exc: - logger.warning('failed to convert ATHENA_CORE_NUMBER to int: %s', exc) + logger.warning(f'failed to convert ATHENA_CORE_NUMBER to int:{exc}') value2 = None if "ATHENA_PROC_NUMBER" not in cmd: if "ATHENA_PROC_NUMBER" in os.environ: - cmd = 'export ATHENA_PROC_NUMBER=%s;' % os.environ['ATHENA_PROC_NUMBER'] + cmd + cmd = f"export ATHENA_PROC_NUMBER={os.environ['ATHENA_PROC_NUMBER']};" + cmd elif "ATHENA_PROC_NUMBER_JOB" in os.environ and value1: if value1 > 1: - cmd = 'export ATHENA_PROC_NUMBER=%d;' % value1 + cmd + cmd = f'export ATHENA_PROC_NUMBER={value1};' + cmd else: - logger.info(( - "will not add ATHENA_PROC_NUMBER to cmd " - "since the value is %s"), str(value1)) + logger.info(f"will not add ATHENA_PROC_NUMBER to cmd since the value is {value1}") else: logger.warning(( "don't know how to set ATHENA_PROC_NUMBER " @@ -776,9 +752,9 @@ def add_athena_proc_number(cmd): if 'ATHENA_CORE_NUMBER' in os.environ and value2: if value2 > 1: - cmd = 'export ATHENA_CORE_NUMBER=%d;' % value2 + cmd + cmd = f'export ATHENA_CORE_NUMBER={value2};' + cmd else: - logger.info("will not add ATHENA_CORE_NUMBER to cmd since the value is %s", str(value2)) + logger.info(f"will not add ATHENA_CORE_NUMBER to cmd since the value is {value2}") else: logger.warning(( 'there is no ATHENA_CORE_NUMBER in os.environ ' @@ -787,14 +763,13 @@ def add_athena_proc_number(cmd): return cmd -def verify_release_string(release): +def verify_release_string(release: str or None) -> str: """ Verify that the release (or homepackage) string is set. - :param release: release or homepackage string that might or might not be set. - :return: release (set string). + :param release: release or homepackage string that might or might not be set (str or None) + :return: release (str). """ - if release is None: release = "" release = release.upper() @@ -806,16 +781,14 @@ def verify_release_string(release): return release -def add_makeflags(job_core_count, cmd): +def add_makeflags(job_core_count: int, cmd: str) -> str: """ - Correct for multi-core if necessary (especially important in - case coreCount=1 to limit parallel make). + Correct for multicore if necessary (especially important in case coreCount=1 to limit parallel make). - :param job_core_count: core count from the job definition (int). - :param cmd: payload execution command (string). - :return: updated payload execution command (string). + :param job_core_count: core count from the job definition (int) + :param cmd: payload execution command (str) + :return: updated payload execution command (str). """ - # ATHENA_PROC_NUMBER is set in Node.py using the schedconfig value try: core_count = int(os.environ.get('ATHENA_PROC_NUMBER')) @@ -831,7 +804,7 @@ def add_makeflags(job_core_count, cmd): if core_count >= 1: # Note: the original request (AF) was to use j%d # and not -j%d, now using the latter - cmd += "export MAKEFLAGS=\'-j%d QUICK=1 -l1\';" % (core_count) + cmd += f"export MAKEFLAGS=\'-j{core_count} QUICK=1 -l1\';" # make sure that MAKEFLAGS is always set if "MAKEFLAGS=" not in cmd: @@ -840,30 +813,19 @@ def add_makeflags(job_core_count, cmd): return cmd -def get_analysis_run_command(job, trf_name): # noqa: C901 +def get_analysis_run_command(job: Any, trf_name: str) -> str: # noqa: C901 """ Return the proper run command for the user job. Example output: export X509_USER_PROXY=<..>;./runAthena --usePFCTurl --directIn - :param job: job object. - :param trf_name: name of the transform that will run the job (string). - Used when containers are not used. - :return: command (string). + :param job: job object (Any) + :param trf_name: name of the transform that will run the job (str) + :return: command (str). """ - cmd = "" - # get relevant file transfer info - #use_copy_tool, use_direct_access, use_pfc_turl = get_file_transfer_info(job) - # check if the input files are to be accessed locally (ie if prodDBlockToken is set to local) - ## useless since stage-in phase has already passed (DEPRECATE ME, anisyonk) - #if job.is_local(): - # logger.debug('switched off direct access for local prodDBlockToken') - # use_direct_access = False - # use_pfc_turl = False - # add the user proxy if 'X509_USER_PROXY' in os.environ and not job.imagename: logger.debug(f'X509_UNIFIED_DISPATCH={os.environ.get("X509_UNIFIED_DISPATCH")}') @@ -880,28 +842,28 @@ def get_analysis_run_command(job, trf_name): # noqa: C901 # set up trfs if job.imagename == "": # user jobs with no imagename defined - cmd += './%s %s' % (trf_name, job.jobparams) + cmd += f'./{trf_name} {job.jobparams}' else: if job.is_analysis() and job.imagename: - cmd += './%s %s' % (trf_name, job.jobparams) + cmd += f'./{trf_name} {job.jobparams}' else: - cmd += 'python %s %s' % (trf_name, job.jobparams) + cmd += f'python {trf_name} {job.jobparams}' imagename = job.imagename # check if image is on disk as defined by envar PAYLOAD_CONTAINER_LOCATION payload_container_location = os.environ.get('PAYLOAD_CONTAINER_LOCATION') if payload_container_location is not None: - logger.debug("$PAYLOAD_CONTAINER_LOCATION = %s", payload_container_location) + logger.debug(f"$PAYLOAD_CONTAINER_LOCATION = {payload_container_location}") # get container name containername = imagename.rsplit('/')[-1] image_location = os.path.join(payload_container_location, containername) if os.path.exists(image_location): - logger.debug("image exists at %s", image_location) + logger.debug(f"image exists at {image_location}") imagename = image_location # restore the image name if necessary if 'containerImage' not in cmd and 'runcontainer' in trf_name: - cmd += ' --containerImage=%s' % imagename + cmd += f' --containerImage={imagename}' # add control options for PFC turl and direct access #if job.indata: ## DEPRECATE ME (anisyonk) @@ -931,14 +893,15 @@ def get_analysis_run_command(job, trf_name): # noqa: C901 lfns, guids = job.get_lfns_and_guids() _guids = get_guids_from_jobparams(job.jobparams, lfns, guids) if _guids: - cmd += ' --inputGUIDs \"%s\"' % (str(_guids)) + cmd += f' --inputGUIDs "{str(_guids)}"' return cmd -def get_guids_from_jobparams(jobparams, infiles, infilesguids): +def get_guids_from_jobparams(jobparams: str, infiles: list, infilesguids: list) -> list: """ Extract the correct guid from the input file list. + The guids list is used for direct reading. 1. extract input file list for direct reading from job parameters 2. for each input file in this list, find the corresponding guid from @@ -946,12 +909,11 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids): Since the job parameters string is entered by a human, the order of the input files might not be the same. - :param jobparams: job parameters. - :param infiles: input file list. - :param infilesguids: input file guids list. - :return: guids list. + :param jobparams: job parameters (str) + :param infiles: input file list (list) + :param infilesguids: input file guids list (list) + :return: guids (list). """ - guidlist = [] jobparams = jobparams.replace("'", "") jobparams = jobparams.replace(", ", ",") @@ -962,10 +924,10 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids): if directreadinginputfiles != []: _infiles = directreadinginputfiles[0].split(",") else: - match = re.search(r"-i ([A-Za-z0-9.\[\],_-]+) ", jobparams) # Python 3 (added r) + match = re.search(r"-i ([A-Za-z0-9.\[\],_-]+) ", jobparams) if match is not None: compactinfiles = match.group(1) - match = re.search(r'(.*)\[(.+)\](.*)\[(.+)\]', compactinfiles) # Python 3 (added r) + match = re.search(r'(.*)\[(.+)\](.*)\[(.+)\]', compactinfiles) if match is not None: infiles = [] head = match.group(1) @@ -974,7 +936,7 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids): attr = match.group(4).split(',') for idx, item in enumerate(body): - lfn = '%s%s%s%s' % (head, item, tail, attr[idx]) + lfn = f'{head}{item}{tail}{attr[idx]}' infiles.append(lfn) else: infiles = [compactinfiles] @@ -985,7 +947,7 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids): try: index = infiles.index(infile) except ValueError as exc: - logger.warning("exception caught: %s (direct reading will fail)", exc) + logger.warning(f"exception caught: {exc} (direct reading will fail)") else: # add the corresponding guid to the list guidlist.append(infilesguids[index]) @@ -993,49 +955,14 @@ def get_guids_from_jobparams(jobparams, infiles, infilesguids): return guidlist -def get_file_transfer_info(job): ## TO BE DEPRECATED, NOT USED (anisyonk) - """ - Return information about desired file transfer. - - :param job: job object - :return: use copy tool (boolean), use direct access (boolean), - use PFC Turl (boolean). - """ - - use_copy_tool = True - use_direct_access = False - use_pfc_turl = False - - # check with schedconfig - is_lan = job.infosys.queuedata.direct_access_lan - is_wan = job.infosys.queuedata.direct_access_wan - if not job.is_build_job() and (is_lan or is_wan or job.transfertype == 'direct'): - # override if all input files are copy-to-scratch - if job.only_copy_to_scratch(): - logger.info(( - 'all input files are copy-to-scratch ' - '(--usePFCTurl and --directIn will not be set)')) - else: - logger.debug('--usePFCTurl and --directIn will be set') - use_copy_tool = False - use_direct_access = True - use_pfc_turl = True - - return use_copy_tool, use_direct_access, use_pfc_turl - - -def test_job_data(job): +def test_job_data(job: Any): """ - REMOVE THIS + Test function to verify that the job object contains the expected data. - :param job: job object - :return: + :param job: job object (Any) """ - # in case the job was created with --outputs="regex|DST_.*\.root", we can now look for the corresponding # output files and add them to the output file list - from pilot.info.filespec import FileSpec - # add a couple of files to replace current output filesizeinbytes = 1024 outputfiles = ['DST_.random1.root', 'DST_.random2.root', 'DST_.random3.root'] @@ -1086,17 +1013,17 @@ def test_job_data(job): logger.debug('no regex found in outdata file list') -def update_job_data(job): +def update_job_data(job: Any): """ + Update the job object. + This function can be used to update/add data to the job object. E.g. user specific information can be extracted from other job object fields. In the case of ATLAS, information is extracted from the metadata field and added to other job object fields. - :param job: job object - :return: + :param job: job object (Any). """ - ## comment from Alexey: ## it would be better to reallocate this logic (as well as parse ## metadata values)directly to Job object since in general it's Job @@ -1111,16 +1038,15 @@ def update_job_data(job): if 'exeErrorDiag' in job.metadata: job.exeerrordiag = job.metadata['exeErrorDiag'] if job.exeerrordiag: - logger.warning('payload failed: exeErrorDiag=%s', job.exeerrordiag) + logger.warning(f'payload failed: exeErrorDiag={job.exeerrordiag}') # determine what should be staged out job.stageout = stageout # output and log file or only log file - work_attributes = None try: work_attributes = parse_jobreport_data(job.metadata) except Exception as exc: - logger.warning('failed to parse job report (cannot set job.nevents): %s', exc) + logger.warning(f'failed to parse job report (cannot set job.nevents): {exc}') else: # note: the number of events can be set already at this point # if the value was extracted from the job report (a more thorough @@ -1133,18 +1059,15 @@ def update_job_data(job): # has created additional (overflow) files. Also make sure all guids are # assigned (use job report value if present, otherwise generate the guid) is_raythena = os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') == 'raythena' - if is_raythena: - return - - if job.metadata and not job.is_eventservice: - # keep this for now, complicated to merge with verify_output_files? - extract_output_file_guids(job) - try: - verify_output_files(job) - except Exception as exc: - logger.warning('exception caught while trying verify output files: %s', exc) - else: - if not job.allownooutput: # i.e. if it's an empty list/string, do nothing + if not is_raythena: + if job.metadata and not job.is_eventservice: + # keep this for now, complicated to merge with verify_output_files? + extract_output_file_guids(job) + try: + verify_output_files(job) + except Exception as exc: + logger.warning(f'exception caught while trying verify output files: {exc}') + elif not job.allownooutput: # i.e. if it's an empty list/string, do nothing logger.debug(( "will not try to extract output files from jobReport " "for user job (and allowNoOut list is empty)")) @@ -1152,19 +1075,18 @@ def update_job_data(job): # remove the files listed in allowNoOutput if they don't exist remove_no_output_files(job) - validate_output_data(job) + validate_output_data(job) -def validate_output_data(job): +def validate_output_data(job: Any): """ Validate output data. + Set any missing GUIDs and make sure the output file names follow the ATLAS naming convention - if not, set the error code. - :param job: job object. - :return: + :param job: job object (Any). """ - ## validate output data (to be moved into the JobData) ## warning: do no execute this code unless guid lookup in job report # has failed - pilot should only generate guids @@ -1175,11 +1097,7 @@ def validate_output_data(job): for dat in job.outdata: if not dat.guid: dat.guid = get_guid() - logger.warning( - 'guid not set: generated guid=%s for lfn=%s', - dat.guid, - dat.lfn - ) + logger.warning(f'guid not set: generated guid={dat.guid} for lfn={dat.lfn}') # is the output file following the naming convention? found = re.findall(pattern, dat.lfn) if found: @@ -1203,76 +1121,69 @@ def validate_output_data(job): logger.debug('verified that all output files follow the ATLAS naming convention') -def naming_convention_pattern(): +def naming_convention_pattern() -> str: """ Return a regular expression pattern in case the output file name should be verified. - pattern=re.compile(r'^[A-Za-z0-9][A-Za-z0-9\\.\\-\\_]{1,250}$') + Pattern as below in the return statement will match the following file names: re.findall(pattern, 'AOD.29466419._001462.pool.root.1') ['AOD.29466419._001462.pool.root.1'] - :return: raw string. + :return: raw string (str). """ - max_filename_size = 250 - return r'^[A-Za-z0-9][A-Za-z0-9\\.\\-\\_]{1,%s}$' % max_filename_size + # pydocstyle does not like the backslash in the following line, but it is needed + return fr"^[A-Za-z0-9][A-Za-z0-9.\-_]{{1,{max_filename_size}}}$" -def get_stageout_label(job): + +def get_stageout_label(job: Any): """ Get a proper stage-out label. - :param job: job object. - :return: "all"/"log" depending on stage-out type (string). + :param job: job object (Any) + :return: "all"/"log" depending on stage-out type (str). """ - stageout = "all" if job.is_eventservice: logger.info('event service payload, will only stage-out log') stageout = "log" - else: + elif 'exeErrorCode' in job.metadata: # handle any error codes - if 'exeErrorCode' in job.metadata: - job.exeerrorcode = job.metadata['exeErrorCode'] - if job.exeerrorcode == 0: - stageout = "all" - else: - logger.info('payload failed: exeErrorCode=%d', job.exeerrorcode) - stageout = "log" + job.exeerrorcode = job.metadata['exeErrorCode'] + if job.exeerrorcode == 0: + stageout = "all" + else: + logger.info(f'payload failed: exeErrorCode={job.exeerrorcode}') + stageout = "log" return stageout -def update_output_for_hpo(job): +def update_output_for_hpo(job: Any): """ Update the output (outdata) for HPO jobs. - :param job: job object. - :return: + :param job: job object (Any). """ - try: new_outdata = discover_new_outdata(job) except Exception as exc: - logger.warning('exception caught while discovering new outdata: %s', exc) + logger.warning(f'exception caught while discovering new outdata: {exc}') else: if new_outdata: - logger.info(( - 'replacing job outdata with discovered output ' - '(%d file(s))'), len(new_outdata)) + logger.info(f'replacing job outdata with discovered output ({len(new_outdata)} file(s))') job.outdata = new_outdata -def discover_new_outdata(job): +def discover_new_outdata(job: Any): """ Discover new outdata created by HPO job. - :param job: job object. - :return: new_outdata (list of FileSpec objects) + :param job: job object (Any) + :return: new_outdata (list of FileSpec objects). """ - - from pilot.info.filespec import FileSpec new_outdata = [] for outdata_file in job.outdata: @@ -1302,9 +1213,9 @@ def discover_new_outdata(job): return new_outdata -def discover_new_output(name_pattern, workdir): +def discover_new_output(name_pattern: str, workdir: str) -> dict: """ - Discover new output created by HPO job in the given work dir. + Discover new output created by HPO job in the given work directory. name_pattern for known 'filename' is 'filename_N' (N = 0, 1, 2, ..). Example: name_pattern = 23578835.metrics.000001.tgz @@ -1312,13 +1223,12 @@ def discover_new_output(name_pattern, workdir): new_output = { lfn: {'path': path, 'size': size, 'checksum': checksum}, .. } - :param name_pattern: assumed name pattern for file to discover (string). - :param workdir: work directory (string). - :return: new_output (dictionary). + :param name_pattern: assumed name pattern for file to discover (str) + :param workdir: work directory (str) + :return: new_output (dict). """ - new_output = {} - outputs = glob("%s/%s_*" % (workdir, name_pattern)) + outputs = glob(f"{workdir}/{name_pattern}_*") if outputs: lfns = [os.path.basename(path) for path in outputs] for lfn, path in list(zip(lfns, outputs)): @@ -1327,7 +1237,7 @@ def discover_new_output(name_pattern, workdir): # get checksum try: checksum = calculate_checksum(path, algorithm=config.File.checksum_type) - except (FileHandlingFailure, NotImplementedError, Exception) as exc: + except (FileHandlingFailure, NotImplementedError) as exc: logger.warning(f'failed to create file info (filesize={filesize}) for lfn={lfn}: {exc}') else: if filesize and checksum: @@ -1338,18 +1248,18 @@ def discover_new_output(name_pattern, workdir): return new_output -def extract_output_file_guids(job): +def extract_output_file_guids(job: Any) -> None: """ - Extract output file info from the job report and make sure all guids\ - are assigned (use job report value if present, otherwise generate the guid.\ + Extract output file info from the job report and make sure all guids are assigned. + + Use job report value if present, otherwise generate the guid. Note: guid generation is done later, not in this function since this function might not be called if metadata info is not found prior - to the call). + to the call. - :param job: job object. - :return: + :param job: job object (Any) + :return: None. """ - # make sure there is a defined output file list in the job report - # unless it is allowed by task parameter allowNoOutput if not job.allownooutput: @@ -1375,13 +1285,9 @@ def extract_output_file_guids(job): # job definition (March 18 change, v 2.5.2) if lfn in data: data[lfn].guid = fdat['file_guid'] - logger.info(( - 'set guid=%s for lfn=%s ' - '(value taken from job report)'), data[lfn].guid, lfn) + logger.info(f'set guid={data[lfn].guid} for lfn={lfn} (value taken from job report)') else: # found new entry - logger.warning(( - 'pilot no longer considers output files not mentioned ' - 'in job definition (lfn=%s)'), lfn) + logger.warning(f'pilot no longer considers output files not mentioned in job definition (lfn={lfn})') continue #if job.outdata: @@ -1400,22 +1306,25 @@ def extract_output_file_guids(job): for fspec in job.outdata: if fspec.guid != data[fspec.lfn].guid: fspec.guid = data[fspec.lfn].guid - logger.debug('reset guid=%s for lfn=%s', fspec.guid, fspec.lfn) + logger.debug(f'reset guid={fspec.guid} for lfn={fspec.lfn}') + elif fspec.guid: + logger.debug(f'verified guid={fspec.guid} for lfn={fspec.lfn}') else: - if fspec.guid: - logger.debug('verified guid=%s for lfn=%s', fspec.guid, fspec.lfn) - else: - logger.warning('guid not set for lfn=%s', fspec.lfn) + logger.warning(f'guid not set for lfn={fspec.lfn}') #if extra: #logger.info('found extra output files in job report, # will overwrite output file list: extra=%s' % extra) #job.outdata = extra + return -def verify_output_files(job): + +def verify_output_files(job: Any) -> bool: """ - Make sure that the known output files from the job definition are listed - in the job report and number of processed events is greater than zero. + Verify that the output files from the job definition are listed in the job report. + + Also make sure that the number of processed events is greater than zero. + If the output file is not listed in the job report, then if the file is listed in allowNoOutput remove it from stage-out, otherwise fail the job. @@ -1423,10 +1332,9 @@ def verify_output_files(job): there with zero events. Then if allownooutput is not set - fail the job. If it is set, then do not store the output, and finish ok. - :param job: job object. - :return: Boolean (and potentially updated job.outdata list) + :param job: job object (Any) + :return: True if output files were validated correctly, False otherwise (bool). """ - failed = False # get list of output files from the job definition @@ -1443,43 +1351,32 @@ def verify_output_files(job): output = job.metadata.get('files', {}).get('output', None) if not output and output is not None: # ie empty list, output=[] - are all known output files in allowNoOutput? - logger.warning(( - 'encountered an empty output file list in job report, ' - 'consulting allowNoOutput list')) + logger.warning('encountered an empty output file list in job report, consulting allowNoOutput list') failed = False for lfn in lfns_jobdef: if lfn not in job.allownooutput: if job.is_analysis(): - logger.warning(( - 'lfn %s is not in allowNoOutput list - ' - 'ignore for user job'), - lfn - ) + logger.warning(f'lfn {lfn} is not in allowNoOutput list - ignore for user job') else: failed = True - logger.warning( - 'lfn %s is not in allowNoOutput list - job will fail', - lfn - ) + logger.warning(f'lfn {lfn} is not in allowNoOutput list - job will fail') job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGOUTPUTFILE) break else: - logger.info('lfn %s listed in allowNoOutput - will be removed from stage-out', lfn) + logger.info(f'lfn {lfn} listed in allowNoOutput - will be removed from stage-out') remove_from_stageout(lfn, job) elif output is None: # ie job report is ancient / output could not be extracted - logger.warning(( - 'output file list could not be extracted from job report ' - '(nothing to verify)')) + logger.warning('output file list could not be extracted from job report (nothing to verify)') else: verified, nevents = verify_extracted_output_files(output, lfns_jobdef, job) failed = (not verified) if nevents > 0 and not failed and job.nevents == 0: job.nevents = nevents - logger.info('number of events from summed up output files: %d', nevents) + logger.info(f'number of events from summed up output files: {nevents}') else: - logger.info('number of events previously set to %d', job.nevents) + logger.info(f'number of events previously set to {job.nevents}') status = (not failed) @@ -1491,17 +1388,17 @@ def verify_output_files(job): return status -def verify_extracted_output_files(output, lfns_jobdef, job): +def verify_extracted_output_files(output: list, lfns_jobdef: list, job: Any) -> (bool, int): """ Make sure all output files extracted from the job report are listed. + Grab the number of events if possible. - :param output: list of FileSpecs (list). - :param lfns_jobdef: list of lfns strings from job definition (list). - :param job: job object. - :return: True if successful|False if failed, number of events (Boolean, int) + :param output: list of FileSpecs (list) + :param lfns_jobdef: list of lfns strings from job definition (list) + :param job: job object (Any) + :return: True if successful, False if failed (bool), number of events (int). """ - failed = False nevents = 0 output_jobrep = {} # {lfn: nentries, ..} @@ -1522,98 +1419,74 @@ def verify_extracted_output_files(output, lfns_jobdef, job): for lfn in lfns_jobdef: if lfn not in output_jobrep and lfn not in job.allownooutput: if job.is_analysis(): - logger.warning(( - 'output file %s from job definition is not present ' - 'in job report and is not listed in allowNoOutput'), lfn) + logger.warning(f'output file {lfn} from job definition is not present in job report and ' + f'is not listed in allowNoOutput') else: - logger.warning(( - 'output file %s from job definition is not present ' - 'in job report and is not listed in allowNoOutput - ' - 'job will fail'), lfn) + logger.warning(f'output file {lfn} from job definition is not present in job report and ' + f'is not listed in allowNoOutput - job will fail') job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.MISSINGOUTPUTFILE) failed = True break - if lfn not in output_jobrep and lfn in job.allownooutput: - logger.warning(( - 'output file %s from job definition is not present ' - 'in job report but is listed in allowNoOutput - ' - 'remove from stage-out'), lfn) + logger.warning(f'output file {lfn} from job definition is not present in job report but ' + f'is listed in allowNoOutput - remove from stage-out') remove_from_stageout(lfn, job) else: nentries = output_jobrep[lfn] if nentries == "UNDEFINED": - logger.warning(( - 'encountered file with nentries=UNDEFINED - ' - 'will ignore %s'), lfn) + logger.warning(f'encountered file with nentries=UNDEFINED - will ignore {lfn}') elif nentries is None: - if lfn not in job.allownooutput: - logger.warning(( - 'output file %s is listed in job report, ' - 'but has no events and is not listed in ' - 'allowNoOutput - will ignore'), lfn) + logger.warning(f'output file {lfn} is listed in job report, but has no events and ' + f'is not listed in allowNoOutput - will ignore') else: - logger.warning(( - 'output file %s is listed in job report, ' - 'nentries is None and is listed in allowNoOutput - ' - 'remove from stage-out'), lfn) + logger.warning(f'output file {lfn} is listed in job report, nentries is None and is listed in ' + f'allowNoOutput - remove from stage-out') remove_from_stageout(lfn, job) elif nentries == 0: - if lfn not in job.allownooutput: - logger.warning(( - 'output file %s is listed in job report, ' - 'has zero events and is not listed in ' - 'allowNoOutput - will ignore'), lfn) + logger.warning(f'output file {lfn} is listed in job report, has zero events and ' + f'is not listed in allowNoOutput - will ignore') else: - logger.warning(( - 'output file %s is listed in job report, ' - 'has zero events and is listed in allowNoOutput - ' - 'remove from stage-out'), lfn) + logger.warning(f'output file {lfn} is listed in job report, has zero events and is listed in ' + f'allowNoOutput - remove from stage-out') remove_from_stageout(lfn, job) - elif type(nentries) is int and nentries: - logger.info('output file %s has %d event(s)', lfn, nentries) + elif isinstance(nentries, int) and nentries: + logger.info(f'output file {lfn} has {nentries} event(s)') nevents += nentries else: # should not reach this step - logger.warning(( - 'case not handled for output file %s with %s event(s) ' - '(ignore)'), lfn, str(nentries)) + logger.warning(f'case not handled for output file {lfn} with {nentries} event(s) (ignore)') status = (not failed) + return status, nevents -def remove_from_stageout(lfn, job): +def remove_from_stageout(lfn: str, job: Any): """ - From the given lfn from the stage-out list. + Remove the given lfn from the stage-out list. - :param lfn: local file name (string). - :param job: job object - :return: [updated job object] + :param lfn: local file name (str) + :param job: job object (Any). """ - outdata = [] for fspec in job.outdata: if fspec.lfn == lfn: - logger.info('removing %s from stage-out list', lfn) + logger.info(f'removing {lfn} from stage-out list') else: outdata.append(fspec) job.outdata = outdata -def remove_no_output_files(job): +def remove_no_output_files(job: Any): """ - Remove files from output file list if they are listed in - allowNoOutput and do not exist. + Remove files from output file list if they are listed in allowNoOutput and do not exist. - :param job: job object. - :return: + :param job: job object (Any). """ - # first identify the files to keep _outfiles = [] for fspec in job.outdata: @@ -1622,22 +1495,16 @@ def remove_no_output_files(job): if filename in job.allownooutput: if os.path.exists(path): - logger.info(( - "file %s is listed in allowNoOutput but exists " - "(will not be removed from list of files to be " - "staged-out)"), filename) + logger.info(f"file {filename} is listed in allowNoOutput but exists (will not be removed from " + f"list of files to be staged-out)") _outfiles.append(filename) else: - logger.info(( - "file %s is listed in allowNoOutput and does not exist " - "(will be removed from list of files to be staged-out)"), filename) + logger.info(f"file {filename} is listed in allowNoOutput and does not exist (will be removed from list of files to be staged-out)") else: if os.path.exists(path): - logger.info("file %s is not listed in allowNoOutput (will be staged-out)", filename) + logger.info(f"file {filename} is not listed in allowNoOutput (will be staged-out)") else: - logger.warning(( - "file %s is not listed in allowNoOutput and " - "does not exist (job will fail)"), filename) + logger.warning(f"file {filename} is not listed in allowNoOutput and does not exist (job will fail)") _outfiles.append(filename) # now remove the unwanted fspecs @@ -1649,14 +1516,13 @@ def remove_no_output_files(job): job.outdata = outdata -def get_outfiles_records(subfiles): +def get_outfiles_records(subfiles: list) -> dict: """ Extract file info from job report JSON subfiles entry. - :param subfiles: list of subfiles. - :return: file info dictionary with format { 'guid': .., 'size': .., 'nentries': .. (optional)} + :param subfiles: list of subfiles (list) + :return: file info dictionary with format { 'guid': .., 'size': .., 'nentries': .. (optional)} (dict). """ - res = {} for subfile in subfiles: res[subfile['name']] = { @@ -1674,11 +1540,19 @@ def get_outfiles_records(subfiles): class DictQuery(dict): - """ - Helper class for parsing job report. - """ + """Helper class for parsing the job report.""" + + def get(self, path: str, dst_dict: dict, dst_key: str): + """ + Get value from dictionary. - def get(self, path, dst_dict, dst_key): + Updates dst_dict[dst_key] with the value from the dictionary. + + :param path: path to the value (str) + :param dst_dict: destination dictionary (dict) + :param dst_key: destination key (str) + :return: None. + """ keys = path.split("/") if len(keys) == 0: return @@ -1693,13 +1567,15 @@ def get(self, path, dst_dict, dst_key): if last_key in me_: dst_dict[dst_key] = me_[last_key] + return + -def parse_jobreport_data(job_report): # noqa: C901 +def parse_jobreport_data(job_report: dict) -> dict: # noqa: C901 """ Parse a job report and extract relevant fields. - :param job_report: - :return: + :param job_report: job report dictionary (dict) + :return: work_attributes (dict). """ work_attributes = {} if job_report is None or not any(job_report): @@ -1714,9 +1590,9 @@ def parse_jobreport_data(job_report): # noqa: C901 work_attributes["outputfiles"] = [] if "ATHENA_PROC_NUMBER" in os.environ: - logger.debug("ATHENA_PROC_NUMBER: %s", os.environ["ATHENA_PROC_NUMBER"]) - work_attributes['core_count'] = int(os.environ["ATHENA_PROC_NUMBER"]) - core_count = int(os.environ["ATHENA_PROC_NUMBER"]) + logger.debug(f"ATHENA_PROC_NUMBER: {os.environ['ATHENA_PROC_NUMBER']}") + work_attributes['core_count'] = int(os.environ['ATHENA_PROC_NUMBER']) + core_count = os.environ['ATHENA_PROC_NUMBER'] dictq = DictQuery(job_report) dictq.get("resource/transform/processedEvents", work_attributes, "nEvents") @@ -1751,26 +1627,24 @@ def parse_jobreport_data(job_report): # noqa: C901 work_attributes.update(fin_report) workdir_size = get_disk_usage('.') - work_attributes['jobMetrics'] = 'coreCount=%s nEvents=%s dbTime=%s dbData=%s workDirSize=%s' % \ - (core_count, - work_attributes["nEvents"], - work_attributes["dbTime"], - work_attributes["dbData"], - workdir_size) + work_attributes['jobMetrics'] = f"coreCount={core_count} " \ + f"nEvents={work_attributes['nEvents']} " \ + f"dbTime={work_attributes['dbTime']} " \ + f"dbData={work_attributes['dbData']} " \ + f"workDirSize={workdir_size}" del work_attributes["dbData"] del work_attributes["dbTime"] return work_attributes -def get_executor_dictionary(jobreport_dictionary): +def get_executor_dictionary(jobreport_dictionary: dict) -> dict: """ Extract the 'executor' dictionary from with a job report. - :param jobreport_dictionary: - :return: executor_dictionary + :param jobreport_dictionary: job report dictionary (dict) + :return: executor_dictionary (dict). """ - executor_dictionary = {} if jobreport_dictionary != {}: @@ -1786,20 +1660,20 @@ def get_executor_dictionary(jobreport_dictionary): return executor_dictionary -def get_resimevents(jobreport_dictionary): +def get_resimevents(jobreport_dictionary: dict) -> int or None: """ Extract and add up the resimevents from the job report. + This information is reported with the jobMetrics. - :param jobreport_dictionary: job report dictionary. - :return: resimevents (int or None) + :param jobreport_dictionary: job report dictionary (dict) + :return: resimevents (int or None). """ - resimevents = None executor_dictionary = get_executor_dictionary(jobreport_dictionary) if executor_dictionary != {}: - for fmt in list(executor_dictionary.keys()): # "ReSim", Python 2/3 + for fmt in list(executor_dictionary.keys()): # "ReSim" if 'resimevents' in executor_dictionary[fmt]: try: resimevents = int(executor_dictionary[fmt]['resimevents']) @@ -1811,95 +1685,85 @@ def get_resimevents(jobreport_dictionary): return resimevents -def get_db_info(jobreport_dictionary): +def get_db_info(jobreport_dictionary) -> (int, int): """ Extract and add up the DB info from the job report. + This information is reported with the jobMetrics. Note: this function adds up the different dbData and dbTime's in the different executor steps. In modern job reports this might have been done already by the transform and stored in dbDataTotal and dbTimeTotal. - :param jobreport_dictionary: job report dictionary. - :return: db_time (int), db_data (long) + :param jobreport_dictionary: job report dictionary (dict) + :return: db_time (int), db_data (int). """ - db_time = 0 - try: - db_data = long(0) # Python 2 # noqa: F821 - except NameError: - db_data = 0 # Python 3 + db_data = 0 executor_dictionary = get_executor_dictionary(jobreport_dictionary) if executor_dictionary != {}: - for fmt in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 + for fmt in list(executor_dictionary.keys()): # "RAWtoESD", .., if 'dbData' in executor_dictionary[fmt]: try: db_data += executor_dictionary[fmt]['dbData'] except Exception: pass else: - logger.warning("format %s has no such key: dbData", fmt) + logger.warning(f"format {fmt} has no such key: dbData") if 'dbTime' in executor_dictionary[fmt]: try: db_time += executor_dictionary[fmt]['dbTime'] except Exception: pass else: - logger.warning("format %s has no such key: dbTime", fmt) + logger.warning(f"format {fmt} has no such key: dbTime") return db_time, db_data -def get_db_info_str(db_time, db_data): +def get_db_info_str(db_time: int, db_data: int) -> (str, str): """ Convert db_time, db_data to strings. + E.g. dbData="105077960", dbTime="251.42". - :param db_time: time (s) - :param db_data: long integer - :return: db_time_s, db_data_s (strings) + :param db_time: time in seconds (int) + :param db_data: long integer (int) + :return: db_time_s (str), db_data_s (str). """ - - try: - zero = long(0) # Python 2 # noqa: F821 - except NameError: - zero = 0 # Python 3 + zero = 0 db_data_s = "" if db_data != zero: - db_data_s = "%s" % (db_data) + db_data_s = f"{db_data}" db_time_s = "" if db_time != 0: - db_time_s = "%.2f" % (db_time) + db_time_s = f"{db_time:.2f}" return db_time_s, db_data_s -def get_cpu_times(jobreport_dictionary): +def get_cpu_times(jobreport_dictionary: dict) -> (str, int, float): """ Extract and add up the total CPU times from the job report. + E.g. ('s', 5790L, 1.0). Note: this function is used with Event Service jobs - :param jobreport_dictionary: - :return: cpu_conversion_unit (unit), total_cpu_time, - conversion_factor (output consistent with set_time_consumed()) + :param jobreport_dictionary: job report dictionary (dict) + :return: cpu_conversion_unit (str), total_cpu_time (int), conversion_factor (output consistent with set_time_consumed()) (float). """ - - try: - total_cpu_time = long(0) # Python 2 # noqa: F821 - except NameError: - total_cpu_time = 0 # Python 3 + total_cpu_time = 0 executor_dictionary = get_executor_dictionary(jobreport_dictionary) if executor_dictionary != {}: - for fmt in list(executor_dictionary.keys()): # "RAWtoESD", .., Python 2/3 + for fmt in list(executor_dictionary.keys()): # "RAWtoESD", .., try: total_cpu_time += executor_dictionary[fmt]['cpuTime'] except KeyError: - logger.warning("format %s has no such key: cpuTime", fmt) + logger.warning(f"format {fmt} has no such key: cpuTime") except Exception: pass @@ -1909,27 +1773,26 @@ def get_cpu_times(jobreport_dictionary): return cpu_conversion_unit, total_cpu_time, conversion_factor -def get_exit_info(jobreport_dictionary): +def get_exit_info(jobreport_dictionary: dict) -> (int, str): """ Return the exit code (exitCode) and exit message (exitMsg). + E.g. (0, 'OK'). :param jobreport_dictionary: - :return: exit_code, exit_message + :return: exit_code (int), exit_message (str). """ + return jobreport_dictionary.get('exitCode'), jobreport_dictionary.get('exitMsg') - return jobreport_dictionary['exitCode'], jobreport_dictionary['exitMsg'] - -def cleanup_looping_payload(workdir): +def cleanup_looping_payload(workdir: str): """ Run a special cleanup for looping payloads. + Remove any root and tmp files. - :param workdir: working directory (string) - :return: + :param workdir: working directory (str). """ - for (root, _, files) in os.walk(workdir): for filename in files: if 'pool.root' in filename: @@ -1938,24 +1801,23 @@ def cleanup_looping_payload(workdir): remove(path) -def cleanup_payload(workdir, outputfiles=None, removecores=True): +def cleanup_payload(workdir: str, outputfiles: list = None, removecores: bool = True): """ - Cleanup of payload (specifically AthenaMP) sub directories prior to log file creation. + Clean up payload (specifically AthenaMP) sub-directories prior to log file creation. + Also remove core dumps. - :param workdir: working directory (string). - :param outputfiles: list of output files. - :param removecores: remove core files if True (Boolean). - :return: + :param workdir: working directory (str) + :param outputfiles: list of output files (list) + :param removecores: remove core files if True (bool). """ - if outputfiles is None: outputfiles = [] if removecores: remove_core_dumps(workdir) - for ampdir in glob('%s/athenaMP-workers-*' % workdir): + for ampdir in glob(f'{workdir}/athenaMP-workers-*'): for (root, _, files) in os.walk(ampdir): for filename in files: path = os.path.abspath(os.path.join(root, filename)) @@ -1972,14 +1834,12 @@ def cleanup_payload(workdir, outputfiles=None, removecores=True): remove(path) -def get_redundant_path(): +def get_redundant_path() -> str: """ - Return the path to the file containing the redundant files - and directories to be removed prior to log file creation. + Return the path to the file containing the redundant files and directories to be removed prior to log file creation. - :return: file path (string). + :return: file path (str). """ - filename = config.Pilot.redundant # correct /cvmfs if necessary @@ -1989,16 +1849,16 @@ def get_redundant_path(): return filename -def get_redundants(): +def get_redundants() -> list: """ Get list of redundant files and directories (to be removed). + The function will return the content of an external file. It that can't be read, then a list defined in this function will be returned instead. Any updates to the external file must be propagated to this function. - :return: files and directories list + :return: files and directories (list). """ - # try to read the list from the external file filename = get_redundant_path() @@ -2009,9 +1869,7 @@ def get_redundants(): # if dir_list: # return dir_list - logger.debug(( - 'list of redundant files could not be read from external file: %s ' - '(will use internal list)'), filename) + logger.debug(f'list of redundant files could not be read from external file: {filename} (will use internal list)') # else return the following dir_list = [".asetup.save", @@ -2089,16 +1947,14 @@ def get_redundants(): return dir_list -def remove_archives(workdir): +def remove_archives(workdir: str): """ - Explicitly remove any soft linked archives (.a files) since - they will be dereferenced by the tar command + Explicitly remove any soft linked archives (.a files) since they will be dereferenced by the tar command. + (--dereference option). - :param workdir: working directory (string) - :return: + :param workdir: working directory (str). """ - matches = [] for root, _, filenames in os.walk(workdir): for filename in fnmatch.filter(filenames, '*.a'): @@ -2111,14 +1967,12 @@ def remove_archives(workdir): remove(match) -def cleanup_broken_links(workdir): +def cleanup_broken_links(workdir: str): """ Run a second pass to clean up any broken links prior to log file creation. - :param workdir: working directory (string) - :return: + :param workdir: working directory (str). """ - broken = [] for root, _, files in os.walk(workdir): for filename in files: @@ -2137,28 +1991,24 @@ def cleanup_broken_links(workdir): remove(brok) -def list_work_dir(workdir): +def list_work_dir(workdir: str): """ Execute ls -lF for the given directory and dump to log. - :param workdir: directory name (string). + :param workdir: directory name (str). """ - - cmd = 'ls -lF %s' % workdir + cmd = f'ls -lF {workdir}' _, stdout, stderr = execute(cmd) - logger.debug('%s:\n' % stdout + stderr) + logger.debug(f'{stdout}:\n' + stderr) -def remove_special_files(workdir, dir_list, outputfiles): +def remove_special_files(workdir: str, dir_list: list): """ Remove list of special files from the workdir. - :param workdir: work directory (string). - :param dir_list: list of special files (list). - :param outputfiles: output files (list). - :return: + :param workdir: work directory (str) + :param dir_list: list of special files (list) """ - # note: these should be partial file/dir names, not containing any wildcards exceptions_list = ["runargs", "runwrapper", "jobReport", "log.", "xrdlog"] @@ -2177,31 +2027,24 @@ def remove_special_files(workdir, dir_list, outputfiles): _files = [os.path.abspath(item) for item in files if item not in exclude] to_delete += _files - exclude_files = [] - for opf in outputfiles: - exclude_files.append(os.path.join(workdir, opf)) - for item in to_delete: - if item not in exclude_files: - if os.path.isfile(item): - remove(item) - else: - remove_dir_tree(item) + if os.path.isfile(item): + remove(item) + else: + remove_dir_tree(item) -def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode=False): +def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: list = [], debugmode: bool = False): """ Remove redundant files and directories prior to creating the log file. Note: in debug mode, any core files should not be removed before creating the log. - :param workdir: working directory (string). - :param outputfiles: list of protected output files (list). - :param errors: list of Pilot assigned error codes (list). - :param debugmode: True if debug mode has been switched on (Boolean). - :return: + :param workdir: working directory (str) + :param outputfiles: list of protected output files (list) + :param errors: list of Pilot assigned error codes (list) + :param debugmode: True if debug mode has been switched on (bool). """ - if outputfiles is None: outputfiles = [] @@ -2218,7 +2061,7 @@ def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode= try: cleanup_payload(workdir, outputfiles, removecores=not debugmode) except OSError as exc: - logger.warning("failed to execute cleanup_payload(): %s", exc) + logger.warning(f"failed to execute cleanup_payload(): {exc}") # explicitly remove any soft linked archives (.a files) # since they will be dereferenced by the tar command (--dereference option) @@ -2226,10 +2069,9 @@ def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode= remove_archives(workdir) # remove special files - remove_special_files(workdir, dir_list, outputfiles) + remove_special_files(workdir, dir_list) - # remove container_script.sh if it contains token info - verify_container_script(os.path.join(workdir, config.Container.container_script)) + # verify_container_script(os.path.join(workdir, config.Container.container_script)) # run a second pass to clean up any broken links logger.debug('cleaning up broken links') @@ -2243,7 +2085,7 @@ def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode= islooping = errors.LOOPINGJOB in piloterrors ismemerror = errors.PAYLOADEXCEEDMAXMEM in piloterrors if not islooping and not ismemerror: - logger.debug('removing \'workDir\' from workdir=%s', workdir) + logger.debug(f'removing \'workDir\' from workdir={workdir}') remove_dir_tree(path) # remove additional dirs @@ -2251,23 +2093,22 @@ def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode= for additional in additionals: path = os.path.join(workdir, additional) if os.path.exists(path): - logger.debug('removing \'%s\' from workdir=%s', additional, workdir) + logger.debug(f"removing \'{additional}\' from workdir={workdir}") remove_dir_tree(path) list_work_dir(workdir) -def download_command(process, workdir): +def download_command(process: dict, workdir: str) -> dict: """ Download the pre/postprocess commands if necessary. Process FORMAT: {'command': , 'args': , 'label': } - :param process: pre/postprocess dictionary. - :param workdir: job workdir (string). - :return: updated pre/postprocess dictionary. + :param process: pre/postprocess dictionary (dict) + :param workdir: job workdir (str) + :return: updated pre/postprocess dictionary (dict). """ - cmd = process.get('command', '') # download the command if necessary @@ -2275,7 +2116,7 @@ def download_command(process, workdir): # Try to download the trf (skip when user container is to be used) exitcode, _, cmd = get_analysis_trf(cmd, workdir) if exitcode != 0: - logger.warning('cannot execute command due to previous error: %s', cmd) + logger.warning(f'cannot execute command due to previous error: {cmd}') return {} # update the preprocess command (the URL should be stripped) @@ -2284,14 +2125,13 @@ def download_command(process, workdir): return process -def get_utility_commands(order=None, job=None): +def get_utility_commands(order: int = None, job: Any = None) -> dict or None: """ - Return a dictionary of utility commands and arguments to be executed - in parallel with the payload. This could e.g. be memory and network - monitor commands. A separate function can be used to determine the - corresponding command setups using the utility command name. If the - optional order parameter is set, the function should return the list - of corresponding commands. + Return a dictionary of utility commands and arguments to be executed in parallel with the payload. + + This could e.g. be memory and network monitor commands. A separate function can be used to determine the + corresponding command setups using the utility command name. If the optional order parameter is set, the + function should return the list of corresponding commands. For example: @@ -2313,7 +2153,6 @@ def get_utility_commands(order=None, job=None): :param job: optional job object. :return: dictionary of utilities to be executed in parallel with the payload. """ - if order == UTILITY_BEFORE_PAYLOAD and job.preprocess: return get_precopostprocess_command(job.preprocess, job.workdir, 'preprocess') @@ -2350,36 +2189,36 @@ def get_utility_commands(order=None, job=None): return None -def get_precopostprocess_command(process, workdir, label): +def get_precopostprocess_command(process: dict, workdir: str, label: str) -> dict: """ Return the pre/co/post-process command dictionary. Command FORMAT: {'command': , 'args': , 'label': } The returned command has the structure: { 'command': , } - :param process: pre/co/post-process (dictionary). - :param workdir: working directory (string). - :param label: label (string). - :return: command (dictionary). - """ + :param process: pre/co/post-process (dict) + :param workdir: working directory (str) + :param label: label (str) + :return: command (dict). + """ com = {} if process.get('command', ''): com = download_command(process, workdir) com['label'] = label com['ignore_failure'] = False + return com -def get_utility_after_payload_started(): +def get_utility_after_payload_started() -> dict: """ Return the command dictionary for the utility after the payload has started. Command FORMAT: {'command': , 'args': , 'label': } - :return: command (dictionary). + :return: command (dict). """ - com = {} try: cmd = config.Pilot.utility_after_payload_started @@ -2388,69 +2227,66 @@ def get_utility_after_payload_started(): else: if cmd: com = {'command': cmd, 'args': '', 'label': cmd.lower(), 'ignore_failure': True} + return com -def get_xcache_command(catchall, workdir, jobid, label, xcache_function): +def get_xcache_command(catchall: str, workdir: str, jobid: str, label: str, xcache_function: Any) -> dict: """ Return the proper xcache command for either activation or deactivation. Command FORMAT: {'command': , 'args': , 'label': } - :param catchall: queuedata catchall field (string). - :param workdir: job working directory (string). - :param jobid: PanDA job id (string). - :param label: label (string). - :param xcache_function: activation/deactivation function name (function). - :return: command (dictionary). + :param catchall: queuedata catchall field (str) + :param workdir: job working directory (str) + :param jobid: PanDA job id (str) + :param label: label (str) + :param xcache_function: activation/deactivation function name (Any) + :return: command (dict). """ - com = {} if 'pilotXcache' in catchall: com = xcache_function(jobid=jobid, workdir=workdir) com['label'] = label com['ignore_failure'] = True + return com -def post_prestagein_utility_command(**kwargs): +def post_prestagein_utility_command(**kwargs: dict): """ Execute any post pre-stage-in utility commands. - :param kwargs: kwargs (dictionary). - :return: + :param kwargs: kwargs (dict). """ - label = kwargs.get('label', 'unknown_label') stdout = kwargs.get('output', None) if stdout: - logger.debug('processing stdout for label=%s', label) + logger.debug(f'processing stdout for label={label}') xcache_proxy(stdout) else: - logger.warning('no output for label=%s', label) + logger.warning(f'no output for label={label}') alrb_xcache_files = os.environ.get('ALRB_XCACHE_FILES', '') if alrb_xcache_files: cmd = 'cat $ALRB_XCACHE_FILES/settings.sh' _, _stdout, _ = execute(cmd) - logger.debug('cmd=%s:\n\n%s\n\n', cmd, _stdout) + logger.debug(f'cmd={cmd}:\n\n{_stdout}\n\n') -def xcache_proxy(output): +def xcache_proxy(output: str): """ Extract env vars from xcache stdout and set them. - :param output: command output (string). - :return: + :param output: command output (str). """ - # loop over each line in the xcache stdout and identify the needed environmental variables for line in output.split('\n'): if 'ALRB_XCACHE_PROXY' in line: suffix = '_REMOTE' if 'REMOTE' in line else '' - name = 'ALRB_XCACHE_PROXY%s' % suffix - pattern = r'\ export\ ALRB_XCACHE_PROXY%s\=\"(.+)\"' % suffix + name = f'ALRB_XCACHE_PROXY{suffix}' + pattern = fr'\ export\ ALRB_XCACHE_PROXY{suffix}\=\"(.+)\"' set_xcache_var(line, name=name, pattern=pattern) elif 'ALRB_XCACHE_MYPROCESS' in line: @@ -2475,94 +2311,91 @@ def xcache_proxy(output): ) -def set_xcache_var(line, name='', pattern=''): +def set_xcache_var(line: str, name: str = '', pattern: str = ''): """ Extract the value of a given environmental variable from a given stdout line. - :param line: line from stdout to be investigated (string). - :param name: name of env var (string). - :param pattern: regex pattern (string). - :return: + :param line: line from stdout to be investigated (str) + :param name: name of env var (str) + :param pattern: regular expression pattern (str). """ - pattern = re.compile(pattern) result = re.findall(pattern, line) if result: os.environ[name] = result[0] -def xcache_activation_command(workdir='', jobid=''): +def xcache_activation_command(workdir: str = '', jobid: str = '') -> dict: """ Return the xcache service activation command. Note: the workdir is not used here, but the function prototype needs it in the called (xcache_activation_command needs it). - :param workdir: unused work directory - do not remove (string). - :param jobid: PanDA job id to guarantee that xcache process is unique (int). - :return: xcache command (string). + :param workdir: unused work directory - do not remove (str) + :param jobid: PanDA job id to guarantee that xcache process is unique (int) + :return: xcache command (str). """ - # a successful startup will set ALRB_XCACHE_PROXY and ALRB_XCACHE_PROXY_REMOTE # so any file access with root://... should be replaced with one of # the above (depending on whether you are on the same machine or not) # example: # ${ALRB_XCACHE_PROXY}root://atlasxrootd-kit.gridka.de:1094//pnfs/gridka.de/../DAOD_FTAG4.24348858._000020.pool.root.1 - command = "%s " % get_asetup(asetup=False) + command = f"{get_asetup(asetup=False)} " # add 'xcache list' which will also kill any # orphaned processes lingering in the system command += ( - "lsetup xcache; xcache list; " - "xcache start -d $PWD/%s/xcache -C centos7 --disklow 4g --diskhigh 5g -b 4" % jobid) + f"lsetup xcache; xcache list; xcache start -d $PWD/{jobid}/xcache -C centos7 --disklow 4g --diskhigh 5g -b 4" + ) return {'command': command, 'args': ''} -def xcache_deactivation_command(workdir='', jobid=''): +def xcache_deactivation_command(workdir: str = '', jobid: str = '') -> dict: """ Return the xcache service deactivation command. + This service should be stopped after the payload has finished. Copy the messages log before shutting down. Note: the job id is not used here, but the function prototype needs it in the called (xcache_activation_command needs it). - :param workdir: payload work directory (string). - :param jobid: unused job id - do not remove (string). - :return: xcache command (string). + :param workdir: payload work directory (str) + :param jobid: unused job id - do not remove (str) + :return: xcache command (dict). """ - path = os.environ.get('ALRB_XCACHE_LOG', None) if path and os.path.exists(path): - logger.debug('copying xcache messages log file (%s) to work dir (%s)', path, workdir) + logger.debug(f'copying xcache messages log file ({path}) to work dir ({workdir})') dest = os.path.join(workdir, 'xcache-messages.log') try: copy(path, dest) except Exception as exc: - logger.warning('exception caught copying xcache log: %s', exc) + logger.warning(f'exception caught copying xcache log: {exc}') else: if not path: logger.warning('ALRB_XCACHE_LOG is not set') if path and not os.path.exists(path): - logger.warning('path does not exist: %s', path) - command = "%s " % get_asetup(asetup=False) + logger.warning(f'path does not exist: {path}') + command = f"{get_asetup(asetup=False)} " command += "lsetup xcache; xcache kill" # -C centos7 return {'command': command, 'args': '-p $ALRB_XCACHE_MYPROCESS'} -def get_utility_command_setup(name, job, setup=None): +def get_utility_command_setup(name: str, job: Any, setup: str = None) -> str: """ Return the proper setup for the given utility command. + If a payload setup is specified, then the utility command string should be prepended to it. - :param name: name of utility (string). - :param job: job object. - :param setup: optional payload setup string. - :return: utility command setup (string). + :param name: name of utility (str) + :param job: job object (Any) + :param setup: optional payload setup string (str) + :return: utility command setup (str). """ - if name == 'MemoryMonitor': # must know if payload is running in a container or not # (enables search for pid in ps output) @@ -2591,11 +2424,11 @@ def get_utility_command_setup(name, job, setup=None): # update the pgrp if the pid changed if pid not in (job.pid, -1): - logger.debug('updating pgrp=%d for pid=%d', job.pgrp, pid) + logger.debug(f'updating pgrp={job.pgrp} for pid={pid}') try: job.pgrp = os.getpgid(pid) except Exception as exc: - logger.warning('os.getpgid(%d) failed with: %s', pid, exc) + logger.warning(f'os.getpgid({pid}) failed with: {exc}') return setup if name == 'NetworkMonitor' and setup: @@ -2610,14 +2443,13 @@ def get_utility_command_setup(name, job, setup=None): return "" -def get_utility_command_execution_order(name): +def get_utility_command_execution_order(name: str) -> int: """ - Should the given utility command be executed before or after the payload? + Determine if the given utility command should be executed before or after the payload. - :param name: utility name (string). - :return: execution order constant. + :param name: utility name (str) + :return: execution order constant (int). """ - # example implementation if name == 'NetworkMonitor': return UTILITY_WITH_PAYLOAD @@ -2625,57 +2457,53 @@ def get_utility_command_execution_order(name): if name == 'MemoryMonitor': return UTILITY_AFTER_PAYLOAD_STARTED - logger.warning('unknown utility name: %s', name) + logger.warning(f'unknown utility name: {name}') + return UTILITY_AFTER_PAYLOAD_STARTED -def post_utility_command_action(name, job): +def post_utility_command_action(name: str, job: Any): """ Perform post action for given utility command. - :param name: name of utility command (string). - :param job: job object. - :return: + :param name: name of utility command (str) + :param job: job object (Any). """ - if name == 'NetworkMonitor': pass elif name == 'MemoryMonitor': post_memory_monitor_action(job) -def get_utility_command_kill_signal(name): +def get_utility_command_kill_signal(name: str) -> int: """ Return the proper kill signal used to stop the utility command. - :param name: name of utility command (string). - :return: kill signal + :param name: name of utility command (str) + :return: kill signal (int). """ - # note that the NetworkMonitor does not require killing (to be confirmed) return SIGUSR1 if name == 'MemoryMonitor' else SIGTERM -def get_utility_command_output_filename(name, selector=None): +def get_utility_command_output_filename(name: str, selector: bool = None) -> str: """ Return the filename to the output of the utility command. - :param name: utility name (string). - :param selector: optional special conditions flag (boolean). - :return: filename (string). + :param name: utility name (str) + :param selector: optional special conditions flag (bool) + :return: filename (str). """ - return get_memory_monitor_summary_filename(selector=selector) if name == 'MemoryMonitor' else "" -def verify_lfn_length(outdata): +def verify_lfn_length(outdata: list) -> (int, str): """ Make sure that the LFNs are all within the allowed length. - :param outdata: FileSpec object. - :return: error code (int), diagnostics (string). + :param outdata: list of FileSpec objects (list) + :return: error code (int), diagnostics (str). """ - exitcode = 0 diagnostics = "" max_length = 255 @@ -2683,22 +2511,20 @@ def verify_lfn_length(outdata): # loop over all output files for fspec in outdata: if len(fspec.lfn) > max_length: - diagnostics = "LFN too long (length: %d, must be less than %d characters): %s" % \ - (len(fspec.lfn), max_length, fspec.lfn) + diagnostics = f"LFN too long (length: {len(fspec.lfn)}, " \ + f"must be less than {max_length} characters): {fspec.lfn}" exitcode = errors.LFNTOOLONG break return exitcode, diagnostics -def verify_ncores(corecount): +def verify_ncores(corecount: int): """ - Verify that nCores settings are correct + Verify that nCores settings are correct. :param corecount: number of cores (int). - :return: """ - try: del os.environ['ATHENA_PROC_NUMBER_JOB'] logger.debug("unset existing ATHENA_PROC_NUMBER_JOB") @@ -2716,30 +2542,27 @@ def verify_ncores(corecount): # otherwise use ATHENA_PROC_NUMBER directly; ATHENA_PROC_NUMBER_JOB # will always be the value from the job definition) if athena_proc_number: - logger.info(( - "encountered a set ATHENA_PROC_NUMBER (%d), " - "will not overwrite it"), athena_proc_number) + logger.info(f"encountered a set ATHENA_PROC_NUMBER ({athena_proc_number}), will not overwrite it") logger.info('set ATHENA_CORE_NUMBER to same value as ATHENA_PROC_NUMBER') os.environ['ATHENA_CORE_NUMBER'] = str(athena_proc_number) else: os.environ['ATHENA_PROC_NUMBER_JOB'] = str(corecount) os.environ['ATHENA_CORE_NUMBER'] = str(corecount) - logger.info(( - "set ATHENA_PROC_NUMBER_JOB and ATHENA_CORE_NUMBER to %s " - "(ATHENA_PROC_NUMBER will not be overwritten)"), corecount) + logger.info(f"set ATHENA_PROC_NUMBER_JOB and ATHENA_CORE_NUMBER to {corecount} " + f"(ATHENA_PROC_NUMBER will not be overwritten)") -def verify_job(job): +def verify_job(job: Any) -> bool: """ Verify job parameters for specific errors. + Note: in case of problem, the function should set the corresponding pilot error code using: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) - :param job: job object - :return: Boolean. + :param job: job object (Any) + :return: True if verified, False otherwise (bool). """ - status = False # are LFNs of correct lengths? @@ -2757,54 +2580,49 @@ def verify_job(job): return status -def update_stagein(job): +def update_stagein(job: Any): """ Skip DBRelease files during stage-in. - :param job: job object. - :return: + :param job: job object (Any). """ - for fspec in job.indata: if 'DBRelease' in fspec.lfn: fspec.status = 'no_transfer' -def get_metadata(workdir): +def get_metadata(workdir: str) -> dict or None: """ Return the metadata from file. - :param workdir: work directory (string) - :return: + :param workdir: work directory (str) + :return: metadata (dict). """ - path = os.path.join(workdir, config.Payload.jobreport) metadata = read_file(path) if os.path.exists(path) else None - logger.debug('metadata=%s', str(metadata)) + logger.debug(f'metadata={metadata}') return metadata -def should_update_logstash(frequency=10): +def should_update_logstash(frequency: int = 10) -> bool: """ - Should logstash be updated with prmon dictionary? + Determine if logstash should be updated with prmon dictionary. - :param frequency: - :return: return True once per 'frequency' times. + :param frequency: update frequency (int) + :return: return True once per 'frequency' times (bool). """ return randint(0, frequency - 1) == 0 -def update_server(job): +def update_server(job: Any) -> None: """ Perform any user specific server actions. E.g. this can be used to send special information to a logstash. - :param job: job object. - :return: + :param job: job object (Any). """ - # attempt to read memory_monitor_output.txt and convert it to json if not should_update_logstash(): logger.debug('no need to update logstash for this job') @@ -2812,7 +2630,7 @@ def update_server(job): path = os.path.join(job.workdir, get_memory_monitor_output_filename()) if not os.path.exists(path): - logger.warning('path does not exist: %s', path) + logger.warning(f'path does not exist: {path}') return # convert memory monitor text output to json and return the selection @@ -2823,58 +2641,42 @@ def update_server(job): # update the path and tell curl to send it new_path = update_extension(path=path, extension='json') - #out = read_json(new_path) - #logger.debug('prmon json=\n%s' % out) - # logger.debug('final logstash prmon dictionary: %s' % str(metadata_dictionary)) + # out = read_json(new_path) + # logger.debug(f'prmon json=\n{out}') + # logger.debug(f'final logstash prmon dictionary: {metadata_dictionary}') url = 'https://pilot.atlas-ml.org' # 'http://collector.atlas-ml.org:80' - - # cmd = ( - # "curl --connect-timeout 20 --max-time 120 " - # "-H \"Content-Type: application/json\" -X POST -d \'%s\' %s" % \ - # (str(metadata_dictionary).replace("'", '"'), url) - #) - - # curl --connect-timeout 20 --max-time 120 -H - # "Content-Type: application/json" -X POST --upload-file test.json - # https://pilot.atlas-ml.org cmd = ( - "curl --connect-timeout 20 --max-time 120 " - "-H \"Content-Type: application/json\" " - "-X POST " - "--upload-file %s %s" % (new_path, url) + f"curl --connect-timeout 20 --max-time 120 -H \"Content-Type: application/json\" -X POST " + f"--upload-file {new_path} {url}" ) - #cmd = "curl --connect-timeout 20 --max-time 120 -F - # 'data=@%s' %s" % (new_path, url) # send metadata to logstash try: _, stdout, stderr = execute(cmd, usecontainer=False) except Exception as exc: - logger.warning('exception caught: %s', exc) + logger.warning(f'exception caught: {exc}') else: logger.debug('sent prmon JSON dictionary to logstash server') - logger.debug('stdout: %s', stdout) - logger.debug('stderr: %s', stderr) + logger.debug(f'stdout: {stdout}') + logger.debug(f'stderr: {stderr}') else: msg = 'no prmon json available - cannot send anything to logstash server' logger.warning(msg) + return + -def preprocess_debug_command(job): +def preprocess_debug_command(job: Any): """ Pre-process the debug command in debug mode. - :param job: Job object. - :return: + :param job: Job object (Any). """ - # Should the pilot do the setup or does jobPars already contain the information? preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) # get the general setup command and then verify it if required resource_name = get_resource_name() # 'grid' if no hpc_resource is set - # Python 3, level: -1 -> 0 - modname = 'pilot.user.atlas.resource.%s' % resource_name - resource = __import__(modname, globals(), locals(), [resource_name], 0) + resource = __import__(f'pilot.user.atlas.resource.{resource_name}', globals(), locals(), [resource_name], 0) cmd = resource.get_setup_command(job, preparesetup) if not cmd.endswith(';'): @@ -2883,8 +2685,10 @@ def preprocess_debug_command(job): job.debug_command = cmd + job.debug_command -def process_debug_command(debug_command, pandaid): +def process_debug_command(debug_command: str, pandaid: str) -> str: """ + Process the debug command in debug mode. + In debug mode, the server can send a special debug command to the piloti via the updateJob backchannel. This function can be used to process that command, i.e. to identify a proper pid to debug (which is unknown @@ -2895,16 +2699,13 @@ def process_debug_command(debug_command, pandaid): (hardcoded) process will be that of athena.py. The pilot will find the corresponding pid. - :param debug_command: debug command (string). - :param pandaid: PanDA id (string). - :return: updated debug command (string). + :param debug_command: debug command (str) + :param pandaid: PanDA id (str) + :return: updated debug command (str). """ - if '--pid %' not in debug_command: return debug_command - pandaid_pid = None - # replace the % with the pid for athena.py # note: if athena.py is not yet running, the --pid % will remain. # Otherwise the % will be replaced by the pid first find the pid @@ -2912,7 +2713,6 @@ def process_debug_command(debug_command, pandaid): cmd = 'ps axo pid,ppid,pgid,args' _, stdout, _ = execute(cmd) if stdout: - #logger.debug('ps=\n\n%s\n' % stdout) # convert the ps output to a dictionary dictionary = convert_ps_to_dict(stdout) @@ -2932,16 +2732,14 @@ def process_debug_command(debug_command, pandaid): try: child = is_child(pid, pandaid_pid, trimmed_dictionary) except RuntimeError as rte: - logger.warning(( - 'too many recursions: %s ' - '(cannot identify athena process)'), rte) + logger.warning(f'too many recursions: {rte} (cannot identify athena process)') else: if child: - logger.info('pid=%d is a child process of the trf of this job', pid) - debug_command = debug_command.replace('--pid %', '--pid %d' % pid) - logger.info('updated debug command: %s', debug_command) + logger.info(f'pid={pid} is a child process of the trf of this job') + debug_command = debug_command.replace('--pid %', f'--pid {pid}') + logger.info(f'updated debug command: {debug_command}') break - logger.info('pid=%d is not a child process of the trf of this job', pid) + logger.info(f'pid={pid} is not a child process of the trf of this job') if not pids or '--pid %' in debug_command: logger.debug('athena is not yet running (no corresponding pid)') @@ -2953,23 +2751,23 @@ def process_debug_command(debug_command, pandaid): return debug_command -def allow_timefloor(submitmode): +def allow_timefloor(submitmode: str) -> bool: """ - Should the timefloor mechanism (multi-jobs) be allowed for the given submit mode? + Decide if the timefloor mechanism (for multi-jobs) should be allowed for the given submit mode. - :param submitmode: submit mode (string). + :param submitmode: submit mode (str) + :return: always True for ATLAS (bool). """ - return True -def get_pilot_id(jobid): +def get_pilot_id(jobid: int) -> str: """ Get the pilot id from the environment variable GTAG. + Update if necessary (not for ATLAS since we want the same pilot id for all multi-jobs). - :param jobid: PanDA job id - UNUSED (int). - :return: pilot id (string). + :param jobid: PanDA job id - UNUSED (int) + :return: pilot id (str). """ - return os.environ.get("GTAG", "unknown") diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index 01503f5c..e9758055 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -25,7 +25,7 @@ import pipes import re import logging -from typing import Any +from typing import Any, Callable # for user container test: import urllib @@ -35,6 +35,7 @@ from pilot.user.atlas.proxy import get_and_verify_proxy, get_voms_role from pilot.info import InfoService, infosys from pilot.util.config import config +from pilot.util.container import obscure_token from pilot.util.filehandling import ( grep, remove, @@ -45,14 +46,13 @@ errors = ErrorCodes() -def do_use_container(**kwargs): +def do_use_container(**kwargs: Any) -> bool: """ Decide whether to use a container or not. - :param kwargs: dictionary of key-word arguments. - :return: True if function has decided that a container should be used, False otherwise (boolean). + :param kwargs: dictionary of key-word arguments (Any) + :return: True if function has decided that a container should be used, False otherwise (bool). """ - # to force no container use: return False use_container = False @@ -71,7 +71,7 @@ def do_use_container(**kwargs): container_name = queuedata.container_type.get("pilot") if container_name: use_container = True - logger.debug('container_name == \'%s\' -> use_container = True', container_name) + logger.debug(f"container_name == \'{container_name}\' -> use_container = True") else: logger.debug('else -> use_container = False') elif copytool: @@ -84,16 +84,15 @@ def do_use_container(**kwargs): return use_container -def wrapper(executable, **kwargs): +def wrapper(executable: str, **kwargs: Any) -> Callable[..., Any]: """ Wrapper function for any container specific usage. This function will be called by pilot.util.container.execute() and prepends the executable with a container command. - :param executable: command to be executed (string). - :param kwargs: dictionary of key-word arguments. - :return: executable wrapped with container command (string). + :param executable: command to be executed (str) + :param kwargs: dictionary of key-word arguments (Any) + :return: executable wrapped with container command (Callable). """ - workdir = kwargs.get('workdir', '.') pilot_home = os.environ.get('PILOT_HOME', '') job = kwargs.get('job', None) @@ -109,38 +108,36 @@ def wrapper(executable, **kwargs): return fctn(executable, workdir, job=job) -def extract_platform_and_os(platform): +def extract_platform_and_os(platform: str) -> str: """ - Extract the platform and OS substring from platform + Extract the platform and OS substring from platform. - :param platform (string): E.g. "x86_64-slc6-gcc48-opt" - :return: extracted platform specifics (string). E.g. "x86_64-slc6". In case of failure, return the full platform + :param platform: platform info, e.g. "x86_64-slc6-gcc48-opt" (str) + :return: extracted platform specifics, e.g. "x86_64-slc6". In case of failure, return the full platform (str). """ - pattern = r"([A-Za-z0-9_-]+)-.+-.+" found = re.findall(re.compile(pattern), platform) if found: ret = found[0] else: - logger.warning("could not extract architecture and OS substring using pattern=%s from platform=%s" - "(will use %s for image name)", pattern, platform, platform) + logger.warning(f"could not extract architecture and OS substring using pattern={pattern} from " + f"platform={platform} (will use {platform} for image name)") ret = platform return ret -def get_grid_image(platform): +def get_grid_image(platform: str) -> str: """ - Return the full path to the singularity/apptainer grid image + Return the full path to the singularity/apptainer grid image. - :param platform: E.g. "x86_64-slc6" (string). - :return: full path to grid image (string). + :param platform: E.g. "x86_64-slc6" (str) + :return: full path to grid image (str). """ - if not platform or platform == "": platform = "x86_64-slc6" - logger.warning("using default platform=%s (cmtconfig not set)", platform) + logger.warning(f"using default platform={platform} (cmtconfig not set)") arch_and_os = extract_platform_and_os(platform) image = arch_and_os + ".img" @@ -151,10 +148,10 @@ def get_grid_image(platform): path = os.path.join(_path, image) if not os.path.exists(path): image = 'x86_64-centos7.img' - logger.warning('path does not exist: %s (trying with image %s instead)', path, image) + logger.warning(f'path does not exist: {path} (trying with image {image} instead)') path = os.path.join(_path, image) if not os.path.exists(path): - logger.warning('path does not exist either: %s', path) + logger.warning(f'path does not exist either: {path}') path = "" return path @@ -182,7 +179,7 @@ def get_middleware_type(): if middleware == _split[0]: middleware_type = _split[1] except IndexError as exc: - logger.warning("failed to parse the container name: %s, %s", container_type, exc) + logger.warning(f"failed to parse the container name: {container_type}, {exc}") else: # logger.warning("container middleware type not specified in queuedata") # no middleware type was specified, assume that middleware is present on worker node @@ -217,7 +214,7 @@ def extract_atlas_setup(asetup, swrelease): cleaned_atlas_setup = asetup.replace(atlas_setup, '').replace(';;', ';') atlas_setup = atlas_setup.replace('source ', '') except AttributeError as exc: - logger.debug('exception caught while extracting asetup command: %s', exc) + logger.debug(f'exception caught while extracting asetup command: {exc}') atlas_setup = '' cleaned_atlas_setup = '' @@ -251,9 +248,9 @@ def extract_full_atlas_setup(cmd, atlas_setup): updated_cmds.append(subcmd) updated_cmd = ';'.join(updated_cmds) except AttributeError as exc: - logger.warning('exception caught while extracting full atlas setup: %s', exc) + logger.warning(f'exception caught while extracting full atlas setup: {exc}') updated_cmd = cmd - logger.debug('updated payload setup command: %s', updated_cmd) + logger.debug(f'updated payload setup command: {updated_cmd}') return extracted_asetup, updated_cmd @@ -273,13 +270,13 @@ def update_alrb_setup(cmd, use_release_setup): _cmd = cmd.split(';') for subcmd in _cmd: if subcmd.startswith('source ${ATLAS_LOCAL_ROOT_BASE}') and use_release_setup: - updated_cmds.append('export ALRB_CONT_SETUPFILE="/srv/%s"' % config.Container.release_setup) + updated_cmds.append(f'export ALRB_CONT_SETUPFILE="/srv/{config.Container.release_setup}"') updated_cmds.append(subcmd) updated_cmd = ';'.join(updated_cmds) except AttributeError as exc: - logger.warning('exception caught while extracting full atlas setup: %s', exc) + logger.warning(f'exception caught while extracting full atlas setup: {exc}') updated_cmd = cmd - logger.debug('updated ALRB command: %s', updated_cmd) + logger.debug(f'updated ALRB command: {updated_cmd}') return updated_cmd @@ -332,13 +329,13 @@ def set_platform(job, alrb_setup): """ if job.alrbuserplatform: - alrb_setup += 'export thePlatform=\"%s\";' % job.alrbuserplatform + alrb_setup += f'export thePlatform="{job.alrbuserplatform}";' elif job.preprocess and job.containeroptions: - alrb_setup += 'export thePlatform=\"%s\";' % job.containeroptions.get('containerImage') + alrb_setup += f"export thePlatform=\"{job.containeroptions.get('containerImage')}\";" elif job.imagename: - alrb_setup += 'export thePlatform=\"%s\";' % job.imagename + alrb_setup += f'export thePlatform="{job.imagename}";' elif job.platform: - alrb_setup += 'export thePlatform=\"%s\";' % job.platform + alrb_setup += f'export thePlatform="{job.platform}";' return alrb_setup @@ -364,7 +361,7 @@ def get_container_options(container_options): if '--containall' in container_options: container_options = container_options.replace('--containall', '') if container_options: - opts += '-e \"%s\"' % container_options + opts += f'-e "{container_options}"' else: # consider using options "-c -i -p" instead of "-C". The difference is that the latter blocks all environment # variables by default and the former does not @@ -452,31 +449,20 @@ def alrb_wrapper(cmd: str, workdir: str, job: Any = None) -> str: release_setup, cmd = create_release_setup(cmd, atlas_setup, full_atlas_setup, job.swrelease, job.workdir, queuedata.is_cvmfs) - # prepend the docker login if necessary - # does the pandasecrets dictionary contain any docker login info? - pandasecrets = str(job.pandasecrets) - if pandasecrets and "token" in pandasecrets and \ - has_docker_pattern(pandasecrets, pattern=r'docker://[^/]+/'): - # if so, add it do the container script - logger.info('adding sensitive docker login info') - cmd = add_docker_login(cmd, job.pandasecrets) - # correct full payload command in case preprocess command are used (ie replace trf with setupATLAS -c ..) if job.preprocess and job.containeroptions: cmd = replace_last_command(cmd, job.containeroptions.get('containerExec')) # write the full payload command to a script file container_script = config.Container.container_script - _cmd = obscure_token(cmd) # obscure any token if present - if _cmd: - logger.info(f'command to be written to container script file:\n\n{container_script}:\n\n{_cmd}\n') + if cmd: + logger.info(f'command to be written to container script file:\n\n{container_script}:\n\n{cmd}\n') else: logger.warning('will not show container script file since the user token could not be obscured') try: write_file(os.path.join(job.workdir, container_script), cmd, mute=False) - os.chmod(os.path.join(job.workdir, container_script), 0o755) # Python 2/3 - # except (FileHandlingFailure, FileNotFoundError) as exc: # Python 3 - except (FileHandlingFailure, OSError) as exc: # Python 2/3 + os.chmod(os.path.join(job.workdir, container_script), 0o755) + except (FileHandlingFailure, OSError) as exc: logger.warning(f'exception caught: {exc}') return "" @@ -490,28 +476,20 @@ def alrb_wrapper(cmd: str, workdir: str, job: Any = None) -> str: execargs = job.containeroptions.get('execArgs', None) if execargs: cmd += ' ' + execargs - logger.debug(f'\n\nfinal command:\n\n{cmd}\n') - else: - logger.warning('container name not defined in CRIC') - - return cmd - - -def obscure_token(cmd: str) -> str: - """ - Obscure any user token from the payload command. - :param cmd: payload command (str) - :return: updated command (str). - """ + # prepend the docker login if necessary + # does the pandasecrets dictionary contain any docker login info? + pandasecrets = str(job.pandasecrets) + if pandasecrets and "token" in pandasecrets and \ + has_docker_pattern(pandasecrets, pattern=r'docker://[^/]+/'): + # if so, add it do the container script + logger.info('adding sensitive docker login info') + cmd = add_docker_login(cmd, job.pandasecrets) - try: - match = re.search(r'-p (\S+);', cmd) - if match: - cmd = cmd.replace(match.group(1), '********') - except (re.error, AttributeError, IndexError): - logger.warning('an exception was thrown while trying to obscure the user token') - cmd = '' + _cmd = obscure_token(cmd) # obscure any token if present + logger.debug(f'\n\nfinal command:\n\n{_cmd}\n') + else: + logger.warning('container name not defined in CRIC') return cmd @@ -544,7 +522,8 @@ def add_docker_login(cmd: str, pandasecrets: dict) -> dict: try: match = re.search(pattern, registry_path) if match: - cmd = f'docker login {match.group(0)} -u {username} -p {token}; ' + cmd + # cmd = f'docker login {match.group(0)} -u {username} -p {token}; ' + cmd + cmd = f'apptainer remote login -u {username} -p {token} {match.group(0)}; ' + cmd else: logger.warning(f'failed to extract registry from {registry_path}') except re.error as regex_error: @@ -587,11 +566,11 @@ def add_asetup(job, alrb_setup, is_cvmfs, release_setup, container_script, conta job.jobparams, container_path = remove_container_string(job.jobparams) if job.alrbuserplatform: if not is_cvmfs: - alrb_setup += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c %s' % job.alrbuserplatform + alrb_setup += f'source ${{ATLAS_LOCAL_ROOT_BASE}}/user/atlasLocalSetup.sh -c {job.alrbuserplatform}' elif container_path != "": - alrb_setup += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c %s' % container_path + alrb_setup += f'source ${{ATLAS_LOCAL_ROOT_BASE}}/user/atlasLocalSetup.sh -c {container_path}' else: - logger.warning('failed to extract container path from %s', job.jobparams) + logger.warning(f'failed to extract container path from {job.jobparams}') alrb_setup = "" if alrb_setup and not is_cvmfs: alrb_setup += ' -d' @@ -603,7 +582,7 @@ def add_asetup(job, alrb_setup, is_cvmfs, release_setup, container_script, conta alrb_setup += ' -d' # update the ALRB setup command - alrb_setup += ' -s %s' % release_setup + alrb_setup += f' -s {release_setup}' alrb_setup += ' -r /srv/' + container_script alrb_setup = alrb_setup.replace(' ', ' ').replace(';;', ';') @@ -614,7 +593,7 @@ def add_asetup(job, alrb_setup, is_cvmfs, release_setup, container_script, conta # correct full payload command in case preprocess command are used (ie replace trf with setupATLAS -c ..) #if job.preprocess and job.containeroptions: - # logger.debug('will update cmd=%s', cmd) + # logger.debug(f'will update cmd={cmd}') # cmd = replace_last_command(cmd, 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c $thePlatform') # logger.debug('updated cmd with containerImage') @@ -676,8 +655,8 @@ def create_release_setup(cmd, atlas_setup, full_atlas_setup, release, workdir, i release_setup_name = '/srv/my_release_setup.sh' # extracted_asetup should be written to 'my_release_setup.sh' and cmd to 'container_script.sh' - content = 'echo \"INFO: sourcing %s inside the container. ' \ - 'This should not run if it is a ATLAS standalone container\"' % release_setup_name + content = f'echo \"INFO: sourcing {release_setup_name} inside the container. ' \ + f'This should not run if it is a ATLAS standalone container\"' if is_cvmfs and release and release != 'NULL': content, cmd = extract_full_atlas_setup(cmd, atlas_setup) if not content: @@ -693,11 +672,11 @@ def create_release_setup(cmd, atlas_setup, full_atlas_setup, release, workdir, i content += '\nfi' content += '\nreturn $retCode' - logger.debug('command to be written to release setup file:\n\n%s:\n\n%s\n', release_setup_name, content) + logger.debug(f'command to be written to release setup file:\n\n{release_setup_name}:\n\n{content}\n') try: write_file(os.path.join(workdir, os.path.basename(release_setup_name)), content, mute=False) except FileHandlingFailure as exc: - logger.warning('exception caught: %s', exc) + logger.warning(f'exception caught: {exc}') return release_setup_name, cmd.replace(';;', ';') @@ -746,7 +725,7 @@ def container_wrapper(cmd, workdir, job=None): queuedata = infoservice.queuedata container_name = queuedata.container_type.get("pilot") # resolve container name for user=pilot - logger.debug("resolved container_name from queuedata.container_type: %s", container_name) + logger.debug(f"resolved container_name from queuedata.container_type: {container_name}") if container_name == 'singularity' or container_name == 'apptainer': logger.info("singularity/apptainer has been requested") @@ -758,7 +737,7 @@ def container_wrapper(cmd, workdir, job=None): else: options = "-B " options += "/cvmfs,${workdir},/home" - logger.debug("using options: %s", options) + logger.debug(f"using options: {options}") # Get the image path if job: @@ -772,8 +751,6 @@ def container_wrapper(cmd, workdir, job=None): quote = pipes.quote(f'cd $workdir;pwd;{cmd}') cmd = f"export workdir={workdir}; {container_name} --verbose exec {options} {image_path} " \ f"/bin/bash -c {quote}" - #cmd = "export workdir=" + workdir + "; singularity --verbose exec " + options + " " + image_path + \ - # " /bin/bash -c " + pipes.quote("cd $workdir;pwd;%s" % cmd) # for testing user containers # singularity_options = "-B $PWD:/data --pwd / " @@ -782,20 +759,20 @@ def container_wrapper(cmd, workdir, job=None): else: logger.warning("singularity/apptainer options found but image does not exist") - logger.info("updated command: %s", cmd) + logger.info(f"updated command: {cmd}") return cmd -def create_root_container_command(workdir, cmd): +def create_root_container_command(workdir: str, cmd: str) -> str: """ + Create the container command for root. - :param workdir: - :param cmd: - :return: + :param workdir: workdir (str) + :param cmd: command to be containerised (str) + :return: container command to be executed (str). """ - - command = 'cd %s;' % workdir + command = f'cd {workdir};' content = get_root_container_script(cmd) script_name = 'open_file.sh' @@ -808,14 +785,14 @@ def create_root_container_command(workdir, cmd): # generate the final container command x509 = os.environ.get('X509_UNIFIED_DISPATCH', os.environ.get('X509_USER_PROXY', '')) if x509: - command += 'export X509_USER_PROXY=%s;' % x509 - command += 'export ALRB_CONT_RUNPAYLOAD=\"source /srv/%s\";' % script_name + command += f'export X509_USER_PROXY={x509};' + command += f'export ALRB_CONT_RUNPAYLOAD="source /srv/{script_name}";' _asetup = get_asetup(alrb=True) # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; _asetup = fix_asetup(_asetup) command += _asetup command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c CentOS7' - logger.debug('container command: %s', command) + logger.debug(f'container command: {command}') return command @@ -856,7 +833,7 @@ def create_middleware_container_command(job, cmd, label='stagein', proxy=True): :return: container command to be executed (string). """ - command = 'cd %s;' % job.workdir + command = f'cd {job.workdir};' # add bits and pieces for the containerisation middleware_container = get_middleware_container(label=label) @@ -873,12 +850,11 @@ def create_middleware_container_command(job, cmd, label='stagein', proxy=True): # for setup container container_script_name = 'container_script.sh' try: - logger.debug('command to be written to container setup file \n\n%s:\n\n%s\n', script_name, content) + logger.debug(f'command to be written to container setup file \n\n{script_name}:\n\n{content}\n') status = write_file(os.path.join(job.workdir, script_name), content) if status: content = 'echo \"Done\"' - logger.debug('command to be written to container command file \n\n%s:\n\n%s\n', container_script_name, - content) + logger.debug(f'command to be written to container command file \n\n{container_script_name}:\n\n{content}\n') status = write_file(os.path.join(job.workdir, container_script_name), content) except PilotException as exc: raise exc @@ -888,55 +864,52 @@ def create_middleware_container_command(job, cmd, label='stagein', proxy=True): if proxy: x509 = os.environ.get('X509_USER_PROXY', '') if x509: - command += 'export X509_USER_PROXY=%s;' % x509 + command += f'export X509_USER_PROXY={x509};' if not label == 'setup': # only for stage-in/out; for setup verification, use -s .. -r .. below - command += 'export ALRB_CONT_RUNPAYLOAD=\"source /srv/%s\";' % script_name + command += f'export ALRB_CONT_RUNPAYLOAD="source /srv/{script_name}";' if 'ALRB_CONT_UNPACKEDDIR' in os.environ: - command += 'export ALRB_CONT_UNPACKEDDIR=%s;' % os.environ.get('ALRB_CONT_UNPACKEDDIR') + command += f"export ALRB_CONT_UNPACKEDDIR={os.environ.get('ALRB_CONT_UNPACKEDDIR')};" command += fix_asetup(get_asetup(alrb=True)) # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; if label == 'setup': # set the platform info - command += 'export thePlatform=\"%s\";' % job.platform - command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c %s' % middleware_container + command += f'export thePlatform="{job.platform}";' + command += f'source ${{ATLAS_LOCAL_ROOT_BASE}}/user/atlasLocalSetup.sh -c {middleware_container}' if label == 'setup': command += f' -s /srv/{script_name} -r /srv/{container_script_name}' else: command += ' ' + get_container_options(job.infosys.queuedata.container_options) command = command.replace(' ', ' ') - logger.debug('container command: %s', command) + logger.debug(f'container command: {command}') return command -def get_root_container_script(cmd): +def get_root_container_script(cmd: str) -> str: """ Return the content of the root container script. - :param cmd: root command (string). - :return: script content (string). + :param cmd: root command (str) + :return: script content (str). """ - - # content = 'lsetup \'root 6.20.06-x86_64-centos7-gcc8-opt\'\npython %s\nexit $?' % cmd - # content = f'date\nlsetup \'root pilot\'\ndate\npython {cmd}\nexit $?' - content = f'date\nlsetup \'root pilot\'\ndate\nstdbuf -oL bash -c \"python {cmd}\"\nexit $?' + content = f'date\nlsetup \'root pilot-default\'\ndate\nstdbuf -oL bash -c \"python3 {cmd}\"\nexit $?' logger.debug(f'root setup script content:\n\n{content}\n\n') return content -def get_middleware_container_script(middleware_container, cmd, asetup=False, label=''): +def get_middleware_container_script(middleware_container: str, cmd: str, asetup: bool = False, label: str = '') -> str: """ Return the content of the middleware container script. + If asetup is True, atlasLocalSetup will be added to the command. - :param middleware_container: container image (string). - :param cmd: isolated stage-in/out command (string). - :param asetup: optional True/False (boolean). - :return: script content (string). + :param middleware_container: container image (str) + :param cmd: isolated stage-in/out command (str) + :param asetup: optional True/False (bool) + :return: script content (str). """ - - sitename = 'export PILOT_RUCIO_SITENAME=%s; ' % os.environ.get('PILOT_RUCIO_SITENAME') + sitename = f"export PILOT_RUCIO_SITENAME={os.environ.get('PILOT_RUCIO_SITENAME')}; " if label == 'setup': # source $AtlasSetup/scripts/asetup.sh AtlasOffline,21.0.16,notest --platform x86_64-slc6-gcc49-opt --makeflags='$MAKEFLAGS' content = cmd[cmd.find('source $AtlasSetup'):] @@ -945,7 +918,7 @@ def get_middleware_container_script(middleware_container, cmd, asetup=False, lab content += f'export ATLAS_LOCAL_ROOT_BASE={get_file_system_root_path()}/atlas.cern.ch/repo/ATLASLocalRootBase; ' content += "alias setupATLAS=\'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh\'; " content += "setupATLAS -3; " - content = 'lsetup \"python pilot-default\";python3 %s ' % cmd # only works with python 3 + content = f'lsetup "python pilot-default";python3 {cmd} ' else: content = 'export ALRB_LOCAL_PY3=YES; ' if asetup: # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/..;source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh --quiet; @@ -954,13 +927,13 @@ def get_middleware_container_script(middleware_container, cmd, asetup=False, lab content += _asetup if label == 'stagein' or label == 'stageout': content += sitename + 'lsetup rucio davix xrootd; ' - content += 'python3 %s ' % cmd + content += f'python3 {cmd} ' else: content += cmd if not asetup: content += '\nexit $?' - logger.debug('middleware container content:\n%s', content) + logger.debug(f'middleware container content:\n{content}') return content @@ -983,9 +956,9 @@ def get_middleware_container(label=None): else: path = config.Container.middleware_container if path.startswith('/') and not os.path.exists(path): - logger.warning('requested middleware container path does not exist: %s (switching to default value)', path) + logger.warning(f'requested middleware container path does not exist: {path} (switching to default value)') path = 'CentOS7' - logger.info('using image: %s for middleware container', path) + logger.info(f'using image: {path} for middleware container') return path @@ -1016,13 +989,16 @@ def get_docker_pattern() -> str: """ Return the docker login URL pattern for secret verification. - Example: docker login -u -p + Examples: + docker login -u -p + apptainer remote login -u -p :return: pattern (raw string). """ return ( - fr"docker\ login\ {get_url_pattern()}\ \-u\ \S+\ \-p\ \S+;" + # fr"docker\ login\ {get_url_pattern()}\ \-u\ \S+\ \-p\ \S+;" + fr"apptainer\ remote\ login\ \-u\ \S+\ \-p\ \S+\ {get_url_pattern()};" ) diff --git a/pilot/user/atlas/copytool_definitions.py b/pilot/user/atlas/copytool_definitions.py index 8aea8411..5b908cc3 100644 --- a/pilot/user/atlas/copytool_definitions.py +++ b/pilot/user/atlas/copytool_definitions.py @@ -46,7 +46,7 @@ def get_path(scope, lfn): :return: partial rucio path (string). """ - s = '%s:%s' % (scope, lfn) + s = f'{scope}:{lfn}' hash_hex = md5(s.encode('utf-8')).hexdigest() paths = scope.split('.') + [hash_hex[0:2], hash_hex[2:4], lfn] paths = [_f for _f in paths if _f] # remove empty parts to avoid double /-chars diff --git a/pilot/user/atlas/cpu.py b/pilot/user/atlas/cpu.py index df4b07b4..ac2e8c7f 100644 --- a/pilot/user/atlas/cpu.py +++ b/pilot/user/atlas/cpu.py @@ -47,7 +47,8 @@ def get_core_count(job): try: job.corecount = int(os.environ.get('ATHENA_PROC_NUMBER')) except (ValueError, TypeError) as exc: - logger.warning("ATHENA_PROC_NUMBER is not properly set: %s (will use existing job.corecount value)", exc) + logger.warning(f"ATHENA_PROC_NUMBER is not properly set: {exc} " + f"(will use existing job.corecount value)") else: try: job.corecount = int(os.environ.get('ATHENA_PROC_NUMBER')) @@ -84,21 +85,6 @@ def set_core_counts(**kwargs): # something like this could be used if prmon also gave info about ncores # (change nprocs -> ncores and add ncores to list in utilities module, get_average_summary_dictionary_prmon()) - #summary_dictionary = get_memory_values(job.workdir, name=job.memorymonitor) - #if summary_dictionary: - # if 'nprocs' in summary_dictionary["Other"]: - # try: - # job.actualcorecount = int(summary_dictionary["Other"]["nprocs"]) - # except Exception as exc: - # logger.warning('exception caught: %s', exc) - # else: - # job.corecounts = add_core_count(job.actualcorecount) - # logger.debug('current core counts list: %s', str(job.corecounts)) - # else: - # logger.debug('summary_dictionary[Other]=%s', summary_dictionary["Other"]) - #else: - # logger.debug('no summary_dictionary') - job = kwargs.get('job', None) walltime = kwargs.get('walltime', None) @@ -129,26 +115,3 @@ def set_core_counts(**kwargs): logger.debug('no summary dictionary') else: logger.debug(f'failed to calculate number of cores (walltime={walltime})') - -# if job and job.pgrp: -# # ps axo pgid,psr -> 154628 8 \n 154628 9 \n 1546280 1 .. -# # sort is redundant; uniq removes any duplicate lines; wc -l gives the final count -# # awk is added to get the pgrp list only and then grep -x makes sure that false positives are removed, e.g. 1546280 -# cmd = "ps axo pgid,psr | sort | grep %d | uniq | awk '{print $1}' | grep -x %d | wc -l" % (job.pgrp, job.pgrp) -# _, stdout, _ = execute(cmd, mute=True) -# logger.debug('%s: %s', cmd, stdout) -# try: -# job.actualcorecount = int(stdout) -# except ValueError as exc: -# logger.warning('failed to convert number of actual cores to int: %s', exc) -# else: -# job.corecounts = add_core_count(job.actualcorecount) #, core_counts=job.corecounts) -# #logger.debug('current core counts list: %s', str(job.corecounts)) -# # check suspicious values -# #if job.actualcorecount > 5: -# # logger.warning('detected large actualcorecount: %d', job.actualcorecount) -# # cmd = "ps axo pgid,stat,euid,ruid,tty,tpgid,sess,pgrp,ppid,pid,pcpu,comm | sort | uniq | grep %d" % job.pgrp -# # exit_code, stdout, stderr = execute(cmd, mute=True) -# # logger.debug('%s (pgrp=%d): %s', cmd, job.pgrp, stdout) -# else: -# logger.debug('payload process group not set - cannot check number of cores used by payload') diff --git a/pilot/user/atlas/dbrelease.py b/pilot/user/atlas/dbrelease.py index 9f343143..dc5d7e71 100644 --- a/pilot/user/atlas/dbrelease.py +++ b/pilot/user/atlas/dbrelease.py @@ -75,9 +75,10 @@ def get_dbrelease_dir(): logger.warning("note: the DBRelease database directory is not available (will not attempt to skip DBRelease stage-in)") else: if os.path.exists(path): - logger.info("local DBRelease path verified: %s (will attempt to skip DBRelease stage-in)", path) + logger.info(f"local DBRelease path verified: {path} (will attempt to skip DBRelease stage-in)") else: - logger.warning("note: local DBRelease path does not exist: %s (will not attempt to skip DBRelease stage-in)", path) + logger.warning(f"note: local DBRelease path does not exist: {path} " + f"(will not attempt to skip DBRelease stage-in)") return path @@ -107,14 +108,14 @@ def is_dbrelease_available(version): # is the required DBRelease version available? if dir_list: if version in dir_list: - logger.info("found version %s in path %s (%d releases found)", version, path, len(dir_list)) + logger.info(f"found version {version} in path {path} ({len(dir_list)} releases found)") status = True else: - logger.warning("did not find version %s in path %s (%d releases found)", version, path, len(dir_list)) + logger.warning(f"did not find version {version} in path {path} ({len(dir_list)} releases found)") else: - logger.warning("empty DBRelease directory list: %s", path) + logger.warning(f"empty DBRelease directory list: {path}") else: - logger.warning('no such DBRelease path: %s', path) + logger.warning(f'no such DBRelease path: {path}') return status @@ -135,21 +136,21 @@ def create_setup_file(version, path): if _dir != "" and version != "": # create the python code string to be written to file txt = "import os\n" - txt += "os.environ['DBRELEASE'] = '%s'\n" % version - txt += "os.environ['DATAPATH'] = '%s/%s:' + os.environ['DATAPATH']\n" % (_dir, version) - txt += "os.environ['DBRELEASE_REQUIRED'] = '%s'\n" % version - txt += "os.environ['DBRELEASE_REQUESTED'] = '%s'\n" % version - txt += "os.environ['CORAL_DBLOOKUP_PATH'] = '%s/%s/XMLConfig'\n" % (_dir, version) + txt += f"os.environ['DBRELEASE'] = '{version}'\n" + txt += f"os.environ['DATAPATH'] = '{_dir}/{version}:' + os.environ['DATAPATH']\n" + txt += f"os.environ['DBRELEASE_REQUIRED'] = '{version}'\n" + txt += f"os.environ['DBRELEASE_REQUESTED'] = '{version}'\n" + txt += f"os.environ['CORAL_DBLOOKUP_PATH'] = '{_dir}/{version}/XMLConfig'\n" try: status = write_file(path, txt) except FileHandlingFailure as exc: - logger.warning('failed to create DBRelease setup file: %s', exc) + logger.warning(f'failed to create DBRelease setup file: {exc}') else: - logger.info("Created setup file with the following content:.................................\n%s", txt) + logger.info(f"Created setup file with the following content:.................................\n{txt}") logger.info("...............................................................................") else: - logger.warning('failed to create %s for DBRelease version=%s and directory=%s', path, version, _dir) + logger.warning(f'failed to create {path} for DBRelease version={version} and directory {_dir}') return status @@ -171,37 +172,37 @@ def create_dbrelease(version, path): try: mkdirs(_path, chmod=None) except PilotException as exc: - logger.warning('failed to create directories for DBRelease: %s', exc) + logger.warning(f'failed to create directories for DBRelease: {exc}') else: - logger.debug('created directories: %s', _path) + logger.debug(f'created directories: {_path}') # create the setup file in the DBRelease directory version_path = os.path.join(dbrelease_path, version) setup_filename = "setup.py" _path = os.path.join(version_path, setup_filename) if create_setup_file(version, _path): - logger.info("created DBRelease setup file: %s", _path) + logger.info(f"created DBRelease setup file: {_path}") # now create a new DBRelease tarball - filename = os.path.join(path, "DBRelease-%s.tar.gz" % version) - logger.info("creating file: %s", filename) + filename = os.path.join(path, f"DBRelease-{version}.tar.gz") + logger.info(f"creating file: {filename}") try: tar = tarfile.open(filename, "w:gz") except (IOError, OSError) as exc: - logger.warning("could not create DBRelease tar file: %s", exc) + logger.warning(f"could not create DBRelease tar file: {exc}") else: if tar: # add the setup file to the tar file - tar.add("%s/DBRelease/%s/%s" % (path, version, setup_filename)) + tar.add(f"{path}/DBRelease/{version}/{setup_filename}") # create the symbolic link DBRelease/current -> 12.2.1 try: _link = os.path.join(path, "DBRelease/current") os.symlink(version, _link) except OSError as exc: - logger.warning("failed to create symbolic link %s: %s", _link, exc) + logger.warning(f"failed to create symbolic link {_link}: {exc}") else: - logger.warning("created symbolic link: %s", _link) + logger.warning(f"created symbolic link: {_link}") # add the symbolic link to the tar file tar.add(_link) @@ -209,17 +210,17 @@ def create_dbrelease(version, path): # done with the tar archive tar.close() - logger.info("created new DBRelease tar file: %s", filename) + logger.info(f"created new DBRelease tar file: {filename}") status = True else: logger.warning("failed to open DBRelease tar file") # clean up if rmdirs(dbrelease_path): - logger.debug("cleaned up directories in path: %s", dbrelease_path) + logger.debug(f"cleaned up directories in path: {dbrelease_path}") else: logger.warning("failed to create DBRelease setup file") if rmdirs(dbrelease_path): - logger.debug("cleaned up directories in path: %s", dbrelease_path) + logger.debug(f"cleaned up directories in path: {dbrelease_path}") return status diff --git a/pilot/user/atlas/diagnose.py b/pilot/user/atlas/diagnose.py index 6dd3381e..8c2a8949 100644 --- a/pilot/user/atlas/diagnose.py +++ b/pilot/user/atlas/diagnose.py @@ -56,7 +56,7 @@ def interpret(job): if len(job.piloterrorcodes) == 1 and errors.NOPAYLOADMETADATA in job.piloterrorcodes and job.transexitcode != 0: logger.warning('ignore metadata error for now') else: - logger.warning('aborting payload error diagnosis since an error has already been set: %s', str(job.piloterrorcodes)) + logger.warning(f'aborting payload error diagnosis since an error has already been set: {job.piloterrorcodes}') return -1 if job.exitcode != 0: @@ -64,10 +64,10 @@ def interpret(job): # check for special errors if exit_code == 146: - logger.warning('user tarball was not downloaded (payload exit code %d)', exit_code) + logger.warning(f'user tarball was not downloaded (payload exit code {exit_code})') set_error_nousertarball(job) elif exit_code == 160: - logger.info('ignoring harmless preprocess exit code %d', exit_code) + logger.info(f'ignoring harmless preprocess exit code {exit_code}') job.transexitcode = 0 job.exitcode = 0 exit_code = 0 @@ -76,7 +76,7 @@ def interpret(job): try: extract_special_information(job) except PilotException as exc: - logger.error('PilotException caught while extracting special job information: %s', exc) + logger.error(f'PilotException caught while extracting special job information: {exc}') exit_code = exc.get_error_code() job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) @@ -84,7 +84,7 @@ def interpret(job): try: interpret_payload_exit_info(job) except Exception as exc: - logger.warning('exception caught while interpreting payload exit info: %s', exc) + logger.warning(f'exception caught while interpreting payload exit info: {exc}') return exit_code @@ -125,7 +125,7 @@ def interpret_payload_exit_info(job): else: if disk_space: spaceleft = convert_mb_to_b(disk_space) # B (diskspace is in MB) - logger.info('remaining local space: %d B', spaceleft) + logger.info(f'remaining local space: {spaceleft} B') else: logger.warning('get_local_disk_space() returned None') return @@ -161,16 +161,16 @@ def is_out_of_memory(job): files = {stderr: ["FATAL out of memory: taking the application down"], stdout: ["St9bad_alloc", "std::bad_alloc"]} for path in files: if os.path.exists(path): - logger.info('looking for out-of-memory errors in %s', os.path.basename(path)) + logger.info(f'looking for out-of-memory errors in {os.path.basename(path)}') if os.path.getsize(path) > 0: matched_lines = grep(files[path], path) if matched_lines: - logger.warning("identified an out of memory error in %s %s:", job.payload, os.path.basename(path)) + logger.warning(f"identified an out of memory error in {job.payload}") for line in matched_lines: logger.info(line) out_of_memory = True else: - logger.warning('file does not exist: %s (cannot look for out-of-memory error in it)') + logger.warning(f'file does not exist: {path} (cannot look for out-of-memory error in it)') return out_of_memory @@ -188,7 +188,7 @@ def is_user_code_missing(job): return scan_file(stdout, error_messages, - warning_message="identified an \'%s\' message in %s" % (error_messages[0], os.path.basename(stdout))) + warning_message=f"identified an '{error_messages[0]}' message in {os.path.basename(stdout)}") def is_out_of_space(job): @@ -204,7 +204,7 @@ def is_out_of_space(job): return scan_file(stderr, error_messages, - warning_message="identified a \'%s\' message in %s" % (error_messages[0], os.path.basename(stderr))) + warning_message=f"identified a '{error_messages[0]}' message in {os.path.basename(stderr)}") def is_installation_error(job): @@ -248,7 +248,7 @@ def is_nfssqlite_locking_problem(job): return scan_file(stdout, error_messages, - warning_message="identified an NFS/Sqlite locking problem in %s" % os.path.basename(stdout)) + warning_message=f"identified an NFS/Sqlite locking problem in {os.path.basename(stdout)}") def extract_special_information(job): @@ -266,7 +266,7 @@ def extract_special_information(job): try: find_db_info(job) except Exception as exc: - logger.warning('detected problem with parsing job report (in find_db_info()): %s', exc) + logger.warning(f'detected problem with parsing job report (in find_db_info()): {exc}') def find_number_of_events(job): @@ -278,29 +278,29 @@ def find_number_of_events(job): """ if job.nevents: - logger.info('number of events already known: %d', job.nevents) + logger.info(f'number of events already known: {job.nevents}') return logger.info('looking for number of processed events (source #1: jobReport.json)') find_number_of_events_in_jobreport(job) if job.nevents > 0: - logger.info('found %d processed events', job.nevents) + logger.info(f'found {job.nevents} processed events') return logger.info('looking for number of processed events (source #2: metadata.xml)') find_number_of_events_in_xml(job) if job.nevents > 0: - logger.info('found %d processed events', job.nevents) + logger.info(f'found {job.nevents} processed events') return logger.info('looking for number of processed events (source #3: athena summary file(s)') nev1, nev2 = process_athena_summary(job) if nev1 > 0: job.nevents = nev1 - logger.info('found %d processed (read) events', job.nevents) + logger.info(f'found {job.nevents} processed (read) events') if nev2 > 0: job.neventsw = nev2 - logger.info('found %d processed (written) events', job.neventsw) + logger.info(f'found {nev2} processed (written) events') def find_number_of_events_in_jobreport(job): @@ -314,7 +314,7 @@ def find_number_of_events_in_jobreport(job): try: work_attributes = parse_jobreport_data(job.metadata) except Exception as exc: - logger.warning('exception caught while parsing job report: %s', exc) + logger.warning(f'exception caught while parsing job report: {exc}') return if 'nEvents' in work_attributes: @@ -323,7 +323,7 @@ def find_number_of_events_in_jobreport(job): if n_events: job.nevents = int(n_events) except ValueError as exc: - logger.warning('failed to convert number of events to int: %s', exc) + logger.warning(f'failed to convert number of events to int: {exc}') def find_number_of_events_in_xml(job): @@ -338,7 +338,7 @@ def find_number_of_events_in_xml(job): try: metadata = get_metadata_from_xml(job.workdir) except Exception as exc: - msg = "Exception caught while interpreting XML: %s" % exc + msg = f"Exception caught while interpreting XML: {exc}" raise BadXML(msg) if metadata: @@ -375,18 +375,17 @@ def process_athena_summary(job): recent_summary_file, recent_time, oldest_summary_file, oldest_time = \ find_most_recent_and_oldest_summary_files(file_list) if oldest_summary_file == recent_summary_file: - logger.info("summary file %s will be processed for errors and number of events", - os.path.basename(oldest_summary_file)) + logger.info(f"summary file {os.path.basename(oldest_summary_file)} will be processed for errors and number of events") else: - logger.info("most recent summary file %s (updated at %d) will be processed for errors [to be implemented]", - os.path.basename(recent_summary_file), recent_time) - logger.info("oldest summary file %s (updated at %d) will be processed for number of events", - os.path.basename(oldest_summary_file), oldest_time) + logger.info(f"most recent summary file {os.path.basename(recent_summary_file)} " + f"(updated at {recent_time}) will be processed for errors [to be implemented]") + logger.info(f"oldest summary file {os.path.basename(oldest_summary_file)} " + f"(updated at {oldest_time}) will be processed for number of events") # Get the number of events from the oldest summary file nev1, nev2 = get_number_of_events_from_summary_file(oldest_summary_file) - logger.info("number of events: %d (read)", nev1) - logger.info("number of events: %d (written)", nev2) + logger.info(f"number of events: {nev1} (read)") + logger.info(f"number of events: {nev2} (written)") return nev1, nev2 @@ -407,8 +406,8 @@ def find_most_recent_and_oldest_summary_files(file_list): # get the modification time try: st_mtime = os.path.getmtime(summary_file) - except OSError as exc: # Python 2/3 - logger.warning("could not read modification time of file %s: %s", summary_file, exc) + except OSError as exc: + logger.warning(f"could not read modification time of file {summary_file}: {exc}") else: if st_mtime > recent_time: recent_time = st_mtime @@ -421,8 +420,8 @@ def find_most_recent_and_oldest_summary_files(file_list): recent_summary_file = oldest_summary_file try: oldest_time = os.path.getmtime(oldest_summary_file) - except OSError as exc: # Python 2/3 - logger.warning("could not read modification time of file %s: %s", oldest_summary_file, exc) + except OSError as exc: + logger.warning(f"could not read modification time of file {oldest_summary_file}: {exc}") else: recent_time = oldest_time @@ -449,14 +448,14 @@ def get_number_of_events_from_summary_file(oldest_summary_file): for line in lines: if "Events Read:" in line: try: - nev1 = int(re.match(r'Events Read\: *(\d+)', line).group(1)) # Python 3 (added r) + nev1 = int(re.match(r'Events Read\: *(\d+)', line).group(1)) except ValueError as exc: - logger.warning('failed to convert number of read events to int: %s', exc) + logger.warning(f'failed to convert number of read events to int: {exc}') if "Events Written:" in line: try: - nev2 = int(re.match(r'Events Written\: *(\d+)', line).group(1)) # Python 3 (added r) + nev2 = int(re.match(r'Events Written\: *(\d+)', line).group(1)) except ValueError as exc: - logger.warning('failed to convert number of written events to int: %s', exc) + logger.warning(f'failed to convert number of written events to int: {exc}') if nev1 > 0 and nev2 > 0: break else: @@ -481,14 +480,14 @@ def find_db_info(job): try: job.dbtime = int(work_attributes.get('__db_time')) except ValueError as exc: - logger.warning('failed to convert dbtime to int: %s', exc) - logger.info('dbtime (total): %d', job.dbtime) + logger.warning(f'failed to convert dbtime to int: {exc}') + logger.info(f'dbtime (total): {job.dbtime}') if '__db_data' in work_attributes: try: job.dbdata = work_attributes.get('__db_data') except ValueError as exc: - logger.warning('failed to convert dbdata to int: %s', exc) - logger.info('dbdata (total): %d', job.dbdata) + logger.warning(f'failed to convert dbdata to int: {exc}') + logger.info(f'dbdata (total): {job.dbdata}') def set_error_nousertarball(job): @@ -509,7 +508,7 @@ def set_error_nousertarball(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOUSERTARBALL) job.piloterrorcode = errors.NOUSERTARBALL - job.piloterrordiag = "User tarball %s cannot be downloaded from PanDA server" % tarball_url + job.piloterrordiag = f"User tarball {tarball_url} cannot be downloaded from PanDA server" def extract_tarball_url(_tail): @@ -545,7 +544,7 @@ def process_metadata_from_xml(job): job.metadata = read_file(path) else: if not job.is_analysis() and job.transformation != 'Archive_tf.py': - diagnostics = 'metadata does not exist: %s' % path + diagnostics = f'metadata does not exist: {path}' logger.warning(diagnostics) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.NOPAYLOADMETADATA) job.piloterrorcode = errors.NOPAYLOADMETADATA @@ -559,14 +558,14 @@ def process_metadata_from_xml(job): try: metadata = get_metadata_from_xml(job.workdir) except Exception as exc: - msg = "Exception caught while interpreting XML: %s (ignoring it, but guids must now be generated)" % exc + msg = f"Exception caught while interpreting XML: {exc} (ignoring it, but guids must now be generated)" logger.warning(msg) if metadata: dat.guid = get_guid_from_xml(metadata, dat.lfn) - logger.info('read guid for lfn=%s from xml: %s', dat.lfn, dat.guid) + logger.info(f'read guid for lfn={dat.lfn} from xml: {dat.guid}') else: dat.guid = get_guid() - logger.info('generated guid for lfn=%s: %s', dat.lfn, dat.guid) + logger.info(f'generated guid for lfn={dat.lfn}: {dat.guid}') def process_job_report(job): @@ -584,7 +583,7 @@ def process_job_report(job): # get the job report path = os.path.join(job.workdir, config.Payload.jobreport) if not os.path.exists(path): - logger.warning('job report does not exist: %s', path) + logger.warning(f'job report does not exist: {path}') # get the metadata from the xml file instead, which must exist for most production transforms process_metadata_from_xml(job) @@ -605,21 +604,21 @@ def process_job_report(job): try: job.exitcode = job.metadata['exitCode'] except KeyError as exc: - logger.warning('could not find compulsory payload exitCode in job report: %s (will be set to 0)', exc) + logger.warning(f'could not find compulsory payload exitCode in job report: {exc} (will be set to 0)') job.exitcode = 0 else: - logger.info('extracted exit code from job report: %d', job.exitcode) + logger.info(f'extracted exit code from job report: {job.exitcode}') try: job.exitmsg = job.metadata['exitMsg'] except KeyError as exc: - logger.warning('could not find compulsory payload exitMsg in job report: %s ' - '(will be set to empty string)', exc) + logger.warning(f'could not find compulsory payload exitMsg in job report: {exc} ' + f'(will be set to empty string)') job.exitmsg = "" else: # assign special payload error code if "got a SIGSEGV signal" in job.exitmsg: - diagnostics = 'Invalid memory reference or a segmentation fault in payload: %s (job report)' % \ - job.exitmsg + diagnostics = f'Invalid memory reference or a segmentation fault in payload: ' \ + f'{job.exitmsg} (job report)' logger.warning(diagnostics) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADSIGSEGV, msg=diagnostics) job.piloterrorcode = errors.PAYLOADSIGSEGV @@ -633,7 +632,7 @@ def process_job_report(job): job.piloterrorcode = errors.FRONTIER job.piloterrordiag = msg - logger.info('extracted exit message from job report: %s', job.exitmsg) + logger.info(f'extracted exit message from job report: {job.exitmsg}') if job.exitmsg != 'OK': job.exeerrordiag = job.exitmsg job.exeerrorcode = job.exitcode @@ -758,7 +757,7 @@ def get_job_report_errors(job_report_dictionary): job_report_errors = [] if 'reportVersion' in job_report_dictionary: - logger.info("scanning jobReport (v %s) for error info", job_report_dictionary.get('reportVersion')) + logger.info(f"scanning jobReport (v {job_report_dictionary.get('reportVersion')}) for error info") else: logger.warning("jobReport does not have the reportVersion key") @@ -766,13 +765,13 @@ def get_job_report_errors(job_report_dictionary): try: error_details = job_report_dictionary['executor'][0]['logfileReport']['details']['ERROR'] except (KeyError, TypeError, IndexError) as exc: - logger.warning("WARNING: aborting jobReport scan: %s", exc) + logger.warning(f"WARNING: aborting jobReport scan: {exc}") else: if isinstance(error_details, list): for msg in error_details: job_report_errors.append(msg['message']) else: - logger.warning("did not get a list object: %s", type(error_details)) + logger.warning(f"did not get a list object: {type(error_details)}") else: logger.warning("jobReport does not have the executor key (aborting)") @@ -791,7 +790,7 @@ def is_bad_alloc(job_report_errors): diagnostics = "" for err in job_report_errors: if "bad_alloc" in err: - logger.warning("encountered a bad_alloc error: %s", err) + logger.warning(f"encountered a bad_alloc error: {err}") bad_alloc = True diagnostics = err break @@ -818,7 +817,7 @@ def get_log_extracts(job, state): # for failed/holding jobs, add extracts from the pilot log file, but always add it to the pilot log itself _extracts = get_pilot_log_extracts(job) if _extracts != "": - logger.warning('detected the following tail of warning/fatal messages in the pilot log:\n%s', _extracts) + logger.warning(f'detected the following tail of warning/fatal messages in the pilot log:\n{_extracts}') if state == 'failed' or state == 'holding': extracts += _extracts @@ -843,15 +842,15 @@ def get_panda_tracer_log(job): if os.path.exists(tracerlog): # only add if file is not empty if os.path.getsize(tracerlog) > 0: - message = "PandaID=%s had outbound connections: " % (job.jobid) + message = f"PandaID={job.jobid} had outbound connections: " extracts += message message = read_file(tracerlog) extracts += message logger.warning(message) else: - logger.info("PanDA tracer log (%s) has zero size (no outbound connections detected)", tracerlog) + logger.info(f"PanDA tracer log ({tracerlog}) has zero size (no outbound connections detected)") else: - logger.debug("PanDA tracer log does not exist: %s (ignoring)", tracerlog) + logger.debug(f"PanDA tracer log does not exist: {tracerlog} (ignoring)") return extracts @@ -873,24 +872,9 @@ def get_pilot_log_extracts(job): if _tail != "": if extracts != "": extracts += "\n" - extracts += "- Log from %s -\n" % config.Pilot.pilotlog + extracts += f"- Log from {config.Pilot.pilotlog} -\n" extracts += _tail - - # grep for fatal/critical errors in the pilot log - #errormsgs = ["FATAL", "CRITICAL", "ERROR"] - #matched_lines = grep(errormsgs, path) - #_extracts = "" - #if len(matched_lines) > 0: - # logger.debug("dumping warning messages from %s:\n", os.path.basename(path)) - # for line in matched_lines: - # _extracts += line + "\n" - #if _extracts != "": - # if config.Pilot.error_log != "": - # path = os.path.join(job.workdir, config.Pilot.error_log) - # write_file(path, _extracts) - # extracts += "\n- Error messages from %s -\n" % config.Pilot.pilotlog - # extracts += _extracts else: - logger.warning('pilot log file does not exist: %s', path) + logger.warning(f'pilot log file does not exist: {path}') return extracts diff --git a/pilot/user/atlas/jobdata.py b/pilot/user/atlas/jobdata.py index 013962db..72aed8a2 100644 --- a/pilot/user/atlas/jobdata.py +++ b/pilot/user/atlas/jobdata.py @@ -16,26 +16,28 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-24 -import re +"""Functions related to job data.""" import logging +import re + logger = logging.getLogger(__name__) -def jobparams_prefiltering(value): +def jobparams_prefiltering(value: str) -> (dict, str): """ Perform pre-filtering of raw job parameters to avoid problems with especially quotation marks. + The function can extract some fields from the job parameters to be put back later after actual filtering. E.g. ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" ' will otherwise become ' --athenaopts 'HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER' ' which will prevent the environmental variable to be unfolded. - :param value: job parameters (string). - :return: dictionary of fields excluded from job parameters (dictionary), updated job parameters (string). + :param value: job parameters (str) + :return: dictionary of fields excluded from job parameters (dict), updated job parameters (str). """ - exclusions = {} pattern = re.compile(r' (\-\-athenaopts\ \"?\'?[^"]+\"?\'?)') result = re.findall(pattern, value) @@ -45,22 +47,34 @@ def jobparams_prefiltering(value): value = re.sub(pattern, ' TOBEREPLACED1 ', value) # do not remove the space # add more items to the exclusions as necessary + logger.debug(f'exclusions = {exclusions}') - logger.debug('exclusions = %s', str(exclusions)) return exclusions, value -def jobparams_postfiltering(value, exclusions={}): +def jobparams_postfiltering(value: str, exclusions: dict = None) -> str: """ Perform post-filtering of raw job parameters. + Any items in the optional exclusion list will be added (space separated) at the end of the job parameters. - :param value: job parameters (string). - :param optional exclusion: exclusion dictionary from pre-filtering function (dictinoary). - :return: updated job parameters (string). + :param value: job parameters (str) + :param exclusions: exclusion dictionary from pre-filtering function (dict) + :return: updated job parameters (str). """ + if exclusions is None: # avoid pylint warning + exclusions = {} for item in exclusions: value = value.replace(item, exclusions[item]) return value + + +def fail_at_getjob_none() -> bool: + """ + Return a boolean value indicating whether to fail when getJob returns None. + + :return: True (bool). + """ + return True diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py index c60d26d7..493d36b1 100644 --- a/pilot/user/atlas/jobmetrics.py +++ b/pilot/user/atlas/jobmetrics.py @@ -51,7 +51,7 @@ def get_job_metrics_string(job, extra={}): # report core count (will also set corecount in job object) corecount = get_core_count(job) - logger.debug('job definition core count: %d', corecount) + logger.debug(f'job definition core count: {corecount}') #if corecount is not None and corecount != "NULL" and corecount != 'null': # job_metrics += get_job_metrics_entry("coreCount", corecount) @@ -85,7 +85,7 @@ def get_job_metrics_string(job, extra={}): if max_space > zero: job_metrics += get_job_metrics_entry("workDirSize", max_space) else: - logger.info("will not add max space = %d B to job metrics", max_space) + logger.info(f"will not add max space = {max_space} B to job metrics") # is there a detected rucio trace service error? trace_exit_code = get_trace_exit_code(job.workdir) @@ -226,7 +226,7 @@ def add_event_number(job_metrics, workdir): if event_number: job_metrics += get_job_metrics_entry("eventnumber", event_number) else: - logger.debug('file %s does not exist (skip for now)', path) + logger.debug(f'file {path} does not exist (skip for now)') return job_metrics @@ -253,18 +253,18 @@ def get_job_metrics(job, extra={}): job_metrics = job_metrics.lstrip().rstrip() if job_metrics != "": - logger.debug('job metrics=\"%s\"', job_metrics) + logger.debug(f'job metrics=\"{job_metrics}\"') else: logger.debug("no job metrics (all values are zero)") # is job_metrics within allowed size? if len(job_metrics) > 500: - logger.warning("job_metrics out of size (%d)", len(job_metrics)) + logger.warning(f"job_metrics out of size ({len(job_metrics)})") # try to reduce the field size and remove the last entry which might be cut job_metrics = job_metrics[:500] job_metrics = " ".join(job_metrics.split(" ")[:-1]) - logger.warning("job_metrics has been reduced to: %s", job_metrics) + logger.warning(f"job_metrics has been reduced to: {job_metrics}") return job_metrics diff --git a/pilot/user/atlas/memory.py b/pilot/user/atlas/memory.py index 89826583..6a72a301 100644 --- a/pilot/user/atlas/memory.py +++ b/pilot/user/atlas/memory.py @@ -51,21 +51,21 @@ def get_ucore_scale_factor(job): try: job_corecount = float(job.corecount) except (ValueError, TypeError) as exc: - logger.warning('exception caught: %s (job.corecount=%s)', exc, str(job.corecount)) + logger.warning(f'exception caught: {exc} (job.corecount={job.corecount})') job_corecount = None try: schedconfig_corecount = float(job.infosys.queuedata.corecount) except (ValueError, TypeError) as exc: - logger.warning('exception caught: %s (job.infosys.queuedata.corecount=%s)', exc, str(job.infosys.queuedata.corecount)) + logger.warning(f'exception caught: {exc} (job.infosys.queuedata.corecount={job.infosys.queuedata.corecount})') schedconfig_corecount = None if job_corecount and schedconfig_corecount: try: scale = job_corecount / schedconfig_corecount - logger.debug('scale: job_corecount / schedconfig_corecount=%f', scale) + logger.debug(f'scale: job_corecount / schedconfig_corecount={scale}') except (ZeroDivisionError, TypeError) as exc: - logger.warning('exception caught: %s (using scale factor 1)', exc) + logger.warning(f'exception caught: {exc} (using scale factor 1)') scale = 1 else: logger.debug('will use scale factor 1') @@ -106,13 +106,13 @@ def memory_usage(job): try: maxrss_int = 2 * int(maxrss * scale) * 1024 # Convert to int and kB except (ValueError, TypeError) as exc: - logger.warning("unexpected value for maxRSS: %s", exc) + logger.warning(f"unexpected value for maxRSS: {exc}") else: # Compare the maxRSS with the maxPSS from memory monitor if maxrss_int > 0 and maxpss_int > 0: if maxpss_int > maxrss_int: - diagnostics = "job has exceeded the memory limit %d kB > %d kB (2 * queuedata.maxrss)" % \ - (maxpss_int, maxrss_int) + diagnostics = f"job has exceeded the memory limit {maxpss_int} kB > {maxrss_int} kB " \ + f"(2 * queuedata.maxrss)" logger.warning(diagnostics) # Create a lockfile to let RunJob know that it should not restart the memory monitor after it has been killed @@ -123,8 +123,8 @@ def memory_usage(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PAYLOADEXCEEDMAXMEM) kill_processes(job.pid) else: - logger.info("max memory (maxPSS) used by the payload is within the allowed limit: " - "%d B (2 * maxRSS = %d B)", maxpss_int, maxrss_int) + logger.info(f"max memory (maxPSS) used by the payload is within the allowed limit: " + f"{maxpss_int} B (2 * maxRSS = {maxrss_int} B)") else: if maxrss == 0 or maxrss == "0": logger.info("queuedata.maxrss set to 0 (no memory checks will be done)") diff --git a/pilot/user/atlas/metadata.py b/pilot/user/atlas/metadata.py index d8c69ce9..e5481271 100644 --- a/pilot/user/atlas/metadata.py +++ b/pilot/user/atlas/metadata.py @@ -162,7 +162,7 @@ def get_metadata_from_xml(workdir, filename="metadata.xml"): metadata_dictionary = {} path = os.path.join(workdir, filename) if not os.path.exists(path): - logger.warning('file does not exist: %s', path) + logger.warning(f'file does not exist: {path}') return metadata_dictionary tree = ElementTree.parse(path) @@ -209,7 +209,7 @@ def get_number_of_events(metadata_dictionary, filename=''): try: nevents = int(metadata_dictionary[filename].get('events')) except ValueError as exc: - logger.warning('failed to convert number of events to int: %s', exc) + logger.warning(f'failed to convert number of events to int: {exc}') else: logger.warning('number of events could not be extracted from metadata dictionary (based on metadata.xml)') @@ -247,7 +247,7 @@ def get_guid(metadata_dictionary, filename=''): try: guid = metadata_dictionary[filename].get('guid') except ValueError as exc: - logger.warning('failed to get guid from xml: %s', exc) + logger.warning(f'failed to get guid from xml: {exc}') else: logger.warning('guid could not be extracted from metadata dictionary (based on metadata.xml)') diff --git a/pilot/user/atlas/nordugrid.py b/pilot/user/atlas/nordugrid.py index c73c8a85..35c93f24 100644 --- a/pilot/user/atlas/nordugrid.py +++ b/pilot/user/atlas/nordugrid.py @@ -71,7 +71,7 @@ def add_to_list(self, dictionary, rootname="outfiles", itemname="file"): else: pass else: - logger.info("not a dictionary: %s", str(self._dictionary)) + logger.info(f"not a dictionary: {self._dictionary}") def get_dictionary(self): """ @@ -110,8 +110,8 @@ def convert_to_xml(dictionary): single_file_tag = list(dictionary.keys()) # Python 2/3 if len(single_file_tag) != 1: - logger.warning("unexpected format - expected single entry, got %d entries", len(single_file_tag)) - logger.warning('dictionary = %s', str(dictionary)) + logger.warning(f"unexpected format - expected single entry, got {len(single_file_tag)} entries") + logger.warning(f'dictionary = {dictionary}') return None file_tag = single_file_tag[0] @@ -134,13 +134,13 @@ def convert_to_xml(dictionary): entry = ElementTree.SubElement(file_element, dictionary_entry) entry.text = file_dictionary[dictionary_entry] else: - logger.warning("unexpected format - expected a dictionary, got %s", str(file_dictionary)) + logger.warning(f"unexpected format - expected a dictionary, got {file_dictionary}") failed = True else: - logger.warning("unexpected format - expected a length 1 dictionary, got %s", str(file_entry)) + logger.warning(f"unexpected format - expected a length 1 dictionary, got {file_entry}") failed = True else: - logger.warning("unexpected format - expected a list, got %s", str(file_list)) + logger.warning(f"unexpected format - expected a list, got {file_list}") failed = True if failed: diff --git a/pilot/user/atlas/proxy.py b/pilot/user/atlas/proxy.py index 39b11fef..4edee459 100644 --- a/pilot/user/atlas/proxy.py +++ b/pilot/user/atlas/proxy.py @@ -78,7 +78,6 @@ def get_and_verify_proxy(x509, voms_role='', proxy_type='', workdir=''): else: logger.info(f"proxy verified (proxy type=\'{proxy_type}\')") # is commented: no user proxy should be in the command the container will execute - # cmd = cmd.replace("export X509_USER_PROXY=%s;" % x509, "export X509_USER_PROXY=%s;" % x509_payload) x509 = x509_payload else: logger.warning(f"failed to get proxy for role=\'{voms_role}\'") @@ -86,22 +85,22 @@ def get_and_verify_proxy(x509, voms_role='', proxy_type='', workdir=''): return exit_code, diagnostics, x509 -def verify_proxy(limit=None, x509=None, proxy_id="pilot", test=False): +def verify_proxy(limit=None, x509=None, proxy_id="pilot", test=False) -> (int, str): """ Check for a valid voms/grid proxy longer than N hours. + Use `limit` to set required time limit. - :param limit: time limit in hours (int). - :param x509: points to the proxy file. If not set (=None) - get proxy file from X509_USER_PROXY environment - :param test: free Boolean test parameter. - :return: exit code (NOPROXY or NOVOMSPROXY), diagnostics (error diagnostics string). + :param limit: time limit in hours (int) + :param x509: points to the proxy file. If not set (=None) - get proxy file from X509_USER_PROXY environment (bool) + :param proxy_id: proxy id (str) + :param test: free Boolean test parameter (bool) + :return: exit code (NOPROXY or NOVOMSPROXY) (int), diagnostics (error diagnostics string) (str). """ - if limit is None: limit = 1 # add setup for arcproxy if it exists - #arcproxy_setup = "%s/atlas.cern.ch/repo/sw/arc/client/latest/slc6/x86_64/setup.sh" % get_file_system_root_path() if x509 is None: x509 = os.environ.get('X509_USER_PROXY', '') if x509 != '': @@ -109,12 +108,6 @@ def verify_proxy(limit=None, x509=None, proxy_id="pilot", test=False): else: envsetup = '' - # envsetup += ". %s/atlas.cern.ch/repo/ATLASLocalRootBase/user/atlasLocalSetup.sh --quiet;" % get_file_system_root_path() - #if os.environ.get('ALRB_noGridMW', '').lower() != "yes": - # envsetup += "lsetup emi;" - #else: - # logger.warning('Skipping "lsetup emi" as ALRB_noGridMW=YES') - # first try to use arcproxy since voms-proxy-info is not working properly on SL6 # (memory issues on queues with limited memory) @@ -126,12 +119,6 @@ def verify_proxy(limit=None, x509=None, proxy_id="pilot", test=False): else: return 0, diagnostics - #exit_code, diagnostics = verify_vomsproxy(envsetup, limit) - #if exit_code != 0: - # return exit_code, diagnostics - #else: - # return 0, diagnostics - return 0, diagnostics @@ -255,10 +242,11 @@ def check_time_left(proxyname, validity, limit): # test bad proxy #if proxyname == 'proxy': # seconds_left = 1000 - logger.info("cache: check %s validity: wanted=%dh (%ds with grace) left=%.2fh (now=%d validity=%d left=%d)", - proxyname, limit, limit * 3600 - 20 * 60, float(seconds_left) / 3600, tnow, validity, seconds_left) + logger.info(f"cache: check {proxyname} validity: wanted={limit}h ({limit * 3600 - 20 * 60}s with grace) " + f"left={float(seconds_left) / 3600:.2f}h (now={tnow} validity={validity} left={seconds_left}s)") + if seconds_left < limit * 3600 - 20 * 60: - diagnostics = 'cert/proxy is about to expire: %.2fh' % (float(seconds_left) / 3600) + diagnostics = f'cert/proxy is about to expire: {float(seconds_left) / 3600:.2f}h' logger.warning(diagnostics) exit_code = errors.CERTIFICATEHASEXPIRED if proxyname == 'cert' else errors.VOMSPROXYABOUTTOEXPIRE else: @@ -299,25 +287,23 @@ def verify_vomsproxy(envsetup, limit): return exit_code, diagnostics -def verify_gridproxy(envsetup, limit): +def verify_gridproxy(envsetup: str, limit: int) -> (int, str): """ Verify proxy using grid-proxy-info command. - :param envsetup: general setup string for proxy commands (string). - :param limit: time limit in hours (int). - :return: exit code (int), error diagnostics (string). + :param envsetup: general setup string for proxy commands (str) + :param limit: time limit in hours (int) + :return: exit code (int), error diagnostics (str). """ - ec = 0 diagnostics = "" if limit: # next clause had problems: grid-proxy-info -exists -valid 0.166666666667:00 - #cmd = "%sgrid-proxy-info -exists -valid %s:00" % (envsetup, str(limit)) # more accurate calculation of HH:MM limit_hours = int(limit * 60) / 60 limit_minutes = int(limit * 60 + .999) - limit_hours * 60 - cmd = "%sgrid-proxy-info -exists -valid %d:%02d" % (envsetup, limit_hours, limit_minutes) + cmd = f"{envsetup}grid-proxy-info -exists -valid {limit_hours}:{limit_minutes:02}" else: cmd = f"{envsetup}grid-proxy-info -exists -valid 24:00" @@ -340,7 +326,7 @@ def verify_gridproxy(envsetup, limit): return ec, diagnostics -def interpret_proxy_info(_ec, stdout, stderr, limit): +def interpret_proxy_info(_ec: int, stdout: str, stderr: str, limit: int) -> (int, str, int, int): """ Interpret the output from arcproxy or voms-proxy-info. @@ -348,16 +334,15 @@ def interpret_proxy_info(_ec, stdout, stderr, limit): :param stdout: stdout from proxy command (string). :param stderr: stderr from proxy command (string). :param limit: time limit in hours (int). - :return: exit code (int), diagnostics (string). validity end cert, validity end in seconds if detected, None if not detected (int). + :return: exit code (int), diagnostics (str). validity end cert (int), validity end in seconds if detected, None if not detected (int). """ - exitcode = 0 diagnostics = "" validity_end = None # not detected validity_end_cert = None # not detected - logger.debug('stdout = %s', stdout) - logger.debug('stderr = %s', stderr) + logger.debug(f'stdout = {stdout}') + logger.debug(f'stderr = {stderr}') if _ec != 0: if "Unable to verify signature! Server certificate possibly not installed" in stdout: @@ -413,15 +398,14 @@ def interpret_proxy_info(_ec, stdout, stderr, limit): return exitcode, diagnostics, validity_end_cert, validity_end -def extract_time_left(stdout): +def extract_time_left(stdout: str) -> (int, int, str): """ Extract the time left for the cert and proxy from the proxy command. Some processing on the stdout is done. - :param stdout: stdout (string). - :return: validity_end_cert, validity_end, stdout (int, string)) + :param stdout: stdout (str) + :return: validity_end_cert, validity_end, stdout (int, int, str) """ - validity_end_cert = None validity_end = None @@ -456,15 +440,15 @@ def extract_time_left(stdout): return validity_end_cert, validity_end, stdout -def extract_time_left_old(stdout): +def extract_time_left_old(stdout: str) -> (int, str): """ Extract the time left from the proxy command. + Some processing on the stdout is done. - :param stdout: stdout (string). - :return: validity_end, stdout (int, string)) + :param stdout: stdout (str) + :return: validity_end, stdout (int, str). """ - validity_end = None # remove the last \n in case there is one @@ -489,11 +473,11 @@ def extract_time_left_old(stdout): return validity_end, stdout -def getproxy_dictionary(voms_role): +def getproxy_dictionary(voms_role: str) -> dict: """ - Prepare the dictionary for the getProxy call. + Prepare the dictionary with the VOMS role and DN for the getProxy call. - :param voms_role: VOMS role (string). + :param voms_role: VOMS role (str) + :return: getProxy dictionary (dict). """ - return {'role': voms_role, 'dn': 'atlpilo2'} if voms_role == 'atlas' else {'role': voms_role} diff --git a/pilot/user/atlas/resource/grid.py b/pilot/user/atlas/resource/grid.py index e1b7ae36..7e6d22d0 100644 --- a/pilot/user/atlas/resource/grid.py +++ b/pilot/user/atlas/resource/grid.py @@ -19,24 +19,30 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 -from pilot.util.container import execute -from pilot.common.errorcodes import ErrorCodes -from ..setup import get_asetup, get_asetup_options +"""Default grid resources.""" import logging +from typing import Any + +from pilot.common.errorcodes import ErrorCodes +from pilot.util.container import execute +from ..setup import ( + get_asetup, + get_asetup_options +) + logger = logging.getLogger(__name__) errors = ErrorCodes() -def verify_setup_command(cmd): +def verify_setup_command(cmd: str) -> (int, str): """ Verify the setup command (containerised). - :param cmd: command string to be verified (string). - :return: pilot error code (int), diagnostics (string). + :param cmd: command string to be verified (str) + :return: pilot error code (int), diagnostics (str). """ - diagnostics = "" exit_code, stdout, stderr = execute(cmd, timeout=5 * 60) @@ -52,32 +58,32 @@ def verify_setup_command(cmd): return exit_code, diagnostics -def get_setup_command(job, prepareasetup): +def get_setup_command(job: Any, prepareasetup: bool = True) -> str: """ Return the path to asetup command, the asetup command itself and add the options (if desired). + If prepareasetup is False, the function will only return the path to the asetup script. It is then assumed to be part of the job parameters. - :param job: job object. - :param prepareasetup: should the pilot prepare the asetup command itself? boolean. - :return: + :param job: job object (Any) + :param prepareasetup: should the pilot prepare the asetup command itself? (bool) + :return: command string (str). """ - # if cvmfs is not available, assume that asetup is not needed # note that there is an exception for sites (BOINC, some HPCs) that have cvmfs but still # uses is_cvmfs=False.. these sites do not use containers, so check for that instead if job.infosys.queuedata.is_cvmfs or not job.infosys.queuedata.container_type: - logger.debug('return asetup path as normal since: is_cvmfs=%s, job.container_type=%s' % - (job.infosys.queuedata.is_cvmfs, job.infosys.queuedata.container_type)) + logger.debug(f'return asetup path as normal since: is_cvmfs={job.infosys.queuedata.is_cvmfs}, ' + f'job.container_type={job.infosys.queuedata.container_type}') else: # if not job.infosys.queuedata.is_cvmfs: - logger.debug('will not return asetup path since: is_cvmfs=%s, job.container_type=%s' % - (job.infosys.queuedata.is_cvmfs, job.infosys.queuedata.container_type)) + logger.debug(f'will not return asetup path since: is_cvmfs={job.infosys.queuedata.is_cvmfs}, ' + f'job.container_type={job.infosys.queuedata.container_type}') return "" # return immediately if there is no release or if user containers are used # if job.swrelease == 'NULL' or (('--containerImage' in job.jobparams or job.imagename) and job.swrelease == 'NULL'): - if job.swrelease == 'NULL' or job.swrelease == '': + if job.swrelease in {'NULL', ''}: logger.debug('will not return asetup path since there is no swrelease set') return "" diff --git a/pilot/user/atlas/resource/jumbojobs.py b/pilot/user/atlas/resource/jumbojobs.py index c6b79ab6..dda665d5 100644 --- a/pilot/user/atlas/resource/jumbojobs.py +++ b/pilot/user/atlas/resource/jumbojobs.py @@ -18,3 +18,5 @@ # # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 + +"""Resource related functions for jumbo jobs (nothing so far).""" diff --git a/pilot/user/atlas/resource/manytoone.py b/pilot/user/atlas/resource/manytoone.py index d0895fd4..6ca1e787 100644 --- a/pilot/user/atlas/resource/manytoone.py +++ b/pilot/user/atlas/resource/manytoone.py @@ -19,34 +19,37 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +"""Resource related functions for many-to-one jobs.""" + +import logging import os +from typing import Any # from pilot.util.container import execute from pilot.common.errorcodes import ErrorCodes -import logging logger = logging.getLogger(__name__) errors = ErrorCodes() -def verify_setup_command(cmd): +def verify_setup_command(cmd: str) -> (int, str): """ Verify the setup command. - :param cmd: command string to be verified (string). - :return: pilot error code (int), diagnostics (string). + :param cmd: command string to be verified (str) + :return: pilot error code (int), diagnostics (str). """ + if not cmd: + logger.debug('cmd is not used by this function') - ec = 0 - diagnostics = "" + return 0, "" - return ec, diagnostics - -def get_setup_command(job, prepareasetup): +def get_setup_command(job: Any, prepareasetup: bool) -> str: """ Return the path to asetup command, the asetup command itself and add the options (if desired). + If prepareasetup is False, the function will only return the path to the asetup script. It is then assumed to be part of the job parameters. @@ -54,22 +57,23 @@ def get_setup_command(job, prepareasetup): HARVESTER_CONTAINER_RELEASE_SETUP_FILE, HARVESTER_LD_LIBRARY_PATH, HARVESTER_PYTHONPATH This will create the string need for the pilot to execute to setup the environment. - :param job: job object. - :param prepareasetup: not used. - :return: setup command (string). + :param job: job object (Any) + :param prepareasetup: not used (bool) + :return: setup command (str). """ - + if not prepareasetup: + logger.debug('prepareasetup is not used by this function') cmd = "" # return immediately if there is no release or if user containers are used if job.swrelease == 'NULL' or '--containerImage' in job.jobparams: - logger.debug('get_setup_command return value: {}'.format(str(cmd))) + logger.debug(f'get_setup_command return value: {cmd}') return cmd # test if environmental variable HARVESTER_CONTAINER_RELEASE_SETUP_FILE is defined setupfile = os.environ.get('HARVESTER_CONTAINER_RELEASE_SETUP_FILE', '') if setupfile != "": - cmd = "source {};".format(setupfile) + cmd = f"source {setupfile};" # test if HARVESTER_LD_LIBRARY_PATH is defined if os.environ.get('HARVESTER_LD_LIBRARY_PATH', '') != "": cmd += "export LD_LIBRARY_PATH=$HARVESTER_LD_LIBRARY_PATH:$LD_LIBRARY_PATH;" @@ -79,6 +83,6 @@ def get_setup_command(job, prepareasetup): #unset FRONTIER_SERVER variable cmd += "unset FRONTIER_SERVER" - logger.debug('get_setup_command return value: {}'.format(str(cmd))) + logger.debug(f'get_setup_command return value: {cmd}') return cmd diff --git a/pilot/user/atlas/resource/nersc.py b/pilot/user/atlas/resource/nersc.py index 3f853262..d1123457 100644 --- a/pilot/user/atlas/resource/nersc.py +++ b/pilot/user/atlas/resource/nersc.py @@ -19,34 +19,36 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +"""Resource related functions for NERSC.""" + +import logging import os +from typing import Any # from pilot.util.container import execute from pilot.common.errorcodes import ErrorCodes -import logging logger = logging.getLogger(__name__) - errors = ErrorCodes() -def verify_setup_command(cmd): +def verify_setup_command(cmd: str) -> (int, str): """ Verify the setup command. :param cmd: command string to be verified (string). :return: pilot error code (int), diagnostics (string). """ + if not cmd: + logger.debug('cmd is not used by this function') - ec = 0 - diagnostics = "" + return 0, "" - return ec, diagnostics - -def get_setup_command(job, prepareasetup): +def get_setup_command(job: Any, prepareasetup: bool) -> str: """ Return the path to asetup command, the asetup command itself and add the options (if desired). + If prepareasetup is False, the function will only return the path to the asetup script. It is then assumed to be part of the job parameters. @@ -54,22 +56,23 @@ def get_setup_command(job, prepareasetup): HARVESTER_CONTAINER_RELEASE_SETUP_FILE, HARVESTER_LD_LIBRARY_PATH, HARVESTER_PYTHONPATH This will create the string need for the pilot to execute to setup the environment. - :param job: job object. - :param prepareasetup: not used. - :return: setup command (string). + :param job: job object (Any) + :param prepareasetup: not used (bool) + :return: setup command (str). """ - + if not prepareasetup: + logger.debug('prepareasetup is not used by this function') cmd = "" # return immediately if there is no release or if user containers are used if job.swrelease == 'NULL' or '--containerImage' in job.jobparams: - logger.debug('get_setup_command return value: {0}'.format(str(cmd))) + logger.debug(f'get_setup_command return value: {cmd}') return cmd # test if environmental variable HARVESTER_CONTAINER_RELEASE_SETUP_FILE is defined setupfile = os.environ.get('HARVESTER_CONTAINER_RELEASE_SETUP_FILE', '') if setupfile != "": - cmd = "source {};".format(setupfile) + cmd = f"source {setupfile};" # test if HARVESTER_LD_LIBRARY_PATH is defined if os.environ.get('HARVESTER_LD_LIBRARY_PATH', '') != "": cmd += "export LD_LIBRARY_PATH=$HARVESTER_LD_LIBRARY_PATH:$LD_LIBRARY_PATH;" @@ -84,6 +87,6 @@ def get_setup_command(job, prepareasetup): "(serverurl=http://atlasfrontier1-ai.cern.ch:8000/atlr)" "(proxyurl=http://frontiercache.nersc.gov:3128)\"") - logger.debug('get_setup_command return value: {0}'.format(str(cmd))) + logger.debug(f'get_setup_command return value: {cmd}') return cmd diff --git a/pilot/user/atlas/setup.py b/pilot/user/atlas/setup.py index cf178b9f..c2616383 100644 --- a/pilot/user/atlas/setup.py +++ b/pilot/user/atlas/setup.py @@ -91,8 +91,8 @@ def get_alrb_export(add_if=False): :return: export command """ - path = "%s/atlas.cern.ch/repo" % get_file_system_root_path() - cmd = "export ATLAS_LOCAL_ROOT_BASE=%s/ATLASLocalRootBase;" % path if os.path.exists(path) else "" + path = f"{get_file_system_root_path()}/atlas.cern.ch/repo" + cmd = f"export ATLAS_LOCAL_ROOT_BASE={path}/ATLASLocalRootBase;" if os.path.exists(path) else "" # if [ -z "$ATLAS_LOCAL_ROOT_BASE" ]; then export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; fi; if cmd and add_if: @@ -132,11 +132,11 @@ def get_asetup(asetup=True, alrb=False, add_if=False): if appdir != "": # make sure that the appdir exists if not os.path.exists(appdir): - msg = 'appdir does not exist: %s' % appdir + msg = f'appdir does not exist: {appdir}' logger.warning(msg) raise NoSoftwareDir(msg) if asetup: - cmd = "source %s/scripts/asetup.sh" % appdir + cmd = f"source {appdir}/scripts/asetup.sh" # do not return an empty string #if not cmd: @@ -160,7 +160,7 @@ def get_asetup_options(release, homepackage): if 'AnalysisTransforms' in homepackage: _homepackage = re.sub('^AnalysisTransforms-*', '', homepackage) - if _homepackage == '' or re.search(r'^\d+\.\d+\.\d+$', release) is None: # Python 3 (added r) + if _homepackage == '' or re.search(r'^\d+\.\d+\.\d+$', release) is None: if release != "": asetupopt.append(release) if _homepackage != '': @@ -210,7 +210,7 @@ def set_inds(dataset): inds = ds break if inds != "": - logger.info("setting INDS environmental variable to: %s", inds) + logger.info(f"setting INDS environmental variable to: {inds}") os.environ['INDS'] = inds else: logger.warning("INDS unknown") @@ -233,25 +233,23 @@ def get_analysis_trf(transform, workdir): # test if $HARVESTER_WORKDIR is set harvester_workdir = os.environ.get('HARVESTER_WORKDIR') if harvester_workdir is not None: - search_pattern = "%s/jobO.*.tar.gz" % harvester_workdir - logger.debug("search_pattern - %s", search_pattern) + search_pattern = f"{harvester_workdir}/jobO.*.tar.gz" jobopt_files = glob.glob(search_pattern) for jobopt_file in jobopt_files: - logger.debug("jobopt_file = %s workdir = %s", jobopt_file, workdir) try: copy(jobopt_file, workdir) except Exception as error: - logger.error("could not copy file %s to %s : %s", jobopt_file, workdir, error) + logger.error(f"could not copy file {jobopt_file} to {workdir} : {error}") if '/' in transform: transform_name = transform.split('/')[-1] else: - logger.warning('did not detect any / in %s (using full transform name)', transform) + logger.warning(f'did not detect any / in {transform} (using full transform name)') transform_name = transform # is the command already available? (e.g. if already downloaded by a preprocess/main process step) if os.path.exists(os.path.join(workdir, transform_name)): - logger.info('script %s is already available - no need to download again', transform_name) + logger.info(f'script {transform_name} is already available - no need to download again') return ec, diagnostics, transform_name original_base_url = "" @@ -263,14 +261,14 @@ def get_analysis_trf(transform, workdir): break if original_base_url == "": - diagnostics = "invalid base URL: %s" % transform + diagnostics = f"invalid base URL: {transform}" return errors.TRFDOWNLOADFAILURE, diagnostics, "" # try to download from the required location, if not - switch to backup status = False for base_url in get_valid_base_urls(order=original_base_url): trf = re.sub(original_base_url, base_url, transform) - logger.debug("attempting to download script: %s", trf) + logger.debug(f"attempting to download script: {trf}") status, diagnostics = download_transform(trf, transform_name, workdir) if status: break @@ -280,11 +278,11 @@ def get_analysis_trf(transform, workdir): logger.info("successfully downloaded script") path = os.path.join(workdir, transform_name) - logger.debug("changing permission of %s to 0o755", path) + logger.debug(f"changing permission of {path} to 0o755") try: - os.chmod(path, 0o755) # Python 2/3 + os.chmod(path, 0o755) except Exception as error: - diagnostics = "failed to chmod %s: %s" % (transform_name, error) + diagnostics = f"failed to chmod {transform_name}: {error}" return errors.CHMODTRF, diagnostics, "" return ec, diagnostics, transform_name @@ -302,7 +300,7 @@ def download_transform(url, transform_name, workdir): status = False diagnostics = "" path = os.path.join(workdir, transform_name) - cmd = 'curl -sS \"%s\" > %s' % (url, path) + cmd = f'curl -sS "{url}" > {path}' trial = 1 max_trials = 3 @@ -317,29 +315,29 @@ def download_transform(url, transform_name, workdir): status = True except Exception as error: status = False - diagnostics = "Failed to copy file %s to %s : %s" % (source_path, path, error) + diagnostics = f"Failed to copy file {source_path} to {path} : {error}" logger.error(diagnostics) # try to download the trf a maximum of 3 times while trial <= max_trials: - logger.info("executing command [trial %d/%d]: %s", trial, max_trials, cmd) + logger.info(f"executing command [trial {trial}/{max_trials}]: {cmd}") exit_code, stdout, stderr = execute(cmd, mute=True) if not stdout: stdout = "(None)" if exit_code != 0: # Analyze exit code / output - diagnostics = "curl command failed: %d, %s, %s" % (exit_code, stdout, stderr) + diagnostics = f"curl command failed: {exit_code}, {stdout}, {stderr}" logger.warning(diagnostics) if trial == max_trials: - logger.fatal('could not download transform: %s', stdout) + logger.fatal(f'could not download transform: {stdout}') status = False break else: logger.info("will try again after 60 s") sleep(60) else: - logger.info("curl command returned: %s", stdout) + logger.info(f"curl command returned: {stdout}") status = True break trial += 1 @@ -393,13 +391,13 @@ def get_payload_environment_variables(cmd, job_id, task_id, attempt_nr, processi """ variables = [] - variables.append('export PANDA_RESOURCE=\'%s\';' % site_name) - variables.append('export FRONTIER_ID=\"[%s_%s]\";' % (task_id, job_id)) + variables.append(f'export PANDA_RESOURCE=\'{site_name}\';') + variables.append(f'export FRONTIER_ID="[{task_id}_{job_id}]";') variables.append('export CMSSW_VERSION=$FRONTIER_ID;') - variables.append('export PandaID=%s;' % os.environ.get('PANDAID', 'unknown')) - variables.append('export PanDA_TaskID=\'%s\';' % os.environ.get('PanDA_TaskID', 'unknown')) - variables.append('export PanDA_AttemptNr=\'%d\';' % attempt_nr) - variables.append('export INDS=\'%s\';' % os.environ.get('INDS', 'unknown')) + variables.append(f"export PandaID={os.environ.get('PANDAID', 'unknown')};") + variables.append(f"export PanDA_TaskID='{os.environ.get('PanDA_TaskID', 'unknown')}';") + variables.append(f'export PanDA_AttemptNr=\'{attempt_nr}\';') + variables.append(f"export INDS='{os.environ.get('INDS', 'unknown')}';") # Unset ATHENA_PROC_NUMBER if set for event service Merge jobs if "Merge_tf" in cmd and 'ATHENA_PROC_NUMBER' in os.environ: @@ -413,14 +411,14 @@ def get_payload_environment_variables(cmd, job_id, task_id, attempt_nr, processi except Exception: _core_count = 'export ROOTCORE_NCPUS=1;' else: - _core_count = 'export ROOTCORE_NCPUS=%d;' % core_count + _core_count = f'export ROOTCORE_NCPUS={core_count};' variables.append(_core_count) if processing_type == "": logger.warning("RUCIO_APPID needs job.processingType but it is not set!") else: - variables.append('export RUCIO_APPID=\'%s\';' % processing_type) - variables.append('export RUCIO_ACCOUNT=\'%s\';' % os.environ.get('RUCIO_ACCOUNT', 'pilot')) + variables.append(f'export RUCIO_APPID=\'{processing_type}\';') + variables.append(f"export RUCIO_ACCOUNT='{os.environ.get('RUCIO_ACCOUNT', 'pilot')}';") return variables @@ -471,7 +469,7 @@ def replace_lfns_with_turls(cmd, workdir, filename, infiles, writetofile=""): # if turl.startswith('root://') and turl not in cmd: if turl not in cmd: cmd = cmd.replace(inputfile, turl) - logger.info("replaced '%s' with '%s' in the run command", inputfile, turl) + logger.info(f"replaced '{inputfile}' with '{turl}' in the run command") # replace the LFNs with TURLs in the writetofile input file list (if it exists) if writetofile and turl_dictionary: @@ -494,9 +492,9 @@ def replace_lfns_with_turls(cmd, workdir, filename, infiles, writetofile=""): if lines: write_file(path, lines) else: - logger.warning("file does not exist: %s", path) + logger.warning(f"file does not exist: {path}") else: - logger.warning("could not find file: %s (cannot locate TURLs for direct access)", filename) + logger.warning(f"could not find file: {filename} (cannot locate TURLs for direct access)") return cmd diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index cc25f1de..ca43c18b 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -29,6 +29,7 @@ from pilot.util.filehandling import read_json, copy, write_json, remove from pilot.util.parameters import convert_to_int from pilot.util.processes import is_process_running +from pilot.util.psutils import get_command_by_pid import logging logger = logging.getLogger(__name__) @@ -95,7 +96,7 @@ def get_memory_monitor_output_filename(suffix='txt'): :return: File name (string). """ - return "memory_monitor_output.%s" % suffix + return f"memory_monitor_output.{suffix}" def get_memory_monitor_setup(pid, pgrp, jobid, workdir, command, setup="", use_container=True, transformation="", outdata=None, dump_ps=False): @@ -132,8 +133,8 @@ def get_memory_monitor_setup(pid, pgrp, jobid, workdir, command, setup="", use_c cmd = "prmon" interval = 60 - options = " --pid %d --filename %s --json-summary %s --interval %d" %\ - (pid, get_memory_monitor_output_filename(), get_memory_monitor_summary_filename(), interval) + options = f" --pid {pid} --filename {get_memory_monitor_output_filename()} " \ + f"--json-summary {get_memory_monitor_summary_filename()} --interval {interval}" cmd = "cd " + workdir + ";" + setup + cmd + options return cmd, pid @@ -173,24 +174,16 @@ def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", return -1 ps = get_ps_info(pgrp) - #logger.debug('ps:\n%s' % ps) - #_pid = os.getpid() - #logger.debug(f'current pid={_pid}') - #logger.debug(f'current ppid={os.getppid()}') # /bin/bash parent process (parent to pilot and prmon, ..) # lookup the process id using ps aux logger.debug(f'attempting to identify pid from job id ({jobid})') _pid = get_pid_for_jobid(ps, jobid) if _pid: logger.debug(f'discovered pid={_pid} for job id {jobid}') + cmd = get_command_by_pid(_pid) + logger.debug(f'command for pid {_pid}: {cmd}') break - #logger.debug('attempting to identify pid from transform name and its output') - #_pid = get_pid_for_trf(ps, transformation, outdata) if outdata else None - #if _pid: - # logger.debug('discovered pid=%d for transform name \"%s\"' % (_pid, transformation)) - # break - logger.warning(f'payload pid has not yet been identified (#{i + 1}/#{imax})') # wait until the payload has launched @@ -200,7 +193,7 @@ def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", if _pid: pid = _pid - logger.info(f'will use pid={pid} for memory monitor') + logger.info(f'will use pid {pid} for memory monitor') return pid @@ -217,11 +210,7 @@ def get_ps_info(pgrp, whoami=None, options='axfo pid,user,args'): if not whoami: whoami = os.getuid() - cmd = "ps -u %s %s" % (whoami, options) - #cmd = "ps %s | grep %s" % (options, whoami) - #cmd = "ps %s | grep %s | awk -v p=%s '$1 == p {print $5}" % (options, whoami, pgrp) - #cmd = "ps %s | awk -v p=%s '$1 == p {print $5}" % (options, pgrp) - exit_code, stdout, stderr = execute(cmd) + exit_code, stdout, stderr = execute(f"ps -u {whoami} {options}") return stdout @@ -243,10 +232,10 @@ def get_pid_for_jobid(ps, jobid): _pid = search(r'(\d+) ', line) try: pid = int(_pid.group(1)) - except Exception as e: - logger.warning('pid has wrong type: %s' % e) + except Exception as exc: + logger.warning(f'pid has wrong type: {exc}') else: - logger.debug('extracted pid=%d from ps output' % pid) + logger.debug(f'extracted pid {pid} from ps output') break return pid @@ -269,7 +258,7 @@ def get_pid_for_trf(ps, transformation, outdata): # in the case of user analysis job, the transformation will contain a URL which should be stripped if "/" in transformation: transformation = transformation.split('/')[-1] - logger.debug('using transformation name: %s' % transformation) + logger.debug(f'using transformation name: {transformation}') for line in ps.split('\n'): if transformation in line: candidates.append(line) @@ -283,15 +272,15 @@ def get_pid_for_trf(ps, transformation, outdata): _pid = search(r'(\d+) ', line) try: pid = int(_pid.group(1)) - except Exception as e: - logger.warning('pid has wrong type: %s' % e) + except Exception as exc: + logger.warning(f'pid has wrong type: {exc}') else: - logger.debug('extracted pid=%d from ps output' % pid) + logger.debug(f'extracted pid {pid} from ps output') break if pid: break else: - logger.debug('pid not found in ps output for trf=%s' % transformation) + logger.debug(f'pid not found in ps output for trf={transformation}') return pid @@ -319,12 +308,12 @@ def get_pid_for_command(ps, command="python pilot3/pilot.py"): _pid = search(r'(\d+) ', found) try: pid = int(_pid.group(1)) - except Exception as e: - logger.warning('pid has wrong type: %s' % e) + except Exception as exc: + logger.warning(f'pid has wrong type: {exc}') else: - logger.debug('extracted pid=%d from ps output: %s' % (pid, found)) + logger.debug(f'extracted pid {pid} from ps output: {found}') else: - logger.debug('command not found in ps output: %s' % command) + logger.debug(f'command not found in ps output: {command}') return pid @@ -376,13 +365,13 @@ def get_memory_monitor_info_path(workdir, allowtxtfile=False): if os.path.exists(init_path): path = init_path else: - logger.info("neither %s, nor %s exist" % (path, init_path)) + logger.info(f"neither {path}, nor {init_path} exist") path = "" if path == "" and allowtxtfile: path = os.path.join(workdir, get_memory_monitor_output_filename()) if not os.path.exists(path): - logger.warning("file does not exist either: %s" % (path)) + logger.warning(f"file does not exist either: {path}") return path @@ -403,11 +392,11 @@ def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 # Note that only the final json file will contain the totRBYTES, etc try: summary_dictionary = get_memory_values(workdir, name=name) - except Exception as e: - logger.warning('failed to get memory values from memory monitor tool: %s' % e) + except Exception as exc: + logger.warning(f'failed to get memory values from memory monitor tool: {exc}') summary_dictionary = {} else: - logger.debug("summary_dictionary=%s" % str(summary_dictionary)) + logger.debug(f"summary_dictionary={summary_dictionary}") # Fill the node dictionary if summary_dictionary and summary_dictionary != {}: @@ -428,8 +417,8 @@ def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 node['avgVMEM'] = summary_dictionary['Avg']['avgVMEM'] node['avgSWAP'] = summary_dictionary['Avg']['avgSwap'] node['avgPSS'] = summary_dictionary['Avg']['avgPSS'] - except Exception as e: - logger.warning("exception caught while parsing memory monitor file: %s" % e) + except Exception as exc: + logger.warning(f"exception caught while parsing memory monitor file: {exc}") logger.warning("will add -1 values for the memory info") node['maxRSS'] = -1 node['maxVMEM'] = -1 @@ -464,8 +453,8 @@ def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 node['avgVMEM'] = summary_dictionary['Avg']['vmem'] node['avgSWAP'] = summary_dictionary['Avg']['swap'] node['avgPSS'] = summary_dictionary['Avg']['pss'] - except Exception as e: - logger.warning("exception caught while parsing prmon file: %s" % e) + except Exception as exc: + logger.warning(f"exception caught while parsing prmon file: {exc}") logger.warning("will add -1 values for the memory info") node['maxRSS'] = -1 node['maxVMEM'] = -1 @@ -512,8 +501,8 @@ def get_max_memory_monitor_value(value, maxvalue, totalvalue): # noqa: C90 ec = 0 try: value_int = int(value) - except Exception as e: - logger.warning("exception caught: %s" % e) + except Exception as exc: + logger.warning(f"exception caught: {exc}") ec = 1 else: totalvalue += value_int @@ -618,7 +607,7 @@ def get_metadata_dict_from_txt(path, storejson=False, jobid=None): dictionary['pandaid'] = jobid path = os.path.join(os.path.dirname(path), get_memory_monitor_output_filename(suffix='json')) - logger.debug('writing prmon dictionary to: %s' % path) + logger.debug(f'writing prmon dictionary to: {path}') write_json(path, dictionary) else: logger.debug('nothing to write (no prmon dictionary)') @@ -668,7 +657,7 @@ def convert_text_file_to_dictionary(path): value = convert_to_int(key) dictionary[key_entry].append(value) except Exception: - logger.warning("unexpected format of utility output: %s" % line) + logger.warning(f"unexpected format of utility output: {line}") return dictionary @@ -737,8 +726,8 @@ def get_average_summary_dictionary(path): rbytes = None wbytes = None except Exception: - logger.warning("unexpected format of utility output: %s (expected format: Time, VMEM," - " PSS, RSS, Swap [, RCHAR, WCHAR, RBYTES, WBYTES])" % (line)) + logger.warning(f"unexpected format of utility output: {line} (expected format: Time, VMEM, PSS, " + f"RSS, Swap [, RCHAR, WCHAR, RBYTES, WBYTES])") else: # Convert to int ec1, maxvmem, totalvmem = get_max_memory_monitor_value(vmem, maxvmem, totalvmem) @@ -746,7 +735,7 @@ def get_average_summary_dictionary(path): ec3, maxrss, totalrss = get_max_memory_monitor_value(rss, maxrss, totalrss) ec4, maxswap, totalswap = get_max_memory_monitor_value(swap, maxswap, totalswap) if ec1 or ec2 or ec3 or ec4: - logger.warning("will skip this row of numbers due to value exception: %s" % (line)) + logger.warning(f"will skip this row of numbers due to value exception: {line}") else: n += 1 @@ -793,7 +782,7 @@ def get_memory_values(workdir, name=""): # Get the path to the proper memory info file (priority ordered) path = get_memory_monitor_info_path(workdir, allowtxtfile=True) if os.path.exists(path): - logger.info("using path: %s (trf name=%s)" % (path, name)) + logger.info(f"using path: {path} (trf name={name})") # Does a JSON summary file exist? If so, there's no need to calculate maximums and averages in the pilot if path.lower().endswith('json'): @@ -805,7 +794,7 @@ def get_memory_values(workdir, name=""): summary_dictionary = get_average_summary_dictionary_prmon(path) else: summary_dictionary = get_average_summary_dictionary(path) - logger.debug('summary_dictionary=%s (trf name=%s)' % (str(summary_dictionary), name)) + logger.debug(f'summary_dictionary={str(summary_dictionary)} (trf name={name})') else: if path == "": logger.warning("filename not set for memory monitor output") @@ -827,20 +816,20 @@ def post_memory_monitor_action(job): nap = 3 path1 = os.path.join(job.workdir, get_memory_monitor_summary_filename()) path2 = os.environ.get('PILOT_HOME') - i = 0 + counter = 0 maxretry = 20 - while i <= maxretry: + while counter <= maxretry: if os.path.exists(path1): break - logger.info("taking a short nap (%d s) to allow the memory monitor to finish writing to the summary file (#%d/#%d)" - % (nap, i, maxretry)) + logger.info(f"taking a short nap ({nap} s) to allow the memory monitor to finish writing to the summary " + f"file (#{counter}/#{maxretry})") time.sleep(nap) - i += 1 + counter += 1 try: copy(path1, path2) - except Exception as e: - logger.warning('failed to copy memory monitor output: %s' % e) + except Exception as exc: + logger.warning(f'failed to copy memory monitor output: {exc}') def precleanup(): @@ -853,7 +842,7 @@ def precleanup(): logger.debug('performing pre-cleanup of potentially pre-existing files from earlier job in main work dir') path = os.path.join(os.environ.get('PILOT_HOME'), get_memory_monitor_summary_filename()) if os.path.exists(path): - logger.info('removing no longer needed file: %s' % path) + logger.info(f'removing no longer needed file: {path}') remove(path) diff --git a/pilot/user/generic/common.py b/pilot/user/generic/common.py index f60bd077..a76c9ee5 100644 --- a/pilot/user/generic/common.py +++ b/pilot/user/generic/common.py @@ -74,7 +74,7 @@ def get_payload_command(job): if ec != 0: raise TrfDownloadFailure(diagnostics) else: - logger.debug('user analysis trf: %s' % trf_name) + logger.debug(f'user analysis trf: {trf_name}') return get_analysis_run_command(job, trf_name) @@ -94,16 +94,16 @@ def get_analysis_run_command(job, trf_name): # add the user proxy if 'X509_USER_PROXY' in os.environ and not job.imagename: - cmd += 'export X509_USER_PROXY=%s;' % os.environ.get('X509_USER_PROXY') + cmd += f"export X509_USER_PROXY={os.environ.get('X509_USER_PROXY')};" # set up trfs if job.imagename == "": # user jobs with no imagename defined - cmd += './%s %s' % (trf_name, job.jobparams) + cmd += f'./{trf_name} {job.jobparams}' else: if trf_name: - cmd += './%s %s' % (trf_name, job.jobparams) + cmd += f'./{trf_name} {job.jobparams}' else: - cmd += 'python %s %s' % (trf_name, job.jobparams) + cmd += f'python {trf_name} {job.jobparams}' return cmd diff --git a/pilot/user/generic/copytool_definitions.py b/pilot/user/generic/copytool_definitions.py index 37afb24a..7a9ef40d 100644 --- a/pilot/user/generic/copytool_definitions.py +++ b/pilot/user/generic/copytool_definitions.py @@ -45,7 +45,7 @@ def get_path(scope, lfn): :return: partial rucio path (string). """ - s = '%s:%s' % (scope, lfn) + s = f'{scope}:{lfn}' hash_hex = md5(s.encode('utf-8')).hexdigest() paths = scope.split('.') + [hash_hex[0:2], hash_hex[2:4], lfn] paths = [_f for _f in paths if _f] # remove empty parts to avoid double /-chars diff --git a/pilot/user/generic/cpu.py b/pilot/user/generic/cpu.py index 6f2bdc6f..c6ed1b4b 100644 --- a/pilot/user/generic/cpu.py +++ b/pilot/user/generic/cpu.py @@ -58,20 +58,20 @@ def set_core_counts(**kwargs): job = kwargs.get('job', None) if job and job.pgrp: - cmd = "ps axo pgid,psr | sort | grep %d | uniq | awk '{print $1}' | grep -x %d | wc -l" % (job.pgrp, job.pgrp) + cmd = f"ps axo pgid,psr | sort | grep {job.pgrp} | uniq | awk '{{print $1}}' | grep -x {job.pgrp} | wc -l" exit_code, stdout, stderr = execute(cmd, mute=True) - logger.debug('%s: %s' % (cmd, stdout)) + logger.debug(f'{cmd}: {stdout}') try: job.actualcorecount = int(stdout) - except Exception as e: - logger.warning('failed to convert number of actual cores to int: %s' % e) + except Exception as exc: + logger.warning(f'failed to convert number of actual cores to int: {exc}') else: - logger.debug('set number of actual cores to: %d' % job.actualcorecount) + logger.debug(f'set number of actual cores to: {job.actualcorecount}') # overwrite the original core count and add it to the list job.corecount = job.actualcorecount job.corecounts = add_core_count(job.actualcorecount) - logger.debug('current core counts list: %s' % str(job.corecounts)) + logger.debug(f'current core counts list: {job.corecounts}') else: logger.debug('payload process group not set - cannot check number of cores used by payload') diff --git a/pilot/user/generic/diagnose.py b/pilot/user/generic/diagnose.py index 3fcff7d5..5cffd4f2 100644 --- a/pilot/user/generic/diagnose.py +++ b/pilot/user/generic/diagnose.py @@ -65,7 +65,7 @@ def get_log_extracts(job, state): extracts = "" _extracts = get_pilot_log_extracts(job) if _extracts != "": - logger.warning('detected the following tail of warning/fatal messages in the pilot log:\n%s' % _extracts) + logger.warning(f'detected the following tail of warning/fatal messages in the pilot log:\n{_extracts}') if state == 'failed' or state == 'holding': extracts += _extracts @@ -89,9 +89,9 @@ def get_pilot_log_extracts(job): if _tail != "": if extracts != "": extracts += "\n" - extracts += "- Log from %s -\n" % config.Pilot.pilotlog + extracts += f"- Log from {config.Pilot.pilotlog} -\n" extracts += _tail else: - logger.warning('pilot log file does not exist: %s' % path) + logger.warning(f'pilot log file does not exist: {path}') return extracts diff --git a/pilot/user/generic/jobdata.py b/pilot/user/generic/jobdata.py index 41c657d0..9a56993a 100644 --- a/pilot/user/generic/jobdata.py +++ b/pilot/user/generic/jobdata.py @@ -16,42 +16,58 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-24 -#import re +"""Functions related to job data.""" #import logging +#import re #logger = logging.getLogger(__name__) -def jobparams_prefiltering(value): +def jobparams_prefiltering(value: str) -> (dict, str): """ Perform pre-filtering of raw job parameters to avoid problems with especially quotation marks. + The function can extract some fields from the job parameters to be put back later after actual filtering. E.g. ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" ' will otherwise become ' --athenaopts 'HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER' ' which will prevent the environmental variable to be unfolded. - :param value: job parameters (string). - :return: list of fields excluded from job parameters (list), updated job parameters (string). + :param value: job parameters (str) + :return: dictionary of fields excluded from job parameters (dict), updated job parameters (str). """ - exclusions = {} # Add regex patterns here - + # .. return exclusions, value -def jobparams_postfiltering(value, exclusions={}): +def jobparams_postfiltering(value: str, exclusions: dict = None) -> str: """ Perform post-filtering of raw job parameters. + Any items in the optional exclusion list will be added (space separated) at the end of the job parameters. - :param value: job parameters (string). - :param optional exclusions: exlusions dictionary from pre-filtering function (dictionary). - :return: updated job parameters (string). + :param value: job parameters (str) + :param exclusions: exclusions dictionary from pre-filtering function (dict) + :return: updated job parameters (str). """ + if exclusions is None: # avoid pylint warning + exclusions = {} + + for item in exclusions: + value = value.replace(item, exclusions[item]) return value + + +def fail_at_getjob_none() -> bool: + """ + Return a boolean value indicating whether to fail when getJob returns None. + + :return: True (bool). + """ + return True diff --git a/pilot/user/generic/setup.py b/pilot/user/generic/setup.py index 9429959d..1322e82c 100644 --- a/pilot/user/generic/setup.py +++ b/pilot/user/generic/setup.py @@ -52,25 +52,25 @@ def get_analysis_trf(transform, workdir): # test if $HARVESTER_WORKDIR is set harvester_workdir = os.environ.get('HARVESTER_WORKDIR') if harvester_workdir is not None: - search_pattern = "%s/jobO.*.tar.gz" % harvester_workdir - logger.debug("search_pattern - %s" % search_pattern) + search_pattern = f"{harvester_workdir}/jobO.*.tar.gz" + logger.debug(f"search_pattern - {search_pattern}") jobopt_files = glob.glob(search_pattern) for jobopt_file in jobopt_files: - logger.debug("jobopt_file = %s workdir = %s" % (jobopt_file, workdir)) + logger.debug(f"jobopt_file = {jobopt_file} workdir = {workdir}") try: copy(jobopt_file, workdir) - except Exception as e: - logger.error("could not copy file %s to %s : %s" % (jobopt_file, workdir, e)) + except Exception as exc: + logger.error(f"could not copy file {jobopt_file} to {workdir} : {exc}") if '/' in transform: transform_name = transform.split('/')[-1] else: - logger.warning('did not detect any / in %s (using full transform name)' % transform) + logger.warning(f'did not detect any / in {transform} (using full transform name)') transform_name = transform # is the command already available? (e.g. if already downloaded by a preprocess/main process step) if os.path.exists(os.path.join(workdir, transform_name)): - logger.info('script %s is already available - no need to download again' % transform_name) + logger.info(f'script {transform_name} is already available - no need to download again') return ec, diagnostics, transform_name original_base_url = "" @@ -82,14 +82,14 @@ def get_analysis_trf(transform, workdir): break if original_base_url == "": - diagnostics = "invalid base URL: %s" % transform + diagnostics = f"invalid base URL: {transform}" return errors.TRFDOWNLOADFAILURE, diagnostics, "" # try to download from the required location, if not - switch to backup status = False for base_url in get_valid_base_urls(order=original_base_url): trf = re.sub(original_base_url, base_url, transform) - logger.debug("attempting to download script: %s" % trf) + logger.debug(f"attempting to download script: {trf}") status, diagnostics = download_transform(trf, transform_name, workdir) if status: break @@ -99,11 +99,11 @@ def get_analysis_trf(transform, workdir): logger.info("successfully downloaded script") path = os.path.join(workdir, transform_name) - logger.debug("changing permission of %s to 0o755" % path) + logger.debug(f"changing permission of {path} to 0o755") try: os.chmod(path, 0o755) # Python 2/3 - except Exception as e: - diagnostics = "failed to chmod %s: %s" % (transform_name, e) + except Exception as exc: + diagnostics = f"failed to chmod {transform_name}: {exc}" return errors.CHMODTRF, diagnostics, "" return ec, diagnostics, transform_name @@ -147,7 +147,7 @@ def download_transform(url, transform_name, workdir): status = False diagnostics = "" path = os.path.join(workdir, transform_name) - cmd = 'curl -sS \"%s\" > %s' % (url, path) + cmd = f'curl -sS "{url}" > {path}' trial = 1 max_trials = 3 @@ -162,29 +162,29 @@ def download_transform(url, transform_name, workdir): status = True except Exception as error: status = False - diagnostics = "Failed to copy file %s to %s : %s" % (source_path, path, error) + diagnostics = f"Failed to copy file {source_path} to {path} : {error}" logger.error(diagnostics) # try to download the trf a maximum of 3 times while trial <= max_trials: - logger.info("executing command [trial %d/%d]: %s" % (trial, max_trials, cmd)) + logger.info(f"executing command [trial {trial}/{max_trials}]: {cmd}") exit_code, stdout, stderr = execute(cmd, mute=True) if not stdout: stdout = "(None)" if exit_code != 0: # Analyze exit code / output - diagnostics = "curl command failed: %d, %s, %s" % (exit_code, stdout, stderr) + diagnostics = f"curl command failed: {exit_code}, {stdout}, {stderr}" logger.warning(diagnostics) if trial == max_trials: - logger.fatal('could not download transform: %s' % stdout) + logger.fatal(f'could not download transform: {stdout}') status = False break else: logger.info("will try again after 60 s") sleep(60) else: - logger.info("curl command returned: %s" % stdout) + logger.info(f"curl command returned: {stdout}") status = True break trial += 1 diff --git a/pilot/user/rubin/common.py b/pilot/user/rubin/common.py index 62704e8f..8f736223 100644 --- a/pilot/user/rubin/common.py +++ b/pilot/user/rubin/common.py @@ -75,7 +75,7 @@ def get_payload_command(job): if ec != 0: raise TrfDownloadFailure(diagnostics) else: - logger.debug('user analysis trf: %s' % trf_name) + logger.debug(f'user analysis trf: {trf_name}') return get_analysis_run_command(job, trf_name) @@ -95,16 +95,16 @@ def get_analysis_run_command(job, trf_name): # add the user proxy if 'X509_USER_PROXY' in os.environ and not job.imagename: - cmd += 'export X509_USER_PROXY=%s;' % os.environ.get('X509_USER_PROXY') + cmd += f"export X509_USER_PROXY={os.environ.get('X509_USER_PROXY')};" # set up trfs if job.imagename == "": # user jobs with no imagename defined - cmd += './%s %s' % (trf_name, job.jobparams) + cmd += f'./{trf_name} {job.jobparams}' else: if trf_name: - cmd += './%s %s' % (trf_name, job.jobparams) + cmd += f'./{trf_name} {job.jobparams}' else: - cmd += 'python %s %s' % (trf_name, job.jobparams) + cmd += f'python {trf_name} {job.jobparams}' return cmd diff --git a/pilot/user/rubin/copytool_definitions.py b/pilot/user/rubin/copytool_definitions.py index 37afb24a..7a9ef40d 100644 --- a/pilot/user/rubin/copytool_definitions.py +++ b/pilot/user/rubin/copytool_definitions.py @@ -45,7 +45,7 @@ def get_path(scope, lfn): :return: partial rucio path (string). """ - s = '%s:%s' % (scope, lfn) + s = f'{scope}:{lfn}' hash_hex = md5(s.encode('utf-8')).hexdigest() paths = scope.split('.') + [hash_hex[0:2], hash_hex[2:4], lfn] paths = [_f for _f in paths if _f] # remove empty parts to avoid double /-chars diff --git a/pilot/user/rubin/cpu.py b/pilot/user/rubin/cpu.py index 388fe2ef..4873673b 100644 --- a/pilot/user/rubin/cpu.py +++ b/pilot/user/rubin/cpu.py @@ -58,20 +58,20 @@ def set_core_counts(**kwargs): job = kwargs.get('job', None) if job and job.pgrp: - cmd = "ps axo pgid,psr | sort | grep %d | uniq | awk '{print $1}' | grep -x %d | wc -l" % (job.pgrp, job.pgrp) + cmd = f"ps axo pgid,psr | sort | grep {job.pgrp} | uniq | awk '{{print $1}}' | grep -x {job.pgrp} | wc -l" exit_code, stdout, stderr = execute(cmd, mute=True) - logger.debug('%s: %s' % (cmd, stdout)) + logger.debug(f'{cmd}: {stdout}') try: job.actualcorecount = int(stdout) - except Exception as e: - logger.warning('failed to convert number of actual cores to int: %s' % e) + except Exception as exc: + logger.warning(f'failed to convert number of actual cores to int: {exc}') else: - logger.debug('set number of actual cores to: %d' % job.actualcorecount) + logger.debug(f'set number of actual cores to: {job.actualcorecount}') # overwrite the original core count and add it to the list job.corecount = job.actualcorecount job.corecounts = add_core_count(job.actualcorecount) - logger.debug('current core counts list: %s' % str(job.corecounts)) + logger.debug(f'current core counts list: {job.corecounts}') else: logger.debug('payload process group not set - cannot check number of cores used by payload') diff --git a/pilot/user/rubin/diagnose.py b/pilot/user/rubin/diagnose.py index ce510724..34844259 100644 --- a/pilot/user/rubin/diagnose.py +++ b/pilot/user/rubin/diagnose.py @@ -71,7 +71,7 @@ def get_log_extracts(job, state): extracts = "" _extracts = get_pilot_log_extracts(job) if _extracts != "": - logger.warning('detected the following tail of warning/fatal messages in the pilot log:\n%s' % _extracts) + logger.warning(f'detected the following tail of warning/fatal messages in the pilot log:\n{_extracts}') if state == 'failed' or state == 'holding': extracts += _extracts @@ -95,9 +95,9 @@ def get_pilot_log_extracts(job): if _tail != "": if extracts != "": extracts += "\n" - extracts += "- Log from %s -\n" % config.Pilot.pilotlog + extracts += f"- Log from {config.Pilot.pilotlog} -\n" extracts += _tail else: - logger.warning('pilot log file does not exist: %s' % path) + logger.warning(f'pilot log file does not exist: {path}') return extracts diff --git a/pilot/user/rubin/esprocessfinegrainedproc.py b/pilot/user/rubin/esprocessfinegrainedproc.py new file mode 100644 index 00000000..11f49cc9 --- /dev/null +++ b/pilot/user/rubin/esprocessfinegrainedproc.py @@ -0,0 +1,891 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Authors: +# - Wen Guan, wen.guan@cern.ch, 2023 - 2024 + +import base64 +import io +import json +import logging +import os +import queue +import re +import signal +import time +import threading +import traceback +from concurrent import futures +from typing import Any + +# from pilot.util.auxiliary import set_pilot_state +from pilot.util.filehandling import read_file +from pilot.common.errorcodes import ErrorCodes +from pilot.common.exception import PilotException, MessageFailure, SetupFailure, RunPayloadFailure +from pilot.util.container import execute + + +logger = logging.getLogger(__name__) +errors = ErrorCodes() + +""" +Main process to handle event service. +It makes use of two hooks get_event_ranges_hook and handle_out_message_hook to communicate with other processes when +it's running. The process will handle the logic of Event service independently. +""" + + +class ESRunnerThreadPool(futures.ThreadPoolExecutor): + def __init__(self, max_workers=None, thread_name_prefix='', initializer=None, initargs=()): + self.futures = {} + self.outputs = {} + self._lock = threading.RLock() + self.max_workers = max_workers + super(ESRunnerThreadPool, self).__init__(max_workers=max_workers, + thread_name_prefix=thread_name_prefix, + initializer=initializer, + initargs=initargs) + + def submit(self, fn, *args, **kwargs): + future = super(ESRunnerThreadPool, self).submit(fn, *args, **kwargs) + return future + + def run_event(self, fn, event): + future = super(ESRunnerThreadPool, self).submit(fn, event) + with self._lock: + self.futures[event['eventRangeID']] = {'event': event, 'future': future} + + def scan(self): + with self._lock: + for event_range_id in list(self.futures.keys()): + event_future = self.futures[event_range_id] + future = event_future['future'] + if future.done(): + result = future.result() + self.outputs[event_range_id] = {'event': self.futures[event_range_id]['event'], 'result': result} + del self.futures[event_range_id] + + def get_outputs(self): + outputs = [] + with self._lock: + for event_range_id in self.outputs: + outputs.append(self.outputs[event_range_id]['result']) + self.outputs = {} + return outputs + + def get_max_workers(self): + return self.max_workers + + def get_num_running_workers(self): + return len(list(self.futures.keys())) + + def has_free_workers(self): + return self.get_num_workers() < self.max_workers + + def get_num_free_workers(self): + return self.max_workers - self.get_num_running_workers() + + +class ESProcessFineGrainedProc(threading.Thread): + """ + Main EventService Process. + """ + def __init__(self, payload, waiting_time=30 * 60): + """ + Init ESProcessFineGrainedProc. + + :param payload: a dict of {'executable': , 'output_file': , 'error_file': } + """ + threading.Thread.__init__(self, name='esprocessFineGrainedProc') + + self.__payload = payload + + self.__thread_pool = None + + self.get_event_ranges_hook = None + self.handle_out_message_hook = None + + self.__monitor_log_time = None + self.is_no_more_events = False + self.__no_more_event_time = None + self.__waiting_time = waiting_time + self.__stop = threading.Event() + self.__stop_time = 180 + self.pid = None + self.__is_payload_started = False + + self.__ret_code = None + self.setName("ESProcessFineGrainedProc") + self.corecount = 1 + self.event_execution_time = None + + self.rubin_es_map = {} + + self._worker_id = -1 + self._lock = threading.RLock() + + def __del__(self): + if self.__thread_pool: + del self.__thread_pool + + def is_payload_started(self): + return self.__is_payload_started + + def stop(self, delay=1800): + if not self.__stop.is_set(): + self.__stop.set() + self.__stop_set_time = time.time() + self.__stop_delay = delay + self.close_logs() + self.__thread_pool.shutdown(wait=False) + + def get_job_id(self): + if 'job' in self.__payload and self.__payload['job'] and self.__payload['job'].jobid: + return self.__payload['job'].jobid + return '' + + def get_job(self): + if 'job' in self.__payload and self.__payload['job']: + return self.__payload['job'] + return None + + def get_transformation(self): + if 'job' in self.__payload and self.__payload['job'] and self.__payload['job'].transformation: + return self.__payload['job'].transformation + return None + + def get_corecount(self): + try: + if os.environ.get("RUBIN_ES_CORES", None) is not None: + rubin_es_cores = int(os.environ.get("RUBIN_ES_CORES")) + return rubin_es_cores + except Exception as ex: + logger.warn("RUBIN_ES_CORES is not defined correctly: %s" % str(ex)) + + if 'job' in self.__payload and self.__payload['job'] and self.__payload['job'].corecount: + core_count = int(self.__payload['job'].corecount) + return core_count + return 1 + + def get_file(self, workdir, file_label='output_file', file_name='payload.stdout'): + """ + Return the requested file. + + :param file_label: + :param workdir: + :return: + """ + + try: + file_type = file # Python 2 + except NameError: + file_type = io.IOBase # Python 3 + + if file_label in self.__payload: + if isinstance(self.__payload[file_label], file_type): + _file_fd = self.__payload[file_label] + else: + _file = self.__payload[file_label] if '/' in self.__payload[file_label] else os.path.join(workdir, self.__payload[file_label]) + _file_fd = open(_file, 'w') + else: + _file = os.path.join(workdir, file_name) + _file_fd = open(_file, 'w') + + return _file_fd + + def get_workdir(self): + """ + Return the workdir. + If the workdir is set but is not a directory, return None. + + :return: workdir (string or None). + :raises SetupFailure: in case workdir is not a directory. + """ + + workdir = '' + if 'workdir' in self.__payload: + workdir = self.__payload['workdir'] + if not os.path.exists(workdir): + os.makedirs(workdir) + elif not os.path.isdir(workdir): + raise SetupFailure('workdir exists but is not a directory') + return workdir + + def get_executable(self, workdir): + """ + Return the executable string. + + :param workdir: work directory (string). + :return: executable (string). + """ + executable = self.__payload['executable'] + # return 'cd %s; %s' % (workdir, executable) + return executable + + def init_logs(self): + workdir = self.get_workdir() + # logger.info("payload: %s", str(self.__payload)) + output_file_fd = self.get_file(workdir, file_label='output_file', file_name='payload.stdout') + error_file_fd = self.get_file(workdir, file_label='error_file', file_name='payload.stderr') + + self.stdout_queue = queue.Queue() + self.stderr_queue = queue.Queue() + self.stdout_file = output_file_fd + self.stderr_file = error_file_fd + + logger.info("stdout_file: %s; stderr_file: %s" % (self.stdout_file, self.stderr_file)) + + realtime_log_files = os.environ.get('REALTIME_LOGFILES', None) + realtime_log_files = re.split('[:,]', realtime_log_files) + # realtime_log_files = [os.path.join(event_dir, f) for f in realtime_log_files] + self.realtime_log_queues = {} + self.realtime_log_files = {} + for realtime_log_file in realtime_log_files: + self.realtime_log_queues[realtime_log_file] = queue.Queue() + self.realtime_log_files[realtime_log_file] = self.get_file(workdir, file_label=realtime_log_file, file_name=realtime_log_file) + logger.info("realtime log %s: %s" % (realtime_log_file, self.realtime_log_files[realtime_log_file])) + logger.info("self.realtime_log_queues: %s" % str(self.realtime_log_queues)) + + def write_logs_from_queue(self): + while not self.stdout_queue.empty(): + item = self.stdout_queue.get(block=False) + itemb = item.encode('utf-8') + self.stdout_file.write(itemb) + # logger.debug("write stdout_file: %s" % item) + while not self.stderr_queue.empty(): + item = self.stderr_queue.get(block=False) + itemb = item.encode('utf-8') + self.stderr_file.write(itemb) + # logger.debug("write stderr_file: %s" % item) + + for fd in self.realtime_log_queues: + while not self.realtime_log_queues[fd].empty(): + item = self.realtime_log_queues[fd].get(block=False) + self.realtime_log_files[fd].write(json.dumps(item)) + # logger.debug("write realtime log %s: %s" % (fd, item)) + + def close_logs(self): + try: + # cmd = "pwd; ls -ltr" + # execute(cmd, stdout=self.stdout_file, stderr=self.stderr_file, timeout=120) + self.stdout_file.close() + self.stderr_file.close() + for fd in self.realtime_log_files: + self.realtime_log_files[fd].close() + except Exception as ex: + logger.error("Failed to close logs: %s" % str(ex)) + + def set_get_event_ranges_hook(self, hook): + """ + set get_event_ranges hook. + + :param hook: a hook method to get event ranges. + """ + + self.get_event_ranges_hook = hook + + def get_get_event_ranges_hook(self): + """ + get get_event_ranges hook. + + :returns: The hook method to get event ranges. + """ + + return self.get_event_ranges_hook + + def set_handle_out_message_hook(self, hook): + """ + set handle_out_message hook. + + :param hook: a hook method to handle payload output and error messages. + """ + + self.handle_out_message_hook = hook + + def get_handle_out_message_hook(self): + """ + get handle_out_message hook. + + :returns: The hook method to handle payload output and error messages. + """ + + return self.handle_out_message_hook + + def init(self): + """ + initialize message thread and payload process. + """ + + try: + self.init_logs() + self.__thread_pool = ESRunnerThreadPool(max_workers=self.get_corecount(), + thread_name_prefix='ESProcessRunner') + except Exception as e: + # TODO: raise exceptions + self.__ret_code = -1 + self.stop() + raise e + + def try_get_events(self, num_free_workers): + events = [] + if num_free_workers: + queue_factor = 1 + if self.event_execution_time and self.event_execution_time < 10 * 60: # 10 minutes + queue_factor = int(10 * 60 / self.event_execution_time) + events = self.get_event_ranges(num_ranges=num_free_workers, queue_factor=queue_factor) + if not events: + self.is_no_more_events = True + self.__no_more_event_time = time.time() + return events + + def get_event_dir(self, event_range_id): + work_dir = self.get_workdir() + event_dir = os.path.join(work_dir, event_range_id) + if not os.path.exists(event_dir): + os.makedirs(event_dir) + return event_dir + + def get_env_item(self, env, str_item): + items = str_item.replace(" ", ";").split(";") + for item in items: + if env in item: + return item.replace(env, "") + return None + + def get_event_range_map_info(self): + executable = self.get_executable(self.get_workdir()) + exec_list = executable.split(" ") + es_map_env, es_map_file = None, None + for exec_item in exec_list: + new_exec_item = None + if self.is_base64(exec_item): + new_exec_item = self.decode_base64(exec_item) + else: + new_exec_item = exec_item + + if "RUBIN_ES_MAP_FILE=" in new_exec_item: + es_map_file = self.get_env_item("RUBIN_ES_MAP_FILE=", new_exec_item) + if "RUBIN_ES_MAP=" in new_exec_item: + es_map_env = self.get_env_item("RUBIN_ES_MAP=", new_exec_item) + + self.rubin_es_map = {} + if es_map_file: + try: + with open(es_map_file) as f: + rubin_es_map_from_file_content = json.load(f) + self.rubin_es_map.update(rubin_es_map_from_file_content) + except Exception as ex: + logger.error("failed to load RUBIN_ES_MAP_FILE: %s" % str(ex)) + if es_map_env: + try: + rubin_es_map_from_env = json.loads(es_map_env) + self.rubin_es_map.update(rubin_es_map_from_env) + except Exception as ex: + logger.error("failed to load RUBIN_ES_MAP: %s" % str(ex)) + + def get_event_range_file_map(self, event): + if not self.rubin_es_map: + self.get_event_range_map_info() + # input_file = self.__payload['job'].input_file + # return {input_file: event['eventRangeID']} + # label = input_file.split(":")[0] + + lfn = event['LFN'] + label = lfn.split(":")[1] + input_file = lfn.split(":")[2] + input_file_name = label + ":" + input_file + event_base_index = int(input_file.split("_")[1]) + event_index = int(event['startEvent']) + event_abs_index = str(event_base_index + event_index - 1) + if label in self.rubin_es_map and event_abs_index in self.rubin_es_map[label]: + return {input_file_name: self.rubin_es_map[label][event_abs_index]} + return {input_file_name: input_file_name + "^" + str(event_index)} + + def is_base64(self, sb): + try: + if isinstance(sb, str): + sb_bytes = bytes(sb, 'ascii') + elif isinstance(sb, bytes): + sb_bytes = sb + else: + return False + return base64.b64encode(base64.b64decode(sb_bytes)) == sb_bytes + except Exception: + # logger.error("is_base64 %s: %s" % (sb, ex)) + return False + + def decode_base64(self, sb): + try: + if isinstance(sb, str): + sb_bytes = bytes(sb, 'ascii') + elif isinstance(sb, bytes): + sb_bytes = sb + else: + return sb + return base64.b64decode(sb_bytes).decode("utf-8") + except Exception as ex: + logger.error("decode_base64 %s: %s" % (sb, ex)) + return sb + + def encode_base64(self, sb): + try: + if isinstance(sb, str): + sb_bytes = bytes(sb, 'ascii') + elif isinstance(sb, bytes): + sb_bytes = sb + return base64.b64encode(sb_bytes).decode("utf-8") + except Exception as ex: + logger.error("encode_base64 %s: %s" % (sb, ex)) + return sb + + def replace_executable(self, executable, event_range_file_map): + exec_list = executable.split(" ") + new_exec_list = [] + for exec_item in exec_list: + new_exec_item = None + if self.is_base64(exec_item): + new_exec_item = self.decode_base64(exec_item) + for input_file in event_range_file_map: + new_exec_item = new_exec_item.replace(input_file, event_range_file_map[input_file]) + new_exec_item = self.encode_base64(new_exec_item) + else: + new_exec_item = exec_item + for input_file in event_range_file_map: + new_exec_item = new_exec_item.replace(input_file, event_range_file_map[input_file]) + new_exec_list.append(new_exec_item) + return " ".join(new_exec_list) + + def get_event_executable(self, event_dir, event): + executable = self.get_executable(event_dir) + event_range_file_map = self.get_event_range_file_map(event) + executable = self.replace_executable(executable, event_range_file_map) + # executable = "cd " + event_dir + "; " + executable + + transformation = self.get_transformation() + base_transformation = os.path.basename(transformation) + + executable = "cp -f " + base_transformation + " " + event_dir + "; cd " + event_dir + "; " + executable + + stdout_filename = os.path.join(event_dir, "payload.stdout") + stderr_filename = os.path.join(event_dir, "payload.stderr") + + stdout_file = open(stdout_filename, 'a') + stderr_file = open(stderr_filename, 'a') + realtime_log_files = os.environ.get('REALTIME_LOGFILES', None) + realtime_log_files = re.split('[:,]', realtime_log_files) + realtime_log_files = [os.path.join(event_dir, f) for f in realtime_log_files] + return executable, stdout_file, stderr_file, stdout_filename, stderr_filename, realtime_log_files + + def get_worker_id(self): + worker_id = None + with self._lock: + self._worker_id += 1 + worker_id = self._worker_id + return worker_id + + def open_log_file(self, filename, perm='r'): + if os.path.exists(filename): + fd = open(filename, perm) + fd.seek(0) + return fd + return None + + def redirect_logs(self, graceful_stop, worker_id, stdout_filename, stderr_filename, realtime_log_files, event_dir): # noqa C901 + stdout_file = None + stderr_file = None + realtime_logs = {} + for rt in realtime_log_files: + realtime_logs[rt] = None + # logger.debug("self.realtime_log_queues: %s" % str(self.realtime_log_queues)) + while not graceful_stop.is_set(): + try: + if stdout_file is None: + stdout_file = self.open_log_file(stdout_filename) + if stderr_file is None: + stderr_file = self.open_log_file(stderr_filename) + for rt in realtime_logs: + if realtime_logs[rt] is None: + realtime_logs[rt] = self.open_log_file(rt) + + if stdout_file: + # logger.debug("stdout_file location: %s" % stdout_file.tell()) + lines = stdout_file.readlines() + for line in lines: + line = "Worker %s: " % worker_id + line + self.stdout_queue.put(line) + if stderr_file: + lines = stderr_file.readlines() + for line in lines: + line = "Worker %s: " % worker_id + line + self.stderr_queue.put(line) + for rt in realtime_logs: + if realtime_logs[rt]: + lines = realtime_logs[rt].readlines() + rt_base = os.path.basename(rt) + for line in lines: + try: + line = json.loads(line) + line.update({'worker_id': worker_id}) + except Exception: + line = "Worker %s: " % worker_id + line + self.realtime_log_queues[rt_base].put(line) + + time.sleep(0.1) + except Exception as ex: + logger.warn(ex) + logger.debug(traceback.format_exc()) + + try: + # cmd = "cd %s; pwd; ls -ltr" % event_dir + # ls_status, ls_stdout, ls_stderr = execute(cmd, timeout=120) + # logger.info("list files status: %s, output: %s, error: %s" % (ls_status, ls_stdout, ls_stderr)) + + if stdout_file is None: + stdout_file = self.open_log_file(stdout_filename) + if stderr_file is None: + stderr_file = self.open_log_file(stderr_filename) + for rt in realtime_logs: + if realtime_logs[rt] is None: + realtime_logs[rt] = self.open_log_file(rt) + + if stdout_file: + lines = stdout_file.readlines() + for line in lines: + line = "Worker %s: " % worker_id + line + self.stdout_queue.put(line) + stdout_file.close() + if stderr_file: + lines = stderr_file.readlines() + for line in lines: + line = "Worker %s: " % worker_id + line + self.stderr_queue.put(line) + stderr_file.close() + for rt in realtime_logs: + if realtime_logs[rt]: + lines = realtime_logs[rt].readlines() + rt_base = os.path.basename(rt) + for line in lines: + try: + line = json.loads(line) + line.update({'worker_id': worker_id}) + except Exception: + line = "Worker %s: " % worker_id + line + self.realtime_log_queues[rt_base].put(line) + realtime_logs[rt].close() + except Exception as ex: + logger.warn(ex) + logger.debug(traceback.format_exc()) + + def wait_graceful(self, proc: Any) -> int: + """ + Wait for payload process to finish. + + :param proc: subprocess object (Any) + :return: exit code (int). + """ + breaker = False + exit_code = None + iteration = 0 + while True: + time.sleep(0.1) + + iteration += 1 + for _ in range(60): + if self.__stop.is_set(): + breaker = True + logger.info(f'breaking -- sending SIGTERM to pid={proc.pid}') + os.killpg(os.getpgid(proc.pid), signal.SIGTERM) + break + exit_code = proc.poll() + if exit_code is not None: + break + time.sleep(1) + if breaker: + logger.info(f'breaking -- sleep 3s before sending SIGKILL pid={proc.pid}') + time.sleep(3) + proc.kill() + break + + exit_code = proc.poll() + + if iteration % 10 == 0: + logger.info(f'running: iteration={iteration} pid={proc.pid} exit_code={exit_code}') + if exit_code is not None: + break + else: + continue + + return exit_code + + def run_event(self, event): + time_start = time.time() + ret = {} + worker_id = self.get_worker_id() + log_prefix = "worker_id=%s: " % worker_id + try: + event_range_id = event['eventRangeID'] + logger.info(log_prefix + "start to run event " + str(event_range_id)) + + event_dir = self.get_event_dir(event_range_id) + executable, stdout_file, stderr_file, stdout_filename, stderr_filename, realtime_log_files = self.get_event_executable(event_dir, event) + logger.info(log_prefix + "executable: " + executable) + logger.info(log_prefix + "stdout: " + stdout_filename) + logger.info(log_prefix + "stderr: " + stderr_filename) + + # exit_code, stdout, stderr = execute(executable, workdir=event_dir, returnproc=True, stdout=stdout_file, stderr=stderr_file, + # cwd=event_dir, timeout=7 * 24 * 3600) + # logger.info(log_prefix + "exit_code: " + str(exit_code)) + # logger.info(log_prefix + "stdout: " + str(stdout)) + # logger.info(log_prefix + "stderr: " + str(stderr)) + try: + proc = execute(executable, returnproc=True, stdout=stdout_file, stderr=stderr_file, timeout=7 * 24 * 3600) + except Exception as error: + logger.error(f'could not execute: {error}') + raise Exception(f'could not execute: {error}') + if isinstance(proc, tuple) and not proc[0]: + logger.error('failed to execute payload') + raise Exception('failed to execute payload') + + logger.info(f'started -- pid={proc.pid} executable={executable}') + # job = self.get_job() + # if job: + # job.pid = proc.pid + # job.pgrp = os.getpgid(job.pid) + # set_pilot_state(job=job, state="running") + + # start a thread to redirect stdout/stderr and realtime logging + graceful_stop = threading.Event() + log_redirect_thread = threading.Thread(target=self.redirect_logs, + args=(graceful_stop, worker_id, stdout_filename, stderr_filename, realtime_log_files, event_dir)) + log_redirect_thread.start() + + exit_code = self.wait_graceful(proc) + logger.info(log_prefix + "exit_code: " + str(exit_code)) + stdout_file.close() + stderr_file.close() + + cmd = "cd %s; pwd; ls -ltr" % event_dir + ls_status, ls_stdout, ls_stderr = execute(cmd, timeout=120) + logger.info("list files status: %s, output: %s, error: %s" % (ls_status, ls_stdout, ls_stderr)) + + # log_redirect_thread.stop() + time.sleep(2) + logger.info(log_prefix + "stopping log_redirect_thread") + graceful_stop.set() + + diagnostics = None + if exit_code: + logger.warning(f'payload returned exit code={exit_code}') + stdout = read_file(stdout_filename) + stderr = read_file(stderr_filename) + err_msg = errors.extract_stderr_error(stderr) + if err_msg == "": + err_msg = errors.extract_stderr_warning(stderr) + + diagnostics = stderr + stdout if stdout and stderr else 'General payload setup verification error (check setup logs)' + # check for special errors in thw output + exit_code = errors.resolve_transform_error(exit_code, diagnostics) + # diagnostics = errors.format_diagnostics(exit_code, diagnostics) + + diagnostics = errors.format_diagnostics(exit_code, err_msg) + _, diagnostics = errors.add_error_code(exit_code, msg=diagnostics) + if stdout_file: + stdout_file.close() + logger.debug(f'closed {stdout_filename}') + if stderr_file: + stderr_file.close() + logger.debug(f'closed {stderr_filename}') + if exit_code: + self.__ret_code = exit_code + ret = {'id': event_range_id, 'status': 'failed', 'error_code': exit_code, 'error_diag': diagnostics} + else: + ret = {'id': event_range_id, 'status': 'finished', 'error_code': exit_code, 'error_diag': diagnostics} + except Exception as ex: + logger.error(ex) + logger.error(traceback.format_exc()) + ret = {'id': event_range_id, 'status': 'failed', 'error_code': -1, 'error_diag': str(ex)} + self.__ret_code = -1 + + logger.info(log_prefix + "ret: " + str(ret)) + + time_used = time.time() - time_start + logger.info(log_prefix + "time used to process this event: " + str(time_used)) + + ret['wall_time'] = time_used + + if self.event_execution_time is None or self.event_execution_time < time_used: + self.event_execution_time = time_used + logger.info(log_prefix + "max event execution time: " + str(time_used)) + return ret + + def send_terminate_events(self, outputs): + for output in outputs: + self.handle_out_message(output) + + def monitor(self, terminate=False): + """ + Monitor whether a process is dead. + + raises: RunPayloadFailure: when the payload process is dead or exited. + """ + if self.__thread_pool: + self.__thread_pool.scan() + if not terminate: + num_free_workers = self.__thread_pool.get_num_free_workers() + if num_free_workers > 0: + events = self.try_get_events(num_free_workers) + if events: + logger.info("Got %s events: %s" % (len(events), events)) + for event in events: + # self.run_event(event) + self.__thread_pool.run_event(self.run_event, event) + + outputs = self.__thread_pool.get_outputs() + if outputs: + logger.info("Got %s outputs: %s" % (len(outputs), outputs)) + self.send_terminate_events(outputs) + + def get_event_ranges(self, num_ranges=None, queue_factor=1): + """ + Calling get_event_ranges hook to get event ranges. + + :param num_ranges: number of event ranges to get. + + :raises: SetupFailure: If get_event_ranges_hook is not set. + MessageFailure: when failed to get event ranges. + """ + if not num_ranges: + num_ranges = self.corecount + + logger.debug('getting event ranges(num_ranges=%s)' % num_ranges) + if not self.get_event_ranges_hook: + raise SetupFailure("get_event_ranges_hook is not set") + + try: + logger.debug('calling get_event_ranges hook(%s) to get event ranges.' % self.get_event_ranges_hook) + event_ranges = self.get_event_ranges_hook(num_ranges, queue_factor) + logger.debug('got event ranges: %s' % event_ranges) + return event_ranges + except Exception as e: + raise MessageFailure("Failed to get event ranges: %s" % e) + + def parse_out_message(self, message): + """ + Parse output or error messages from payload. + + :param message: The message string received from payload. + + :returns: a dict {'id': , 'status': , 'output': , 'cpu': , 'wall': , 'message': } + :raises: PilotExecption: when a PilotException is caught. + UnknownException: when other unknown exception is caught. + """ + + logger.debug('parsing message: %s' % message) + return message + + def handle_out_message(self, message): + """ + Handle output or error messages from payload. + Messages from payload will be parsed and the handle_out_message hook is called. + + :param message: The message string received from payload. + + :raises: SetupFailure: when handle_out_message_hook is not set. + RunPayloadFailure: when failed to handle an output or error message. + """ + + logger.debug('handling out message: %s' % message) + if not self.handle_out_message_hook: + raise SetupFailure("handle_out_message_hook is not set") + + try: + message_status = self.parse_out_message(message) + logger.debug('parsed out message: %s' % message_status) + logger.debug('calling handle_out_message hook(%s) to handle parsed message.' % self.handle_out_message_hook) + self.handle_out_message_hook(message_status) + except Exception as e: + raise RunPayloadFailure("Failed to handle out message: %s" % e) + + def is_payload_running(self): + """ + Check whether the payload is still running + + :return: True if the payload is running, otherwise False + """ + if (self.__stop.is_set() or self.is_no_more_events) and self.__thread_pool.get_num_running_workers() < 1: + return False + return True + + def poll(self): + """ + poll whether the process is still running. + + :returns: None: still running. + 0: finished successfully. + others: failed. + """ + # if self.is_payload_running(): + # return None + logger.debug("is_alive: %s, ret_code:%s" % (self.is_alive(), self.__ret_code)) + # if self.is_alive(): + # return None + return self.__ret_code + + def clean(self): + """ + Clean left resources + """ + self.stop() + if self.__ret_code is None: + self.__ret_code = 0 + + def run(self): + """ + Main run loops: monitor message thread and payload process. + handle messages from payload and response messages with injecting new event ranges or process outputs. + + :raises: PilotExecption: when a PilotException is caught. + UnknownException: when other unknown exception is caught. + """ + + self.__is_payload_started = True + logger.info('start esprocess with thread ident: %s' % (self.ident)) + logger.debug('initializing') + self.init() + logger.debug('initialization finished.') + + logger.info('starts to main loop') + while self.is_payload_running(): + try: + self.monitor() + self.write_logs_from_queue() + time.sleep(0.01) + except PilotException as e: + logger.error('PilotException caught in the main loop: %s, %s' % (e.get_detail(), traceback.format_exc())) + # TODO: define output message exception. If caught 3 output message exception, terminate + self.stop() + except Exception as e: + logger.error('Exception caught in the main loop: %s, %s' % (e, traceback.format_exc())) + # TODO: catch and raise exceptions + # if catching dead process exception, terminate. + self.stop() + break + logger.info("main loop ends") + self.monitor(terminate=True) + self.write_logs_from_queue() + self.clean() + logger.debug('main loop finished') diff --git a/pilot/user/rubin/jobdata.py b/pilot/user/rubin/jobdata.py index 41c657d0..880e4ba3 100644 --- a/pilot/user/rubin/jobdata.py +++ b/pilot/user/rubin/jobdata.py @@ -16,42 +16,58 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-24 -#import re +"""Functions related to job data.""" #import logging +#import re #logger = logging.getLogger(__name__) -def jobparams_prefiltering(value): +def jobparams_prefiltering(value: str) -> (dict, str): """ Perform pre-filtering of raw job parameters to avoid problems with especially quotation marks. + The function can extract some fields from the job parameters to be put back later after actual filtering. E.g. ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" ' will otherwise become ' --athenaopts 'HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER' ' which will prevent the environmental variable to be unfolded. :param value: job parameters (string). - :return: list of fields excluded from job parameters (list), updated job parameters (string). + :return: dictionary of fields excluded from job parameters (dict), updated job parameters (str). """ - exclusions = {} # Add regex patterns here - + # .. return exclusions, value -def jobparams_postfiltering(value, exclusions={}): +def jobparams_postfiltering(value: str, exclusions: dict = None) -> str: """ Perform post-filtering of raw job parameters. + Any items in the optional exclusion list will be added (space separated) at the end of the job parameters. - :param value: job parameters (string). - :param optional exclusions: exlusions dictionary from pre-filtering function (dictionary). - :return: updated job parameters (string). + :param value: job parameters (str) + :param exclusions: exclusions dictionary from pre-filtering function (dict) + :return: updated job parameters (str). """ + if exclusions is None: # avoid pylint warning + exclusions = {} + + for item in exclusions: + value = value.replace(item, exclusions[item]) return value + + +def fail_at_getjob_none() -> bool: + """ + Return a boolean value indicating whether to fail when getJob returns None. + + :return: False (bool). + """ + return False diff --git a/pilot/user/rubin/setup.py b/pilot/user/rubin/setup.py index 8f98fc61..945f7801 100644 --- a/pilot/user/rubin/setup.py +++ b/pilot/user/rubin/setup.py @@ -52,25 +52,25 @@ def get_analysis_trf(transform, workdir): # test if $HARVESTER_WORKDIR is set harvester_workdir = os.environ.get('HARVESTER_WORKDIR') if harvester_workdir is not None: - search_pattern = "%s/jobO.*.tar.gz" % harvester_workdir - logger.debug("search_pattern - %s" % search_pattern) + search_pattern = f"{harvester_workdir}/jobO.*.tar.gz" + logger.debug(f"search_pattern - {search_pattern}") jobopt_files = glob.glob(search_pattern) for jobopt_file in jobopt_files: - logger.debug("jobopt_file = %s workdir = %s" % (jobopt_file, workdir)) + logger.debug(f"jobopt_file = {jobopt_file} workdir = {workdir}") try: copy(jobopt_file, workdir) - except Exception as e: - logger.error("could not copy file %s to %s : %s" % (jobopt_file, workdir, e)) + except Exception as exc: + logger.error(f"could not copy file {jobopt_file} to {workdir} : {exc}") if '/' in transform: transform_name = transform.split('/')[-1] else: - logger.warning('did not detect any / in %s (using full transform name)' % transform) + logger.warning(f'did not detect any / in {transform} (using full transform name)') transform_name = transform # is the command already available? (e.g. if already downloaded by a preprocess/main process step) if os.path.exists(os.path.join(workdir, transform_name)): - logger.info('script %s is already available - no need to download again' % transform_name) + logger.info(f'script {transform_name} is already available - no need to download again') return ec, diagnostics, transform_name original_base_url = "" @@ -82,14 +82,14 @@ def get_analysis_trf(transform, workdir): break if original_base_url == "": - diagnostics = "invalid base URL: %s" % transform + diagnostics = f"invalid base URL: {transform}" return errors.TRFDOWNLOADFAILURE, diagnostics, "" # try to download from the required location, if not - switch to backup status = False for base_url in get_valid_base_urls(order=original_base_url): trf = re.sub(original_base_url, base_url, transform) - logger.debug("attempting to download script: %s" % trf) + logger.debug(f"attempting to download script: {trf}") status, diagnostics = download_transform(trf, transform_name, workdir) if status: break @@ -99,11 +99,11 @@ def get_analysis_trf(transform, workdir): logger.info("successfully downloaded script") path = os.path.join(workdir, transform_name) - logger.debug("changing permission of %s to 0o755" % path) + logger.debug(f"changing permission of {path} to 0o755") try: - os.chmod(path, 0o755) # Python 2/3 - except Exception as e: - diagnostics = "failed to chmod %s: %s" % (transform_name, e) + os.chmod(path, 0o755) + except Exception as exc: + diagnostics = f"failed to chmod {transform_name}: {exc}" return errors.CHMODTRF, diagnostics, "" return ec, diagnostics, transform_name @@ -149,7 +149,7 @@ def download_transform(url, transform_name, workdir): path = os.path.join(workdir, transform_name) ip_version = os.environ.get('PILOT_IP_VERSION', 'IPv6') command = 'curl' if ip_version == 'IPv6' else 'curl -4' - cmd = f'{command} -sS \"%s\" > %s' % (url, path) + cmd = f'{command} -sS \"{url}\" > {path}' trial = 1 max_trials = 3 @@ -164,29 +164,29 @@ def download_transform(url, transform_name, workdir): status = True except Exception as error: status = False - diagnostics = "Failed to copy file %s to %s : %s" % (source_path, path, error) + diagnostics = f"Failed to copy file {source_path} to {path} : {error}" logger.error(diagnostics) # try to download the trf a maximum of 3 times while trial <= max_trials: - logger.info("executing command [trial %d/%d]: %s" % (trial, max_trials, cmd)) + logger.info(f"executing command [trial {trial}/{max_trials}]: {cmd}") exit_code, stdout, stderr = execute(cmd, mute=True) if not stdout: stdout = "(None)" if exit_code != 0: # Analyze exit code / output - diagnostics = "curl command failed: %d, %s, %s" % (exit_code, stdout, stderr) + diagnostics = f"curl command failed: {exit_code}, {stdout}, {stderr}" logger.warning(diagnostics) if trial == max_trials: - logger.fatal('could not download transform: %s' % stdout) + logger.fatal(f'could not download transform: {stdout}') status = False break else: logger.info("will try again after 60 s") sleep(60) else: - logger.info("curl command returned: %s" % stdout) + logger.info(f"curl command returned: {stdout}") status = True break trial += 1 diff --git a/pilot/user/rubin/utilities.py b/pilot/user/rubin/utilities.py index 0a328021..9bc0549f 100644 --- a/pilot/user/rubin/utilities.py +++ b/pilot/user/rubin/utilities.py @@ -50,7 +50,7 @@ def get_memory_monitor_output_filename(suffix='txt'): :return: File name (string). """ - return "memory_monitor_output.%s" % suffix + return f"memory_monitor_output.{suffix}" def get_memory_monitor_info_path(workdir, allowtxtfile=False): @@ -74,13 +74,13 @@ def get_memory_monitor_info_path(workdir, allowtxtfile=False): if os.path.exists(init_path): path = init_path else: - logger.info("neither %s, nor %s exist" % (path, init_path)) + logger.info(f"neither {path}, nor {init_path} exist") path = "" if path == "" and allowtxtfile: path = os.path.join(workdir, get_memory_monitor_output_filename()) if not os.path.exists(path): - logger.warning("file does not exist either: %s" % (path)) + logger.warning(f"file does not exist either: {path}") return path @@ -101,11 +101,11 @@ def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 # Note that only the final json file will contain the totRBYTES, etc try: summary_dictionary = get_memory_values(workdir, name=name) - except Exception as e: - logger.warning('failed to get memory values from memory monitor tool: %s' % e) + except Exception as exc: + logger.warning(f'failed to get memory values from memory monitor tool: {exc}') summary_dictionary = {} else: - logger.debug("summary_dictionary=%s" % str(summary_dictionary)) + logger.debug(f"summary_dictionary={str(summary_dictionary)}") # Fill the node dictionary if summary_dictionary and summary_dictionary != {}: @@ -126,8 +126,8 @@ def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 node['avgVMEM'] = summary_dictionary['Avg']['avgVMEM'] node['avgSWAP'] = summary_dictionary['Avg']['avgSwap'] node['avgPSS'] = summary_dictionary['Avg']['avgPSS'] - except Exception as e: - logger.warning("exception caught while parsing memory monitor file: %s" % e) + except Exception as exc: + logger.warning(f"exception caught while parsing memory monitor file: {exc}") logger.warning("will add -1 values for the memory info") node['maxRSS'] = -1 node['maxVMEM'] = -1 @@ -162,8 +162,8 @@ def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 node['avgVMEM'] = summary_dictionary['Avg']['vmem'] node['avgSWAP'] = summary_dictionary['Avg']['swap'] node['avgPSS'] = summary_dictionary['Avg']['pss'] - except Exception as e: - logger.warning("exception caught while parsing prmon file: %s" % e) + except Exception as exc: + logger.warning(f"exception caught while parsing prmon file: {exc}") logger.warning("will add -1 values for the memory info") node['maxRSS'] = -1 node['maxVMEM'] = -1 @@ -210,8 +210,8 @@ def get_max_memory_monitor_value(value, maxvalue, totalvalue): # noqa: C90 ec = 0 try: value_int = int(value) - except Exception as e: - logger.warning("exception caught: %s" % e) + except Exception as exc: + logger.warning(f"exception caught: {exc}") ec = 1 else: totalvalue += value_int @@ -270,7 +270,7 @@ def filter_value(value): keys = ['vmem', 'pss', 'rss', 'swap'] values = {} for key in keys: - value_list = list(filter(filter_value, dictionary.get(key, 0))) # Python 2/3 + value_list = list(filter(filter_value, dictionary.get(key, 0))) n = len(value_list) average = int(float(sum(value_list)) / float(n)) if n > 0 else 0 maximum = max(value_list) @@ -316,7 +316,7 @@ def get_metadata_dict_from_txt(path, storejson=False, jobid=None): dictionary['pandaid'] = jobid path = os.path.join(os.path.dirname(path), get_memory_monitor_output_filename(suffix='json')) - logger.debug('writing prmon dictionary to: %s' % path) + logger.debug(f'writing prmon dictionary to: {path}') write_json(path, dictionary) else: logger.debug('nothing to write (no prmon dictionary)') @@ -366,7 +366,7 @@ def convert_text_file_to_dictionary(path): value = convert_to_int(key) dictionary[key_entry].append(value) except Exception: - logger.warning("unexpected format of utility output: %s" % line) + logger.warning(f"unexpected format of utility output: {line}") return dictionary @@ -435,8 +435,8 @@ def get_average_summary_dictionary(path): rbytes = None wbytes = None except Exception: - logger.warning("unexpected format of utility output: %s (expected format: Time, VMEM," - " PSS, RSS, Swap [, RCHAR, WCHAR, RBYTES, WBYTES])" % (line)) + logger.warning(f"unexpected format of utility output: {line} (expected format: Time, VMEM, PSS, " + f"RSS, Swap [, RCHAR, WCHAR, RBYTES, WBYTES])") else: # Convert to int ec1, maxvmem, totalvmem = get_max_memory_monitor_value(vmem, maxvmem, totalvmem) @@ -444,7 +444,7 @@ def get_average_summary_dictionary(path): ec3, maxrss, totalrss = get_max_memory_monitor_value(rss, maxrss, totalrss) ec4, maxswap, totalswap = get_max_memory_monitor_value(swap, maxswap, totalswap) if ec1 or ec2 or ec3 or ec4: - logger.warning("will skip this row of numbers due to value exception: %s" % (line)) + logger.warning(f"will skip this row of numbers due to value exception: {line}") else: n += 1 @@ -491,7 +491,7 @@ def get_memory_values(workdir, name=""): # Get the path to the proper memory info file (priority ordered) path = get_memory_monitor_info_path(workdir, allowtxtfile=True) if os.path.exists(path): - logger.info("using path: %s (trf name=%s)" % (path, name)) + logger.info(f"using path: {path} (trf name={name})") # Does a JSON summary file exist? If so, there's no need to calculate maximums and averages in the pilot if path.lower().endswith('json'): @@ -503,7 +503,7 @@ def get_memory_values(workdir, name=""): summary_dictionary = get_average_summary_dictionary_prmon(path) else: summary_dictionary = get_average_summary_dictionary(path) - logger.debug('summary_dictionary=%s (trf name=%s)' % (str(summary_dictionary), name)) + logger.debug(f'summary_dictionary={str(summary_dictionary)} (trf name={name})') else: if path == "": logger.warning("filename not set for memory monitor output") @@ -525,20 +525,20 @@ def post_memory_monitor_action(job): nap = 3 path1 = os.path.join(job.workdir, get_memory_monitor_summary_filename()) path2 = os.environ.get('PILOT_HOME') - i = 0 + counter = 0 maxretry = 20 - while i <= maxretry: + while counter <= maxretry: if os.path.exists(path1): break - logger.info("taking a short nap (%d s) to allow the memory monitor to finish writing to the summary file (#%d/#%d)" - % (nap, i, maxretry)) + logger.info(f"taking a short nap ({nap} s) to allow the memory monitor to finish writing to the " + f"summary file (#{counter}/#{maxretry})") time.sleep(nap) - i += 1 + counter += 1 try: copy(path1, path2) - except Exception as e: - logger.warning('failed to copy memory monitor output: %s' % e) + except Exception as exc: + logger.warning(f'failed to copy memory monitor output: {exc}') def precleanup(): diff --git a/pilot/user/sphenix/common.py b/pilot/user/sphenix/common.py index e6bbd263..4d100aae 100644 --- a/pilot/user/sphenix/common.py +++ b/pilot/user/sphenix/common.py @@ -23,7 +23,10 @@ import re from signal import SIGTERM -from pilot.common.exception import TrfDownloadFailure +from pilot.common.exception import ( + TrfDownloadFailure, + FileHandlingFailure +) from pilot.info import FileSpec from pilot.util.config import config from pilot.util.constants import ( @@ -55,7 +58,6 @@ def sanity_check(): :return: exit code (0 if all is ok, otherwise non-zero exit code). """ - return 0 @@ -66,7 +68,6 @@ def validate(job): :param job: job object. :return: Boolean (True if validation is successful). """ - return True @@ -89,7 +90,7 @@ def get_payload_command(job): if ec != 0: raise TrfDownloadFailure(diagnostics) else: - logger.debug('user analysis trf: %s' % trf_name) + logger.debug(f'user analysis trf: {trf_name}') return get_analysis_run_command(job, trf_name) @@ -104,21 +105,20 @@ def get_analysis_run_command(job, trf_name): :param trf_name: name of the transform that will run the job (string). Used when containers are not used. :return: command (string). """ - cmd = "" # add the user proxy if 'X509_USER_PROXY' in os.environ and not job.imagename: - cmd += 'export X509_USER_PROXY=%s;' % os.environ.get('X509_USER_PROXY') + cmd += f"export X509_USER_PROXY={os.environ.get('X509_USER_PROXY')};" # set up trfs if job.imagename == "": # user jobs with no imagename defined - cmd += './%s %s' % (trf_name, job.jobparams) + cmd += f'./{trf_name} {job.jobparams}' else: if trf_name: - cmd += './%s %s' % (trf_name, job.jobparams) + cmd += f'./{trf_name} {job.jobparams}' else: - cmd += 'python %s %s' % (trf_name, job.jobparams) + cmd += f'python {trf_name} {job.jobparams}' return cmd @@ -132,7 +132,6 @@ def update_job_data(job): :param job: job object :return: """ - # in case the job was created with --outputs="regex|DST_.*\.root", we can now look for the corresponding # output files and add them to the output file list outfiles = [] @@ -182,10 +181,9 @@ def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode= :param workdir: working directory (string). :param outputfiles: list of output files. :param piloterrors: list of Pilot assigned error codes (list). - :return: + :return: None """ - - pass + return def get_utility_commands(order=None, job=None): @@ -217,7 +215,6 @@ def get_utility_commands(order=None, job=None): :param job: optional job object. :return: dictionary of utilities to be executed in parallel with the payload. """ - if order == UTILITY_BEFORE_PAYLOAD and job.preprocess: return {} @@ -250,7 +247,6 @@ def get_utility_after_payload_started(): :return: command (dictionary). """ - com = {} try: cmd = config.Pilot.utility_after_payload_started @@ -272,7 +268,6 @@ def get_utility_command_setup(name, job, setup=None): :param setup: optional payload setup string. :return: utility command setup (string). """ - if name == 'MemoryMonitor': # must know if payload is running in a container or not # (enables search for pid in ps output) @@ -301,11 +296,11 @@ def get_utility_command_setup(name, job, setup=None): # update the pgrp if the pid changed if pid not in (job.pid, -1): - logger.debug('updating pgrp=%d for pid=%d', job.pgrp, pid) + logger.debug(f'updating pgrp={job.pgrp} for pid {pid}') try: job.pgrp = os.getpgid(pid) except Exception as exc: - logger.warning('os.getpgid(%d) failed with: %s', pid, exc) + logger.warning(f'os.getpgid({pid}) failed with: {exc}', pid, exc) return setup return "" @@ -318,7 +313,6 @@ def get_utility_command_execution_order(name): :param name: utility name (string). :return: execution order constant (UTILITY_BEFORE_PAYLOAD or UTILITY_AFTER_PAYLOAD_STARTED) """ - # example implementation if name == 'monitor': return UTILITY_BEFORE_PAYLOAD @@ -332,9 +326,7 @@ def post_utility_command_action(name, job): :param name: name of utility command (string). :param job: job object. - :return: """ - if name == 'MemoryMonitor': post_memory_monitor_action(job) @@ -346,7 +338,6 @@ def get_utility_command_kill_signal(name): :param name: :return: kill signal """ - return SIGTERM @@ -358,7 +349,6 @@ def get_utility_command_output_filename(name, selector=None): :param selector: optional special conditions flag (boolean). :return: filename (string). """ - if name == 'MemoryMonitor': filename = get_memory_monitor_summary_filename(selector=selector) else: @@ -377,7 +367,6 @@ def verify_job(job): :param job: job object :return: Boolean. """ - return True @@ -387,23 +376,24 @@ def update_stagein(job): See ATLAS code for an example. :param job: job object. - :return: + :return: None """ - - pass + return def get_metadata(workdir): """ Return the metadata from file. - :param workdir: work directory (string) - :return: + :param workdir: work directory (str) + :return: metadata (str or None). """ - path = os.path.join(workdir, config.Payload.jobreport) - metadata = read_file(path) if os.path.exists(path) else None - + try: + metadata = read_file(path) if os.path.exists(path) else None + except FileHandlingFailure as exc: + logger.warning(f'exception caught while opening file: {exc}') + metadata = None return metadata @@ -414,10 +404,9 @@ def update_server(job): E.g. this can be used to send special information to a logstash. :param job: job object. - :return: + :return: None """ - - pass + return def post_prestagein_utility_command(**kwargs): @@ -425,13 +414,11 @@ def post_prestagein_utility_command(**kwargs): Execute any post pre-stage-in utility commands. :param kwargs: kwargs (dictionary). - :return: + :return: None """ - # label = kwargs.get('label', 'unknown_label') # stdout = kwargs.get('output', None) - - pass + return def process_debug_command(debug_command, pandaid): @@ -441,10 +428,9 @@ def process_debug_command(debug_command, pandaid): to the server). :param debug_command: debug command (string), payload pid (int). - :param pandaid: PanDA id (string). - :return: updated debug command (string) + :param pandaid: PanDA id (str) + :return: updated debug command (str). """ - return debug_command @@ -452,9 +438,9 @@ def allow_timefloor(submitmode): """ Should the timefloor mechanism (multi-jobs) be allowed for the given submit mode? - :param submitmode: submit mode (string). + :param submitmode: submit mode (str). + :return: True (bool). """ - return True @@ -463,10 +449,9 @@ def get_pilot_id(jobid): Get the pilot id from the environment variable GTAG. Update if necessary (do not used if you want the same pilot id for all multi-jobs). - :param jobid: PanDA job id - UNUSED (int). - :return: pilot id (string). + :param jobid: PanDA job id - UNUSED (int) + :return: pilot id (str). """ - return os.environ.get("GTAG", "unknown") @@ -476,7 +461,6 @@ def get_rtlogging(): :return: rtlogging (str). """ - return 'logstash;http://splogstash.sdcc.bnl.gov:8080' @@ -484,7 +468,6 @@ def get_rtlogging_ssl(): """ Return the proper ssl_enable and ssl_verify for real-time logging. - :return: ssl_enable (bool), ssl_verify (bool) (tuple). + :return: ssl_enable (bool), ssl_verify (bool). """ - return False, False diff --git a/pilot/user/sphenix/container.py b/pilot/user/sphenix/container.py index bf0572c5..2dc24bc4 100644 --- a/pilot/user/sphenix/container.py +++ b/pilot/user/sphenix/container.py @@ -28,9 +28,8 @@ def do_use_container(**kwargs): Decide whether to use a container or not. :param kwargs: dictionary of key-word arguments. - :return: True is function has decided that a container should be used, False otherwise (boolean). + :return: True is function has decided that a container should be used, False otherwise (bool). """ - return True @@ -39,11 +38,10 @@ def wrapper(executable, **kwargs): Wrapper function for any container specific usage. This function will be called by pilot.util.container.execute() and prepends the executable with a container command. - :param executable: command to be executed (string). - :param kwargs: dictionary of key-word arguments. - :return: executable wrapped with container command (string). + :param executable: command to be executed (str) + :param kwargs: dictionary of key-word arguments (dict) + :return: executable wrapped with container command (str). """ - return executable @@ -55,9 +53,8 @@ def create_stagein_container_command(workdir, cmd): it in a stagein.sh script file. It then generates the actual command that will execute the stage-in script in a container. - :param workdir: working directory where script will be stored (string). - :param cmd: isolated stage-in command (string). - :return: container command to be executed (string). + :param workdir: working directory where script will be stored (str). + :param cmd: isolated stage-in command (str) + :return: container command to be executed (str). """ - return cmd diff --git a/pilot/user/sphenix/copytool_definitions.py b/pilot/user/sphenix/copytool_definitions.py index d4530af0..878c3644 100644 --- a/pilot/user/sphenix/copytool_definitions.py +++ b/pilot/user/sphenix/copytool_definitions.py @@ -26,13 +26,12 @@ def mv_to_final_destination(): """ Is mv allowed to move files to/from final destination? - :return: Boolean. + :return: True (bool). """ - return True -def get_path(scope, lfn): +def get_path(scope: str, lfn: str) -> str: """ Construct a partial Rucio PFN using the scope and the LFN /md5(:)[0:2]/md5()[2:4]/ @@ -40,12 +39,11 @@ def get_path(scope, lfn): E.g. scope = 'user.jwebb2', lfn = 'user.jwebb2.66999._000001.top1outDS.tar' -> 'user/jwebb2/01/9f/user.jwebb2.66999._000001.top1outDS.tar' - :param scope: scope (string). - :param lfn: LFN (string). - :return: partial rucio path (string). + :param scope: scope (str) + :param lfn: LFN (str) + :return: partial rucio path (str). """ - - s = '%s:%s' % (scope, lfn) + s = f'{scope}:{lfn}' hash_hex = md5(s.encode('utf-8')).hexdigest() paths = scope.split('.') + [hash_hex[0:2], hash_hex[2:4], lfn] paths = [_f for _f in paths if _f] # remove empty parts to avoid double /-chars diff --git a/pilot/user/sphenix/cpu.py b/pilot/user/sphenix/cpu.py index 6f2bdc6f..7f7c967f 100644 --- a/pilot/user/sphenix/cpu.py +++ b/pilot/user/sphenix/cpu.py @@ -19,24 +19,24 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +from typing import Any from pilot.util.container import execute import logging logger = logging.getLogger(__name__) -def get_core_count(job): +def get_core_count(job: Any) -> int: """ Return the core count. - :param job: job object. + :param job: job object (Any) :return: core count (int). """ - return 0 -def add_core_count(corecount, core_counts=[]): +def add_core_count(corecount: int, core_counts: list = []) -> list: """ Add a core count measurement to the list of core counts. @@ -44,7 +44,6 @@ def add_core_count(corecount, core_counts=[]): :param core_counts: list of core counts (list). :return: updated list of core counts (list). """ - return core_counts.append(corecount) @@ -52,26 +51,23 @@ def set_core_counts(**kwargs): """ Set the number of used cores. - :param kwargs: kwargs (dictionary). - :return: + :param kwargs: kwargs (dict) """ - job = kwargs.get('job', None) if job and job.pgrp: - cmd = "ps axo pgid,psr | sort | grep %d | uniq | awk '{print $1}' | grep -x %d | wc -l" % (job.pgrp, job.pgrp) + cmd = f"ps axo pgid,psr | sort | grep {job.pgrp} | uniq | awk '{{print $1}}' | grep -x {job.pgrp} | wc -l" exit_code, stdout, stderr = execute(cmd, mute=True) - logger.debug('%s: %s' % (cmd, stdout)) + logger.debug(f'{cmd}: {stdout}') try: job.actualcorecount = int(stdout) except Exception as e: - logger.warning('failed to convert number of actual cores to int: %s' % e) + logger.warning(f'failed to convert number of actual cores to int: {e}') else: - logger.debug('set number of actual cores to: %d' % job.actualcorecount) + logger.debug(f'set number of actual cores to: {job.actualcorecount}') # overwrite the original core count and add it to the list job.corecount = job.actualcorecount job.corecounts = add_core_count(job.actualcorecount) - logger.debug('current core counts list: %s' % str(job.corecounts)) - + logger.debug(f'current core counts list: {job.corecounts}') else: logger.debug('payload process group not set - cannot check number of cores used by payload') diff --git a/pilot/user/sphenix/diagnose.py b/pilot/user/sphenix/diagnose.py index b24bd276..9e3dca58 100644 --- a/pilot/user/sphenix/diagnose.py +++ b/pilot/user/sphenix/diagnose.py @@ -20,6 +20,7 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 import os +from typing import Any from pilot.util.config import config from pilot.util.filehandling import read_file, tail @@ -30,14 +31,13 @@ logger = logging.getLogger(__name__) -def interpret(job): +def interpret(job: Any) -> int: """ Interpret the payload, look for specific errors in the stdout. :param job: job object :return: exit code (payload) (int). """ - # since the payload have finished, we can look for output files and update the output file list update_job_data(job) @@ -53,23 +53,22 @@ def interpret(job): return 0 -def get_log_extracts(job, state): +def get_log_extracts(job: Any, state: str) -> str: """ Extract special warnings and other other info from special logs. This function also discovers if the payload had any outbound connections. - :param job: job object. - :param state: job state (string). - :return: log extracts (string). + :param job: job object (Any) + :param state: job state (str) + :return: log extracts (str). """ - logger.info("building log extracts (sent to the server as \'pilotLog\')") # for failed/holding jobs, add extracts from the pilot log file, but always add it to the pilot log itself extracts = "" _extracts = get_pilot_log_extracts(job) if _extracts != "": - logger.warning('detected the following tail of warning/fatal messages in the pilot log:\n%s' % _extracts) + logger.warning(f'detected the following tail of warning/fatal messages in the pilot log:\n{_extracts}') if state == 'failed' or state == 'holding': extracts += _extracts @@ -80,10 +79,9 @@ def get_pilot_log_extracts(job): """ Get the extracts from the pilot log (warning/fatal messages, as well as tail of the log itself). - :param job: job object. - :return: tail of pilot log (string). + :param job: job object (Any) + :return: tail of pilot log (str). """ - extracts = "" path = os.path.join(job.workdir, config.Pilot.pilotlog) @@ -93,9 +91,9 @@ def get_pilot_log_extracts(job): if _tail != "": if extracts != "": extracts += "\n" - extracts += "- Log from %s -\n" % config.Pilot.pilotlog + extracts += f"- Log from {config.Pilot.pilotlog} -\n" extracts += _tail else: - logger.warning('pilot log file does not exist: %s' % path) + logger.warning(f'pilot log file does not exist: {path}') return extracts diff --git a/pilot/user/sphenix/jobdata.py b/pilot/user/sphenix/jobdata.py index 41c657d0..9a56993a 100644 --- a/pilot/user/sphenix/jobdata.py +++ b/pilot/user/sphenix/jobdata.py @@ -16,42 +16,58 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-24 -#import re +"""Functions related to job data.""" #import logging +#import re #logger = logging.getLogger(__name__) -def jobparams_prefiltering(value): +def jobparams_prefiltering(value: str) -> (dict, str): """ Perform pre-filtering of raw job parameters to avoid problems with especially quotation marks. + The function can extract some fields from the job parameters to be put back later after actual filtering. E.g. ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" ' will otherwise become ' --athenaopts 'HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER' ' which will prevent the environmental variable to be unfolded. - :param value: job parameters (string). - :return: list of fields excluded from job parameters (list), updated job parameters (string). + :param value: job parameters (str) + :return: dictionary of fields excluded from job parameters (dict), updated job parameters (str). """ - exclusions = {} # Add regex patterns here - + # .. return exclusions, value -def jobparams_postfiltering(value, exclusions={}): +def jobparams_postfiltering(value: str, exclusions: dict = None) -> str: """ Perform post-filtering of raw job parameters. + Any items in the optional exclusion list will be added (space separated) at the end of the job parameters. - :param value: job parameters (string). - :param optional exclusions: exlusions dictionary from pre-filtering function (dictionary). - :return: updated job parameters (string). + :param value: job parameters (str) + :param exclusions: exclusions dictionary from pre-filtering function (dict) + :return: updated job parameters (str). """ + if exclusions is None: # avoid pylint warning + exclusions = {} + + for item in exclusions: + value = value.replace(item, exclusions[item]) return value + + +def fail_at_getjob_none() -> bool: + """ + Return a boolean value indicating whether to fail when getJob returns None. + + :return: True (bool). + """ + return True diff --git a/pilot/user/sphenix/jobmetrics.py b/pilot/user/sphenix/jobmetrics.py index 864983d4..01255974 100644 --- a/pilot/user/sphenix/jobmetrics.py +++ b/pilot/user/sphenix/jobmetrics.py @@ -22,12 +22,15 @@ # from pilot.util.jobmetrics import get_job_metrics_entry import logging +from typing import Any + logger = logging.getLogger(__name__) -def get_job_metrics(job, extra={}): +def get_job_metrics(job: Any, extra: dict = {}) -> str: """ Return a properly formatted job metrics string. + The format of the job metrics string is defined by the server. It will be reported to the server during updateJob. Example of job metrics: @@ -35,9 +38,8 @@ def get_job_metrics(job, extra={}): Format: nEvents= nEventsW= vmPeakMax= vmPeakMean= RSSMean= hs06= shutdownTime= cpuFactor= cpuLimit= diskLimit= jobStart= memLimit= runLimit= - :param job: job object + :param job: job object (Any) :param extra: any extra information to be added (dict) - :return: job metrics (string). + :return: job metrics (str). """ - return "" diff --git a/pilot/user/sphenix/loopingjob_definitions.py b/pilot/user/sphenix/loopingjob_definitions.py index 782b0fd5..5f9d88fa 100644 --- a/pilot/user/sphenix/loopingjob_definitions.py +++ b/pilot/user/sphenix/loopingjob_definitions.py @@ -20,29 +20,28 @@ # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 -def allow_loopingjob_detection(): +def allow_loopingjob_detection() -> str: """ Should the looping job detection algorithm be allowed? + The looping job detection algorithm finds recently touched files within the job's workdir. If a found file has not been touched during the allowed time limit (see looping job section in util/default.cfg), the algorithm will kill the job/payload process. - :return: boolean. + :return: True (bool). """ - return True -def remove_unwanted_files(workdir, files): +def remove_unwanted_files(workdir: str, files: list) -> list: """ Remove files from the list that are to be ignored by the looping job algorithm. - :param workdir: working directory (string). Needed in case the find command includes the workdir in the list of - recently touched files. - :param files: list of recently touched files (file names). - :return: filtered files list. + :param workdir: working directory. Needed in case the find command includes the workdir in the list of + recently touched files (str) + :param files: list of recently touched files (list) + :return: filtered files (list). """ - _files = [] for _file in files: if not (workdir == _file or @@ -52,5 +51,4 @@ def remove_unwanted_files(workdir, files): ".py" in _file or "pandaJob" in _file): _files.append(_file) - return _files diff --git a/pilot/user/sphenix/memory.py b/pilot/user/sphenix/memory.py index aed36cb2..3eafa700 100644 --- a/pilot/user/sphenix/memory.py +++ b/pilot/user/sphenix/memory.py @@ -19,25 +19,25 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +from typing import Any -def allow_memory_usage_verifications(): + +def allow_memory_usage_verifications() -> bool: """ Should memory usage verifications be performed? - :return: boolean. + :return: False (bool). """ - return False -def memory_usage(job): +def memory_usage(job: Any) -> (int, str): """ Perform memory usage verification. - :param job: job object - :return: exit code (int), diagnostics (string). + :param job: job object (Any) + :return: exit code (int), diagnostics (str). """ - exit_code = 0 diagnostics = "" diff --git a/pilot/user/sphenix/monitoring.py b/pilot/user/sphenix/monitoring.py index 4962151c..b42e5917 100644 --- a/pilot/user/sphenix/monitoring.py +++ b/pilot/user/sphenix/monitoring.py @@ -19,15 +19,16 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2021-23 +from typing import Any -def fast_monitor_tasks(job): + +def fast_monitor_tasks(job: Any) -> int: """ Perform fast monitoring tasks. - :param job: job object. - :return: exit code (int) + :param job: job object (Any) + :return: exit code (int). """ - exit_code = 0 return exit_code diff --git a/pilot/user/sphenix/proxy.py b/pilot/user/sphenix/proxy.py index 3d83b7ee..050bf160 100644 --- a/pilot/user/sphenix/proxy.py +++ b/pilot/user/sphenix/proxy.py @@ -25,52 +25,51 @@ logger = logging.getLogger(__name__) -def verify_proxy(limit=None, x509=None, proxy_id="pilot", test=False): +def verify_proxy(limit: int = None, x509: bool = None, proxy_id: str = "pilot", test: bool = False) -> (int, str): """ Check for a valid voms/grid proxy longer than N hours. Use `limit` to set required time limit. - :param limit: time limit in hours (int). - :param test: free Boolean test parameter. - :return: exit code (NOPROXY or NOVOMSPROXY), diagnostics (error diagnostics string). + :param limit: time limit in hours (int) + :param x509: points to the proxy file. If not set (=None) - get proxy file from X509_USER_PROXY environment (bool) + :param proxy_id: proxy id (str) + :param test: free Boolean test parameter (bool) + :return: exit code (NOPROXY or NOVOMSPROXY) (int), diagnostics (error diagnostics string) (str). """ - return 0, "" -def get_voms_role(role='production'): +def get_voms_role(role: str = 'production') -> str: """ Return the proper voms role. - :param role: proxy role, 'production' or 'user' (string). - :return: voms role (string). + :param role: proxy role, 'production' or 'user' (str). + :return: voms role (str). """ - return '' -def get_and_verify_proxy(x509, voms_role='', proxy_type='', workdir=''): +def get_and_verify_proxy(x509: str, voms_role: str = '', proxy_type: str = '', workdir: str = '') -> (int, str, str): """ Download a payload proxy from the server and verify it. - :param x509: X509_USER_PROXY (string). - :param voms_role: role, e.g. 'sphenix' (string). - :param proxy_type: proxy type ('payload' for user payload proxy, blank for prod/user proxy) (string). - :param workdir: payload work directory (string). - :return: exit code (int), diagnostics (string), updated X509_USER_PROXY (string). + :param x509: X509_USER_PROXY (str) + :param voms_role: role, e.g. 'sphenix' (str) + :param proxy_type: proxy type ('payload' for user payload proxy, blank for prod/user proxy) (str) + :param workdir: payload work directory (str) + :return: exit code (int), diagnostics (str), updated X509_USER_PROXY (str). """ - exit_code = 0 diagnostics = "" return exit_code, diagnostics, x509 -def getproxy_dictionary(voms_role): +def getproxy_dictionary(voms_role: str) -> dict: """ - Prepare the dictionary for the getProxy call. + Prepare the dictionary with the VOMS role for the getProxy call. - :param voms_role: VOMS role (string). + :param voms_role: VOMS role (str) + :return: getProxy dictionary (dict). """ - return {'role': voms_role} diff --git a/pilot/user/sphenix/setup.py b/pilot/user/sphenix/setup.py index 5459b1ca..9099f600 100644 --- a/pilot/user/sphenix/setup.py +++ b/pilot/user/sphenix/setup.py @@ -24,6 +24,7 @@ import glob from time import sleep from datetime import datetime +from typing import Any from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import NoSoftwareDir @@ -38,50 +39,48 @@ errors = ErrorCodes() -def get_file_system_root_path(): +def get_file_system_root_path() -> str: """ Return the root path of the local file system. + The function returns "/cvmfs" or "/(some path)/cvmfs" in case the expected file system root path is not where it usually is (e.g. on an HPC). A site can set the base path by exporting ATLAS_SW_BASE. - :return: path (string) + :return: path (str). """ - return os.environ.get('ATLAS_SW_BASE', '/cvmfs') -def get_alrb_export(add_if=False): +def get_alrb_export(add_if: bool = False) -> str: """ Return the export command for the ALRB path if it exists. + If the path does not exist, return empty string. - :param add_if: Boolean. True means that an if statement will be placed around the export. - :return: export command + :param add_if: True means that an if statement will be placed around the export (bool) + :return: export command (str). """ - - path = "%s/atlas.cern.ch/repo" % get_file_system_root_path() - cmd = "export ATLAS_LOCAL_ROOT_BASE=%s/ATLASLocalRootBase;" % path if os.path.exists(path) else "" - - # if [ -z "$ATLAS_LOCAL_ROOT_BASE" ]; then export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; fi; + path = f"{get_file_system_root_path()}/atlas.cern.ch/repo" + cmd = f"export ATLAS_LOCAL_ROOT_BASE={path}/ATLASLocalRootBase;" if os.path.exists(path) else "" if cmd and add_if: cmd = 'if [ -z \"$ATLAS_LOCAL_ROOT_BASE\" ]; then ' + cmd + ' fi;' return cmd -def get_asetup(asetup=True, alrb=False, add_if=False): +def get_asetup(asetup: bool = True, alrb: bool = False, add_if: bool = False) -> str: """ - Define the setup for asetup, i.e. including full path to asetup and setting of ATLAS_LOCAL_ROOT_BASE + Define the setup for asetup, i.e. including full path to asetup and setting of ATLAS_LOCAL_ROOT_BASE. + Only include the actual asetup script if asetup=True. This is not needed if the jobPars contain the payload command but the pilot still needs to add the exports and the atlasLocalSetup. - :param asetup: Boolean. True value means that the pilot should include the asetup command. - :param alrb: Boolean. True value means that the function should return special setup used with ALRB and containers. - :param add_if: Boolean. True means that an if statement will be placed around the export. - :raises: NoSoftwareDir if appdir does not exist. - :return: source /asetup.sh (string). + :param asetup: True value means that the pilot should include the asetup command (bool) + :param alrb: True value means that the function should return special setup used with ALRB and containers (bool) + :param add_if: True means that an if statement will be placed around the export (bool) + :raises: NoSoftwareDir if appdir does not exist + :return: source /asetup.sh (str). """ - cmd = "" alrb_cmd = get_alrb_export(add_if=add_if) if alrb_cmd != "": @@ -100,50 +99,50 @@ def get_asetup(asetup=True, alrb=False, add_if=False): if appdir != "": # make sure that the appdir exists if not os.path.exists(appdir): - msg = 'appdir does not exist: %s' % appdir + msg = f'appdir does not exist: {appdir}' logger.warning(msg) raise NoSoftwareDir(msg) if asetup: - cmd = "source %s/scripts/asetup.sh" % appdir + cmd = f"source {appdir}/scripts/asetup.sh" return cmd -def get_analysis_trf(transform, workdir): +def get_analysis_trf(transform: str, workdir: str) -> (int, str, str): """ Prepare to download the user analysis transform with curl. + The function will verify the download location from a known list of hosts. :param transform: full trf path (url) (string). :param workdir: work directory (string). :return: exit code (int), diagnostics (string), transform_name (string) """ - ec = 0 diagnostics = "" # test if $HARVESTER_WORKDIR is set harvester_workdir = os.environ.get('HARVESTER_WORKDIR') if harvester_workdir is not None: - search_pattern = "%s/jobO.*.tar.gz" % harvester_workdir - logger.debug("search_pattern - %s" % search_pattern) + search_pattern = f"{harvester_workdir}/jobO.*.tar.gz" + logger.debug(f"search_pattern - {search_pattern}") jobopt_files = glob.glob(search_pattern) for jobopt_file in jobopt_files: - logger.debug("jobopt_file = %s workdir = %s" % (jobopt_file, workdir)) + logger.debug(f"jobopt_file = {jobopt_file} workdir = {workdir}") try: copy(jobopt_file, workdir) except Exception as e: - logger.error("could not copy file %s to %s : %s" % (jobopt_file, workdir, e)) + logger.error(f"could not copy file {jobopt_file} to {workdir} : {e}") if '/' in transform: transform_name = transform.split('/')[-1] else: - logger.warning('did not detect any / in %s (using full transform name)' % transform) + logger.warning(f'did not detect any / in {transform} (using full transform name)') transform_name = transform # is the command already available? (e.g. if already downloaded by a preprocess/main process step) if os.path.exists(os.path.join(workdir, transform_name)): - logger.info('script %s is already available - no need to download again' % transform_name) + logger.info(f'script {transform_name} is already available - no need to download again') return ec, diagnostics, transform_name original_base_url = "" @@ -155,14 +154,14 @@ def get_analysis_trf(transform, workdir): break if original_base_url == "": - diagnostics = "invalid base URL: %s" % transform + diagnostics = f"invalid base URL: {transform}" return errors.TRFDOWNLOADFAILURE, diagnostics, "" # try to download from the required location, if not - switch to backup status = False for base_url in get_valid_base_urls(order=original_base_url): trf = re.sub(original_base_url, base_url, transform) - logger.debug("attempting to download script: %s" % trf) + logger.debug(f"attempting to download script: {trf}") status, diagnostics = download_transform(trf, transform_name, workdir) if status: break @@ -172,27 +171,27 @@ def get_analysis_trf(transform, workdir): logger.info("successfully downloaded script") path = os.path.join(workdir, transform_name) - logger.debug("changing permission of %s to 0o755" % path) + logger.debug(f"changing permission of {path} to 0o755") try: os.chmod(path, 0o755) # Python 2/3 except Exception as e: - diagnostics = "failed to chmod %s: %s" % (transform_name, e) + diagnostics = f"failed to chmod {transform_name}: {e}" return errors.CHMODTRF, diagnostics, "" return ec, diagnostics, transform_name -def get_valid_base_urls(order=None): +def get_valid_base_urls(order: str = "") -> list: """ Return a list of valid base URLs from where the user analysis transform may be downloaded from. + If order is defined, return given item first. E.g. order=http://atlpan.web.cern.ch/atlpan -> ['http://atlpan.web.cern.ch/atlpan', ...] NOTE: the URL list may be out of date. - :param order: order (string). + :param order: order (str) :return: valid base URLs (list). """ - valid_base_urls = [] _valid_base_urls = ["https://storage.googleapis.com/drp-us-central1-containers", "http://pandaserver-doma.cern.ch:25080/trf/user"] @@ -208,19 +207,19 @@ def get_valid_base_urls(order=None): return valid_base_urls -def download_transform(url, transform_name, workdir): - """ - Download the transform from the given url - :param url: download URL with path to transform (string). - :param transform_name: trf name (string). - :param workdir: work directory (string). - :return: +def download_transform(url: str, transform_name: str, workdir: str): """ + Download the transform from the given url. + :param url: download URL with path to transform (str) + :param transform_name: trf name (str) + :param workdir: work directory (str) + :return: status (bool), diagnostics (str). + """ status = False diagnostics = "" path = os.path.join(workdir, transform_name) - cmd = 'curl -sS \"%s\" > %s' % (url, path) + cmd = f'curl -sS "{url}" > {path}' trial = 1 max_trials = 3 @@ -235,29 +234,29 @@ def download_transform(url, transform_name, workdir): status = True except Exception as error: status = False - diagnostics = "Failed to copy file %s to %s : %s" % (source_path, path, error) + diagnostics = f"Failed to copy file {source_path} to {path} : {error}" logger.error(diagnostics) # try to download the trf a maximum of 3 times while trial <= max_trials: - logger.info("executing command [trial %d/%d]: %s" % (trial, max_trials, cmd)) + logger.info(f"executing command [trial {trial}/{max_trials}]: {cmd}") exit_code, stdout, stderr = execute(cmd, mute=True) if not stdout: stdout = "(None)" if exit_code != 0: # Analyze exit code / output - diagnostics = "curl command failed: %d, %s, %s" % (exit_code, stdout, stderr) + diagnostics = f"curl command failed: {exit_code}, {stdout}, {stderr}" logger.warning(diagnostics) if trial == max_trials: - logger.fatal('could not download transform: %s' % stdout) + logger.fatal(f'could not download transform: {stdout}') status = False break else: logger.info("will try again after 60 s") sleep(60) else: - logger.info("curl command returned: %s" % stdout) + logger.info(f"curl command returned: {stdout}") status = True break trial += 1 @@ -265,17 +264,17 @@ def download_transform(url, transform_name, workdir): return status, diagnostics -def get_end_setup_time(path, pattern=r'(\d{2}\:\d{2}\:\d{2}\ \d{4}\/\d{2}\/\d{2})'): +def get_end_setup_time(path: str, pattern: str = r'(\d{2}\:\d{2}\:\d{2}\ \d{4}\/\d{2}\/\d{2})') -> float: """ Extract a more precise end of setup time from the payload stdout. + File path should be verified already. The function will look for a date time in the beginning of the payload stdout with the given pattern. - :param path: path to payload stdout (string). - :param pattern: regular expression pattern (raw string). + :param path: path to payload stdout (str) + :param pattern: regular expression pattern (raw str) :return: time in seconds since epoch (float). """ - end_time = None head_list = head(path, count=50) time_string = find_pattern_in_list(head_list, pattern) @@ -286,50 +285,50 @@ def get_end_setup_time(path, pattern=r'(\d{2}\:\d{2}\:\d{2}\ \d{4}\/\d{2}\/\d{2} return end_time -def get_schedconfig_priority(): +def get_schedconfig_priority() -> list: """ Return the prioritized list for the schedconfig sources. + This list is used to determine which source to use for the queuedatas, which can be different for different users. The sources themselves are defined in info/extinfo/load_queuedata() (minimal set) and load_schedconfig_data() (full set). - :return: prioritized DDM source list. + :return: prioritized DDM source list (list). """ - return ['LOCAL', 'CVMFS', 'CRIC', 'PANDA'] -def get_queuedata_priority(): +def get_queuedata_priority() -> list: """ Return the prioritized list for the schedconfig sources. + This list is used to determine which source to use for the queuedatas, which can be different for different users. The sources themselves are defined in info/extinfo/load_queuedata() (minimal set) and load_schedconfig_data() (full set). - :return: prioritized DDM source list. + :return: prioritized DDM source list (list). """ - return ['LOCAL', 'PANDA', 'CVMFS', 'CRIC'] -def get_ddm_source_priority(): +def get_ddm_source_priority() -> list: """ Return the prioritized list for the DDM sources. + This list is used to determine which source to use for the DDM endpoints, which can be different for different users. The sources themselves are defined in info/extinfo/load_storage_data(). - :return: prioritized DDM source list. + :return: prioritized DDM source list (list). """ - return ['USER', 'LOCAL', 'CVMFS', 'CRIC', 'PANDA'] -def should_verify_setup(job): +def should_verify_setup(job: Any) -> bool: """ - Should the setup command be verified? + Determine if the setup command should be verified. - :param job: job object. - :return: Boolean. + :param job: job object (Any) + :return: False (bool). """ return False diff --git a/pilot/user/sphenix/utilities.py b/pilot/user/sphenix/utilities.py index 13a767fa..4515190a 100644 --- a/pilot/user/sphenix/utilities.py +++ b/pilot/user/sphenix/utilities.py @@ -22,6 +22,7 @@ import os import time from re import search +from typing import Any # from pilot.info import infosys # from .setup import get_asetup @@ -34,14 +35,13 @@ logger = logging.getLogger(__name__) -def get_memory_monitor_summary_filename(selector=None): +def get_memory_monitor_summary_filename(selector: bool = False) -> str: """ Return the name for the memory monitor summary file. - :param selector: special conditions flag (boolean). - :return: File name (string). + :param selector: special conditions flag (bool) + :return: File name (str). """ - name = "memory_monitor_summary.json" if selector: name += '_snapshot' @@ -49,36 +49,38 @@ def get_memory_monitor_summary_filename(selector=None): return name -def get_memory_monitor_output_filename(suffix='txt'): +def get_memory_monitor_output_filename(suffix: str = 'txt') -> str: """ Return the filename of the memory monitor text output file. - :return: File name (string). + :param suffix: suffix (str) + :return: File name (str). """ - - return "memory_monitor_output.%s" % suffix + return f"memory_monitor_output.{suffix}" -def get_memory_monitor_setup(pid, pgrp, jobid, workdir, command, setup="", use_container=True, transformation="", outdata=None, dump_ps=False): +def get_memory_monitor_setup(pid: int, pgrp: int, jobid: int, workdir: str, command: str, setup: str = "", + use_container: bool = True, transformation: str = "", outdata: list = [], + dump_ps: bool = False) -> (str, int): """ Return the proper setup for the memory monitor. + If the payload release is provided, the memory monitor can be setup with the same release. Until early 2018, the memory monitor was still located in the release area. After many problems with the memory monitor, it was decided to use a fixed version for the setup. Currently, release 21.0.22 is used. - :param pid: job process id (int). - :param pgrp: process group id (int). - :param jobid: job id (int). - :param workdir: job work directory (string). - :param command: payload command (string). - :param setup: optional setup in case asetup can not be used, which uses infosys (string). - :param use_container: optional boolean. - :param transformation: optional name of transformation, e.g. Sim_tf.py (string). - :param outdata: optional list of output fspec objects (list). - :param dump_ps: should ps output be dumped when identifying prmon process? (Boolean). - :return: job work directory (string), pid for process inside container (int). + :param pid: job process id (int) + :param pgrp: process group id (int) + :param jobid: job id (int) + :param workdir: job work directory (str) + :param command: payload command (str) + :param setup: optional setup in case asetup can not be used, which uses infosys (str) + :param use_container: optional boolean (bool) + :param transformation: optional name of transformation, e.g. Sim_tf.py (str) + :param outdata: optional list of output fspec objects (list) + :param dump_ps: should ps output be dumped when identifying prmon process? (bool) + :return: job work directory (str), pid for process inside container (int). """ - # try to get the pid from a pid.txt file which might be created by a container_script pid = get_proper_pid(pid, pgrp, jobid, command=command, transformation=transformation, outdata=outdata, use_container=use_container, dump_ps=dump_ps) if pid == -1: @@ -97,33 +99,33 @@ def get_memory_monitor_setup(pid, pgrp, jobid, workdir, command, setup="", use_c path += '/' cmd = f"{path}prmon" interval = 60 - options = " --pid %d --filename %s --json-summary %s --interval %d" %\ - (pid, get_memory_monitor_output_filename(), get_memory_monitor_summary_filename(), interval) - #cmd = "cd " + workdir + ";" + setup + cmd + options + options = f" --pid {pid} --filename {get_memory_monitor_output_filename()} " \ + f"--json-summary {get_memory_monitor_summary_filename()} --interval {interval}" cmd = "cd " + workdir + ";" + cmd + options return cmd, pid -def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", use_container=True, dump_ps=False): +def get_proper_pid(pid: int, pgrp: int, jobid: int, command: str = "", transformation: str = "", outdata: str = "", + use_container: bool = True, dump_ps: bool = False) -> int: """ Return a pid from the proper source to be used with the memory monitor. + The given pid comes from Popen(), but in the case containers are used, the pid should instead come from a ps aux lookup. If the main process has finished before the proper pid has been identified (it will take time if the payload is running inside a container), then this function will abort and return -1. The called should handle this and not launch the memory monitor as it is not needed any longer. - :param pid: process id (int). - :param pgrp: process group id (int). - :param jobid: job id (int). - :param command: payload command (string). - :param transformation: optional name of transformation, e.g. Sim_tf.py (string). - :param outdata: list of output fspec object (list). - :param use_container: optional boolean. + :param pid: process id (int) + :param pgrp: process group id (int) + :param jobid: job id (int) + :param command: payload command (str) + :param transformation: optional name of transformation, e.g. Sim_tf.py (str) + :param outdata: list of output fspec object (list) + :param use_container: optional boolean (bool) :return: pid (int). """ - if not use_container: return pid @@ -151,13 +153,13 @@ def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", return -1 ps = get_ps_info(pgrp) - logger.debug('ps:\n%s' % ps) + logger.debug(f'ps:\n{ps}') # lookup the process id using ps aux logger.debug(f'attempting to identify pid from job id ({jobid})') _pid = get_pid_for_jobid(ps, jobid) if _pid: - logger.debug('discovered pid=%d for job id %s' % (_pid, jobid)) + logger.debug(f'discovered pid {_pid} for job id {jobid}') break #logger.debug('attempting to identify pid from transform name and its output') @@ -166,7 +168,7 @@ def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", # logger.debug('discovered pid=%d for transform name \"%s\"' % (_pid, transformation)) # break - logger.warning('payload pid has not yet been identified (#%d/#%d)' % (i + 1, imax)) + logger.warning(f'payload pid has not yet been identified (#{i + 1}/#{imax})') # wait until the payload has launched time.sleep(5) @@ -175,41 +177,35 @@ def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", if _pid: pid = _pid - logger.info('will use pid=%d for memory monitor' % pid) + logger.info(f'will use pid {pid} for memory monitor') return pid -def get_ps_info(pgrp, whoami=None, options='axfo pid,user,args'): +def get_ps_info(pgrp: int, whoami: str = "", options: str = "axfo pid,user,args") -> str: """ Return ps info for the given user. - :param pgrp: process group id (int). - :param whoami: user name (string). - :return: ps aux for given user (string). + :param pgrp: process group id (int) + :param whoami: user name (str) + :return: ps aux for given user (str). """ - if not whoami: whoami = os.getuid() - cmd = "ps -u %s %s" % (whoami, options) - #cmd = "ps %s | grep %s" % (options, whoami) - #cmd = "ps %s | grep %s | awk -v p=%s '$1 == p {print $5}" % (options, whoami, pgrp) - #cmd = "ps %s | awk -v p=%s '$1 == p {print $5}" % (options, pgrp) - exit_code, stdout, stderr = execute(cmd) + _, stdout, _ = execute(f"ps -u {whoami} {options}") return stdout -def get_pid_for_jobid(ps, jobid): +def get_pid_for_jobid(ps: str, jobid: int) -> int: """ Return the process id for the ps entry that contains the job id. - :param ps: ps command output (string). - :param jobid: PanDA job id (int). + :param ps: ps command output (str) + :param jobid: PanDA job id (int) :return: pid (int) or None if no such process. """ - pid = None for line in ps.split('\n'): @@ -218,18 +214,19 @@ def get_pid_for_jobid(ps, jobid): _pid = search(r'(\d+) ', line) try: pid = int(_pid.group(1)) - except Exception as e: - logger.warning('pid has wrong type: %s' % e) + except Exception as exc: + logger.warning(f'pid has wrong type: {exc}') else: - logger.debug('extracted pid=%d from ps output' % pid) + logger.debug(f'extracted pid {pid} from ps output') break return pid -def get_pid_for_trf(ps, transformation, outdata): +def get_pid_for_trf(ps: str, transformation: str, outdata: Any) -> int: """ Return the process id for the given command and user. + Note: function returns 0 in case pid could not be found. :param ps: ps command output (string). @@ -237,14 +234,13 @@ def get_pid_for_trf(ps, transformation, outdata): :param outdata: fspec objects (list). :return: pid (int) or None if no such process. """ - pid = None candidates = [] # in the case of user analysis job, the transformation will contain a URL which should be stripped if "/" in transformation: transformation = transformation.split('/')[-1] - logger.debug('using transformation name: %s' % transformation) + logger.debug(f'using transformation name: {transformation}') for line in ps.split('\n'): if transformation in line: candidates.append(line) @@ -258,30 +254,30 @@ def get_pid_for_trf(ps, transformation, outdata): _pid = search(r'(\d+) ', line) try: pid = int(_pid.group(1)) - except Exception as e: - logger.warning('pid has wrong type: %s' % e) + except Exception as exc: + logger.warning(f'pid has wrong type: {exc}') else: - logger.debug('extracted pid=%d from ps output' % pid) + logger.debug(f'extracted pid {pid} from ps output') break if pid: break else: - logger.debug('pid not found in ps output for trf=%s' % transformation) + logger.debug(f'pid not found in ps output for trf={transformation}') return pid -def get_pid_for_command(ps, command="python pilot3/pilot.py"): +def get_pid_for_command(ps: str, command: str = "python pilot3/pilot.py") -> int: """ Return the process id for the given command and user. + The function returns 0 in case pid could not be found. If no command is specified, the function looks for the "python pilot3/pilot.py" command in the ps output. - :param ps: ps command output (string). - :param command: command string expected to be in ps output (string). + :param ps: ps command output (str) + :param command: command string expected to be in ps output (str) :return: pid (int) or None if no such process. """ - pid = None found = None @@ -294,26 +290,25 @@ def get_pid_for_command(ps, command="python pilot3/pilot.py"): _pid = search(r'(\d+) ', found) try: pid = int(_pid.group(1)) - except Exception as e: - logger.warning('pid has wrong type: %s' % e) + except Exception as exc: + logger.warning(f'pid has wrong type: {exc}') else: - logger.debug('extracted pid=%d from ps output: %s' % (pid, found)) + logger.debug(f'extracted pid {pid} from ps output: {found}') else: - logger.debug('command not found in ps output: %s' % command) + logger.debug(f'command not found in ps output: {command}') return pid -def get_trf_command(command, transformation=""): +def get_trf_command(command: str, transformation: str = "") -> str: """ Return the last command in the full payload command string. Note: this function returns the last command in job.command which is only set for containers. - :param command: full payload command (string). - :param transformation: optional name of transformation, e.g. Sim_tf.py (string). - :return: trf command (string). + :param command: full payload command (str) + :param transformation: optional name of transformation, e.g. Sim_tf.py (str) + :return: trf command (str). """ - payload_command = "" if command: if not transformation: @@ -330,19 +325,19 @@ def get_trf_command(command, transformation=""): return payload_command -def get_memory_monitor_info_path(workdir, allowtxtfile=False): +def get_memory_monitor_info_path(workdir: str, allowtxtfile: bool = False) -> str: """ - Find the proper path to the utility info file + Find the proper path to the utility info file. + Priority order: 1. JSON summary file from workdir 2. JSON summary file from pilot initdir 3. Text output file from workdir (if allowtxtfile is True) - :param workdir: relevant work directory (string). - :param allowtxtfile: boolean attribute to allow for reading the raw memory monitor output. - :return: path (string). + :param workdir: relevant work directory (str) + :param allowtxtfile: boolean attribute to allow for reading the raw memory monitor output (bool) + :return: path (str). """ - pilot_initdir = os.environ.get('PILOT_HOME', '') path = os.path.join(workdir, get_memory_monitor_summary_filename()) init_path = os.path.join(pilot_initdir, get_memory_monitor_summary_filename()) @@ -351,38 +346,37 @@ def get_memory_monitor_info_path(workdir, allowtxtfile=False): if os.path.exists(init_path): path = init_path else: - logger.info("neither %s, nor %s exist" % (path, init_path)) + logger.info(f"neither {path}, nor {init_path} exist") path = "" if path == "" and allowtxtfile: path = os.path.join(workdir, get_memory_monitor_output_filename()) if not os.path.exists(path): - logger.warning("file does not exist either: %s" % (path)) + logger.warning(f"file does not exist either: {path}") return path -def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 +def get_memory_monitor_info(workdir: str, allowtxtfile: bool = False, name: str = "") -> dict: # noqa: C901 """ Add the utility info to the node structure if available. - :param workdir: relevant work directory (string). - :param allowtxtfile: boolean attribute to allow for reading the raw memory monitor output. - :param name: name of memory monitor (string). - :return: node structure (dictionary). + :param workdir: relevant work directory (str) + :param allowtxtfile: boolean attribute to allow for reading the raw memory monitor output (bool) + :param name: name of memory monitor (str) + :return: node structure (dict). """ - node = {} # Get the values from the memory monitor file (json if it exists, otherwise the preliminary txt file) # Note that only the final json file will contain the totRBYTES, etc try: summary_dictionary = get_memory_values(workdir, name=name) - except Exception as e: - logger.warning('failed to get memory values from memory monitor tool: %s' % e) + except Exception as exc: + logger.warning(f'failed to get memory values from memory monitor tool: {exc}') summary_dictionary = {} else: - logger.debug("summary_dictionary=%s" % str(summary_dictionary)) + logger.debug(f"summary_dictionary={summary_dictionary}") # Fill the node dictionary if summary_dictionary and summary_dictionary != {}: @@ -403,8 +397,8 @@ def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 node['avgVMEM'] = summary_dictionary['Avg']['avgVMEM'] node['avgSWAP'] = summary_dictionary['Avg']['avgSwap'] node['avgPSS'] = summary_dictionary['Avg']['avgPSS'] - except Exception as e: - logger.warning("exception caught while parsing memory monitor file: %s" % e) + except Exception as exc: + logger.warning(f"exception caught while parsing memory monitor file: {exc}") logger.warning("will add -1 values for the memory info") node['maxRSS'] = -1 node['maxVMEM'] = -1 @@ -439,8 +433,8 @@ def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 node['avgVMEM'] = summary_dictionary['Avg']['vmem'] node['avgSWAP'] = summary_dictionary['Avg']['swap'] node['avgPSS'] = summary_dictionary['Avg']['pss'] - except Exception as e: - logger.warning("exception caught while parsing prmon file: %s" % e) + except Exception as exc: + logger.warning(f"exception caught while parsing prmon file: {exc}") logger.warning("will add -1 values for the memory info") node['maxRSS'] = -1 node['maxVMEM'] = -1 @@ -473,22 +467,21 @@ def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 return node -def get_max_memory_monitor_value(value, maxvalue, totalvalue): # noqa: C90 +def get_max_memory_monitor_value(value: int, maxvalue: int, totalvalue: int) -> (int, int, int): # noqa: C90 """ Return the max and total value (used by memory monitoring). Return an error code, 1, in case of value error. - :param value: value to be tested (integer). - :param maxvalue: current maximum value (integer). - :param totalvalue: total value (integer). - :return: exit code, maximum and total value (tuple of integers). + :param value: value to be tested (int) + :param maxvalue: current maximum value (int) + :param totalvalue: total value (int) + :return: exit code (int), maximum (int) and total value (int). """ - ec = 0 try: value_int = int(value) - except Exception as e: - logger.warning("exception caught: %s" % e) + except Exception as exc: + logger.warning(f"exception caught: {exc}") ec = 1 else: totalvalue += value_int @@ -498,20 +491,19 @@ def get_max_memory_monitor_value(value, maxvalue, totalvalue): # noqa: C90 return ec, maxvalue, totalvalue -def convert_unicode_string(unicode_string): +def convert_unicode_string(unicode_string: str) -> str: """ Convert a unicode string into str. - :param unicode_string: - :return: string. + :param unicode string: unicode string (str) + :return: normal string (str). """ - if unicode_string is not None: return str(unicode_string) return None -def get_average_summary_dictionary_prmon(path): +def get_average_summary_dictionary_prmon(path: str) -> dict: """ Loop over the memory monitor output file and create the averaged summary dictionary. @@ -524,10 +516,9 @@ def get_average_summary_dictionary_prmon(path): later in the function. This means that any change in the format such as new columns will be handled automatically. - :param path: path to memory monitor txt output file (string). - :return: summary dictionary. + :param path: path to memory monitor txt output file (str) + :return: summary dictionary (dict). """ - summary_dictionary = {} # get the raw memory monitor output, convert to dictionary @@ -538,12 +529,19 @@ def get_average_summary_dictionary_prmon(path): summary_dictionary = {"Max": {}, "Avg": {}, "Other": {}, "Time": {}} def filter_value(value): - """ Inline function used to remove any string or None values from data. """ + """ Filter value.""" if isinstance(value, str) or value is None: return False else: return True + def get_last_value(value_list): + """ Return the last value in the given list.""" + value = None + if value_list: + value = value_list[-1] + return value + keys = ['vmem', 'pss', 'rss', 'swap'] values = {} for key in keys: @@ -574,16 +572,15 @@ def filter_value(value): return summary_dictionary -def get_metadata_dict_from_txt(path, storejson=False, jobid=None): +def get_metadata_dict_from_txt(path: str, storejson: bool = False, jobid: str = "") -> dict: """ Convert memory monitor text output to json, store it, and return a selection as a dictionary. - :param path: - :param storejson: store dictionary on disk if True (boolean). - :param jobid: job id (string). - :return: prmon metadata (dictionary). + :param path: path to metadata file (str) + :param storejson: store dictionary on disk if True (bool) + :param jobid: job id (str) + :return: prmon metadata (dict). """ - # get the raw memory monitor output, convert to dictionary dictionary = convert_text_file_to_dictionary(path) @@ -593,7 +590,7 @@ def get_metadata_dict_from_txt(path, storejson=False, jobid=None): dictionary['pandaid'] = jobid path = os.path.join(os.path.dirname(path), get_memory_monitor_output_filename(suffix='json')) - logger.debug('writing prmon dictionary to: %s' % path) + logger.debug(f'writing prmon dictionary to: {path}') write_json(path, dictionary) else: logger.debug('nothing to write (no prmon dictionary)') @@ -604,7 +601,7 @@ def get_metadata_dict_from_txt(path, storejson=False, jobid=None): return dictionary -def convert_text_file_to_dictionary(path): +def convert_text_file_to_dictionary(path: str) -> dict: """ Convert row-column text file to dictionary. User first row identifiers as dictionary keys. @@ -613,10 +610,9 @@ def convert_text_file_to_dictionary(path): value1 value2 .. .. .. .. - :param path: path to file (string). - :return: dictionary. + :param path: path to file (str) + :return: dictionary (dict). """ - summary_keys = [] # to keep track of content header_locked = False dictionary = {} @@ -643,26 +639,18 @@ def convert_text_file_to_dictionary(path): value = convert_to_int(key) dictionary[key_entry].append(value) except Exception: - logger.warning("unexpected format of utility output: %s" % line) + logger.warning(f"unexpected format of utility output: {line}") return dictionary -def get_last_value(value_list): - value = None - if value_list: - value = value_list[-1] - return value - - -def get_average_summary_dictionary(path): +def get_average_summary_dictionary(path: str) -> dict: """ Loop over the memory monitor output file and create the averaged summary dictionary. - :param path: path to memory monitor txt output file (string). - :return: summary dictionary. + :param path: path to memory monitor txt output file (str) + :return: summary dictionary (dict). """ - maxvmem = -1 maxrss = -1 maxpss = -1 @@ -712,8 +700,8 @@ def get_average_summary_dictionary(path): rbytes = None wbytes = None except Exception: - logger.warning("unexpected format of utility output: %s (expected format: Time, VMEM," - " PSS, RSS, Swap [, RCHAR, WCHAR, RBYTES, WBYTES])" % (line)) + logger.warning(f"unexpected format of utility output: {line} (expected format: Time, VMEM, PSS, " + f"RSS, Swap [, RCHAR, WCHAR, RBYTES, WBYTES])") else: # Convert to int ec1, maxvmem, totalvmem = get_max_memory_monitor_value(vmem, maxvmem, totalvmem) @@ -721,7 +709,7 @@ def get_average_summary_dictionary(path): ec3, maxrss, totalrss = get_max_memory_monitor_value(rss, maxrss, totalrss) ec4, maxswap, totalswap = get_max_memory_monitor_value(swap, maxswap, totalswap) if ec1 or ec2 or ec3 or ec4: - logger.warning("will skip this row of numbers due to value exception: %s" % (line)) + logger.warning(f"will skip this row of numbers due to value exception: {line}") else: n += 1 @@ -746,7 +734,7 @@ def get_average_summary_dictionary(path): return summary_dictionary -def get_memory_values(workdir, name=""): +def get_memory_values(workdir: str, name: str = "") -> dict: """ Find the values in the memory monitor output file. @@ -758,17 +746,16 @@ def get_memory_values(workdir, name=""): "Avg":{"avgVMEM":19384236,"avgPSS":5023500,"avgRSS":6501489,"avgSwap":5964997}, "Other":{"rchar":NN,"wchar":NN,"rbytes":NN,"wbytes":NN}} - :param workdir: relevant work directory (string). - :param name: name of memory monitor (string). - :return: memory values dictionary. + :param workdir: relevant work directory (str) + :param name: name of memory monitor (str) + :return: memory values dictionary (dict). """ - summary_dictionary = {} # Get the path to the proper memory info file (priority ordered) path = get_memory_monitor_info_path(workdir, allowtxtfile=True) if os.path.exists(path): - logger.info("using path: %s (trf name=%s)" % (path, name)) + logger.info(f"using path: {path} (trf name={name})") # Does a JSON summary file exist? If so, there's no need to calculate maximums and averages in the pilot if path.lower().endswith('json'): @@ -780,7 +767,7 @@ def get_memory_values(workdir, name=""): summary_dictionary = get_average_summary_dictionary_prmon(path) else: summary_dictionary = get_average_summary_dictionary(path) - logger.debug('summary_dictionary=%s (trf name=%s)' % (str(summary_dictionary), name)) + logger.debug(f'summary_dictionary={summary_dictionary} (trf name={name})') else: if path == "": logger.warning("filename not set for memory monitor output") @@ -791,44 +778,39 @@ def get_memory_values(workdir, name=""): return summary_dictionary -def post_memory_monitor_action(job): +def post_memory_monitor_action(job: Any): """ Perform post action items for memory monitor. - :param job: job object. - :return: + :param job: job object (Any). """ - nap = 3 path1 = os.path.join(job.workdir, get_memory_monitor_summary_filename()) path2 = os.environ.get('PILOT_HOME') - i = 0 + counter = 0 maxretry = 20 - while i <= maxretry: + while counter <= maxretry: if os.path.exists(path1): break - logger.info("taking a short nap (%d s) to allow the memory monitor to finish writing to the summary file (#%d/#%d)" - % (nap, i, maxretry)) + logger.info(f"taking a short nap ({nap} s) to allow the memory monitor to finish writing to the " + f"summary file (#{counter}/#{maxretry})") time.sleep(nap) - i += 1 + counter += 1 try: copy(path1, path2) - except Exception as e: - logger.warning('failed to copy memory monitor output: %s' % e) + except Exception as exc: + logger.warning(f'failed to copy memory monitor output: {exc}') def precleanup(): """ - Pre-cleanup at the beginning of the job to remove any pre-existing files from previous jobs in the main work dir. - - :return: + Remove any pre-existing files from previous jobs in the main work directory. """ - logger.debug('performing pre-cleanup of potentially pre-existing files from earlier job in main work dir') path = os.path.join(os.environ.get('PILOT_HOME'), get_memory_monitor_summary_filename()) if os.path.exists(path): - logger.info('removing no longer needed file: %s' % path) + logger.info(f'removing no longer needed file: {path}') remove(path) @@ -838,7 +820,6 @@ def get_cpu_arch(): If not returned by this function, the pilot will resort to use the internal scripts/cpu_arch.py. - :return: CPU arch (string). + :return: CPU arch (str). """ - return "" diff --git a/pilot/util/activemq.py b/pilot/util/activemq.py index adaae17e..1b4d4485 100644 --- a/pilot/util/activemq.py +++ b/pilot/util/activemq.py @@ -19,6 +19,8 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2022-23 +"""Functions for using ActiveMQ.""" + import socket import json import random @@ -44,50 +46,44 @@ class Listener(connectionlistener): - """ - Messaging listener. - """ + """Messaging listener.""" messages = [] def __init__(self, broker: Any = None, queues: Any = None) -> None: """ - Init function. + Initialize variables. - :param broker: broker - :param queues: queues. + :param broker: broker (Any) + :param queues: queues (Any). """ - self.__broker = broker self.__queues = queues self.logger = logging.getLogger(self.__class__.__name__) def set_broker(self, broker: Any) -> None: """ - Define broker for internal use. + Set the broker for internal use. :param broker: broker. """ - self.__broker = broker def on_error(self, frame: Any) -> None: """ - Error handler. + Handle errors. - :param frame: frame. + :param frame: frame (Any). """ - self.logger.warning(f'received an error "{frame}"') # store error in messages? def on_message(self, frame: Any) -> None: """ - Message handler. + Handle messages. - :param frame: frame. + :param frame: frame (Any). """ - self.logger.info(f'received a message "{frame.body}"') body = json.loads(frame.body) if body not in [_obj for _obj in list(self.__queues.mbmessages.queue)]: @@ -101,13 +97,13 @@ def get_messages(self) -> list: :return: messages (list). """ - return self.messages class ActiveMQ: """ ActiveMQ class. + Note: the class can be used for either topic or queue messages. E.g. 'topic': '/queue/panda.pilot' or '/topic/panda.pilot' X.509 authentication using SSL not possible since key+cert cannot easily be reached from WNs. @@ -128,12 +124,12 @@ class ActiveMQ: def __init__(self, **kwargs: dict) -> None: """ - Init function. + Initialize variables. + Note: the init function sets up all connections and starts the listener. - :param kwargs: kwargs dictionary. + :param kwargs: kwargs dictionary (dict). """ - self.logger = logging.getLogger(self.__class__.__name__) self.broker = kwargs.get('broker', '') self.receiver_port = kwargs.get('receiver_port', '') @@ -206,9 +202,8 @@ def send_message(self, message: str) -> None: """ Send a message to a topic or queue. - :param message: message (string). + :param message: message (str). """ - conn = random.choice(self.connections) self.logger.debug(f'sending to {conn} topic/queue={self.topic}') conn.send(destination=self.topic, body=json.dumps(message), id='atlas-pilot-messaging', ack='auto', @@ -216,10 +211,7 @@ def send_message(self, message: str) -> None: self.logger.debug('sent message') def close_connections(self) -> None: - """ - Close all open connections. - """ - + """Close all open connections.""" for conn in self.connections: try: conn.disconnect() @@ -231,9 +223,9 @@ def close_connections(self) -> None: def get_credentials(self) -> None: """ Download username+password from the PanDA server for ActiveMQ authentication. + Note: this function does not return anything, only sets private username and password. """ - res = {} if not self.pandaurl or self.pandaport == 0: self.logger.warning('PanDA server URL and/or port not set - cannot get ActiveMQ credentials') diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index 18f6e6a6..a8b0fa8e 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -19,6 +19,8 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +"""Auxiliary functions.""" + import os import re import sys @@ -50,10 +52,7 @@ def pilot_version_banner() -> None: - """ - Print a pilot version banner. - """ - + """Print a pilot version banner.""" version = f'*** PanDA Pilot version {get_pilot_version()} ***' logger.info('*' * len(version)) logger.info(version) @@ -69,13 +68,13 @@ def pilot_version_banner() -> None: def is_virtual_machine() -> bool: """ - Are we running in a virtual machine? + Determine if we are running in a virtual machine. + If we are running inside a VM, then linux will put 'hypervisor' in cpuinfo. This function looks for the presence of that. - :return: Boolean. + :return: True is virtual machine, False otherwise (bool). """ - status = False # look for 'hypervisor' in cpuinfo @@ -90,21 +89,17 @@ def is_virtual_machine() -> bool: def display_architecture_info() -> None: - """ - Display OS/architecture information from /etc/os-release. - """ - + """Display OS/architecture information from /etc/os-release.""" logger.info("architecture information:") dump("/etc/os-release") def get_batchsystem_jobid() -> (str, int): """ - Identify and return the batch system job id (will be reported to the server) + Identify and return the batch system job id (will be reported to the server). :return: batch system name (string), batch system job id (int) """ - # BQS (e.g. LYON) batchsystem_dict = {'QSUB_REQNAME': 'BQS', 'BQSCLUSTER': 'BQS', # BQS alternative @@ -134,9 +129,8 @@ def get_globaljobid() -> str: """ Return the GlobalJobId value from the condor class ad. - :return: GlobalJobId value (string). + :return: GlobalJobId value (str). """ - ret = "" with open(os.environ.get("_CONDOR_JOB_AD"), 'r', encoding='utf-8') as _fp: for line in _fp: @@ -154,9 +148,9 @@ def get_globaljobid() -> str: def get_job_scheduler_id() -> str: """ - Get the job scheduler id from the environment variable PANDA_JSID + Get the job scheduler id from the environment variable PANDA_JSID. - :return: job scheduler id (string) + :return: job scheduler id (str) """ return os.environ.get("PANDA_JSID", "unknown") @@ -167,7 +161,6 @@ def whoami() -> str: :return: whoami output (string). """ - _, who_am_i, _ = execute('whoami', mute=True) return who_am_i @@ -179,7 +172,6 @@ def get_error_code_translation_dictionary() -> dict: :return: populated error code translation dictionary. """ - error_code_translation_dictionary = { -1: [64, "Site offline"], errors.GENERALERROR: [65, "General pilot error, consult batch log"], # added to traces object @@ -203,25 +195,53 @@ def get_error_code_translation_dictionary() -> dict: errors.SIGTERM: [143, "Job killed by signal: SIGTERM"], # 128+15 errors.SIGQUIT: [131, "Job killed by signal: SIGQUIT"], # 128+3 errors.SIGSEGV: [139, "Job killed by signal: SIGSEGV"], # 128+11 - errors.SIGXCPU: [158, "Job killed by signal: SIGXCPU"], # 128+30 - errors.SIGUSR1: [144, "Job killed by signal: SIGUSR1"], # 128+16 - errors.SIGBUS: [138, "Job killed by signal: SIGBUS"] # 128+10 + errors.SIGXCPU: [152, "Job killed by signal: SIGXCPU"], # 128+24 + errors.SIGUSR1: [138, "Job killed by signal: SIGUSR1"], # 128+10 + errors.SIGINT: [130, "Job killed by signal: SIGINT"], # 128+2 + errors.SIGBUS: [135, "Job killed by signal: SIGBUS"] # 128+7 } return error_code_translation_dictionary +def convert_signal_to_exit_code(signal: str) -> int: + """ + Convert a signal to an exit code. + + :param signal: signal (string). + :return: exit code (int). + """ + if signal == "SIGINT": + exitcode = errors.SIGINT + elif signal == "SIGTERM": + exitcode = errors.SIGTERM + elif signal == "SIGQUIT": + exitcode = errors.SIGQUIT + elif signal == "SIGSEGV": + exitcode = errors.SIGSEGV + elif signal == "SIGXCPU": + exitcode = errors.SIGXCPU + elif signal == "SIGUSR1": + exitcode = errors.SIGUSR1 + elif signal == "SIGBUS": + exitcode = errors.SIGBUS + else: + exitcode = errors.KILLSIGNAL + + return exitcode + + def shell_exit_code(exit_code: int) -> int: """ Translate the pilot exit code to a proper exit code for the shell (wrapper). + Any error code that is to be converted by this function, should be added to the traces object like: traces.pilot['error_code'] = errors. The traces object will be checked by the pilot module. - :param exit_code: pilot error code (int). + :param exit_code: pilot error code (int) :return: standard shell exit code (int). """ - # Error code translation dictionary # FORMAT: { pilot_error_code : [ shell_error_code, meaning ], .. } @@ -242,10 +262,11 @@ def shell_exit_code(exit_code: int) -> int: def convert_to_pilot_error_code(exit_code: int) -> int: """ - This conversion function is used to revert a batch system exit code back to a pilot error code. + Revert a batch system exit code back to a pilot error code. + Note: the function is used by Harvester. - :param exit_code: batch system exit code (int). + :param exit_code: batch system exit code (int) :return: pilot error code (int). """ error_code_translation_dictionary = get_error_code_translation_dictionary() @@ -263,13 +284,13 @@ def convert_to_pilot_error_code(exit_code: int) -> int: def get_size(obj_0: Any) -> int: """ - Recursively iterate to sum size of object & members. + Recursively iterate to sum size of object and members. + Note: for size measurement to work, the object must have set the data members in the __init__(). :param obj_0: object to be measured. :return: size in Bytes (int). """ - _seen_ids = set() def inner(obj): @@ -308,18 +329,19 @@ def inner(obj): def get_pilot_state(job: Any = None) -> str: """ Return the current pilot (job) state. + If the job object does not exist, the environmental variable PILOT_JOB_STATE will be queried instead. - :param job: - :return: pilot (job) state (string). + :param job: job object (Any) + :return: pilot (job) state (str). """ - return job.state if job else os.environ.get('PILOT_JOB_STATE', 'unknown') def set_pilot_state(job: Any = None, state: str = '') -> None: """ Set the internal pilot state. + Note: this function should update the global/singleton object but currently uses an environmental variable (PILOT_JOB_STATE). The function does not update job.state if it is already set to finished or failed. @@ -328,7 +350,6 @@ def set_pilot_state(job: Any = None, state: str = '') -> None: :param job: optional job object. :param state: internal pilot state (string). """ - os.environ['PILOT_JOB_STATE'] = state if job and job.state != 'failed': @@ -337,38 +358,38 @@ def set_pilot_state(job: Any = None, state: str = '') -> None: def check_for_final_server_update(update_server: bool) -> None: """ + Check for the final server update. + Do not set graceful stop if pilot has not finished sending the final job update i.e. wait until SERVER_UPDATE is DONE_FINAL. This function sleeps for a maximum of 20*30 s until SERVER_UPDATE env variable has been set to SERVER_UPDATE_FINAL. - :param update_server: args.update_server (Boolean). + :param update_server: args.update_server (bool). """ - max_i = 20 - i = 0 + counter = 0 # abort if in startup stage or if in final update stage server_update = os.environ.get('SERVER_UPDATE', '') if server_update == SERVER_UPDATE_NOT_DONE: return - while i < max_i and update_server: + while counter < max_i and update_server: server_update = os.environ.get('SERVER_UPDATE', '') if server_update in (SERVER_UPDATE_FINAL, SERVER_UPDATE_TROUBLE): logger.info('server update done, finishing') break - logger.info('server update not finished (#%d/#%d)', i + 1, max_i) + logger.info(f'server update not finished (#{counter + 1}/#{max_i})') sleep(30) - i += 1 + counter += 1 def get_resource_name() -> str: """ Return the name of the resource (only set for HPC resources; e.g. Cori, otherwise return 'grid'). - :return: resource_name (string). + :return: resource_name (str). """ - resource_name = os.environ.get('PILOT_RESOURCE_NAME', '').lower() if not resource_name: resource_name = 'grid' @@ -377,13 +398,12 @@ def get_resource_name() -> str: def get_object_size(obj: Any, seen: Any = None) -> int: """ - Recursively find the size of any objects + Recursively find the size of any objects. - :param obj: object. - :param seen: + :param obj: object (Any) + :param seen: logical seen variable (Any) :return: object size (int). """ - size = sys.getsizeof(obj) if seen is None: seen = set() @@ -406,10 +426,7 @@ def get_object_size(obj: Any, seen: Any = None) -> int: def show_memory_usage() -> None: - """ - Display the current memory usage by the pilot process. - """ - + """Display the current memory usage by the pilot process.""" _, _stdout, _ = get_memory_usage(os.getpid()) _value = extract_memory_usage_value(_stdout) logger.debug(f'current pilot memory usage:\n\n{_stdout}\n\nusage: {_value} kB\n') @@ -422,7 +439,6 @@ def get_memory_usage(pid: int) -> (int, str, str): :param pid: process id (int). :return: ps exit code (int), stderr (strint), stdout (string). """ - return execute(f'ps aux -q {pid}', timeout=60) @@ -434,10 +450,9 @@ def extract_memory_usage_value(output: str) -> int: # usatlas1 13917 1.5 0.0 1324968 152832 ? Sl 09:33 2:55 /bin/python2 .. # -> 152832 (kB) - :param output: ps output (string). - :return: memory value in kB (string). + :param output: ps output (str) + :return: memory value in kB (str). """ - memory_usage = "(unknown)" for row in output.split('\n'): try: @@ -452,14 +467,13 @@ def extract_memory_usage_value(output: str) -> int: def cut_output(txt: str, cutat: int = 1024, separator: str = '\n[...]\n') -> str: """ - Cut the given string if longer that 2*cutat value. + Cut the given string if longer than 2 * cutat value. - :param txt: text to be cut at position cutat (string). - :param cutat: max length of uncut text (int). - :param separator: separator text (string). - :return: cut text (string). + :param txt: text to be cut at position cutat (str) + :param cutat: max length of uncut text (int) + :param separator: separator text (str) + :return: cut text (str). """ - if len(txt) > 2 * cutat: txt = txt[:cutat] + separator + txt[-cutat:] @@ -469,12 +483,12 @@ def cut_output(txt: str, cutat: int = 1024, separator: str = '\n[...]\n') -> str def has_instruction_set(instruction_set: str) -> bool: """ Determine whether a given CPU instruction set is available. + The function will use grep to search in /proc/cpuinfo (both in upper and lower case). - :param instruction_set: instruction set (e.g. AVX2) (string). - :return: Boolean + :param instruction_set: instruction set (e.g. AVX2) (str) + :return: True if given instruction set is available, False otherwise (bool). """ - status = False cmd = fr"grep -o \'{instruction_set.lower()}[^ ]*\|{instruction_set.upper()}[^ ]*\' /proc/cpuinfo" exit_code, stdout, stderr = execute(cmd) @@ -488,14 +502,15 @@ def has_instruction_set(instruction_set: str) -> bool: def has_instruction_sets(instruction_sets: str) -> bool: """ Determine whether a given list of CPU instruction sets is available. + The function will use grep to search in /proc/cpuinfo (both in upper and lower case). Example: instruction_sets = ['AVX', 'AVX2', 'SSE4_2', 'XXX'] -> "AVX|AVX2|SSE4_2" - :param instruction_sets: instruction set (e.g. AVX2) (string). - :return: Boolean - """ - ret = '' - pattern = '' + :param instruction_sets: instruction set (e.g. AVX2) (str) + :return: True if given instruction set is available, False otherwise (bool). + """ + ret = "" + pattern = "" for instr in instruction_sets: pattern += fr'\|{instr.lower()}[^ ]*\|{instr.upper()}[^ ]*' if pattern else fr'{instr.lower()}[^ ]*\|{instr.upper()}[^ ]*' @@ -514,11 +529,10 @@ def locate_core_file(cmd: str = '', pid: int = 0) -> str: """ Locate the core file produced by gdb. - :param cmd: optional command containing pid corresponding to core file (string). - :param pid: optional pid to use with core file (core.pid) (int). - :return: path to core file (string). + :param cmd: optional command containing pid corresponding to core file (str) + :param pid: optional pid to use with core file (core.pid) (int) + :return: path to core file (str). """ - path = None if not pid and cmd: pid = get_pid_from_command(cmd) @@ -537,18 +551,17 @@ def locate_core_file(cmd: str = '', pid: int = 0) -> str: def get_pid_from_command(cmd: str, pattern: str = r'gdb --pid (\d+)') -> int: - """ + r""" Identify an explicit process id in the given command. Example: cmd = 'gdb --pid 19114 -ex \'generate-core-file\'' -> pid = 19114 - :param cmd: command containing a pid (string). - :param pattern: regex pattern (raw string). + :param cmd: command containing a pid (str) + :param pattern: regex pattern (raw str) :return: pid (int). """ - pid = None match = re.search(pattern, cmd) if match: @@ -557,7 +570,7 @@ def get_pid_from_command(cmd: str, pattern: str = r'gdb --pid (\d+)') -> int: except (IndexError, ValueError): pid = None else: - logger.warning('no match for pattern \'%s\' in command=\'%s\'', pattern, cmd) + logger.warning(f"no match for pattern \'{pattern}\' in command=\'{cmd}\'") return pid @@ -566,9 +579,8 @@ def list_hardware() -> str: """ Execute lshw to list local hardware. - :return: lshw output (string). + :return: lshw output (str). """ - _, stdout, stderr = execute('lshw -numeric -C display', mute=True) if 'command not found' in stdout or 'command not found' in stderr: stdout = '' @@ -578,14 +590,14 @@ def list_hardware() -> str: def get_display_info() -> (str, str): """ Extract the product and vendor from the lshw command. + E.g. product: GD 5446 [1013:B8] vendor: Cirrus Logic [1013] -> GD 5446, Cirrus Logic - :return: product (string), vendor (string). + :return: product (str), vendor (str). """ - vendor = '' product = '' stdout = list_hardware() @@ -609,11 +621,11 @@ def get_display_info() -> (str, str): def get_key_value(catchall: str, key: str = 'SOMEKEY') -> str: """ Return the value corresponding to key in catchall. - :param catchall: catchall free string. - :param key: key name (string). - :return: value (string). - """ + :param catchall: catchall free string (str) + :param key: key name (str) + :return: value (str). + """ # ignore any non-key-value pairs that might be present in the catchall string _dic = dict(_str.split('=', 1) for _str in catchall.split() if '=' in _str) @@ -624,10 +636,9 @@ def is_string(obj: Any) -> bool: """ Determine if the passed object is a string or not. - :param obj: object (object type). - :return: True if obj is a string (Boolean). + :param obj: object (Any) + :return: True if obj is a string, False otherwise (bool). """ - return isinstance(obj, str) @@ -635,11 +646,10 @@ def find_pattern_in_list(input_list: list, pattern: str) -> str: """ Search for the given pattern in the input list. - :param input_list: list of string. - :param pattern: regular expression pattern (raw string). - :return: found string (or None). + :param input_list: list of strings (list) + :param pattern: regular expression pattern (raw str) + :return: found string (str or None). """ - found = None for line in input_list: out = re.search(pattern, line) @@ -652,13 +662,13 @@ def find_pattern_in_list(input_list: list, pattern: str) -> str: def sort_words(input_str: str) -> str: """ - Sort the words in a given string. + Sort the words in the given string. + E.g. input_str = 'bbb fff aaa' -> output_str = 'aaa bbb fff' - :param input_str: input string. - :return: sorted output string. + :param input_str: input string (str) + :return: sorted output string (str). """ - output_str = input_str try: tmp = output_str.split() @@ -673,6 +683,7 @@ def sort_words(input_str: str) -> str: def encode_globaljobid(jobid: str, maxsize: int = 31) -> str: """ Encode the global job id on HTCondor. + To be used as an environmental variable on HTCondor nodes to facilitate debugging. Format: ::._ @@ -683,11 +694,10 @@ def encode_globaljobid(jobid: str, maxsize: int = 31) -> str: characters (i.e. the left part of the string might get cut). Also, the cluster ID and process IDs are converted to hex to limit the sizes. The schedd host name is further encoded using the last digit in the host name (spce03.sdcc.bnl.gov -> spce03 -> 3). - :param jobid: panda job id (string) + :param jobid: panda job id (str) :param maxsize: max length allowed (int) - :return: encoded global job id (string). + :return: encoded global job id (str). """ - def get_host_name(): # spool1462.sdcc.bnl.gov -> spool1462 if 'PANDA_HOSTNAME' in os.environ: @@ -730,16 +740,16 @@ def get_host_name(): return global_name -def grep_str(patterns, stdout): +def grep_str(patterns: list, stdout: str) -> list: """ Search for the patterns in the given stdout. + For expected large stdout, better to use FileHandling::grep() - :param patterns: list of regexp patterns. - :param stdout: some text (string). - :return: list of matched lines in stdout. + :param patterns: list of regexp patterns (list) + :param stdout: some text (str) + :return: list of matched lines in stdout (list). """ - matched_lines = [] _pats = [] for pattern in patterns: @@ -756,12 +766,16 @@ def grep_str(patterns, stdout): class TimeoutException(Exception): + """Timeout exception.""" - def __init__(self, message, timeout=None, *args): + def __init__(self, message: str, timeout: int = None, *args: Any): + """Initialize variables.""" self.timeout = timeout self.message = message self._errorCode = 1334 super(TimeoutException, self).__init__(*args) def __str__(self): - return "%s: %s, timeout=%s seconds%s" % (self.__class__.__name__, self.message, self.timeout, ' : %s' % repr(self.args) if self.args else '') + """Set and return the error string for string representation of the class instance.""" + tmp = f' : {repr(self.args)}' if self.args else '' + return f"{self.__class__.__name__}: {self.message}, timeout={self.timeout} seconds{tmp}" diff --git a/pilot/util/common.py b/pilot/util/common.py index ae8e2703..bd77f1fb 100644 --- a/pilot/util/common.py +++ b/pilot/util/common.py @@ -19,6 +19,8 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Common functions.""" + import os import logging from typing import Any @@ -36,10 +38,9 @@ def should_abort(args: Any, limit: int = 30, label: str = '') -> bool: :param args: pilot arguments object :param limit: optional time limit (int) - :param label: optional label prepending log messages (string) - :return: True if graceful_stop has been set (and less than optional time limit has passed since maxtime) or False (bool) + :param label: optional label prepending log messages (str) + :return: True if graceful_stop has been set (and less than optional time limit has passed since maxtime) or False (bool). """ - abort = False if args.graceful_stop.wait(1) or args.graceful_stop.is_set(): # 'or' added for 2.6 compatibility reasons if os.environ.get('REACHED_MAXTIME', None) and limit: @@ -59,25 +60,23 @@ def should_abort(args: Any, limit: int = 30, label: str = '') -> bool: def was_pilot_killed(timing: dict) -> bool: """ - Was the pilot killed by a KILL signal? + Check if the pilot was killed by a KILL signal (i.e., is about to be killed). :param timing: args.timing dictionary (dict) - :return: True if pilot was killed by KILL signal (bool). + :return: True if pilot was killed by KILL signal, False otherwise (bool). """ - return any(PILOT_KILL_SIGNAL in timing[i] for i in timing) def is_pilot_check(check: str = '') -> bool: """ - Should the given pilot check be run? + Determine if the given pilot check is to be run. Consult config.Pilot.checks if the given check is listed. - :param check: name of check (string) - :return: True if check is present in config.Pilot.checks (and if config is outdated), False othersise (bool). + :param check: name of check (str) + :return: True if check is present in config.Pilot.checks (and if config is outdated), False otherwise (bool). """ - status = False if not check: return status diff --git a/pilot/util/config.py b/pilot/util/config.py index c48ce52a..77cf10d5 100644 --- a/pilot/util/config.py +++ b/pilot/util/config.py @@ -19,6 +19,8 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +"""Functions for configuration file.""" + import os import re import configparser @@ -30,9 +32,7 @@ class _ConfigurationSection(): - """ - Keep the settings for a section of the configuration file - """ + """Keep the settings for a section of the configuration file.""" def __getitem__(self, item: Any) -> Any: return getattr(self, item) @@ -48,12 +48,11 @@ def __getattr__(self, attr: Any) -> Any: def read(config_file: Any) -> Any: """ - Read the settings from file and return a dot notation object + Read the settings from file and return a dot notation object. - :param config_file: file - :return: attribute object. + :param config_file: file (Any) + :return: attribute object (Any). """ - _config = configparser.ConfigParser() _config.read(config_file) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index ef64019f..b620d71e 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -20,13 +20,15 @@ # - Mario Lassnig, mario.lassnig@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Constamts.""" + from os import environ # Pilot version RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '7' # version number is '1' for first release, '0' until then, increased for bigger updates -REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '36' # build number should be reset to '1' for every new development cycle +REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '50' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 @@ -84,19 +86,20 @@ def get_pilot_version() -> str: """ Return the current Pilot version string with the format .. (). + E.g. pilot_version = '2.1.3 (12)' - :return: version string. - """ + :return: version string (str). + """ return f'{RELEASE}.{VERSION}.{REVISION}.{BUILD}' def get_rucio_client_version() -> str: """ Return the current Rucio client version string using the environmental variable ATLAS_LOCAL_RUCIOCLIENTS_VERSION. + If the environmental variable is not set, then an empty string will be returned. - :return: $ATLAS_LOCAL_RUCIOCLIENTS_VERSION (string). + :return: $ATLAS_LOCAL_RUCIOCLIENTS_VERSION (str). """ - return environ.get('ATLAS_LOCAL_RUCIOCLIENTS_VERSION', '') diff --git a/pilot/util/container.py b/pilot/util/container.py index d98e7e74..ede58cc5 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -19,15 +19,18 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Functions for executing commands.""" + import os import subprocess import logging +import re import shlex import threading from os import environ, getcwd, getpgid, kill #, setpgrp, getpgid #setsid -from time import sleep from signal import SIGTERM, SIGKILL +from time import sleep from typing import Any, TextIO from pilot.common.errorcodes import ErrorCodes @@ -43,14 +46,14 @@ def execute(executable: Any, **kwargs: dict) -> Any: """ - Execute the command and its options in the provided executable list. + Execute the command with its options in the provided executable list using subprocess time-out handler. + The function also determines whether the command should be executed within a container. - :param executable: command to be executed (string or list). + :param executable: command to be executed (str or list) :param kwargs: kwargs (dict) - :return: exit code (int), stdout (str) and stderr (str) (or process if requested via returnproc argument) + :return: exit code (int), stdout (str) and stderr (str) (or process if requested via returnproc argument). """ - usecontainer = kwargs.get('usecontainer', False) job = kwargs.get('job') #shell = kwargs.get("shell", False) @@ -99,7 +102,7 @@ def execute(executable: Any, **kwargs: dict) -> Any: return process try: - logger.debug(f'subprocess.communicate() will use timeout={timeout} s') + logger.debug(f'subprocess.communicate() will use timeout {timeout} s') stdout, stderr = process.communicate(timeout=timeout) except subprocess.TimeoutExpired as exc: # make sure that stdout buffer gets flushed - in case of time-out exceptions @@ -133,7 +136,15 @@ def execute(executable: Any, **kwargs: dict) -> Any: def execute2(executable: Any, stdout_file: TextIO, stderr_file: TextIO, timeout_seconds: int, **kwargs: dict) -> int: + """ + Execute the command with its options in the provided executable list using an internal timeout handler. + The function also determines whether the command should be executed within a container. + + :param executable: command to be executed (string or list) + :param kwargs: kwargs (dict) + :return: exit code (int), stdout (str) and stderr (str) (or process if requested via returnproc argument). + """ exit_code = None def _timeout_handler(): @@ -203,19 +214,16 @@ def get_timeout(requested_timeout: int) -> int: :param requested_timeout: timeout in seconds set by execute() caller (int) :return: timeout in seconds (int). """ - return requested_timeout if requested_timeout else 10 * 24 * 60 * 60 # using a ridiculously large default timeout def execute_command(command: str) -> str: """ - Executes a command using subprocess without using the shell. - - :param command: The command to execute. + Execute a command using subprocess without using the shell. - :return: The output of the command (string). + :param command: The command to execute (str) + :return: The output of the command (str). """ - try: logger.info(f'executing command: {command}') command = shlex.split(command) @@ -235,11 +243,10 @@ def kill_all(process: Any, stderr: str) -> str: """ Kill all processes after a time-out exception in process.communication(). - :param process: process object - :param stderr: stderr (string) + :param process: process object (Any) + :param stderr: stderr (str) :return: stderr (str). """ - try: logger.warning('killing lingering subprocess and process group') sleep(1) @@ -267,12 +274,12 @@ def kill_all(process: Any, stderr: str) -> str: def print_executable(executable: str, obscure: str = '') -> None: """ Print out the command to be executed, omitting any secrets. + Any S3_SECRET_KEY=... parts will be removed. - :param executable: executable (string). - :param obscure: sensitive string to be obscured before dumping to log (string) + :param executable: executable (str) + :param obscure: sensitive string to be obscured before dumping to log (str). """ - executable_readable = executable for sub_cmd in executable_readable.split(";"): if 'S3_SECRET_KEY=' in sub_cmd: @@ -282,6 +289,9 @@ def print_executable(executable: str, obscure: str = '') -> None: if obscure: executable_readable = executable_readable.replace(obscure, '********') + # also make sure there is no user token present. If so, obscure it as well + executable_readable = obscure_token(executable_readable) + logger.info(f'executing command: {executable_readable}') @@ -289,11 +299,10 @@ def containerise_executable(executable: str, **kwargs: dict) -> (Any, str): """ Wrap the containerisation command around the executable. - :param executable: command to be wrapper (string) - :param kwargs: kwargs dictionary + :param executable: command to be wrapper (str) + :param kwargs: kwargs dictionary (dict) :return: containerised executable (list or None), diagnostics (str). """ - job = kwargs.get('job') user = environ.get('PILOT_USER', 'generic').lower() # TODO: replace with singleton @@ -325,3 +334,21 @@ def containerise_executable(executable: str, **kwargs: dict) -> (Any, str): logger.warning('container module could not be imported') return executable, "" + + +def obscure_token(cmd: str) -> str: + """ + Obscure any user token from the payload command. + + :param cmd: payload command (str) + :return: updated command (str). + """ + try: + match = re.search(r'-p (\S+)\ ', cmd) + if match: + cmd = cmd.replace(match.group(1), '********') + except (re.error, AttributeError, IndexError): + logger.warning('an exception was thrown while trying to obscure the user token') + cmd = '' + + return cmd diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index 18d3c14f..9c18d953 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -58,7 +58,8 @@ iddsserver: https://pandaserver.cern.ch:25443 # The log type and URL for the real-time logging server (format: ;) # (experiment specific values are preferred, see common.py in user area) -rtlogging: logstash;http://aipanda020.cern.ch:8443 +#rtlogging: logstash;http://aipanda020.cern.ch:8443 +rtlogging: logstash;http://aipanda115.cern.ch:8443 ssl_enable: True ssl_verify: False @@ -67,10 +68,14 @@ ssl_verify: False heartbeat: 1800 debug_heartbeat: 60 lost_heartbeat: 10800 +pilot_heartbeat: 60 # Heartbeat message file (only used when Pilot is not sending heartbeats to server) heartbeat_message: heartbeat.json +# Pilot heartbeat (to keep track of pilot activity - esp. in case of SIGSTOP/SIGCONT) +pilot_heartbeat_file: pilot_heartbeat.json + # Job IDs can be stored to a file that is picked up by the wrapper jobid_file: pandaIDs.out @@ -165,7 +170,7 @@ pandasecrets: panda_secrets.json # last_heartbeat = time since last successful heartbeat # machinefeatures = look for machinefeatures # jobfeatures = look for jobfeatures -checks: proxy,space,last_heartbeat,machinefeatures,jobfeatures,cpu_usage,threads +checks: proxy,space,last_heartbeat,pilot_heartbeat,machinefeatures,jobfeatures,cpu_usage,threads ################################ # Information service parameters diff --git a/pilot/util/disk.py b/pilot/util/disk.py index e8a76fa3..db24e3a1 100644 --- a/pilot/util/disk.py +++ b/pilot/util/disk.py @@ -20,13 +20,21 @@ # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Paul Nilsson, Paul.Nilsson@cern.ch, 2021-23 +"""Functions for reporting disk usage.""" + import os from collections import namedtuple ntuple_diskusage = namedtuple('usage', 'total used free') if hasattr(os, 'statvfs'): # POSIX - def disk_usage(path): + def disk_usage(path: str): + """ + Return named tuple with disk usage. + + :param path: path (str) + :return: total, used, free (tuple). + """ stat = os.statvfs(path) free = stat.f_bavail * stat.f_frsize total = stat.f_blocks * stat.f_frsize @@ -34,6 +42,7 @@ def disk_usage(path): return ntuple_diskusage(total, used, free) else: def disk_usage(path): + """Return zero-tuple for disk usage on non-POSIX systems.""" return ntuple_diskusage(0, 0, 0) disk_usage.__doc__ = """ diff --git a/pilot/util/features.py b/pilot/util/features.py index 87b21f1b..048d2d43 100644 --- a/pilot/util/features.py +++ b/pilot/util/features.py @@ -19,6 +19,8 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2023 +"""Machine and job features (for sites that support this CERN project).""" + import logging import os from json import dumps, loads @@ -30,23 +32,22 @@ class Features: + """Common machine and job features.""" - def get_data_members(self): + def get_data_members(self) -> list: """ Return all data members. - :return: list of data members. + :return: list of data members (list). """ - return [attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__")] - def get(self): + def get(self) -> dict: """ Convert class to dictionary. - :return: class dictionary. + :return: class dictionary (dict). """ - # convert class data members to a dictionary string (dumps), then to a dictionary (loads) # note that all data members will remain as strings return loads(dumps(self, default=lambda par: par.__dict__)) @@ -56,9 +57,8 @@ def set(self, path: str, label: str): Set all values. :param path: path to job or machine features directory (str) - :param label: machine or job string (str) + :param label: machine or job string (str). """ - if path and os.path.exists(path): data_members = self.get_data_members() for member in data_members: @@ -80,12 +80,10 @@ def set(self, path: str, label: str): class MachineFeatures(Features): + """Machine features.""" def __init__(self): - """ - Default init. - """ - + """Initialize variables.""" super().__init__() # machine features @@ -98,12 +96,10 @@ def __init__(self): class JobFeatures(Features): + """Job features.""" def __init__(self): - """ - Default init. - """ - + """Initialize variables.""" super().__init__() # job features diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index b67cd4d3..307bce9a 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -19,6 +19,8 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +"""A collection of functions related to file handling.""" + import fnmatch import hashlib import io @@ -56,20 +58,19 @@ def get_pilot_work_dir(workdir: str) -> str: :param workdir: The full path to where the main work directory should be created (str) :return: The name of main work directory """ - return os.path.join(workdir, f"PanDA_Pilot3_{os.getpid()}_{int(time.time())}") def mkdirs(workdir: str, chmod: int = 0o770) -> None: """ Create a directory. + Perform a chmod if set. :param workdir: Full path to the directory to be created (str) :param chmod: chmod code (default 0770) (octal int) :raises PilotException: MKDirFailure. """ - try: os.makedirs(workdir) if chmod: @@ -85,7 +86,6 @@ def rmdirs(path: str) -> bool: :param path: path to directory to be removed (str) :return: True if success, otherwise False (bool). """ - status = False try: @@ -101,11 +101,11 @@ def rmdirs(path: str) -> bool: def read_file(filename: str, mode: str = 'r') -> str: """ Open, read and close a file. + :param filename: file name (str) :param mode: file mode (str) :return: file contents (str). """ - out = "" _file = open_file(filename, mode) if _file: @@ -118,6 +118,7 @@ def read_file(filename: str, mode: str = 'r') -> str: def write_file(path: str, contents: Any, mute: bool = True, mode: str = 'w', unique: bool = False) -> bool: """ Write the given contents to a file. + If unique=True, then if the file already exists, an index will be added (e.g. 'out.txt' -> 'out-1.txt') :param path: full path for file (str) @@ -128,7 +129,6 @@ def write_file(path: str, contents: Any, mute: bool = True, mode: str = 'w', uni :raises PilotException: FileHandlingFailure :return: True if successful, otherwise False (bool). """ - status = False # add an incremental file name (add -%d if path already exists) if necessary @@ -157,6 +157,7 @@ def write_file(path: str, contents: Any, mute: bool = True, mode: str = 'w', uni def open_file(filename: str, mode: str) -> IO: """ Open and return a file pointer for the given mode. + Note: the caller needs to close the file. :param filename: file name (str) @@ -164,7 +165,6 @@ def open_file(filename: str, mode: str) -> IO: :raises PilotException: FileHandlingFailure :return: file pointer (IO). """ - _file = None try: _file = open(filename, mode, encoding='utf-8') @@ -180,7 +180,6 @@ def find_text_files() -> list: :return: list of files (list). """ - files = [] # -I = ignore binary files cmd = r"find . -type f -exec grep -Iq . {} \; -print" @@ -202,7 +201,6 @@ def get_files(pattern: str = "*.log") -> list: :param pattern: file name pattern (str) :return: list of files (list). """ - files = [] cmd = f"find . -name {pattern}" @@ -219,13 +217,13 @@ def get_files(pattern: str = "*.log") -> list: def tail(filename: str, nlines: int = 10) -> str: """ Return the last n lines of a file. + Note: the function uses the posix tail function. :param filename: name of file to do the tail on (str) :param nlines: number of lines (int) :return: file tail (str). """ - _, stdout, _ = execute(f'tail -n {nlines} {filename}') # protection if not isinstance(stdout, str): @@ -241,7 +239,6 @@ def head(filename: str, count: int = 20) -> list: :param count: number of lines (int) :return: head lines (list). """ - ret = None with open(filename, 'r', encoding='utf-8') as _file: lines = [_file.readline() for line in range(1, count + 1)] @@ -264,7 +261,6 @@ def grep(patterns: list, file_name: str) -> list: :param file_name: file name (str) :return: list of matched lines in file (list). """ - matched_lines = [] _pats = [] for pattern in patterns: @@ -288,7 +284,6 @@ def grep(patterns: list, file_name: str) -> list: def convert(data: Union[str, Mapping, Iterable]) -> Union[str, dict, list]: - """ Convert unicode data to utf-8. @@ -311,7 +306,6 @@ def convert(data: Union[str, Mapping, Iterable]) -> Union[str, dict, list]: :param data: unicode object to be converted to utf-8 :return: converted data to utf-8 """ - if isinstance(data, str): ret = str(data) elif isinstance(data, MappingABC): @@ -352,7 +346,6 @@ def read_list(filename: str) -> list: :param filename: file name (str) :return: file content (list). """ - _list = [] # open output file for reading @@ -367,13 +360,12 @@ def read_list(filename: str) -> list: def read_json(filename: str) -> dict: """ - Read a dictionary with unicode to utf-8 conversion + Read a dictionary with unicode to utf-8 conversion. :param filename: file name (str) :raises PilotException: FileHandlingFailure, ConversionFailure :return: json dictionary (dict). """ - dictionary = None _file = open_file(filename, 'r') if _file: @@ -397,25 +389,22 @@ def read_json(filename: str) -> dict: def write_json(filename: str, data: Union[dict, list], sort_keys: bool = True, indent: int = 4, separators: tuple = (',', ': ')) -> bool: - - """ + r""" Write the dictionary to a JSON file. - :param filename: file name (string). - :param data: object to be written to file (dictionary or list). - :param sort_keys: should entries be sorted? (boolean). - :param indent: indentation level, default 4 (int). + :param filename: file name (str) + :param data: object to be written to file (dictionary or list) + :param sort_keys: should entries be sorted? (boolean) + :param indent: indentation level, default 4 (int) :param separators: field separators (default (',', ': ') for dictionaries, use e.g. (',\n') for lists) (tuple) - :raises PilotException: FileHandlingFailure. - :return: status (boolean). + :return: status (bool). """ - status = False try: with open(filename, 'w', encoding='utf-8') as _fh: dumpjson(data, _fh, sort_keys=sort_keys, indent=indent, separators=separators) - except IOError as exc: + except (IOError, TypeError) as exc: logger.warning(f'exception caught in write_json: {exc}') else: status = True @@ -423,14 +412,14 @@ def write_json(filename: str, data: Union[dict, list], sort_keys: bool = True, i return status -def touch(path): +def touch(path: str): """ Touch a file and update mtime in case the file exists. + Default to use execute() if case of python problem with appending to non-existant path. - :param path: full path to file to be touched (string). + :param path: full path to file to be touched (str). """ - try: with open(path, 'a', encoding='utf-8'): os.utime(path, None) @@ -438,14 +427,14 @@ def touch(path): execute(f'touch {path}') -def remove_empty_directories(src_dir): +def remove_empty_directories(src_dir: str): """ - Removal of empty directories in the given src_dir tree. - Only empty directories will be removed. + Remove empty directories in the given src_dir tree. + + Note: Only _empty_ directories will be removed. :param src_dir: directory to be purged of empty directories. """ - for dirpath, _, _ in os.walk(src_dir, topdown=False): if dirpath == src_dir: break @@ -455,13 +444,13 @@ def remove_empty_directories(src_dir): pass -def remove(path): - """ - Remove file. - :param path: path to file (string). - :return: 0 if successful, -1 if failed (int) +def remove(path: str): """ + Remove the given file. + :param path: path to file (str) + :return: 0 if successful, -1 if failed (int). + """ ret = -1 try: os.remove(path) @@ -474,13 +463,13 @@ def remove(path): return ret -def remove_dir_tree(path): - """ - Remove directory tree. - :param path: path to directory (string). - :return: 0 if successful, -1 if failed (int) +def remove_dir_tree(path: str) -> int: """ + Remove the given directory tree. + :param path: path to directory (str) + :return: 0 if successful, -1 if failed (int). + """ try: rmtree(path) except OSError as exc: @@ -489,16 +478,17 @@ def remove_dir_tree(path): return 0 -def remove_files(files, workdir=None): +def remove_files(files: list, workdir: str = "") -> int: """ - Remove all given files from workdir. + + Remove all given files from the given workdir. + If workdir is set, it will be used as base path. - :param files: file list - :param workdir: optional working directory (string) - :return: exit code (0 if all went well, -1 otherwise) + :param files: file list (list) + :param workdir: optional working directory (str) + :return: exit code (0 if all went well, -1 otherwise) (int). """ - exitcode = 0 if not isinstance(files, list): logger.warning(f'files parameter not a list: {type(files)}') @@ -513,17 +503,16 @@ def remove_files(files, workdir=None): return exitcode -def tar_files(wkdir, excludedfiles, logfile_name, attempt=0): +def tar_files(wkdir: str, excludedfiles: list, logfile_name: str, attempt: int = 0) -> int: """ - Tarring of files in given directory. + Tar the files in the given directory. - :param wkdir: work directory (string) + :param wkdir: work directory (str) :param excludedfiles: list of files to be excluded from tar operation (list) - :param logfile_name: file name (string) - :param attempt: attempt number (integer) - :return: 0 if successful, 1 in case of error (int) + :param logfile_name: file name (str) + :param attempt: attempt number (int) + :return: 0 if successful, 1 in case of error (int). """ - to_pack = [] pack_start = time.time() for path, _, files in os.walk(wkdir): @@ -558,14 +547,13 @@ def tar_files(wkdir, excludedfiles, logfile_name, attempt=0): return 0 -def move(path1, path2): +def move(path1: str, path2: str): """ Move a file from path1 to path2. - :param path1: source path (string). - :param path2: destination path2 (string). + :param path1: source path (str) + :param path2: destination path (str). """ - if not os.path.exists(path1): diagnostic = f'file copy failure: path does not exist: {path1}' logger.warning(diagnostic) @@ -581,15 +569,14 @@ def move(path1, path2): logger.info(f"moved {path1} to {path2}") -def copy(path1, path2): +def copy(path1: str, path2: str): """ Copy path1 to path2. - :param path1: file path (string). - :param path2: file path (string). - :raises PilotException: FileHandlingFailure, NoSuchFile + :param path1: file path (str) + :param path2: file path (str) + :raises PilotException: FileHandlingFailure, NoSuchFile. """ - if not os.path.exists(path1): diagnostics = f'file copy failure: path does not exist: {path1}' logger.warning(diagnostics) @@ -604,15 +591,14 @@ def copy(path1, path2): logger.info(f"copied {path1} to {path2}") -def add_to_total_size(path, total_size): +def add_to_total_size(path: str, total_size: int): """ Add the size of file in the given path to the total size of all in/output files. - :param path: path to file (string). - :param total_size: prior total size of all input/output files (long). - :return: total size of all input/output files (long). + :param path: path to file (str) + :param total_size: prior total size of all input/output files (int) + :return: total size of all input/output files (int). """ - if os.path.exists(path): # Get the file size fsize = get_local_file_size(path) @@ -625,14 +611,13 @@ def add_to_total_size(path, total_size): return total_size -def get_local_file_size(filename): +def get_local_file_size(filename: str) -> int: """ Get the file size of a local file. - :param filename: file name (string). + :param filename: file name (str) :return: file size (int). """ - file_size = None if os.path.exists(filename): @@ -649,17 +634,18 @@ def get_local_file_size(filename): def get_guid(): """ Generate a GUID using the uuid library. + E.g. guid = '92008FAF-BE4C-49CF-9C5C-E12BC74ACD19' - :return: a random GUID (string) + :return: a random GUID (str). """ - return str(uuid.uuid4()).upper() -def get_table_from_file(filename, header=None, separator="\t", convert_to_float=True): +def get_table_from_file(filename: str, header: str = "", separator: str = "\t", convert_to_float: bool = True) -> bool: """ Extract a table of data from a txt file. + E.g. header="Time VMEM PSS RSS Swap rchar wchar rbytes wbytes" or the first line in the file is @@ -670,13 +656,12 @@ def get_table_from_file(filename, header=None, separator="\t", convert_to_float= The output dictionary will have the format {'Time': [ .. data from first row .. ], 'VMEM': [.. data from second row], ..} - :param filename: name of input text file, full path (string). - :param header: header string. - :param separator: separator character (char). - :param convert_to_float: boolean, if True, all values will be converted to floats. - :return: dictionary. + :param filename: name of input text file, full path (str) + :param header: header string (str) + :param separator: separator character (str) + :param convert_to_float: boolean, if True, all values will be converted to floats (bool) + :return: dictionary (dict). """ - tabledict = {} keylist = [] # ordered list of dictionary key names @@ -713,17 +698,17 @@ def get_table_from_file(filename, header=None, separator="\t", convert_to_float= return tabledict -def _define_tabledict_keys(header, fields, separator): +def _define_tabledict_keys(header: str, fields: str, separator: str) -> (dict, list): """ Define the keys for the tabledict dictionary. + Note: this function is only used by parse_table_from_file(). - :param header: header string. - :param fields: header content string. - :param separator: separator character (char). - :return: tabledict (dictionary), keylist (ordered list with dictionary key names). + :param header: header (str) + :param fields: header content (str) + :param separator: separator character (str) + :return: tabledict (dict), keylist (ordered list with dictionary key names). """ - tabledict = {} keylist = [] @@ -751,18 +736,18 @@ def _define_tabledict_keys(header, fields, separator): return tabledict, keylist -def calculate_checksum(filename, algorithm='adler32'): +def calculate_checksum(filename: str, algorithm: str = "adler32") -> str: """ Calculate the checksum value for the given file. + The default algorithm is adler32. Md5 is also be supported. Valid algorithms are 1) adler32/adler/ad32/ad, 2) md5/md5sum/md. - :param filename: file name (string). - :param algorithm: optional algorithm string. - :raises FileHandlingFailure, NotImplementedError, Exception. - :return: checksum value (string). + :param filename: file name (str) + :param algorithm: optional algorithm string (str) + :raises FileHandlingFailure, NotImplementedError, Exception + :return: checksum value (str). """ - if not os.path.exists(filename): raise FileHandlingFailure(f'file does not exist: {filename}') @@ -780,17 +765,18 @@ def calculate_checksum(filename, algorithm='adler32'): raise NotImplementedError() -def calculate_adler32_checksum(filename): +def calculate_adler32_checksum(filename: str) -> str: """ + Calculate the adler32 checksum for the given file. + An Adler-32 checksum is obtained by calculating two 16-bit checksums A and B and concatenating their bits into a 32-bit integer. A is the sum of all bytes in the stream plus one, and B is the sum of the individual values of A from each step. - :param filename: file name (string). + :param filename: file name (str) :raises: Exception. - :returns: hexadecimal string, padded to 8 values (string). + :returns: hexadecimal string, padded to 8 values (str). """ - # adler starting value is _not_ 0 adler = 1 @@ -819,18 +805,18 @@ def calculate_adler32_checksum(filename): adler = adler + 2 ** 32 # convert to hex - return "{0:08x}".format(adler) + return f"{adler:08x}" -def calculate_md5_checksum(filename): +def calculate_md5_checksum(filename: str): """ Calculate the md5 checksum for the given file. + The file is assumed to exist. - :param filename: file name (string). - :return: checksum value (string). + :param filename: file name (str) + :return: checksum value (str). """ - length = io.DEFAULT_BUFFER_SIZE md5 = hashlib.md5() @@ -841,9 +827,10 @@ def calculate_md5_checksum(filename): return md5.hexdigest() -def get_checksum_value(checksum): +def get_checksum_value(checksum: str) -> str: """ - Return the checksum value. + Return the actual checksum value from the full checksum string. + The given checksum might either be a standard ad32 or md5 string, or a dictionary with the format { checksum_type: value } as defined in the `FileSpec` class. This function extracts the checksum value from this dictionary (or immediately returns the checksum value if the given value is a string). @@ -851,7 +838,6 @@ def get_checksum_value(checksum): :param checksum: checksum object (string or dictionary). :return: checksum. checksum string. """ - if isinstance(checksum, str): return checksum @@ -864,17 +850,17 @@ def get_checksum_value(checksum): return checksum_value -def get_checksum_type(checksum): +def get_checksum_type(checksum: Any) -> str: """ Return the checksum type (ad32 or md5). + The given checksum can be either be a standard ad32 or md5 value, or a dictionary with the format { checksum_type: value } as defined in the `FileSpec` class. In case the checksum type cannot be identified, the function returns 'unknown'. - :param checksum: checksum string or dictionary. + :param checksum: checksum string or dictionary (Any) :return: checksum type (string). """ - checksum_type = 'unknown' if isinstance(checksum, dict): for key in list(checksum.keys()): @@ -890,16 +876,15 @@ def get_checksum_type(checksum): return checksum_type -def scan_file(path, error_messages, warning_message=None): +def scan_file(path: str, error_messages: list, warning_message: str = ""): """ - Scan file for known error messages. + Scan the given file for known error messages. - :param path: path to file (string). - :param error_messages: list of error messages. - :param warning_message: optional warning message to be printed with any of the error_messages have been found (string). - :return: Boolean. (note: True means the error was found) + :param path: path to file (str) + :param error_messages: list of error messages (list) + :param warning_message: optional warning message to be printed with any of the error_messages have been found (str) + :return: True if error was found, False otherwise (bool) """ - found_problem = False matched_lines = grep(error_messages, path) @@ -913,14 +898,13 @@ def scan_file(path, error_messages, warning_message=None): return found_problem -def verify_file_list(list_of_files): +def verify_file_list(list_of_files: list) -> list: """ - Make sure that the files in the given list exist, return the list of files that does exist. + Make sure that the files in the given list exist, return the list of files that do exist. - :param list_of_files: file list. - :return: list of existing files. + :param list_of_files: file list (list) + :return: list of existing files (list). """ - # remove any non-existent files from the input file list filtered_list = [f for f in list_of_files if os.path.exists(f)] @@ -931,15 +915,15 @@ def verify_file_list(list_of_files): return filtered_list -def find_latest_modified_file(list_of_files): +def find_latest_modified_file(list_of_files: list) -> (str, int): """ Find the most recently modified file among the list of given files. + In case int conversion of getmtime() fails, int(time.time()) will be returned instead. - :param list_of_files: list of files with full paths. - :return: most recently updated file (string), modification time (int). + :param list_of_files: list of files with full paths (list) + :return: most recently updated file (str), modification time (int). """ - if not list_of_files: logger.warning('there were no files to check mod time for') return None, None @@ -955,14 +939,14 @@ def find_latest_modified_file(list_of_files): return latest_file, mtime -def list_mod_files(file_list): +def list_mod_files(file_list: list): """ List file names along with the mod times. + Called before looping killer is executed. - :param file_list: list of files with full paths. + :param file_list: list of files with full paths (list). """ - if file_list: logger.info('dumping info for recently modified files prior to looping job kill') for _file in file_list: @@ -973,15 +957,14 @@ def list_mod_files(file_list): logger.info(f'file name={_file} : mod_time={size}') -def dump(path, cmd="cat"): +def dump(path: str, cmd: str = "cat"): """ Dump the content of the file in the given path to the log. - :param path: file path (string). - :param cmd: optional command (string). - :return: cat (string). + :param path: file path (str) + :param cmd: optional command (str) + :return: cat (str). """ - if os.path.exists(path) or cmd == "echo": _cmd = f"{cmd} {path}" _, stdout, stderr = execute(_cmd) @@ -990,18 +973,17 @@ def dump(path, cmd="cat"): logger.info(f"path {path} does not exist") -def remove_core_dumps(workdir, pid=None): +def remove_core_dumps(workdir: str, pid: int = 0): """ - Remove any remaining core dumps so they do not end up in the log tarball + Remove any remaining core dumps so they do not end up in the log tarball. A core dump from the payload process should not be deleted if in debug mode (checked by the called). Also, a found core dump from a non-payload process, should be removed but should result in function returning False. - :param workdir: working directory for payload (string). - :param pid: payload pid (integer). - :return: Boolean (True if a payload core dump is found) + :param workdir: working directory for payload (str) + :param pid: payload pid (int) + :return: True if a payload core dump is found, False otherwise (bool). """ - found = False coredumps = glob(f"{workdir}/core.*") + glob(f"{workdir}/core") @@ -1015,34 +997,32 @@ def remove_core_dumps(workdir, pid=None): return found -def get_nonexistant_path(fname_path): +def get_nonexistant_path(fname_path: str) -> str: """ Get the path to a filename which does not exist by incrementing path. - :param fname_path: file name path (string). - :return: file name path (string). + :param fname_path: file name path (str) + :return: file name path (str). """ - if not os.path.exists(fname_path): return fname_path filename, file_extension = os.path.splitext(fname_path) i = 1 - new_fname = "{}-{}{}".format(filename, i, file_extension) + new_fname = f"{filename}-{i}{file_extension}" while os.path.exists(new_fname): i += 1 - new_fname = "{}-{}{}".format(filename, i, file_extension) + new_fname = f"{filename}-{i}{file_extension}" return new_fname -def update_extension(path='', extension=''): +def update_extension(path: str = "", extension: str = "") -> str: """ Update the file name extension to the given extension. - :param path: file path (string). - :param extension: new extension (string). - :return: file path with new extension (string). + :param path: file path (str) + :param extension: new extension (str) + :return: file path with new extension (str). """ - path, _ = os.path.splitext(path) if not extension.startswith('.'): extension = '.' + extension @@ -1051,14 +1031,13 @@ def update_extension(path='', extension=''): return path -def get_valid_path_from_list(paths): +def get_valid_path_from_list(paths: list) -> str: """ Return the first valid path from the given list. - :param paths: list of file paths. - :return: first valid path from list (string). + :param paths: list of file paths (list) + :return: first valid path from list (str). """ - valid_path = None for path in paths: if os.path.exists(path): @@ -1068,16 +1047,16 @@ def get_valid_path_from_list(paths): return valid_path -def copy_pilot_source(workdir, filename=None): +def copy_pilot_source(workdir: str, filename: str = "") -> str: """ Copy the pilot source into the work directory. + If a filename is specified, only that file will be copied. - :param workdir: working directory (string). - :param filename: specific filename (string). - :return: diagnostics (string). + :param workdir: working directory (str) + :param filename: specific filename (str) + :return: diagnostics (str). """ - diagnostics = "" srcdir = os.path.join(os.environ.get('PILOT_SOURCE_DIR', '.'), 'pilot3') @@ -1086,6 +1065,9 @@ def copy_pilot_source(workdir, filename=None): try: logger.debug(f'copy {srcdir} to {workdir}') + # replace with: + # pat = f"{filename}" if filename else f"{filename}/*" + # cmd = f"cp -pr {pat} {srcdir} {workdir}" pat = '%s' if filename else '%s/*' cmd = f'cp -pr {pat} %s' % (srcdir, workdir) exit_code, stdout, _ = execute(cmd) @@ -1099,14 +1081,13 @@ def copy_pilot_source(workdir, filename=None): return diagnostics -def create_symlink(from_path='', to_path=''): +def create_symlink(from_path: str = "", to_path: str = ""): """ Create a symlink from/to the given paths. :param from_path: from path (string). :param to_path: to path (string). """ - try: os.symlink(from_path, to_path) except (OSError, FileNotFoundError) as exc: @@ -1115,18 +1096,17 @@ def create_symlink(from_path='', to_path=''): logger.debug(f'created symlink from {from_path} to {to_path}') -def locate_file(pattern): +def locate_file(pattern: str) -> str: """ - Locate a file defined by the pattern. + Locate a file defined by the given pattern. Example: pattern = os.path.join(os.getcwd(), '**/core.123') -> /Users/Paul/Development/python/tt/core.123 - :param pattern: pattern name (string). - :return: path (string). + :param pattern: pattern name (str) + :return: path (str). """ - path = None for fname in glob(pattern): if os.path.isfile(fname): @@ -1135,14 +1115,13 @@ def locate_file(pattern): return path -def find_last_line(filename): +def find_last_line(filename: str) -> str: """ Find the last line in a (not too large) file. - :param filename: file name, full path (string). - :return: last line (string). + :param filename: file name, full path (str) + :return: last line (str). """ - last_line = "" with open(filename) as _file: line = "" @@ -1154,14 +1133,13 @@ def find_last_line(filename): return last_line -def get_disk_usage(start_path='.'): +def get_disk_usage(start_path: str = "."): """ Calculate the disk usage of the given directory (including any sub-directories). - :param start_path: directory (string). + :param start_path: directory (str) :return: disk usage in bytes (int). """ - total_size = 0 for dirpath, _, filenames in os.walk(start_path): for fname in filenames: @@ -1177,15 +1155,14 @@ def get_disk_usage(start_path='.'): return total_size -def extract_lines_from_file(pattern, filename): +def extract_lines_from_file(pattern: str, filename: str) -> str: """ - Extract all lines containing 'pattern' from given file. + Extract all lines containing the given pattern from the given file. - :param pattern: text (string). - :param filename: file name (string). - :return: text (string). + :param pattern: text (str) + :param filename: file name (str) + :return: text (str). """ - _lines = '' try: with open(filename, 'r') as _file: @@ -1199,15 +1176,14 @@ def extract_lines_from_file(pattern, filename): return _lines -def find_file(filename, startdir): +def find_file(filename: str, startdir: str) -> str: """ - Locate a file in a subdirectory to the given start directory. + Locate a file in a subdirectory of the given start directory. - :param filename: file name (string). - :param startdir: start directory for search (string). - :return: full path (string). + :param filename: file name (str) + :param startdir: start directory for search (str) + :return: full path (str). """ - logger.debug(f'looking for {filename} in start dir {startdir}') _path = None for path in Path(startdir).rglob(filename): @@ -1218,15 +1194,14 @@ def find_file(filename, startdir): return _path -def zip_files(archivename, files): +def zip_files(archivename: str, files: list) -> bool: """ - Zip a list of files with standard compression level. + Compress a list of files with the standard compression level. - :param archivename: archive name (string). - :param files: list of files. - :return: status (Boolean) + :param archivename: archive name (str) + :param files: list of files (list) + :return: status (bool). """ - status = False try: @@ -1247,27 +1222,43 @@ def zip_files(archivename, files): return status -def generate_test_file(filename, filesize=1024): +def generate_test_file(filename: str, filesize: int = 1024): """ Generate a binary file with the given size in Bytes. - :param filename: full path, file name (string) - :param filesize: file size in Bytes (int) + :param filename: full path, file name (str) + :param filesize: file size in Bytes (int). """ - with open(filename, 'wb') as fout: fout.write(os.urandom(filesize)) # replace 1024 with a size in kilobytes if it is not unreasonably large def get_directory_size(directory: str) -> float: + """ + Measure the size of the given directory. + + :param directory: full directory path (str) + :return: size in MB (float). + """ + + size_mb = None + try: + size_mb = get_disk_usage(directory) / 1024 / 1024 + except Exception as exc: + logger.warning(f'failed to get directory size: {exc}') + + return size_mb + + +def old_get_directory_size(directory: str) -> float: """ Measure the size of the given directory with du -sh. + The function will return None in case of failure. - :param directory: full directory path (string). + :param directory: full directory path (str) :return: size in MB (float). """ - size_mb = None command = ["du", "-sh", directory] output = subprocess.check_output(command) @@ -1288,10 +1279,9 @@ def get_total_input_size(files: Any, nolib: bool = True) -> int: Calculate the total input file size, but do not include the lib file if present. :param files: files object (list of FileSpec) - :param nolib: if True, do not include the lib file in the calculation + :param nolib: if True, do not include the lib file in the calculation (bool) :return: total input file size in bytes (int). """ - if not nolib: total_size = reduce(lambda x, y: x + y.filesize, files, 0) else: @@ -1305,13 +1295,12 @@ def get_total_input_size(files: Any, nolib: bool = True) -> int: def append_to_file(from_file: str, to_file: str) -> bool: """ - Appends the contents of one file to another. + Append the contents of one file to another. :param from_file: The path to the source file to read from (str) :param to_file: The path to the target file to append to (str) :return: True if the operation was successful, False otherwise (bool). """ - status = False try: # 1 kB chunk size @@ -1342,7 +1331,7 @@ def append_to_file(from_file: str, to_file: str) -> bool: return status -def find_files_with_pattern(directory, pattern): +def find_files_with_pattern(directory: str, pattern: str) -> list: """ Find files in a directory that match a specified pattern. @@ -1350,7 +1339,6 @@ def find_files_with_pattern(directory, pattern): :param pattern: The pattern to match filenames (str) :return: a list of matching filenames found in the directory (list). """ - try: if not os.path.exists(directory): raise FileNotFoundError(f"directory '{directory}' does not exist") @@ -1368,7 +1356,6 @@ def rename_xrdlog(name: str): :param name: local file name (str). """ - xrd_logfile = os.environ.get('XRD_LOGFILE', None) if xrd_logfile: # xrootd is then expected to have produced a corresponding log file diff --git a/pilot/util/filestate.py b/pilot/util/filestate.py index f76ad679..a2445f0d 100644 --- a/pilot/util/filestate.py +++ b/pilot/util/filestate.py @@ -19,14 +19,17 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2022-23 +"""Handling of file states.""" + import logging logger = logging.getLogger(__name__) class FileState(object): - """ + File state class. + FS = FileState(file_status={'lfns': ['LFN1.DAT', 'LFN2.DAT']}) FS.update(lfn='LFN1.DAT', state='TRANSFERRED') print(FS.get_file_states()) @@ -38,32 +41,33 @@ class FileState(object): def __init__(self, file_states={}): """ - Default init function. - """ + Initialize variables. + :param file_states: file states (dict). + """ self._lfns = file_states.get('lfns', []) self.set_initial_list() def set_initial_list(self): - """ - Set the initial file states list. - """ - + """Set the initial file states list.""" for lfn in self._lfns: self._file_states[lfn] = 'NOT_YET_TRANSFERRED' - def get_file_states(self): - """ - Return the current file states list. + def get_file_states(self) -> dict: """ + Return the current file states dictionary. + :return: file states (dict). + """ return self._file_states def update(self, lfn='', state=''): """ Update the state for a given LFN. - """ + :param lfn: file name (str) + :param state: file state (str). + """ if not lfn or not state: logger.warning('must set lfn/state') return diff --git a/pilot/util/harvester.py b/pilot/util/harvester.py index b761f7dc..b76faf1b 100644 --- a/pilot/util/harvester.py +++ b/pilot/util/harvester.py @@ -19,9 +19,12 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Functions for interactiving with Harvester.""" + import os import os.path import socket +from typing import Any from pilot.common.exception import FileHandlingFailure from pilot.util.config import config @@ -32,21 +35,23 @@ logger = logging.getLogger(__name__) -def dump(obj): +def dump(obj: Any): """ - function for debugging - dumps object to sysout + Dump given object to stdout. + + :param obj: object (Any). """ for attr in dir(obj): print(f"obj.{attr} = {getattr(obj, attr)}") -def is_harvester_mode(args): +def is_harvester_mode(args: Any) -> bool: """ Determine if the pilot is running in Harvester mode. - :param args: Pilot arguments object. - :return: Boolean. - """ + :param args: Pilot arguments object (Any) + :return: True if Harvester mode, False otherwise (bool). + """ if (args.harvester_workdir != '' or args.harvester_datadir != '') and not args.update_server: harvester = True elif (args.harvester_eventstatusdump != '' or args.harvester_workerattributes != '') and not args.update_server: @@ -59,23 +64,17 @@ def is_harvester_mode(args): return harvester -def get_job_request_file_name(): +def get_job_request_file_name() -> str: """ Return the name of the job request file as defined in the pilot config file. - :return: job request file name. + :return: job request file name (str). """ - return os.path.join(os.environ['PILOT_HOME'], config.Harvester.job_request_file) def remove_job_request_file(): - """ - Remove an old job request file when it is no longer needed. - - :return: - """ - + """Remove an old job request file when it is no longer needed.""" path = get_job_request_file_name() if os.path.exists(path): if remove(path) == 0: @@ -86,14 +85,14 @@ def remove_job_request_file(): def request_new_jobs(njobs: int = 1): """ - Inform Harvester that the pilot is ready to process new jobs by creating a job request file with the desired - number of jobs. + Inform Harvester that the pilot is ready to process new jobs by creating a job request file. + + The request file will contain the desired number of jobs. :param njobs: Number of jobs. Default is 1 since on grids and clouds the pilot does not know how many jobs it can - process before it runs out of time + process before it runs out of time (int) :raises: FileHandlingFailure if write_json() fails. """ - path = get_job_request_file_name() dictionary = {'nJobs': njobs} @@ -106,48 +105,41 @@ def request_new_jobs(njobs: int = 1): def kill_worker(): """ Create (touch) a kill_worker file in the pilot launch directory. - This file will let Harverster know that the pilot has finished. - :return: + This file will let Harverster know that the pilot has finished. """ - touch(os.path.join(os.environ['PILOT_HOME'], config.Harvester.kill_worker_file)) -def get_initial_work_report(): +def get_initial_work_report() -> dict: """ Prepare the work report dictionary. + Note: the work_report should also contain all fields defined in parse_jobreport_data(). - :return: work report dictionary. + :return: work report dictionary (dict). """ - - hostname = os.environ.get('PANDA_HOSTNAME', socket.gethostname()) - work_report = {'jobStatus': 'starting', - 'messageLevel': logging.getLevelName(logger.getEffectiveLevel()), - 'cpuConversionFactor': 1.0, - 'cpuConsumptionTime': '', - 'node': hostname, - 'workdir': '', - 'timestamp': time_stamp(), - 'endTime': '', - 'transExitCode': 0, - 'pilotErrorCode': 0, # only add this in case of failure? - } - - return work_report - - -def get_event_status_file(args): + return {'jobStatus': 'starting', + 'messageLevel': logging.getLevelName(logger.getEffectiveLevel()), + 'cpuConversionFactor': 1.0, + 'cpuConsumptionTime': '', + 'node': os.environ.get('PANDA_HOSTNAME', socket.gethostname()), + 'workdir': '', + 'timestamp': time_stamp(), + 'endTime': '', + 'transExitCode': 0, + 'pilotErrorCode': 0, # only add this in case of failure? + } + + +def get_event_status_file(args: Any) -> str: """ - Return the name of the event_status.dump file as defined in the pilot config file - and from the pilot arguments. + Return the name of the event_status.dump file. - :param args: Pilot arguments object. - :return: event staus file name. + :param args: Pilot arguments object (Any) + :return: event staus file name (str). """ - - logger.debug('config.Harvester.__dict__ : {0}'.format(config.Harvester.__dict__)) + logger.debug(f'config.Harvester.__dict__ : {config.Harvester.__dict__}') if args.harvester_workdir != '': work_dir = args.harvester_workdir @@ -155,21 +147,19 @@ def get_event_status_file(args): work_dir = os.environ['PILOT_HOME'] event_status_file = config.Harvester.stageoutnfile event_status_file = os.path.join(work_dir, event_status_file) - logger.debug('event_status_file = {}'.format(event_status_file)) + logger.debug(f'event_status_file = {event_status_file}') return event_status_file -def get_worker_attributes_file(args): +def get_worker_attributes_file(args: Any): """ - Return the name of the worker attributes file as defined in the pilot config file - and from the pilot arguments. + Return the name of the worker attributes file. - :param args: Pilot arguments object. - :return: worker attributes file name. + :param args: Pilot arguments object (Any) + :return: worker attributes file name (str). """ - - logger.debug('config.Harvester.__dict__ : {0}'.format(config.Harvester.__dict__)) + logger.debug(f'config.Harvester.__dict__ : {config.Harvester.__dict__}') if args.harvester_workdir != '': work_dir = args.harvester_workdir @@ -177,39 +167,38 @@ def get_worker_attributes_file(args): work_dir = os.environ['PILOT_HOME'] worker_attributes_file = config.Harvester.workerattributesfile worker_attributes_file = os.path.join(work_dir, worker_attributes_file) - logger.debug('worker_attributes_file = {}'.format(worker_attributes_file)) + logger.debug(f'worker_attributes_file = {worker_attributes_file}') return worker_attributes_file -def findfile(path, name): +def findfile(path: str, name: str) -> str: """ - find the first instance of file in the directory tree + Find the first instance of file in the directory tree. - :param path: directory tree to search - :param name: name of the file to search - - :return: the path to the first instance of the file + :param path: directory tree to search (str) + :param name: name of the file to search (str) + :return: the path to the first instance of the file (str). """ - + filename = "" for root, dirs, files in os.walk(path): if name in files: - return os.path.join(root, name) - return '' + filename = os.path.join(root, name) + break + return filename -def publish_stageout_files(job, event_status_file): + +def publish_stageout_files(job: Any, event_status_file: str) -> bool: """ - Publishing of work report to file. - The work report dictionary should contain the fields defined in get_initial_work_report(). + Publish the work report for stageout. - :param args: Pilot arguments object. - :param job: job object. - :param event status file name: + The work report dictionary should contain the fields defined in get_initial_work_report(). - :return: Boolean. status of writing the file information to a json + :param job: job object (Any) + :param event_status_file: file ane (str) + :return: status of writing the file information to a json (bool). """ - # get the harvester workdir from the event_status_file work_dir = os.path.dirname(event_status_file) @@ -218,11 +207,11 @@ def publish_stageout_files(job, event_status_file): # first look at the logfile information (logdata) from the FileSpec objects for fspec in job.logdata: - logger.debug("File {} will be checked and declared for stage out".format(fspec.lfn)) + logger.debug(f"file {fspec.lfn} will be checked and declared for stage out") # find the first instance of the file filename = os.path.basename(fspec.surl) path = findfile(work_dir, filename) - logger.debug("Found File {} at path - {}".format(fspec.lfn, path)) + logger.debug(f"found File {fspec.lfn} at path - {path}") # file_desc = {} file_desc['type'] = fspec.filetype @@ -230,12 +219,12 @@ def publish_stageout_files(job, event_status_file): file_desc['guid'] = fspec.guid file_desc['fsize'] = fspec.filesize file_desc['chksum'] = get_checksum_value(fspec.checksum) - logger.debug("File description - {} ".format(file_desc)) + logger.debug(f"file description - {file_desc} ") out_file_report[job.jobid].append(file_desc) # Now look at the output file(s) information (outdata) from the FileSpec objects for fspec in job.outdata: - logger.debug("File {} will be checked and declared for stage out".format(fspec.lfn)) + logger.debug(f"file {fspec.lfn} will be checked and declared for stage out") if fspec.status != 'transferred': logger.debug('will not add the output file to the json since it was not produced or transferred') else: @@ -243,9 +232,9 @@ def publish_stageout_files(job, event_status_file): filename = os.path.basename(fspec.surl) path = findfile(work_dir, filename) if not path: - logger.warning('file %s was not found - will not be added to json') + logger.warning(f'file {path} was not found - will not be added to json') else: - logger.debug("Found File {} at path - {}".format(fspec.lfn, path)) + logger.debug(f"found File {fspec.lfn} at {path}") # file_desc = {} file_desc['type'] = fspec.filetype @@ -253,33 +242,33 @@ def publish_stageout_files(job, event_status_file): file_desc['guid'] = fspec.guid file_desc['fsize'] = fspec.filesize file_desc['chksum'] = get_checksum_value(fspec.checksum) - logger.debug("File description - {} ".format(file_desc)) + logger.debug(f"File description - {file_desc} ") out_file_report[job.jobid].append(file_desc) if out_file_report[job.jobid]: if write_json(event_status_file, out_file_report): - logger.debug('Stagout declared in: {0}'.format(event_status_file)) - logger.debug('Report for stageout: {}'.format(out_file_report)) + logger.debug(f'stagout declared in: {event_status_file}') + logger.debug(f'report for stageout: {out_file_report}') return True else: - logger.debug('Failed to declare stagout in: {0}'.format(event_status_file)) + logger.debug(f'failed to declare stagout in: {event_status_file}') return False else: - logger.debug('No Report for stageout') + logger.debug('no report for stageout') return False -def publish_work_report(work_report=None, worker_attributes_file="worker_attributes.json"): +def publish_work_report(work_report: dict = {}, worker_attributes_file: str = "worker_attributes.json") -> bool: """ - Publishing of work report to file. + Publish the work report. + The work report dictionary should contain the fields defined in get_initial_work_report(). - :param work_report: work report dictionary. - :param worker_attributes_file: - :raises FileHandlingFailure: in case of IOError. - :return: True or False + :param work_report: work report dictionary (dict) + :param worker_attributes_file: file name (str) + :raises FileHandlingFailure: in case of IOError + :return: True if successfully published, False otherwise (bool). """ - if work_report: work_report['timestamp'] = time_stamp() if "outputfiles" in work_report: @@ -291,33 +280,33 @@ def publish_work_report(work_report=None, worker_attributes_file="worker_attribu ec = write_json(worker_attributes_file, work_report) if ec: - logger.error("work report publish failed: {0}".format(work_report)) + logger.error(f"work report publish failed: {work_report}") return False else: - logger.info("work report published: {0}".format(work_report)) + logger.info(f"work report published: {work_report}") return True else: # No work_report return False return False -def publish_job_report(job, args, job_report_file="jobReport.json"): +def publish_job_report(job: Any, args: Any, job_report_file: str = "jobReport.json") -> str: """ + Publish the job report. + Copy job report file to make it accessible by Harvester. Shrink job report file. - :param job: job object. - :param args: Pilot arguments object. - :param job_report_file: name of job report (string). - :raises FileHandlingFailure: in case of IOError. - :return True or False + :param job: job object (Any) + :param args: Pilot arguments object (Any) + :param job_report_file: name of job report (str) + :raises FileHandlingFailure: in case of IOError + :return True if successfully published, False otherwise (bool). """ - src_file = os.path.join(job.workdir, job_report_file) dst_file = os.path.join(args.harvester_workdir, job_report_file) try: - logger.info( - "copy of payload report [{0}] to access point: {1}".format(job_report_file, args.harvester_workdir)) + logger.info(f"copy of payload report [{job_report_file}] to access point: {args.harvester_workdir}") # shrink jobReport job_report = read_json(src_file) if 'executor' in job_report: @@ -335,18 +324,18 @@ def publish_job_report(job, args, job_report_file="jobReport.json"): return False -def parse_job_definition_file(filename): +def parse_job_definition_file(filename: str) -> list: """ - This function parses the Harvester job definition file and re-packages the job definition dictionaries. + Parse the Harvester job definition file and re-package the job definition dictionaries. + The format of the Harvester job definition dictionary is: dict = { job_id: { key: value, .. }, .. } The function returns a list of these dictionaries each re-packaged as dict = { key: value } (where the job_id is now one of the key-value pairs: 'jobid': job_id) - :param filename: file name (string). - :return: list of job definition dictionaries. + :param filename: file name (str) + :return: list of job definition dictionaries (list). """ - job_definitions_list = [] # re-package dictionaries diff --git a/pilot/util/heartbeat.py b/pilot/util/heartbeat.py new file mode 100644 index 00000000..31f1135b --- /dev/null +++ b/pilot/util/heartbeat.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Authors: +# - Paul Nilsson, paul.nilsson@cern.ch, 2023 + +"""Functions related to heartbeat messages. It is especually needed for the pilot to know if it has been suspended.""" + +import logging +import os +import threading +import time + +# from pilot.common.errorcodes import ErrorCodes +from pilot.common.exception import ( + PilotException, + FileHandlingFailure, + ConversionFailure +) +from pilot.util.config import config +from pilot.util.filehandling import ( + read_json, + write_json +) + +lock = threading.Lock() +logger = logging.getLogger(__name__) +# errors = ErrorCodes() + + +def update_pilot_heartbeat(update_time: float, detected_job_suspension: bool = False, time_since_detection: int = 0, name: str = 'pilot') -> bool: + """ + Update the pilot heartbeat file. + + Dictionary = {last_pilot_heartbeat: , last_server_update: , ( last_looping_check: {job_id: : }, .. ) } + (optionally add looping job info later). + + :param update_time: time of last update (float) + :param detected_job_suspension: True if a job suspension was detected, False otherwise (bool) + :param time_since_detection: time since the job suspension was detected, in seconds (int) + :param name: name of the heartbeat to update, 'pilot' or 'server' (str) + :return: True if successfully updated heartbeat file, False otherwise (bool). + """ + path = os.path.join(os.getenv('PILOT_HOME', os.getcwd()), config.Pilot.pilot_heartbeat_file) + dictionary = read_pilot_heartbeat(path) + if not dictionary: # redundancy + dictionary = {} + + with lock: + # add the diff time (time between updates) to the dictionary if not present (ie the first time) + if not dictionary.get('max_diff_time', None): + # ie add the new field + dictionary['max_diff_time'] = 0 + if not dictionary.get(f'last_{name}_update', None): + # ie add the new field + dictionary[f'last_{name}_update'] = int(update_time) + max_diff_time = int(update_time) - dictionary.get(f'last_{name}_update', 0) + if max_diff_time >= dictionary.get('max_diff_time', 0): + dictionary['max_diff_time'] = max_diff_time + dictionary[f'last_{name}_update'] = int(update_time) + dictionary['time_since_detection'] = time_since_detection if detected_job_suspension else 0 + if detected_job_suspension: + logger.warning(f'job suspension detected: time since detection: {time_since_detection} seconds') + else: + logger.debug('no job suspension detected') + + status = write_json(path, dictionary) + if not status: + logger.warning(f'failed to update heartbeat file: {path}') + return False + else: + logger.debug(f'updated pilot heartbeat file: {path}') + + return True + + +def read_pilot_heartbeat(path: str) -> dict: + """ + Read the pilot heartbeat file. + + :param path: path to heartbeat file (str) + :return: dictionary with pilot heartbeat info (dict). + """ + dictionary = {} + + with lock: + if os.path.exists(path): + try: + dictionary = read_json(path) + except (PilotException, FileHandlingFailure, ConversionFailure) as exc: + logger.warning(f'failed to read heartbeat file: {exc}') + + return dictionary + + +def get_last_update(name: str = 'pilot') -> int: + """ + Return the time of the last pilot or server update. + + :param name: name of the heartbeat to return (str) + :return: time of last pilot or server update (int). + """ + dictionary = read_pilot_heartbeat() + if dictionary: + return dictionary.get(f'last_{name}_update', 0) + + return 0 + + +def time_since_suspension() -> int: + """ + Return the time since the pilot detected a job suspension. + + If non-zero, reset the time since detection to zero. + + :return: time since the pilot detected a job suspension (int). + """ + path = os.path.join(os.getenv('PILOT_HOME', os.getcwd()), config.Pilot.pilot_heartbeat_file) + dictionary = read_pilot_heartbeat(path) + if dictionary: + time_since_detection = dictionary.get('time_since_detection', 0) + if time_since_detection: + # reset the time since detection to zero + update_pilot_heartbeat(time.time(), False, 0) + logger.info('reset time since detection to zero') + return time_since_detection + + return 0 + + +def is_suspended(limit: int = 10 * 60) -> bool: + """ + Check if the pilot was suspended. + + :param limit: time limit in seconds (int) + :return: True if the pilot is suspended, False otherwise (bool). + """ + last_pilot_update = get_last_update() + if last_pilot_update: + # check if more than ten minutes has passed + if int(time.time()) - last_pilot_update > limit: + return True + + return False diff --git a/pilot/util/https.py b/pilot/util/https.py index 0741c03b..b33245e3 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -21,8 +21,12 @@ # - Mario Lassnig, mario.lassnig@cern.ch, 2017 # - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +"""Functions for https interactions.""" + import json +import logging import os +import pipes import platform import random import socket @@ -31,19 +35,18 @@ import urllib.request import urllib.error import urllib.parse -import pipes from collections import namedtuple -from time import sleep, time from re import findall +from time import sleep, time +from typing import Callable, Any -from .filehandling import write_file, read_file from .config import config from .constants import get_pilot_version from .container import execute +from .filehandling import write_file, read_file from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import FileHandlingFailure -import logging logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -59,18 +62,18 @@ ctx = type('ctx', (object,), dict(ssl_context=None, user_agent='Pilot3 client', capath=None, cacert=None)) -def _tester(func, *args): +def _tester(func: Callable[..., Any], *args: Any) -> Any: """ - Tests function ``func`` on arguments and returns first positive. + Test function ``func`` on the given arguments and return the first positive. >>> _tester(lambda x: x%3 == 0, 1, 2, 3, 4, 5, 6) 3 >>> _tester(lambda x: x%3 == 0, 1, 2) None - :param func: function(arg)->boolean - :param args: other arguments - :return: something or none + :param func: the function to be tested (Callable) + :param args: other arguments (Any) + :return: something or none (Any). """ for arg in args: if arg is not None and func(arg): @@ -79,78 +82,72 @@ def _tester(func, *args): return None -def capath(args=None): +def capath(args: Any = None) -> Any: """ - Tries to get :abbr:`CA (Certification Authority)` path with certificates. - Testifies it to be a directory. - Tries next locations: + Try to get :abbr:`CA (Certification Authority)` path with certificates. + Tries 1. :option:`--capath` from arguments 2. :envvar:`X509_CERT_DIR` from env 3. Path ``/etc/grid-security/certificates`` - :param args: arguments, parsed by `argparse` - :returns: `str` -- directory path, or `None` + :param args: arguments, parsed by argparse (Any) + :returns: directory path (str), or None. """ - return _tester(os.path.isdir, args and args.capath, os.environ.get('X509_CERT_DIR'), '/etc/grid-security/certificates') -def cacert_default_location(): +def cacert_default_location() -> Any: """ - Tries to get current user ID through `os.getuid`, and get the posix path for x509 certificate. + Try to get current user ID through `os.getuid`, and get the posix path for x509 certificate. + :returns: `str` -- posix default x509 path, or `None` """ try: - return '/tmp/x509up_u%s' % str(os.getuid()) + return f'/tmp/x509up_u{os.getuid()}' except AttributeError: - logger.warning('No UID available? System not POSIX-compatible... trying to continue') + logger.warning('no UID available? System not POSIX-compatible... trying to continue') pass return None -def cacert(args=None): +def cacert(args: Any = None) -> Any: """ - Tries to get :abbr:`CA (Certification Authority)` certificate or X509 one. - Testifies it to be a regular file. - Tries next locations: + Try to get :abbr:`CA (Certification Authority)` certificate or X509. + Checks that it is a regular file. + Tries 1. :option:`--cacert` from arguments 2. :envvar:`X509_USER_PROXY` from env 3. Path ``/tmp/x509up_uXXX``, where ``XXX`` refers to ``UID`` - :param args: arguments, parsed by `argparse` - :returns: `str` -- certificate file path, or `None` + :param args: arguments, parsed by argparse (Any) + :returns: `str` -- certificate file path, or `None` (Any). """ - return _tester(os.path.isfile, args and args.cacert, os.environ.get('X509_USER_PROXY'), cacert_default_location()) -def https_setup(args=None, version=None): +def https_setup(args: Any = None, version: str = ""): """ - Sets up the context for future HTTPS requests: + Set up the context for HTTPS requests. 1. Selects the certificate paths 2. Sets up :mailheader:`User-Agent` 3. Tries to create `ssl.SSLContext` for future use (falls back to :command:`curl` if fails) - :param args: arguments, parsed by `argparse` - :param str version: pilot version string (for :mailheader:`User-Agent`) + :param args: arguments, parsed by argparse (Any) + :param version: pilot version string (for :mailheader:`User-Agent`) (str). """ - version = version or get_pilot_version() - _ctx.user_agent = 'pilot/%s (Python %s; %s %s)' % (version, - sys.version.split()[0], - platform.system(), - platform.machine()) + _ctx.user_agent = f'pilot/{version} (Python {sys.version.split()[0]}; {platform.system()} {platform.machine()})' _ctx.capath = capath(args) _ctx.cacert = cacert(args) @@ -173,9 +170,10 @@ def https_setup(args=None, version=None): logger.warning(f'Failed to initialize SSL context .. skipped, error: {exc}') -def request(url, data=None, plain=False, secure=True, ipv='IPv6'): +def request(url: str, data: dict = {}, plain: bool = False, secure: bool = True, ipv: str = 'IPv6') -> Any: """ - This function sends a request using HTTPS. + Send a request using HTTPS. + Sends :mailheader:`User-Agent` and certificates previously being set up by `https_setup`. If `ssl.SSLContext` is available, uses `urllib2` as a request processor. Otherwise uses :command:`curl`. @@ -184,11 +182,6 @@ def request(url, data=None, plain=False, secure=True, ipv='IPv6'): Treats the request as JSON unless a parameter ``plain`` is `True`. If JSON is expected, sends ``Accept: application/json`` header. - :param string url: the URL of the resource. - :param dict data: data to send. - :param boolean plain: if true, treats the response as a plain text. - :param secure: Boolean (default: True, ie use certificates). - :param ipv: internet protocol version (string). Usage: .. code-block:: python @@ -197,12 +190,16 @@ def request(url, data=None, plain=False, secure=True, ipv='IPv6'): https_setup(args, PILOT_VERSION) # sets up ssl and other stuff response = request('https://some.url', {'some':'data'}) - Returns: + :param url: the URL of the resource (str) + :param data: data to send (dict) + :param plain: if true, treats the response as a plain text (bool) + :param secure: default: True, i.e. use certificates (bool) + :param ipv: internet protocol version (str). + :returns: - :keyword:`dict` -- if everything went OK - `str` -- if ``plain`` parameter is `True` - `None` -- if something went wrong """ - _ctx.ssl_context = None # certificates are not available on the grid, use curl # note that X509_USER_PROXY might change during running (in the case of proxy downloads), so @@ -270,10 +267,7 @@ def request(url, data=None, plain=False, secure=True, ipv='IPv6'): def update_ctx(): - """ - Update the ctx object in case X509_USER_PROXY has been updated. - """ - + """Update the ctx object in case X509_USER_PROXY has been updated.""" x509 = os.environ.get('X509_USER_PROXY', _ctx.cacert) if x509 != _ctx.cacert and os.path.exists(x509): _ctx.cacert = x509 @@ -282,16 +276,15 @@ def update_ctx(): _ctx.capath = certdir -def get_curl_command(plain, dat, ipv): +def get_curl_command(plain: bool, dat: str, ipv: str) -> (Any, str): """ Get the curl command. - :param plain: - :param dat: curl config option (string). - :param ipv: internet protocol version (string). - :return: curl command (string), sensitive string to be obscured before dumping to log (string). + :param plain: if true, treats the response as a plain text (bool) + :param dat: curl config option (str) + :param ipv: internet protocol version (str) + :return: curl command (str or None), sensitive string to be obscured before dumping to log (str). """ - auth_token_content = '' auth_token = os.environ.get('OIDC_AUTH_TOKEN', os.environ.get('PANDA_AUTH_TOKEN', None)) # file name of the token auth_origin = os.environ.get('OIDC_AUTH_ORIGIN', os.environ.get('PANDA_AUTH_ORIGIN', None)) # origin of the token (panda_dev.pilot) @@ -329,21 +322,20 @@ def get_curl_command(plain, dat, ipv): f'--cert {pipes.quote(_ctx.cacert or "")} ' \ f'--cacert {pipes.quote(_ctx.cacert or "")} ' \ f'--key {pipes.quote(_ctx.cacert or "")} '\ - f'-H {pipes.quote("User-Agent: %s" % _ctx.user_agent)} ' \ + f'-H {pipes.quote(f"User-Agent: {_ctx.user_agent}")} ' \ f'-H {pipes.quote("Accept: application/json") if not plain else ""} {dat}' #logger.info('request: %s', req) return req, auth_token_content -def locate_token(auth_token): +def locate_token(auth_token: str) -> str: """ Locate the token file. - :param auth_token: file name of token (string). - :return: path to token (string). + :param auth_token: file name of token (str) + :return: path to token (str). """ - _primary = os.path.dirname(os.environ.get('OIDC_AUTH_DIR', os.environ.get('PANDA_AUTH_DIR', os.environ.get('X509_USER_PROXY', '')))) paths = [os.path.join(_primary, auth_token), os.path.join(os.environ.get('PILOT_SOURCE_DIR', ''), auth_token), @@ -361,39 +353,35 @@ def locate_token(auth_token): return path -def get_vars(url, data): +def get_vars(url: str, data: dict) -> (str, str): """ Get the filename and strdata for the curl config file. - :param url: URL (string). - :param data: data to be written to file (dictionary). - :return: filename (string), strdata (string). + :param url: URL (str) + :param data: data to be written to file (dict) + :return: filename (str), strdata (str). """ - strdata = "" for key in data: - strdata += 'data="%s"\n' % urllib.parse.urlencode({key: data[key]}) - jobid = '' - if 'jobId' in list(data.keys()): - jobid = '_%s' % data['jobId'] + strdata += f'data="{urllib.parse.urlencode({key: data[key]})}"\n' + jobid = f"_{data['jobId']}" if 'jobId' in list(data.keys()) else "" # write data to temporary config file - filename = '%s/curl_%s%s.config' % (os.getenv('PILOT_HOME'), os.path.basename(url), jobid) + filename = f"{os.getenv('PILOT_HOME')}/curl_{os.path.basename(url)}{jobid}.config" return filename, strdata -def get_curl_config_option(writestatus, url, data, filename): +def get_curl_config_option(writestatus: bool, url: str, data: dict, filename: str) -> str: """ Get the curl config option. - :param writestatus: status of write_file call (Boolean). - :param url: URL (string). - :param data: data structure (dictionary). - :param filename: file name of config file (string). - :return: config option (string). + :param writestatus: status of write_file call (bool) + :param url: URL (str) + :param data: data structure (dict) + :param filename: file name of config file (str) + :return: config option (str). """ - if not writestatus: logger.warning('failed to create curl config file (will attempt to urlencode data directly)') dat = pipes.quote(url + '?' + urllib.parse.urlencode(data) if data else '') @@ -403,15 +391,16 @@ def get_curl_config_option(writestatus, url, data, filename): return dat -def execute_urllib(url, data, plain, secure): +def execute_urllib(url: str, data: dict, plain: bool, secure: bool) -> Any: """ Execute the request using urllib. - :param url: URL (string). - :param data: data structure - :return: urllib request structure. + :param url: URL (str) + :param data: data structure (dict) + :param plain: if true, treats the response as a plain text (bool) + :param secure: default: True, i.e. use certificates (bool) + :return: urllib request structure (Any). """ - req = urllib.request.Request(url, urllib.parse.urlencode(data)) if not plain: req.add_header('Accept', 'application/json') @@ -421,15 +410,14 @@ def execute_urllib(url, data, plain, secure): return req -def get_urlopen_output(req, context): +def get_urlopen_output(req: Any, context: Any) -> (int, str): """ Get the output from the urlopen request. - :param req: - :param context: - :return: ec (int), output (string). + :param req: urllib request structure (Any) + :param context: ssl context (Any) + :return: exit code (int), output (str). """ - exitcode = -1 output = "" try: @@ -444,19 +432,18 @@ def get_urlopen_output(req, context): return exitcode, output -def send_update(update_function, data, url, port, job=None, ipv='IPv6'): +def send_update(update_function: str, data: dict, url: str, port: str, job: Any = None, ipv: str = 'IPv6') -> dict: """ Send the update to the server using the given function and data. - :param update_function: 'updateJob' or 'updateWorkerPilotStatus' (string). - :param data: data (dictionary). - :param url: server url (string). - :param port: server port (string). - :param job: job object. - :param ipv: internet protocol version, IPv4 or IPv6 (string). - :return: server response (dictionary). + :param update_function: 'updateJob' or 'updateWorkerPilotStatus' (str) + :param data: data (dict) + :param url: server url (str) + :param port: server port (str) + :param job: job object (Any) + :param ipv: internet protocol version, IPv4 or IPv6 (str) + :return: server response (dict). """ - time_before = int(time()) max_attempts = 10 attempt = 0 @@ -519,17 +506,17 @@ def send_update(update_function, data, url, port, job=None, ipv='IPv6'): return res -def get_panda_server(url, port, update_server=True): +def get_panda_server(url: str, port: str, update_server: bool = True) -> str: """ Get the URL for the PanDA server. + The URL will be randomized if the server can be contacted (otherwise fixed). - :param url: URL string, if set in pilot option (port not included). - :param port: port number, if set in pilot option (int). - :param update_server: True if the server can be contacted (Boolean). - :return: full URL (either from pilot options or from config file). + :param url: URL string, if set in pilot option (port not included) (str) + :param port: port number, if set in pilot option (str) + :param update_server: True if the server can be contacted, False otherwise (bool) + :return: full URL (either from pilot options or from config file) (str). """ - if url != '': parsedurl = url.split('://') scheme = None @@ -568,15 +555,13 @@ def get_panda_server(url, port, update_server=True): return pandaserver -def add_error_codes(data, job): +def add_error_codes(data: dict, job: Any): """ Add error codes to data structure. - :param data: data dictionary. - :param job: job object. - :return: + :param data: data dictionary (dict) + :param job: job object (Any). """ - # error codes pilot_error_code = job.piloterrorcode pilot_error_codes = job.piloterrorcodes @@ -599,19 +584,18 @@ def add_error_codes(data, job): data['exeErrorDiag'] = job.exeerrordiag -def get_server_command(url, port, cmd='getJob'): +def get_server_command(url: str, port: str, cmd: str = 'getJob') -> str: """ Prepare the getJob server command. - :param url: PanDA server URL (string) - :param port: PanDA server port - :return: full server command (URL string) + :param url: PanDA server URL (str) + :param port: PanDA server port (str) + :return: full server command (str). """ - if url != "": port_pattern = '.:([0-9]+)' if not findall(port_pattern, url): - url = url + ':%s' % port + url = url + f':{port}' else: logger.debug(f'URL already contains port: {url}') else: diff --git a/pilot/util/jobmetrics.py b/pilot/util/jobmetrics.py index 134c66ea..2eda49bf 100644 --- a/pilot/util/jobmetrics.py +++ b/pilot/util/jobmetrics.py @@ -19,22 +19,25 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +"""Functions for building job metrics.""" + from os import environ +from typing import Any import logging logger = logging.getLogger(__name__) -def get_job_metrics_entry(name, value): +def get_job_metrics_entry(name: str, value: str) -> str: """ Get a formatted job metrics entry. + Return a job metrics substring with the format 'name=value ' (return empty entry if value is not set). - :param name: job metrics parameter name (string). - :param value: job metrics parameter value (string). - :return: job metrics entry (string). + :param name: job metrics parameter name (str) + :param value: job metrics parameter value (str) + :return: job metrics entry (str). """ - job_metrics_entry = "" if value != "": job_metrics_entry += f"{name}={value} " @@ -42,9 +45,10 @@ def get_job_metrics_entry(name, value): return job_metrics_entry -def get_job_metrics(job, extra={}): +def get_job_metrics(job: Any, extra: dict = {}) -> str: """ Return a properly formatted job metrics string. + Job metrics are highly user specific, so this function merely calls a corresponding get_job_metrics() in the user code. The format of the job metrics string is defined by the server. It will be reported to the server during updateJob. @@ -54,11 +58,10 @@ def get_job_metrics(job, extra={}): Format: nEvents= nEventsW= vmPeakMax= vmPeakMean= RSSMean= hs06= shutdownTime= cpuFactor= cpuLimit= diskLimit= jobStart= memLimit= runLimit= - :param job: job object + :param job: job object (Any) :param extra: any extra information to be added (dict) - :return: job metrics (string). + :return: job metrics (str). """ - user = environ.get('PILOT_USER', 'generic').lower() # TODO: replace with singleton try: job_metrics_module = __import__(f'pilot.user.{user}.jobmetrics', globals(), locals(), [user], 0) diff --git a/pilot/util/loggingsupport.py b/pilot/util/loggingsupport.py index 8e9e8352..53bd2a12 100644 --- a/pilot/util/loggingsupport.py +++ b/pilot/util/loggingsupport.py @@ -21,6 +21,8 @@ # This module contains functions related to logging. +"""Functions for logging.""" + import logging import sys from time import gmtime @@ -31,7 +33,7 @@ def establish_logging(debug: bool = True, nopilotlog: bool = False, filename: str = config.Pilot.pilotlog, loglevel: int = 0, redirectstdout: str = ""): """ - Setup and establish logging. + Set up and establish logging. Option loglevel can be used to decide which (predetermined) logging format to use. Example: @@ -42,13 +44,12 @@ def establish_logging(debug: bool = True, nopilotlog: bool = False, filename: st will be too much stdout. If to a file, it is recommended to then also set an appropriate max pilot lifetime to prevent it from creating too much stdout. - :param debug: debug mode (Boolean), - :param nopilotlog: True when pilot log is not known (Boolean). - :param filename: name of log file (string). - :param loglevel: selector for logging level (int). - :param redirectstdout: file name, or /dev/null (string). + :param debug: debug mode (bool) + :param nopilotlog: True when pilot log is not known (bool) + :param filename: name of log file (str) + :param loglevel: selector for logging level (int) + :param redirectstdout: file name, or /dev/null (str). """ - if redirectstdout: with open(redirectstdout, 'w', encoding="utf-8") as sys.stdout: pass # use with open to prevent pylint complaint @@ -84,11 +85,11 @@ def establish_logging(debug: bool = True, nopilotlog: bool = False, filename: st def flush_handler(name: str = ""): """ Flush the stdout buffer for the given handler. + Useful e.g. in case of time-out exceptions. - :param name: name of handler (string) + :param name: name of handler (str). """ - if not name: return for handler in logging.getLogger().handlers: diff --git a/pilot/util/loopingjob.py b/pilot/util/loopingjob.py index 22c42690..25fa7934 100644 --- a/pilot/util/loopingjob.py +++ b/pilot/util/loopingjob.py @@ -17,38 +17,62 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-24 + +"""Functions for identifying looping payloads.""" + +import os +import time +import logging +from typing import Any from pilot.common.errorcodes import ErrorCodes -from pilot.util.auxiliary import whoami, set_pilot_state, cut_output, locate_core_file +from pilot.util.auxiliary import ( + whoami, + set_pilot_state, + cut_output, + locate_core_file +) from pilot.util.config import config from pilot.util.container import execute #, execute_command -from pilot.util.filehandling import remove_files, find_latest_modified_file, verify_file_list, copy, list_mod_files +from pilot.util.filehandling import ( + remove_files, + find_latest_modified_file, + verify_file_list, + copy, + list_mod_files +) +from pilot.util.heartbeat import time_since_suspension +from pilot.util.math import convert_seconds_to_hours_minutes_seconds from pilot.util.parameters import convert_to_int -from pilot.util.processes import kill_process, find_zombies, handle_zombies, reap_zombies -from pilot.util.psutils import get_child_processes, get_subprocesses +from pilot.util.processes import ( + kill_process, + find_zombies, + handle_zombies, + reap_zombies +) +from pilot.util.psutils import ( + get_child_processes, + get_subprocesses +) from pilot.util.timing import time_stamp -import os -import time -import logging logger = logging.getLogger(__name__) - errors = ErrorCodes() -def looping_job(job, montime): +def looping_job(job: Any, montime: Any) -> (int, str): """ - Looping job detection algorithm. - Identify hanging tasks/processes. Did the stage-in/out finish within allowed time limit, or did the payload update + Identify looping payload, processes and tasks. + + Did the stage-in/out finish within allowed time limit, or did the payload update any files recently? The files must have been touched within the given looping_limit, or the process will be terminated. - :param job: job object. - :param montime: `MonitoringTime` object. - :return: exit code (int), diagnostics (string). + :param job: job object (Any) + :param montime: `MonitoringTime` object (Any) + :return: exit code (int), diagnostics (str). """ - exit_code = 0 diagnostics = "" @@ -66,13 +90,17 @@ def looping_job(job, montime): # check, the returned value will be the same as the previous time time_last_touched, recent_files = get_time_for_last_touch(job, montime, looping_limit) + # correct for job suspension if detected + time_since_job_suspension = time_since_suspension() + if time_since_job_suspension: + logger.info(f'looping job killer adjusting for job suspension: {time_since_job_suspension} s (adding to time_last_touched))') + time_last_touched += time_since_job_suspension + # the payload process is considered to be looping if it's files have not been touched within looping_limit time if time_last_touched: currenttime = int(time.time()) - logger.info(f'current time: {currenttime}') - logger.info(f'last time files were touched: {time_last_touched}') - logger.info(f'looping limit: {looping_limit} s') - + hours, minutes, seconds = convert_seconds_to_hours_minutes_seconds(currenttime - time_last_touched) + logger.info(f'files were last touched {hours}h {minutes}m {seconds}s ago (current time: {currenttime})') if currenttime - time_last_touched > looping_limit: try: # which were the considered files? @@ -92,13 +120,12 @@ def looping_job(job, montime): return exit_code, diagnostics -def create_core_dump(job): +def create_core_dump(job: Any): """ - Create core dump and copy it to work directory + Create core dump and copy it to work directory. - :param job: job object. + :param job: job object (Any). """ - if not job.pid or not job.workdir: logger.warning('cannot create core file since pid or workdir is unknown') return @@ -137,24 +164,24 @@ def create_core_dump(job): logger.warning(f'exception caught: {exp}') -def get_time_for_last_touch(job, montime, looping_limit): +def get_time_for_last_touch(job: Any, montime: Any, looping_limit: int) -> (int, list): """ Return the time when the files in the workdir were last touched. - in case no file was touched since the last check, the returned value will be the same as the previous time. - :param job: job object. - :param montime: `MonitoringTime` object. - :param looping_limit: looping limit in seconds. + In case no file was touched since the last check, the returned value will be the same as the previous time. + + :param job: job object (Any) + :param montime: `MonitoringTime` object (Any) + :param looping_limit: looping limit in seconds (int) :return: time in seconds since epoch (int) (or None in case of failure), recent files (list). """ - updated_files = [] pilot_user = os.environ.get('PILOT_USER', 'generic').lower() loopingjob_definitions = __import__(f'pilot.user.{pilot_user}.loopingjob_definitions', globals(), locals(), [pilot_user], 0) # locate all files that were modified the last N minutes - cmd = "find %s -mmin -%d" % (job.workdir, int(looping_limit / 60)) + cmd = f"find {job.workdir} -mmin -{int(looping_limit / 60)}" exit_code, stdout, stderr = execute(cmd) if exit_code == 0: if stdout != "": @@ -190,14 +217,12 @@ def get_time_for_last_touch(job, montime, looping_limit): return montime.ct_looping_last_touched, updated_files -def kill_looping_job(job): +def kill_looping_job(job: Any): """ Kill the looping process. - :param job: job object. - :return: (updated job object.) + :param job: job object (Any). """ - # the child process is looping, kill it diagnostics = f"pilot has decided to kill looping job {job.jobid} at {time_stamp()}" logger.fatal(diagnostics) @@ -252,13 +277,12 @@ def kill_looping_job(job): kill_process(pid) -def get_looping_job_limit(): +def get_looping_job_limit() -> int: """ Get the time limit for looping job detection. :return: looping job time limit in seconds (int). """ - looping_limit = convert_to_int(config.Pilot.looping_limit_default, default=2 * 3600) looping_limit_min_default = convert_to_int(config.Pilot.looping_limit_min_default, default=2 * 3600) looping_limit = max(looping_limit, looping_limit_min_default) diff --git a/pilot/util/math.py b/pilot/util/math.py index 2cc56e23..f8e8bed2 100644 --- a/pilot/util/math.py +++ b/pilot/util/math.py @@ -19,10 +19,13 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-23 -from pilot.common.exception import NotDefined +"""Common math functions.""" from decimal import Decimal from re import split, sub +from typing import Any + +from pilot.common.exception import NotDefined SYMBOLS = { 'customary': ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'), @@ -33,14 +36,13 @@ } -def mean(data): +def mean(data: list) -> float: """ Return the sample arithmetic mean of data. - :param data: list of floats or ints. + :param data: list of floats or ints (list) :return: mean value (float). """ - n = len(data) if n < 1: raise ValueError('mean requires at least one data point') @@ -48,95 +50,94 @@ def mean(data): return sum(data) / float(n) -def sum_square_dev(data): +def sum_square_dev(data: list) -> float: """ Return sum of square deviations of sequence data. + Sum (x - x_mean)**2 :param data: list of floats or ints. :return: sum of squares (float). """ - c = mean(data) return sum((x - c) ** 2 for x in data) -def sum_dev(x, y): +def sum_dev(x: list, y: list) -> float: """ Return sum of deviations of sequence data. + Sum (x - x_mean)**(y - y_mean) - :param x: list of ints or floats. - :param y: list of ints or floats. + :param x: list of ints or floats (list) + :param y: list of ints or floats (list) :return: sum of deviations (float). """ + return sum((_x - mean(x)) * (_y - mean(y)) for _x, _y in zip(x, y)) - c1 = mean(x) - c2 = mean(y) - - return sum((_x - c1) * (_y - c2) for _x, _y in zip(x, y)) - -def chi2(observed, expected): +def chi2(observed: list, expected: list) -> float: """ Return the chi2 sum of the provided observed and expected values. - :param observed: list of floats. - :param expected: list of floats. + :param observed: list of floats (list) + :param expected: list of floats (list) :return: chi2 (float). """ - if 0 in expected: return 0.0 return sum((_o - _e) ** 2 / _e ** 2 for _o, _e in zip(observed, expected)) -def float_to_rounded_string(num, precision=3): +def float_to_rounded_string(num: float, precision: int = 3) -> str: """ Convert float to a string with a desired number of digits (the precision). + E.g. num=3.1415, precision=2 -> '3.14'. round_to_n = lambda x, n: x if x == 0 else round(x, -int(math.floor(math.log10(abs(x)))) + (n - 1)) round_to_n(x=0.123,n=2) 0.12 - :param num: number to be converted (float). + + :param num: number to be converted (float) :param precision: number of desired digits (int) - :raises NotDefined: for undefined precisions and float conversions to Decimal. - :return: rounded string. + :raises NotDefined: for undefined precisions and float conversions to Decimal + :return: rounded string (str). """ - try: _precision = Decimal(10) ** -precision except Exception as exc: - raise NotDefined(f'failed to define precision={precision}: {exc}') + raise NotDefined(f'failed to define precision={precision}: {exc}') from exc try: s = Decimal(str(num)).quantize(_precision) except Exception as exc: - raise NotDefined(f'failed to convert {num} to Decimal: {exc}') + raise NotDefined(f'failed to convert {num} to Decimal: {exc}') from exc return str(s) -def tryint(x): +def tryint(x: Any) -> Any: """ + Try to convert given number to integer. + Used by numbered string comparison (to protect against unexpected letters in version number). - :param x: possible int. - :return: converted int or original value in case of ValueError. + :param x: possible int (Any) + :return: converted int or original value in case of ValueError (Any). """ - try: return int(x) except ValueError: return x -def split_version(s): +def split_version(version: str) -> tuple: """ Split version string into parts and convert the parts into integers when possible. + Any encountered strings are left as they are. The function is used with release strings. split_version("1.2.3") = (1,2,3) @@ -147,78 +148,78 @@ def split_version(s): > sorted(names, key=splittedname) ['4.3', '4.10', 'PT2.9', 'PT2.19', 'YT4.2', 'YT4.11'] - :param s: release string. - :return: converted release tuple. + :param version: release string (str) + :return: converted release tuple (tuple). """ - - return tuple(tryint(x) for x in split('([^.]+)', s)) + return tuple(tryint(x) for x in split('([^.]+)', version)) -def is_greater_or_equal(a, b): +def is_greater_or_equal(num_a: str, num_b: str) -> bool: """ - Is the numbered string a >= b? + Check if the numbered string num_a >= num_b. + "1.2.3" > "1.2" -- more digits "1.2.3" > "1.2.2" -- rank based comparison "1.3.2" > "1.2.3" -- rank based comparison "1.2.N" > "1.2.2" -- nightlies checker, always greater - :param a: numbered string. - :param b: numbered string. - :return: boolean. + :param num_a: numbered string (str) + :param num_b: numbered string (str) + :return: True if num_a >= num_b, False otherwise (bool). """ - - return split_version(a) >= split_version(b) + return split_version(num_a) >= split_version(num_b) -def add_lists(list1, list2): +def add_lists(list1: list, list2: list) -> list: """ Add list1 and list2 and remove any duplicates. + Example: list1=[1,2,3,4] list2=[3,4,5,6] add_lists(list1, list2) = [1, 2, 3, 4, 5, 6] - :param list1: input list 1 - :param list2: input list 2 - :return: added lists with removed duplicates + :param list1: input list 1 (list) + :param list2: input list 2 (list) + :return: added lists with removed duplicates (list). """ return list1 + list(set(list2) - set(list1)) -def convert_mb_to_b(size): +def convert_mb_to_b(size: Any) -> int: """ Convert value from MB to B for the given size variable. + If the size is a float, the function will convert it to int. - :param size: size in MB (float or int). - :return: size in B (int). + :param size: size in MB (float or int) (Any) :raises: ValueError for conversion error. + :return: size in B (int). """ - try: size = int(size) except Exception as exc: - raise ValueError(f'cannot convert {size} to int: {exc}') + raise ValueError(f'cannot convert {size} to int: {exc}') from exc return size * 1024 ** 2 -def diff_lists(list_a, list_b): +def diff_lists(list_a: list, list_b: list) -> list: """ Return the difference between list_a and list_b. - :param list_a: input list a. - :param list_b: input list b. + :param list_a: input list a (list) + :param list_b: input list b (list) :return: difference (list). """ - return list(set(list_a) - set(list_b)) -def bytes2human(n, _format='%(value).1f %(symbol)s', symbols='customary'): +def bytes2human(num: Any, symbols: str = 'customary') -> str: """ - Convert n bytes into a human readable string based on format. - symbols can be either "customary", "customary_ext", "iec" or "iec_ext", + Convert `num` bytes into a human-readable string based on format. + + Symbols can be either "customary", "customary_ext", "iec" or "iec_ext", see: http://goo.gl/kTQMs >>> bytes2human(0) @@ -251,26 +252,36 @@ def bytes2human(n, _format='%(value).1f %(symbol)s', symbols='customary'): >>> # precision can be adjusted by playing with %f operator >>> bytes2human(10000, _format="%(value).5f %(symbol)s") '9.76562 K' + + :param num: input number (Any) + :param symbols: symbold string (str) + :return: human-readable string (str). """ - n = int(n) - if n < 0: + _format = '%(value).1f %(symbol)s' + + try: + number = int(num) + except ValueError as exc: + raise exc + if number < 0: raise ValueError("n < 0") symbols = SYMBOLS[symbols] prefix = {} for i, s in enumerate(symbols[1:]): prefix[s] = 1 << (i + 1) * 10 for symbol in reversed(symbols[1:]): - if n >= prefix[symbol]: - value = float(n) / prefix[symbol] + if number >= prefix[symbol]: + # value = float(number) / prefix[symbol] return _format % locals() - return _format % dict(symbol=symbols[0], value=n) + + return _format % {"symbol": symbols[0], "value": number} -def human2bytes(s, divider=None): +def human2bytes(snumber: str, divider: Any = None) -> int: """ - Attempts to guess the string format based on default symbols - set and return the corresponding bytes as an integer. - When unable to recognize the format ValueError is raised. + Guess the string format based on default symbols set and return the corresponding bytes as an integer. + + When unable to recognize the format, a ValueError is raised. If no digit passed, only a letter, it is interpreted as a one of a kind. Eg "KB" = "1 KB". If no letter passed, it is assumed to be in bytes. Eg "512" = "512 B" @@ -310,22 +321,32 @@ def human2bytes(s, divider=None): 2048 >>> human2bytes('G', '2M') 512 + + :param snumber: number string (str) + :param divider: divider (Any) + :return: converted integer (int) + :raises ValueError: for conversion error. """ - init = s + init = snumber num = "" - while s and s[0:1].isdigit() or s[0:1] == '.': - num += s[0] - s = s[1:] + while snumber and snumber[0:1].isdigit() or snumber[0:1] == '.': + num += snumber[0] + snumber = snumber[1:] if len(num) == 0: num = "1" - num = float(num) - letter = s.strip() + + try: + number = float(num) + except ValueError as exc: + raise exc + + letter = snumber.strip() letter = sub(r'(?i)(?<=.)(bi?|bytes?)$', "", letter) if len(letter) == 0: letter = "B" - for name, sset in list(SYMBOLS.items()): + for _, sset in list(SYMBOLS.items()): if letter in sset: break else: @@ -334,10 +355,31 @@ def human2bytes(s, divider=None): sset = SYMBOLS['customary'] letter = letter.upper() else: - raise ValueError("can't interpret %r" % init) + raise ValueError(f"can't interpret {init!r}") # = repr(init) prefix = {sset[0]: 1} - for i, s in enumerate(sset[1:]): - prefix[s] = 1 << (i + 1) * 10 + for inum, snum in enumerate(sset[1:]): + prefix[snum] = 1 << (inum + 1) * 10 div = 1 if divider is None else human2bytes(divider) - return int(num * prefix[letter] / div) + + try: + ret = int(number * prefix[letter] / div) + except ValueError as exc: + raise exc + + return ret + + +def convert_seconds_to_hours_minutes_seconds(seconds: int) -> tuple: + """ + Convert seconds to hours, minutes, and remaining seconds. + + :param seconds: seconds (int) + :return: hours, minutes, remaining seconds (tuple). + """ + hours = seconds // 3600 + remaining_seconds = seconds % 3600 + minutes = remaining_seconds // 60 + remaining_seconds %= 60 + + return hours, minutes, remaining_seconds diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 5adced08..03b39ce4 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -118,6 +118,8 @@ def job_monitor_tasks(job, mt, args): # noqa: C901 job.cpuconsumptiontime = int(round(cpuconsumptiontime)) job.cpuconversionfactor = 1.0 logger.info(f'(instant) CPU consumption time for pid={job.pid}: {cpuconsumptiontime} (rounded to {job.cpuconsumptiontime})') + elif _cpuconsumptiontime == -1: + logger.warning('could not get CPU consumption time') else: logger.warning(f'process {job.pid} is no longer using CPU - aborting') return 0, "" diff --git a/pilot/util/networking.py b/pilot/util/networking.py new file mode 100644 index 00000000..1b540cb5 --- /dev/null +++ b/pilot/util/networking.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Authors: +# - Paul Nilsson, paul.nilsson@cern.ch, 2023 + +"""Functions related to networking.""" + +import ipaddress +import logging +import re + +from pilot.util.container import execute + +logger = logging.getLogger(__name__) + + +def dump_ipv6_info() -> None: + """Dump the IPv6 info to the log.""" + _, stdout, stderr = execute('ifconfig', timeout=10) + if stdout: + ipv6 = extract_ipv6(stdout) + if ipv6: + logger.info(f'IPv6 addresses: {ipv6}') + else: + logger.warning('no IPv6 addresses found - this WN does not support IPv6') + else: + logger.warning(f'failed to run ifconfig: {stderr}') + + +def extract_ipv6(ifconfig: str) -> str: + """ + Extract the IPv6 address from the ifconfig output. + + :param ifconfig: ifconfig output (str) + :return: IPv6 address (str). + """ + # Regular expression pattern to match MAC addresses + mac_pattern = r'([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})' + + # Extract MAC addresses from the text + # mac_addresses = re.findall(mac_pattern, ifconfig) + + # Replace MAC addresses with placeholders + placeholder = '__MAC_ADDRESS__' + text_without_mac = re.sub(mac_pattern, placeholder, ifconfig) + + # Regular expression pattern to match potential IPv6 addresses + ipv6_pattern = r'\b[0-9a-fA-F:]+\b' + + # Extract potential addresses from the text without MAC addresses + potential_addresses = re.findall(ipv6_pattern, text_without_mac) + + # Filter and collect valid IPv6 addresses + valid_ipv6_addresses = [addr for addr in potential_addresses if ':' in addr] + + # Validate if addresses are IPv6 + try: + ipv6_addresses = [str(ipaddress.IPv6Address(addr)) for addr in valid_ipv6_addresses if + ipaddress.ip_address(addr).version == 6] + except ValueError: + ipv6_addresses = [] + + return ipv6_addresses diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 3374679c..7c382392 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -35,77 +35,44 @@ logger = logging.getLogger(__name__) -def find_processes_in_group(cpids, pid, ps_cache): +def find_processes_in_group(cpids: list, pid: int, ps_cache: str = ""): """ Find all processes that belong to the same group using the given ps command output. + Recursively search for the children processes belonging to pid and return their pid's. pid is the parent pid and cpids is a list that has to be initialized before calling this function and it contains the pids of the children AND the parent. ps_cache is expected to be the output from the command "ps -eo pid,ppid -m". + The cpids input parameter list gets updated in the function. + :param cpids: list of pid's for all child processes to the parent pid, as well as the parent pid itself (int). :param pid: parent process id (int). :param ps_cache: ps command output (string). - :return: (updated cpids input parameter list). """ + if pid: + cpids.append(pid) + lines = grep_str([str(pid)], ps_cache) - if not pid: - return - - cpids.append(pid) - lines = grep_str([str(pid)], ps_cache) + if lines and lines != ['']: + for i in range(0, len(lines)): + try: + thispid = int(lines[i].split()[0]) + thisppid = int(lines[i].split()[1]) + except Exception as error: + logger.warning(f'exception caught: {error}') + if thisppid == pid: + find_processes_in_group(cpids, thispid, ps_cache) - if lines and lines != ['']: - for i in range(0, len(lines)): - try: - thispid = int(lines[i].split()[0]) - thisppid = int(lines[i].split()[1]) - except Exception as error: - logger.warning(f'exception caught: {error}') - if thisppid == pid: - find_processes_in_group(cpids, thispid, ps_cache) - -def find_processes_in_group_old(cpids, pid): +def is_zombie(pid: int): """ - Find all processes that belong to the same group. - Recursively search for the children processes belonging to pid and return their pid's. - pid is the parent pid and cpids is a list that has to be initialized before calling this function and it contains - the pids of the children AND the parent. + Check if the given process is a zombie process. - :param cpids: list of pid's for all child processes to the parent pid, as well as the parent pid itself (int). - :param pid: parent process id (int). - :return: (updated cpids input parameter list). - """ - - if not pid: - return - - cpids.append(pid) - - cmd = "ps -eo pid,ppid -m | grep %d" % pid - _, psout, _ = execute(cmd, mute=True) - - lines = psout.split("\n") - if lines != ['']: - for i in range(0, len(lines)): - try: - thispid = int(lines[i].split()[0]) - thisppid = int(lines[i].split()[1]) - except Exception as error: - logger.warning(f'exception caught: {error}') - if thisppid == pid: - find_processes_in_group(cpids, thispid) - - -def is_zombie(pid): - """ - Is the given process a zombie? - :param pid: process id (int). - :return: boolean. + :param pid: process id (int) + :return: True if process is defunct, False otherwise (bool). """ - status = False cmd = "ps aux | grep %d" % (pid) @@ -601,8 +568,12 @@ def get_current_cpu_consumption_time(pid): # get all the child processes children = [] - _, ps_cache, _ = execute("ps -eo pid,ppid -m", mute=True) - find_processes_in_group(children, pid, ps_cache) + _, ps_cache, _ = execute("ps -eo pid,ppid -m", mute=True, timeout=60) + if ps_cache: + find_processes_in_group(children, pid, ps_cache) + else: + logger.warning('failed to get ps_cache') + return -1 cpuconsumptiontime = 0 for _pid in children: diff --git a/pilot/util/psutils.py b/pilot/util/psutils.py index 8fbdcbb0..c8968f5d 100644 --- a/pilot/util/psutils.py +++ b/pilot/util/psutils.py @@ -19,7 +19,7 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2023 -from re import findall +import logging import os import subprocess try: @@ -29,18 +29,24 @@ _is_psutil_available = False else: _is_psutil_available = True +from re import findall # from pilot.common.exception import MiddlewareImportFailure -import logging logger = logging.getLogger(__name__) -def is_process_running_by_pid(pid): +def is_process_running_by_pid(pid: int) -> bool: + """ + Is the given process still running? + + :param pid: process id (int) + :return: True (process still running), False (process not running). + """ return os.path.exists(f"/proc/{pid}") -def is_process_running(pid): +def is_process_running(pid: int) -> bool: """ Is the given process still running? @@ -50,25 +56,24 @@ def is_process_running(pid): :return: True (process still running), False (process not running) :raises: MiddlewareImportFailure if psutil module is not available. """ - if not _is_psutil_available: is_running = is_process_running_by_pid(pid) logger.warning(f'using /proc/{pid} instead of psutil (is_running={is_running})') return is_running # raise MiddlewareImportFailure("required dependency could not be imported: psutil") - else: - return psutil.pid_exists(pid) + + return psutil.pid_exists(pid) -def get_pid(jobpid): +def get_pid(jobpid: int) -> int: """ Try to figure out the pid for the memory monitoring tool. + Attempt to use psutil, but use a fallback to ps-command based code if psutil is not available. :param jobpid: job.pid (int) :return: pid (int|None). """ - pid = None if _is_psutil_available: @@ -95,15 +100,14 @@ def get_pid(jobpid): return pid -def find_pid_by_command_and_ppid(command, payload_pid): +def find_pid_by_command_and_ppid(command: str, payload_pid: int) -> int: """ Find the process id corresponding to the given command, and ensure that it belongs to the given payload. - :param command: command (string) + :param command: command (str) :param payload_pid: payload process id (int) - :return: process id (int) or None + :return: process id (int) or None. """ - if not _is_psutil_available: logger.warning('find_pid_by_command_and_ppid(): psutil not available - aborting') return None @@ -117,19 +121,19 @@ def find_pid_by_command_and_ppid(command, payload_pid): logger.debug(f"command={command} is in {process.info['cmdline'][0]}") logger.debug(f"ok returning pid={process.info['pid']}") return process.info['pid'] - except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + except (psutil.AccessDenied, psutil.ZombieProcess): pass + return None -def get_parent_pid(pid): +def get_parent_pid(pid: int) -> int or None: """ Return the parent process id for the given pid. :param pid: process id (int) :return: parent process id (int or None). """ - try: process = psutil.Process(pid) parent_pid = process.ppid() @@ -138,23 +142,23 @@ def get_parent_pid(pid): return None -def get_child_processes(parent_pid): +def get_child_processes(parent_pid: int) -> list: """ Return a list of all child processes belonging to the same parent process id. - Using a fallback to /proc/{pid} in case psutil is not available. + + Uses a fallback to /proc/{pid} in case psutil is not available. :param parent_pid: parent process id (int) :return: child processes (list). """ - if not _is_psutil_available: logger.warning('get_child_processes(): psutil not available - using legacy code as a fallback') return get_child_processes_legacy(parent_pid) - else: - return get_all_descendant_processes(parent_pid) + + return get_all_descendant_processes(parent_pid) -def get_all_descendant_processes(parent_pid, top_pid=os.getpid()): +def get_all_descendant_processes(parent_pid: int, top_pid: int = os.getpid()) -> list: """ Recursively find child processes using the given parent pid as a starting point. @@ -162,8 +166,7 @@ def get_all_descendant_processes(parent_pid, top_pid=os.getpid()): :param top_pid: do not include os.getpid() in the list (int) :return: descendant process ids and cmdline (list). """ - - def find_descendant_processes(pid, top_pid): + def find_descendant_processes(pid: int, top_pid: int) -> list: try: descendants = [] for process in psutil.process_iter(attrs=['pid', 'ppid', 'cmdline']): @@ -175,16 +178,17 @@ def find_descendant_processes(pid, top_pid): descendants.append((child_pid, cmdline)) descendants.extend(find_descendant_processes(child_pid, top_pid)) return descendants - except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + except (psutil.AccessDenied, psutil.ZombieProcess): return [] - all_descendant_processes = find_descendant_processes(parent_pid, top_pid) + return all_descendant_processes -def get_child_processes_legacy(parent_pid): +def get_child_processes_legacy(parent_pid: int) -> list: """ Return a list of all child processes belonging to the same parent process id. + Note: this approach is not efficient if one is to find all child processes using the parent pid as a starting point. Better to use a recursive function using psutil. This method should be removed once psutil is available everywhere. @@ -192,7 +196,6 @@ def get_child_processes_legacy(parent_pid): :param parent_pid: parent process id (int) :return: child processes (list). """ - child_processes = [] # Iterate through all directories in /proc @@ -227,7 +230,7 @@ def get_child_processes_legacy(parent_pid): return child_processes -def get_subprocesses(pid, debug=False): +def get_subprocesses(pid: int, debug: bool = False) -> list: """ Return the subprocesses belonging to the given PID as a list. @@ -235,13 +238,29 @@ def get_subprocesses(pid, debug=False): :param debug: control debug mode (bool) :return: list of subprocess PIDs. """ - pids = get_child_processes(pid) if debug: # always dump for looping jobs e.g. logger.info(f'child processes for pid={pid}: {pids}') else: # otherwise, only in debug mode logger.debug(f'child processes for pid={pid}: {pids}') + return [pid[0] for pid in pids] #cmd = f'ps -opid --no-headers --ppid {pid}' #_, out, _ = execute(cmd) #return [int(line) for line in out.splitlines()] if out else [] + + +def get_command_by_pid(pid: int) -> str or None: + """ + Return the command corresponding to the given process id. + + :param pid: process id (int) + :return: command (str or None). + """ + try: + process = psutil.Process(pid) + command = " ".join(process.cmdline()) + return command + except psutil.NoSuchProcess: + print(f"process with PID {pid} not found") + return None diff --git a/pilot/util/queuehandling.py b/pilot/util/queuehandling.py index 5a0c233e..8d7dbc80 100644 --- a/pilot/util/queuehandling.py +++ b/pilot/util/queuehandling.py @@ -71,7 +71,7 @@ def scan_for_jobs(queues): _queue = getattr(queues, queue) jobs = list(_queue.queue) if len(jobs) > 0: - logger.info(f'found {len(jobs)} job(s) in queue {queue} after {time.time() - _t0} s - will begin queue monitoring') + logger.debug(f'found {len(jobs)} job(s) in queue {queue} after {time.time() - _t0} s - will begin queue monitoring') found_job = True break if found_job: diff --git a/pilot/util/timer.py b/pilot/util/timer.py index e8e90ae2..22e28ee9 100644 --- a/pilot/util/timer.py +++ b/pilot/util/timer.py @@ -80,7 +80,10 @@ def run(self, func, args, kwargs, timeout=None): timeout = timeout if timeout is not None else self.timeout - thread.join(timeout) + try: + thread.join(timeout) + except Exception as exc: + print(f'exception caught while joining timer thread: {exc}') if thread.is_alive(): self.is_timeout = True diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index 07b29578..aeeba599 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -327,7 +327,7 @@ def lscpu(): cmd = 'lscpu' if not which(cmd): - logger.warning('command={cmd} does not exist - cannot check number of available cores') + logger.warning(f'command={cmd} does not exist - cannot check number of available cores') return 1, "" ec, stdout, _ = execute(cmd) diff --git a/pilot/workflow/generic.py b/pilot/workflow/generic.py index 23de5cf2..1f164145 100644 --- a/pilot/workflow/generic.py +++ b/pilot/workflow/generic.py @@ -122,7 +122,14 @@ def run(args): """ logger.info('setting up signal handling') - register_signals([signal.SIGINT, signal.SIGTERM, signal.SIGQUIT, signal.SIGSEGV, signal.SIGXCPU, signal.SIGUSR1, signal.SIGBUS], args) + register_signals([signal.SIGINT, + signal.SIGTERM, + signal.SIGQUIT, + signal.SIGSEGV, + signal.SIGXCPU, + signal.SIGUSR1, + signal.SIGBUS], + args) logger.info('setting up queues') queues = namedtuple('queues', ['jobs', 'payloads', 'data_in', 'data_out', 'current_data_in', @@ -193,32 +200,37 @@ def run(args): # the thread_count is the total number of threads, not just the ExcThreads above thread_count = threading.activeCount() abort = False - while threading.activeCount() > 1 or not abort: - # Note: this loop only includes at ExcThreads, not MainThread or Thread - # threading.activeCount() will also include MainThread and any daemon threads (will be ignored) - for thread in threads: - bucket = thread.get_bucket() - try: - exc = bucket.get(block=False) - except queue.Empty: - pass - else: - exc_type, exc_obj, exc_trace = exc - # deal with the exception - print(f'received exception from bucket queue in generic workflow: {exc_obj}', file=stderr) - - thread.join(0.1) - - # have all threads finished? - abort = threads_aborted(caller='run') - if abort: - logger.debug('will proceed to set job_aborted') - args.job_aborted.set() - sleep(5) # allow monitor thread to finish (should pick up job_aborted within 1 second) - logger.debug(f'all relevant threads have aborted (thread count={threading.activeCount()})') - break - - sleep(1) + try: + while threading.activeCount() > 1 or not abort: + # Note: this loop only includes at ExcThreads, not MainThread or Thread + # threading.activeCount() will also include MainThread and any daemon threads (will be ignored) + for thread in threads: + bucket = thread.get_bucket() + try: + exc = bucket.get(block=False) + except queue.Empty: + pass + else: + exc_type, exc_obj, exc_trace = exc + # deal with the exception + print(f'received exception from bucket queue in generic workflow: {exc_obj}', file=stderr) + + thread.join(0.1) + + # have all threads finished? + abort = threads_aborted(caller='run') + if abort: + logger.debug('will proceed to set job_aborted') + args.job_aborted.set() + sleep(5) # allow monitor thread to finish (should pick up job_aborted within 1 second) + logger.debug(f'all relevant threads have aborted (thread count={threading.activeCount()})') + break + + sleep(1) + except Exception as exc: + logger.warning(f"exception caught while handling threads: {exc}") + finally: + logger.info('all workflow threads have been joined') logger.info(f'end of generic workflow (traces error code: {traces.pilot["error_code"]})') diff --git a/pilot/workflow/stager.py b/pilot/workflow/stager.py index 3452c748..2281ce9b 100644 --- a/pilot/workflow/stager.py +++ b/pilot/workflow/stager.py @@ -146,27 +146,32 @@ def run(args): logger.info('waiting for interrupts') thread_count = threading.activeCount() - while threading.activeCount() > 1: - for thread in threads: - bucket = thread.get_bucket() - try: - exc = bucket.get(block=False) - except queue.Empty: - pass - else: - exc_type, exc_obj, exc_trace = exc - # deal with the exception - print('received exception from bucket queue in generic workflow: %s' % exc_obj, file=stderr) - - thread.join(0.1) - - abort = False - if thread_count != threading.activeCount(): - # has all threads finished? - #abort = threads_aborted(abort_at=1) - abort = threads_aborted(caller='run') - if abort: - break + try: + while threading.activeCount() > 1: + for thread in threads: + bucket = thread.get_bucket() + try: + exc = bucket.get(block=False) + except queue.Empty: + pass + else: + exc_type, exc_obj, exc_trace = exc + # deal with the exception + print('received exception from bucket queue in generic workflow: %s' % exc_obj, file=stderr) + + thread.join(0.1) + + abort = False + if thread_count != threading.activeCount(): + # has all threads finished? + #abort = threads_aborted(abort_at=1) + abort = threads_aborted(caller='run') + if abort: + break + except Exception as exc: + logger.warning(f"exception caught while handling threads: {exc}") + finally: + logger.info('all stager threads have been joined') logger.info(f"end of stager workflow (traces error code: {traces.pilot['error_code']})") diff --git a/setup.py b/setup.py index d7687493..4807f379 100644 --- a/setup.py +++ b/setup.py @@ -26,26 +26,26 @@ from setuptools import setup, find_packages -sys.path.insert(0, '.') +sys.path.insert(0, ".") # get release version -with open('PILOTVERSION') as reader: +with open("PILOTVERSION") as reader: release_version = reader.read() setup( name="panda-pilot", version=release_version, - description='PanDA Pilot 3', - long_description='''This package contains the PanDA Pilot 3 source code''', - license='Apache License 2.0', - author='PanDA Team', - author_email='atlas-adc-panda@cern.ch', - url='https://github.com/PanDAWMS/pilot3/wiki', - python_requires='>=3.6', + description="PanDA Pilot 3", + long_description="""This package contains the PanDA Pilot 3 source code""", + license="Apache License 2.0", + author="PanDA Team", + author_email="atlas-adc-panda@cern.ch", + url="https://github.com/PanDAWMS/pilot3/wiki", + python_requires=">=3.6", packages=find_packages(), install_requires=[], data_files=[], - package_data={'': ['PILOTVERSION']}, + package_data={"": ["PILOTVERSION"]}, include_package_data=True, - scripts=[] + scripts=[], )