diff --git a/PILOTVERSION b/PILOTVERSION index 9c0db0082..4b3244f51 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.7.9.1 \ No newline at end of file +3.8.1.66 \ No newline at end of file diff --git a/doc/components/info/index.rst b/doc/components/info/index.rst index ae616650f..e70573df8 100644 --- a/doc/components/info/index.rst +++ b/doc/components/info/index.rst @@ -7,7 +7,7 @@ http://www.apache.org/licenses/LICENSE-2.0 Authors: - - Paul Nilsson, paul.nilsson@cern.ch, 2018 + - Paul Nilsson, paul.nilsson@cern.ch, 2018-24 info components =============== @@ -23,6 +23,5 @@ info components infoservice jobdata jobinfo - jobinfoservice queuedata storagedata diff --git a/doc/components/info/jobinfoservice.rst b/doc/components/info/jobinfoservice.rst deleted file mode 100644 index 615ac6b8d..000000000 --- a/doc/components/info/jobinfoservice.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. - Pilot 2 pilot.info.jobinfoservice doc file - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - Paul Nilsson, paul.nilsson@cern.ch, 2018 - -jobinfoservice -============== - -.. automodule:: pilot.info.jobinfoservice - :members: - :private-members: - :special-members: - :undoc-members: diff --git a/doc/components/resource/index.rst b/doc/components/resource/index.rst index 01562015c..81f0dd3c1 100644 --- a/doc/components/resource/index.rst +++ b/doc/components/resource/index.rst @@ -7,7 +7,7 @@ http://www.apache.org/licenses/LICENSE-2.0 Authors: - - Paul Nilsson, paul.nilsson@cern.ch, 2018-2019 + - Paul Nilsson, paul.nilsson@cern.ch, 2018-24 resource components =================== @@ -19,5 +19,4 @@ resource components bnl generic nersc - summit titan diff --git a/doc/components/resource/summit.rst b/doc/components/resource/summit.rst deleted file mode 100644 index 6274ccbd9..000000000 --- a/doc/components/resource/summit.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. - Pilot 2 pilot.resource.summit doc file - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - Paul Nilsson, paul.nilsson@cern.ch, 2019 - -summit -====== - -.. automodule:: pilot.resource.summit - :members: - :private-members: - :special-members: - :undoc-members: diff --git a/pilot.py b/pilot.py index 3384c3fa1..bb1bd797f 100755 --- a/pilot.py +++ b/pilot.py @@ -17,9 +17,9 @@ # under the License. # # Authors: -# - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 +# - Mario Lassnig, mario.lassnig@cern.ch, 2016-17 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 """This is the entry point for the PanDA Pilot, executed with 'python3 pilot.py '.""" @@ -39,29 +39,30 @@ from pilot.common.exception import PilotException from pilot.info import infosys from pilot.util.auxiliary import ( + convert_signal_to_exit_code, pilot_version_banner, shell_exit_code, - convert_signal_to_exit_code ) from pilot.util.config import config from pilot.util.constants import ( get_pilot_version, - SUCCESS, - FAILURE, ERRNO_NOJOBS, - PILOT_START_TIME, + FAILURE, PILOT_END_TIME, - SERVER_UPDATE_NOT_DONE, PILOT_MULTIJOB_START_TIME, + PILOT_START_TIME, + SERVER_UPDATE_NOT_DONE, + SUCCESS, ) from pilot.util.cvmfs import ( cvmfs_diagnostics, + get_last_update, is_cvmfs_available, - get_last_update ) from pilot.util.filehandling import ( get_pilot_work_dir, mkdirs, + store_base_urls ) from pilot.util.harvester import ( is_harvester_mode, @@ -72,6 +73,7 @@ get_panda_server, https_setup, send_update, + update_local_oidc_token_info ) from pilot.util.loggingsupport import establish_logging from pilot.util.networking import dump_ipv6_info @@ -116,8 +118,11 @@ def main() -> int: https_setup(args, get_pilot_version()) args.amq = None + # update the OIDC token if necessary + update_local_oidc_token_info(args.url, args.port) + # let the server know that the worker has started - if args.update_server: + if args.update_server and args.workerpilotstatusupdate: send_worker_status( "started", args.queue, args.url, args.port, logger, "IPv6" ) # note: assuming IPv6, fallback in place @@ -160,6 +165,9 @@ def main() -> int: ) logger.debug(f'PILOT_RUCIO_SITENAME={os.environ.get("PILOT_RUCIO_SITENAME")}') + #os.environ['RUCIO_ACCOUNT'] = 'atlpilo1' + #logger.warning(f"enforcing RUCIO_ACCOUNT={os.environ.get('RUCIO_ACCOUNT')}") + # store the site name as set with a pilot option environ[ "PILOT_SITENAME" @@ -171,6 +179,8 @@ def main() -> int: f"pilot.workflow.{args.workflow}", globals(), locals(), [args.workflow], 0 ) + # check if real-time logging is requested for this queue + #rtloggingtype # update the pilot heartbeat file update_pilot_heartbeat(time.time()) @@ -182,7 +192,7 @@ def main() -> int: exitcode = None # let the server know that the worker has finished - if args.update_server: + if args.update_server and args.workerpilotstatusupdate: send_worker_status( "finished", args.queue, @@ -357,8 +367,6 @@ def get_args() -> Any: required=False, # From v 2.2.1 the site name is internally set help="OBSOLETE: site name (e.g., AGLT2_TEST)", ) - - # graciously stop pilot process after hard limit arg_parser.add_argument( "-j", "--joblabel", @@ -366,6 +374,13 @@ def get_args() -> Any: default="ptest", help="Job prod/source label (default: ptest)", ) + arg_parser.add_argument( + "-g", + "--baseurls", + dest="baseurls", + default="", + help="Comma separated list of base URLs for validation of trf download", + ) # pilot version tag; PR or RC arg_parser.add_argument( @@ -385,6 +400,15 @@ def get_args() -> Any: help="Disable server updates", ) + arg_parser.add_argument( + "-k", + "--noworkerpilotstatusupdate", + dest="workerpilotstatusupdate", + action="store_false", + default=True, + help="Disable updates to updateWorkerPilotStatus", + ) + arg_parser.add_argument( "-t", "--noproxyverification", @@ -842,7 +866,7 @@ def send_worker_status( port: str, logger: Any, internet_protocol_version: str, -) -> None: +): """ Send worker info to the server to let it know that the worker has started. @@ -956,6 +980,10 @@ def list_zombies(): # set environment variables (to be replaced with singleton implementation) set_environment_variables() + # store base URLs in a file if set + if args.baseurls: + store_base_urls(args.baseurls) + # execute main function trace = main() diff --git a/pilot/api/data.py b/pilot/api/data.py index 02785845e..3305b47f9 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -80,8 +80,15 @@ class StagingClient: # list of allowed schemas to be used for transfers from REMOTE sites remoteinput_allowed_schemas = ['root', 'gsiftp', 'dcap', 'srm', 'storm', 'https'] - def __init__(self, infosys_instance: Any = None, acopytools: dict = None, logger: Any = None, - default_copytools: str = 'rucio', trace_report: dict = None, ipv: str = 'IPv6', workdir: str = ""): + def __init__(self, + infosys_instance: Any = None, + acopytools: dict = None, + logger: Any = None, + default_copytools: str = 'rucio', + trace_report: dict = None, + ipv: str = 'IPv6', + workdir: str = "", + altstageout: str = None): """ Set default/init values. @@ -106,6 +113,7 @@ def __init__(self, infosys_instance: Any = None, acopytools: dict = None, logger self.infosys = infosys_instance or infosys self.ipv = ipv self.workdir = workdir + self.altstageout = altstageout if isinstance(acopytools, str): acopytools = {'default': [acopytools]} if acopytools else {} @@ -221,7 +229,7 @@ def print_replicas(self, replicas: list, label: str = 'unsorted'): """ number = 1 maxnumber = 10 - self.logger.info(f'{label} list of replicas: (max {maxnumber})') + self.logger.debug(f'{label} list of replicas: (max {maxnumber})') for pfn, xdat in replicas: self.logger.debug(f"{number}. " f"lfn={pfn}, " diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 890763a6b..123a50adc 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -179,6 +179,9 @@ class ErrorCodes: LOGCREATIONTIMEOUT = 1376 CVMFSISNOTALIVE = 1377 LSETUPTIMEDOUT = 1378 + PREEMPTION = 1379 + ARCPROXYFAILURE = 1380 + ARCPROXYLIBFAILURE = 1381 _error_messages = { GENERALERROR: "General pilot error, consult batch log", @@ -320,6 +323,9 @@ class ErrorCodes: LOGCREATIONTIMEOUT: "Log file creation timed out", CVMFSISNOTALIVE: "CVMFS is not responding", LSETUPTIMEDOUT: "Lsetup command timed out during remote file open", + PREEMPTION: "Job was preempted", + ARCPROXYFAILURE: "General arcproxy failure", + ARCPROXYLIBFAILURE: "Arcproxy failure while loading shared libraries", } put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181] diff --git a/pilot/control/data.py b/pilot/control/data.py index 12d6a33f7..3c76a9b9f 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -30,6 +30,7 @@ import time import traceback import queue +from collections import namedtuple from typing import Any from pathlib import Path @@ -42,11 +43,12 @@ from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import ( ExcThread, - PilotException, + FileHandlingFailure, LogFileCreationFailure, NoSuchFile, - FileHandlingFailure + PilotException, ) +from pilot.info import JobData from pilot.util.auxiliary import ( set_pilot_state, check_for_final_server_update @@ -54,28 +56,28 @@ from pilot.util.common import should_abort from pilot.util.config import config from pilot.util.constants import ( - PILOT_PRE_STAGEIN, + LOG_TRANSFER_DONE, + LOG_TRANSFER_FAILED, + LOG_TRANSFER_IN_PROGRESS, + LOG_TRANSFER_NOT_DONE, + MAX_KILL_WAIT_TIME, + PILOT_POST_LOG_TAR, PILOT_POST_STAGEIN, - PILOT_PRE_STAGEOUT, PILOT_POST_STAGEOUT, PILOT_PRE_LOG_TAR, - PILOT_POST_LOG_TAR, - LOG_TRANSFER_IN_PROGRESS, - LOG_TRANSFER_DONE, - LOG_TRANSFER_NOT_DONE, - LOG_TRANSFER_FAILED, + PILOT_PRE_STAGEIN, + PILOT_PRE_STAGEOUT, SERVER_UPDATE_RUNNING, - MAX_KILL_WAIT_TIME, UTILITY_BEFORE_STAGEIN ) from pilot.util.container import execute from pilot.util.filehandling import ( - remove, - write_file, copy, - get_directory_size, find_files_with_pattern, - rename_xrdlog + get_directory_size, + remove, + rename_xrdlog, + write_file, ) from pilot.util.middleware import ( containerise_middleware, @@ -94,13 +96,13 @@ errors = ErrorCodes() -def control(queues: Any, traces: Any, args: Any): +def control(queues: namedtuple, traces: Any, args: object): """ Set up data control threads. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any). + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object). """ targets = {'copytool_in': copytool_in, 'copytool_out': copytool_out, 'queue_monitoring': queue_monitoring} threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'queues': queues, 'traces': traces, 'args': args}, @@ -153,13 +155,13 @@ def control(queues: Any, traces: Any, args: Any): logger.info('[data] control thread has finished') -def skip_special_files(job: Any): +def skip_special_files(job: JobData): """ Consult user defined code if any files should be skipped during stage-in. ATLAS code will skip DBRelease files e.g. as they should already be available in CVMFS. - :param job: job object (Any). + :param job: job object (JobData). """ pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__(f'pilot.user.{pilot_user}.common', globals(), locals(), [pilot_user], 0) @@ -169,11 +171,11 @@ def skip_special_files(job: Any): logger.warning('caught exception: %s', error) -def update_indata(job: Any): +def update_indata(job: JobData): """ Remove files marked as no_transfer files from stage-in. - :param job: job object (Any). + :param job: job object (JobData). """ toberemoved = [] for fspec in job.indata: @@ -184,11 +186,11 @@ def update_indata(job: Any): job.indata.remove(fspec) -def get_trace_report_variables(job: Any, label: str = 'stage-in') -> (str, str, str): +def get_trace_report_variables(job: JobData, label: str = 'stage-in') -> (str, str, str): """ Get some of the variables needed for creating the trace report. - :param job: job object (Any) + :param job: job object (JobData) :param label: 'stage-[in|out]' (str) :return: event_type (str), localsite (str), remotesite (str). """ @@ -201,11 +203,11 @@ def get_trace_report_variables(job: Any, label: str = 'stage-in') -> (str, str, return event_type, localsite, remotesite -def create_trace_report(job: Any, label: str = 'stage-in') -> Any: +def create_trace_report(job: JobData, label: str = 'stage-in') -> Any: """ Create the trace report object. - :param job: job object (Any) + :param job: job object (JobData) :param label: 'stage-[in|out]' (str) :return: trace report object (Any). """ @@ -217,12 +219,12 @@ def create_trace_report(job: Any, label: str = 'stage-in') -> Any: return trace_report -def get_stagein_client(job: Any, args: Any, label: str = 'stage-in') -> (Any, str): +def get_stagein_client(job: JobData, args: object, label: str = 'stage-in') -> (Any, str): """ Return the proper stage-in client. - :param job: job object (Any) - :param args: pilot args object (Any) + :param job: job object (JobData) + :param args: pilot args object (object) :param label: 'stage-in' (str) :return: stage-in client (StageInClient). """ @@ -240,12 +242,12 @@ def get_stagein_client(job: Any, args: Any, label: str = 'stage-in') -> (Any, st return client, activity -def _stage_in(args: Any, job: Any) -> bool: +def _stage_in(args: object, job: JobData) -> bool: """ Call the stage-in client. - :param args: pilot args object (Any) - :param job: job object (Any) + :param args: pilot args object (object) + :param job: job object (JobData) :return: True in case of success, False otherwise (bool). """ # tested ok: @@ -271,7 +273,7 @@ def _stage_in(args: Any, job: Any) -> bool: try: eventtype, localsite, remotesite = get_trace_report_variables(job, label=label) containerise_middleware(job, args, job.indata, eventtype, localsite, remotesite, - job.infosys.queuedata.container_options, label=label, + label=label, container_type=job.infosys.queuedata.container_type.get("middleware")) except PilotException as error: logger.warning('stage-in containerisation threw a pilot exception: %s', error) @@ -422,15 +424,15 @@ def write_utility_output(workdir: str, step: str, stdout: str, stderr: str): write_output(os.path.join(workdir, step + '_stderr.txt'), stderr) -def copytool_in(queues: Any, traces: Any, args: Any): # noqa: C901 +def copytool_in(queues: namedtuple, traces: Any, args: object): # noqa: C901 """ Call the stage-in function and put the job object in the proper queue. Main stage-in thread. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any). + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object). """ abort = False while not args.graceful_stop.is_set() and not abort: @@ -569,15 +571,15 @@ def copytool_in(queues: Any, traces: Any, args: Any): # noqa: C901 logger.info('[data] copytool_in thread has finished') -def copytool_out(queues: Any, traces: Any, args: Any): # noqa: C901 +def copytool_out(queues: namedtuple, traces: Any, args: object): # noqa: C901 """ Perform stage-out as soon as a job object can be extracted from the data_out queue. Main stage-out thread. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any). + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object). """ cont = True if args.graceful_stop.is_set(): @@ -652,14 +654,14 @@ def copytool_out(queues: Any, traces: Any, args: Any): # noqa: C901 logger.info('[data] copytool_out thread has finished') -def is_already_processed(queues: Any, processed_jobs: list) -> bool: +def is_already_processed(queues: namedtuple, processed_jobs: list) -> bool: """ Skip stage-out in case the job has already been processed. This should not be necessary so this is a fail-safe but it seems there is a case when a job with multiple output files enters the stage-out more than once. - :param queues: queues object (Any) + :param queues: queues object (namedtuple) :param processed_jobs: list of already processed jobs (list) :return: True if stage-out queues contain a job object that has already been processed, False otherwise (bool). """ @@ -857,15 +859,15 @@ def get_tar_timeout(dirsize: float) -> int: return min(timeout, timeout_max) -def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, ipv: str = 'IPv6') -> bool: +def _do_stageout(job: JobData, args: object, xdata: list, activity: list, title: str, ipv: str = 'IPv6') -> bool: """ Use the `StageOutClient` in the Data API to perform stage-out. The rucio host is internally set by Rucio via the client config file. This can be set directly as a pilot option --rucio-host. - :param job: job object (Any) - :param args: pilot args object (Any) + :param job: job object (JobData) + :param args: pilot args object (object) :param xdata: list of FileSpec objects (list) :param activity: copytool activity or preferred list of activities to resolve copytools (list) :param title: type of stage-out (output, log) (str) @@ -894,7 +896,7 @@ def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, i try: eventtype, localsite, remotesite = get_trace_report_variables(job, label=label) containerise_middleware(job, args, xdata, eventtype, localsite, remotesite, - job.infosys.queuedata.container_options, label=label, + label=label, container_type=job.infosys.queuedata.container_type.get("middleware")) except PilotException as error: logger.warning('stage-out containerisation threw a pilot exception: %s', error) @@ -907,7 +909,7 @@ def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, i # create the trace report trace_report = create_trace_report(job, label=label) - client = StageOutClient(job.infosys, logger=logger, trace_report=trace_report, ipv=ipv, workdir=job.workdir) + client = StageOutClient(job.infosys, logger=logger, trace_report=trace_report, ipv=ipv, workdir=job.workdir, altstageout=job.altstageout) kwargs = {'workdir': job.workdir, 'cwd': job.workdir, 'usecontainer': False, 'job': job, 'output_dir': args.output_dir, 'catchall': job.infosys.queuedata.catchall, 'rucio_host': args.rucio_host} #, mode='stage-out') @@ -946,14 +948,14 @@ def _do_stageout(job: Any, args: Any, xdata: list, activity: list, title: str, i return not remain_files -def _stage_out_new(job: Any, args: Any) -> bool: +def _stage_out_new(job: JobData, args: object) -> bool: """ Stage out all output files. If job.stageout=log then only log files will be transferred. - :param job: job object (Any) - :param args: pilot args object (Any) + :param job: job object (JobData) + :param args: pilot args object (object) :return: True in case of success, False otherwise (bool). """ #logger.info('testing sending SIGUSR1') @@ -1048,11 +1050,11 @@ def _stage_out_new(job: Any, args: Any) -> bool: return is_success -def generate_fileinfo(job: Any) -> dict: +def generate_fileinfo(job: JobData) -> dict: """ Generate fileinfo details to be sent to Panda. - :param job: job object (Any) + :param job: job object (JobData) :return: file info (dict). """ fileinfo = {} @@ -1067,15 +1069,15 @@ def generate_fileinfo(job: Any) -> dict: return fileinfo -def queue_monitoring(queues: Any, traces: Any, args: Any): +def queue_monitoring(queues: namedtuple, traces: Any, args: object): """ Monitor data queues. Thread. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object) """ while True: # will abort when graceful_stop has been set time.sleep(0.5) diff --git a/pilot/control/interceptor.py b/pilot/control/interceptor.py index bf1ee766f..b80e5f01d 100644 --- a/pilot/control/interceptor.py +++ b/pilot/control/interceptor.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 # Note: leave this module for now - the code might be useful for reuse @@ -26,7 +26,6 @@ import time import queue import logging -from typing import Any from pilot.common.exception import ExcThread from pilot.util.processes import threads_aborted @@ -34,13 +33,13 @@ logger = logging.getLogger(__name__) -def run(args: Any): +def run(args: object): """ Set up all interceptor threads. Main execution function for the interceptor communication layer. - :param args: pilot arguments (Any) + :param args: pilot arguments (object) """ targets = {'receive': receive, 'send': send} threads = [ExcThread(bucket=queue.Queue(), target=target, kwargs={'args': args}, @@ -78,11 +77,11 @@ def run(args: Any): logger.debug('[interceptor] run thread has finished') -def receive(args: Any): +def receive(args: object): """ Look for interceptor messages. - :param args: Pilot args object (Any). + :param args: Pilot args object (object). """ while not args.graceful_stop.is_set(): time.sleep(0.5) @@ -97,7 +96,7 @@ def receive(args: Any): logger.debug('[interceptor] receive thread has finished') -def send(args: Any): +def send(args: object): """ Send message to interceptor. @@ -117,15 +116,13 @@ def send(args: Any): # implement if necessary -# def interceptor(queues: Any, traces: Any, args: Any): +# def interceptor(queues: namedtuple, traces: Any, args: object): # """ # -# :param queues: internal queues for job handling. -# :param traces: tuple containing internal pilot states. -# :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). -# :return: +# :param queues: internal queues for job handling (namedtuple) +# :param traces: tuple containing internal pilot states (tupl) +# :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object). # """ -# # # overall loop counter (ignoring the fact that more than one job may be running) # counter = 0 # while not args.graceful_stop.is_set(): diff --git a/pilot/control/job.py b/pilot/control/job.py index 952ee7b3e..d01d9b453 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -17,9 +17,9 @@ # under the License. # # Authors: -# - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 +# - Mario Lassnig, mario.lassnig@cern.ch, 2016-17 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 # - Wen Guan, wen.guan@cern.ch, 2018 """Job module with functions for job handling.""" @@ -38,29 +38,29 @@ from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import ( ExcThread, + FileHandlingFailure, PilotException, - FileHandlingFailure ) from pilot.info import ( infosys, - JobData, InfoService, + JobData, JobInfoProvider ) from pilot.util import https from pilot.util.activemq import ActiveMQ from pilot.util.auxiliary import ( + check_for_final_server_update, + encode_globaljobid, get_batchsystem_jobid, + get_display_info, get_job_scheduler_id, - set_pilot_state, get_pilot_state, - check_for_final_server_update, - pilot_version_banner, - is_virtual_machine, has_instruction_sets, + is_virtual_machine, locate_core_file, - get_display_info, - encode_globaljobid + pilot_version_banner, + set_pilot_state, ) from pilot.util.config import config from pilot.util.common import ( @@ -83,78 +83,78 @@ ) from pilot.util.container import execute from pilot.util.filehandling import ( + copy, + create_symlink, find_text_files, - tail, + get_total_input_size, is_json, - copy, remove, + tail, write_file, - create_symlink, write_json, - get_total_input_size ) from pilot.util.harvester import ( - request_new_jobs, - remove_job_request_file, - parse_job_definition_file, is_harvester_mode, + get_event_status_file, get_worker_attributes_file, + parse_job_definition_file, publish_job_report, + publish_stageout_files, publish_work_report, - get_event_status_file, - publish_stageout_files + remove_job_request_file, + request_new_jobs, ) from pilot.util.jobmetrics import get_job_metrics from pilot.util.loggingsupport import establish_logging from pilot.util.math import mean, float_to_rounded_string from pilot.util.middleware import containerise_general_command from pilot.util.monitoring import ( + check_local_space, job_monitor_tasks, - check_local_space ) from pilot.util.monitoringtime import MonitoringTime from pilot.util.processes import ( cleanup, - threads_aborted, + kill_defunct_children, kill_process, kill_processes, - kill_defunct_children + threads_aborted, ) from pilot.util.proxy import get_distinguished_name from pilot.util.queuehandling import ( - scan_for_jobs, + purge_queue, put_in_queue, queue_report, - purge_queue + scan_for_jobs, ) from pilot.util.realtimelogger import cleanup as rtcleanup from pilot.util.timing import ( add_to_pilot_timing, - timing_report, get_postgetjob_time, get_time_since, - time_stamp + time_stamp, + timing_report, ) from pilot.util.workernode import ( - get_disk_space, collect_workernode_info, - get_node_name, - get_cpu_model, + get_cpu_arch, get_cpu_cores, - get_cpu_arch + get_cpu_model, + get_disk_space, + get_node_name, ) logger = logging.getLogger(__name__) errors = ErrorCodes() -def control(queues: Any, traces: Any, args: Any): +def control(queues: namedtuple, traces: Any, args: object): """ Set up job control threads. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object) """ targets = {'validate': validate, 'retrieve': retrieve, 'create_data_payload': create_data_payload, 'queue_monitor': queue_monitor, 'job_monitor': job_monitor, 'fast_job_monitor': fast_job_monitor, @@ -356,6 +356,18 @@ def is_final_update(job: Any, state: str, tag: str = 'sending') -> bool: :param tag: optional tag ('sending'/'writing') (str) :return: final state (bool). """ + # make sure that the log transfer has been attempted + log_transfer = get_job_status(job, 'LOG_TRANSFER') + actual_state = state + if log_transfer in {LOG_TRANSFER_DONE, LOG_TRANSFER_FAILED}: + logger.info(f'log transfer has been attempted: {log_transfer}') + elif not job.logdata: + # make sure that there should actually be a log transfer (i.e. is there a known log file defined in the job def) + logger.info('no logdata defined in job definition - no log transfer will be attempted') + else: + logger.info(f'log transfer has not been attempted: {log_transfer}') + state = 'not_ready_for_final_state' + if state in {'finished', 'failed', 'holding'}: final = True os.environ['SERVER_UPDATE'] = SERVER_UPDATE_UPDATING @@ -371,7 +383,7 @@ def is_final_update(job: Any, state: str, tag: str = 'sending') -> bool: verify_error_code(job) else: final = False - logger.info(f'job {job.jobid} has state \'{state}\' - {tag} heartbeat') + logger.info(f'job {job.jobid} has state \'{actual_state}\' - {tag} heartbeat') return final @@ -446,7 +458,7 @@ def send_state(job: Any, args: Any, state: str, xml: str = "", metadata: str = " if final and os.path.exists(job.workdir): # ignore if workdir doesn't exist - might be a delayed jobUpdate os.environ['SERVER_UPDATE'] = SERVER_UPDATE_FINAL - if state in {'finished', 'holding', 'failed'}: + if final and state in {'finished', 'holding', 'failed'}: logger.info(f'setting job as completed (state={state})') job.completed = True @@ -904,7 +916,7 @@ def get_general_command_stdout(job: Any): _containerisation = False # set this with some logic instead - not used for now if _containerisation: try: - containerise_general_command(job, job.infosys.queuedata.container_options, + containerise_general_command(job, label='general', container_type='container') except PilotException as error: @@ -1127,15 +1139,15 @@ def get_latest_log_tail(files: list) -> str: return stdout_tail -def validate(queues: Any, traces: Any, args: Any): +def validate(queues: namedtuple, traces: Any, args: object): """ Perform validation of job. Thread. - :param queues: queues object (Any) + :param queues: queues object (namedtuple) :param traces: traces object (Any) - :param args: args object (Any). + :param args: args object (object). """ while not args.graceful_stop.is_set(): time.sleep(0.5) @@ -1272,14 +1284,14 @@ def verify_ctypes(): logger.debug('all child subprocesses will be parented') -def delayed_space_check(queues: Any, traces: Any, args: Any, job: Any): +def delayed_space_check(queues: namedtuple, traces: Any, args: object, job: object): """ Run the delayed space check if necessary. - :param queues: queues object (Any) + :param queues: queues object (namedtuple) :param traces: traces object (Any) - :param args: args object (Any) - :param job: job object (Any). + :param args: args object (object) + :param job: job object (object). """ proceed_with_local_space_check = args.harvester_submitmode.lower() == 'push' and args.update_server if proceed_with_local_space_check: @@ -1332,7 +1344,7 @@ def store_jobid(jobid: int, init_dir: str): logger.warning(f'exception caught while trying to store job id: {error}') -def create_data_payload(queues: Any, traces: Any, args: Any): +def create_data_payload(queues: namedtuple, traces: Any, args: object): """ Get a Job object from the "validated_jobs" queue. @@ -1341,9 +1353,9 @@ def create_data_payload(queues: Any, traces: Any, args: Any): the thread also places the Job object in the "payloads" queue (another thread will retrieve it and wait for any stage-in to finish). - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object) """ while not args.graceful_stop.is_set(): time.sleep(0.5) @@ -1546,6 +1558,8 @@ def proceed_with_getjob(timefloor: int, starttime: int, jobnumber: int, getjob_r exit_code, diagnostics = userproxy.verify_proxy(test=False) if traces.pilot['error_code'] == 0: # careful so we don't overwrite another error code traces.pilot['error_code'] = exit_code + if exit_code == errors.ARCPROXYLIBFAILURE: + logger.warning("currently ignoring arcproxy library failure") if exit_code in {errors.NOPROXY, errors.NOVOMSPROXY, errors.CERTIFICATEHASEXPIRED}: logger.warning(diagnostics) return False @@ -1565,25 +1579,13 @@ def proceed_with_getjob(timefloor: int, starttime: int, jobnumber: int, getjob_r maximum_getjob_requests = 60 if harvester else max_getjob_requests # 1 s apart (if harvester) if getjob_requests > int(maximum_getjob_requests): - logger.warning(f'reached maximum number of getjob requests ({maximum_getjob_requests}) -- will abort pilot') - # use singleton: - # instruct the pilot to wrap up quickly - os.environ['PILOT_WRAP_UP'] = 'QUICKLY' - return False + return wrap_up_quickly(f'reached maximum number of getjob requests ({maximum_getjob_requests}) -- will abort pilot') if timefloor == 0 and jobnumber > 0: - logger.warning("since timefloor is set to 0, pilot was only allowed to run one job") - # use singleton: - # instruct the pilot to wrap up quickly - os.environ['PILOT_WRAP_UP'] = 'QUICKLY' - return False + return wrap_up_quickly("since timefloor is set to 0, pilot was only allowed to run one job") if (currenttime - starttime > timefloor) and jobnumber > 0: - logger.warning(f"the pilot has run out of time (timefloor={timefloor} has been passed)") - # use singleton: - # instruct the pilot to wrap up quickly - os.environ['PILOT_WRAP_UP'] = 'QUICKLY' - return False + return wrap_up_quickly(f"the pilot has run out of time (timefloor={timefloor} has been passed)") # timefloor not relevant for the first job if jobnumber > 0: @@ -1593,7 +1595,9 @@ def proceed_with_getjob(timefloor: int, starttime: int, jobnumber: int, getjob_r # unless it's the first job (which is preplaced in the init dir), instruct Harvester to place another job # in the init dir logger.info('asking Harvester for another job') - request_new_jobs() + status = request_new_jobs() + if not status: + return False if os.environ.get('SERVER_UPDATE', '') == SERVER_UPDATE_UPDATING: logger.info('still updating previous job, will not ask for a new job yet') @@ -1603,6 +1607,20 @@ def proceed_with_getjob(timefloor: int, starttime: int, jobnumber: int, getjob_r return True +def wrap_up_quickly(message: str) -> bool: + """ + Wrap up quickly. + + Helper function to reduce complexity of proceed_with_getjob(). + + :param message: message to log (str) + :return: False. + """ + logger.warning(message) + os.environ['PILOT_WRAP_UP'] = 'QUICKLY' + return False + + def get_job_definition_from_file(path: str, harvester: bool, pod: bool) -> dict: """ Get a job definition from a pre-placed file. @@ -1670,7 +1688,7 @@ def get_job_definition_from_server(args: Any, taskid: str = "") -> str: cmd = https.get_server_command(args.url, args.port) if cmd != "": logger.info(f'executing server command: {cmd}') - res = https.request2(cmd, data=data) # will be a dictionary + res = https.request2(cmd, data=data, panda=True) # will be a dictionary logger.debug(f"request2 response: {res}") # should be StatusCode=0 if all is ok if not res: # fallback to curl solution res = https.request(cmd, data=data) @@ -1706,15 +1724,21 @@ def locate_job_definition(args: Any) -> str: if path == "": logger.info('did not find any local job definition file') + # make sure there are no secondary job definition copies + _path = os.path.join(os.environ.get('PILOT_HOME'), config.Pilot.pandajobdata) + if _path != path and os.path.exists(_path): + logger.info(f'removing useless secondary job definition file: {_path}') + remove(_path) + return path -def get_job_definition(queues: Any, args: Any) -> dict: +def get_job_definition(queues: namedtuple, args: object) -> dict: """ Get a job definition from a source (server or pre-placed local file). - :param queues: queues object (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param queues: queues object (namedtuple) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object) :return: job definition (dict). """ res = {} @@ -1851,11 +1875,11 @@ def get_message(args: Any, message_queue: Any): message_queue.put(message) -def get_kwargs_for_mb(queues: Any, url: str, port: str, allow_same_user: bool, debug: bool): +def get_kwargs_for_mb(queues: namedtuple, url: str, port: str, allow_same_user: bool, debug: bool): """ Get the kwargs dictinoary for the message broker. - :param queues: queues object (Any) + :param queues: queues object (namedtuple) :param url: PanDA server URL (str) :param port: PanDA server port (str) :param allow_same_user: allow the same user or not (bool) @@ -2051,10 +2075,10 @@ def get_job_retrieval_delay(harvester: bool) -> int: :param harvester: True if Harvester is being used (determined from args.harvester), otherwise False (bool) :return: sleep (s) (int) """ - return 1 if harvester else 60 + return 10 if harvester else 60 -def retrieve(queues: Any, traces: Any, args: Any): # noqa: C901 +def retrieve(queues: namedtuple, traces: Any, args: object): # noqa: C901 """ Retrieve all jobs from the proper source. @@ -2068,9 +2092,9 @@ def retrieve(queues: Any, traces: Any, args: Any): # noqa: C901 WARNING: this function is nearly too complex. Be careful with adding more lines as flake8 will fail it. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object) :raises PilotException: if create_job fails (e.g. because queuedata could not be downloaded). """ timefloor = infosys.queuedata.timefloor @@ -2120,7 +2144,7 @@ def retrieve(queues: Any, traces: Any, args: Any): # noqa: C901 if not res: getjob_failures += 1 - if getjob_failures >= args.getjob_failures: + if getjob_failures >= get_nr_getjob_failures(args.getjob_failures, args.harvester_submitmode): logger.warning(f'did not get a job -- max number of job request failures reached: {getjob_failures} (setting graceful_stop)') args.graceful_stop.set() break @@ -2137,7 +2161,7 @@ def retrieve(queues: Any, traces: Any, args: Any): # noqa: C901 # it seems the PanDA server returns StatusCode as an int, but the aCT returns it as a string # note: StatusCode keyword is not available in job definition files from Harvester (not needed) getjob_failures += 1 - if getjob_failures >= args.getjob_failures: + if getjob_failures >= get_nr_getjob_failures(args.getjob_failures, args.harvester_submitmode): logger.warning(f'did not get a job -- max number of job request failures reached: {getjob_failures}') args.graceful_stop.set() break @@ -2154,6 +2178,10 @@ def retrieve(queues: Any, traces: Any, args: Any): # noqa: C901 except PilotException as error: raise error + # inform the server if this job should be in debug mode (real-time logging), decided by queuedata + if "loggingfile" in job.infosys.queuedata.catchall: + set_debug_mode(job.jobid, args.url, args.port) + logger.info('resetting any existing errors') job.reset_errors() @@ -2215,6 +2243,49 @@ def retrieve(queues: Any, traces: Any, args: Any): # noqa: C901 logger.info('[job] retrieve thread has finished') +def set_debug_mode(jobid: int, url: str, port: int): + """ + Inform the server that the given job should be in debug mode. + + Note, this is decided by queuedata.catchall. + + :param jobid: job id (int) + :param url: server url (str) + :param port: server port (int). + """ + # worker node structure to be sent to the server + data = {} + data["pandaID"] = jobid + data["modeOn"] = True + + # attempt to send the info to the server + res = https.send_update("setDebugMode", data, url, port) + if not res: + logger.warning('could not inform server to set job in debug mode') + + +def get_nr_getjob_failures(getjob_failures: int, harvester_submitmode: str) -> int: + """ + Return the number of max getjob failures. + + Note: the default max number of getjob failures is set to 5 in pilot.py. However, for PUSH mode, it makes more + sense to have a larger max attempt number since Harvester only checks for job requests once per five minutes. + So, if the pilot is started in PUSH mode, the max number of getjob failures is set to a higher number unless + args.getjob_failures is set (to a number not equal to five). + + :param getjob_failures: max getjob failures (int) + :param harvester_submitmode: Harvester submit mode, PUSH or PULL (str) + :return: max getjob failures (int). + """ + if harvester_submitmode.lower() == 'push': + if getjob_failures == 5: + return 12 + else: + return getjob_failures + else: + return getjob_failures + + def htcondor_envvar(jobid: str): """ On HTCondor nodes, set special env var (HTCondor_PANDA) for debugging Lustre. @@ -2307,14 +2378,14 @@ def create_job(dispatcher_response: dict, queuename: str) -> Any: return job -def has_job_completed(queues: Any, args: Any) -> bool: +def has_job_completed(queues: namedtuple, args: object) -> bool: """ Check if the current job has completed (finished or failed). Note: the job object was extracted from monitored_payloads queue before this function was called. - :param queues: Pilot queues object (Any) - :param args: Pilot arguments object (Any) + :param queues: Pilot queues object (namedtuple) + :param args: Pilot arguments object (object) :return: True is the payload has finished or failed, False otherwise (bool). """ # check if the job has finished @@ -2367,13 +2438,13 @@ def has_job_completed(queues: Any, args: Any) -> bool: return False -def get_job_from_queue(queues: Any, state: str) -> Any: +def get_job_from_queue(queues: namedtuple, state: str) -> object or None: """ Check if the job has finished or failed and if so return it. - :param queues: Pilot queues object (Any) + :param queues: Pilot queues object (namedtuple) :param state: job state (e.g. finished/failed) (str) - :return: job object (Any). + :return: job object (object or None). """ try: if state == "finished": @@ -2392,11 +2463,11 @@ def get_job_from_queue(queues: Any, state: str) -> Any: return job -def is_queue_empty(queues: Any, queuename: str) -> bool: +def is_queue_empty(queues: namedtuple, queuename: str) -> bool: """ Check if the given queue is empty (without pulling). - :param queues: Pilot queues object (Any) + :param queues: Pilot queues object (namedtuple) :param queuename: queue name (str) :return: True if queue is empty, False otherwise (bool) """ @@ -2415,12 +2486,12 @@ def is_queue_empty(queues: Any, queuename: str) -> bool: return status -def order_log_transfer(queues: Any, job: Any): +def order_log_transfer(queues: namedtuple, job: object): """ Order a log transfer for a failed job. - :param queues: Pilot queues object (Any) - :param job: job object (Any). + :param queues: Pilot queues object (namedtuple) + :param job: job object (object). """ # add the job object to the data_out queue to have it staged out job.stageout = 'log' # only stage-out log file @@ -2448,13 +2519,13 @@ def order_log_transfer(queues: Any, job: Any): logger.info('proceeding with server update') -def wait_for_aborted_job_stageout(args: Any, queues: Any, job: Any): +def wait_for_aborted_job_stageout(args: object, queues: namedtuple, job: object): """ Wait for stage-out to finish for aborted job. - :param args: Pilot arguments object (Any) - :param queues: Pilot queues object (Any) - :param job: job object (Any). + :param args: Pilot arguments object (object) + :param queues: Pilot queues object (namedtuple) + :param job: job object (object). """ # if the pilot received a kill signal, how much time has passed since the signal was intercepted? try: @@ -2505,7 +2576,7 @@ def get_job_status(job: Any, key: str) -> str: return value -def queue_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 +def queue_monitor(queues: namedtuple, traces: Any, args: object): # noqa: C901 """ Monitor queue activity. @@ -2513,9 +2584,9 @@ def queue_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 This function monitors queue activity, specifically if a job has finished or failed and then reports to the server. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any). + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object). """ # scan queues until at least one queue has a job object. abort if it takes too long time if not scan_for_jobs(queues): @@ -2632,14 +2703,14 @@ def pause_queue_monitor(delay: int): time.sleep(delay) -def get_finished_or_failed_job(args: Any, queues: Any) -> Any: +def get_finished_or_failed_job(args: object, queues: namedtuple) -> Any: """ Check if the job has either finished or failed and if so return it. If failed, order a log transfer. If the job is in state 'failed' and abort_job is set, set job_aborted. - :param args: Pilot arguments object (Any) - :param queues: Pilot queues object (Any) + :param args: Pilot arguments object (object) + :param queues: Pilot queues object (namedtuple) :return: job object (Any). """ job = get_job_from_queue(queues, "finished") @@ -2725,15 +2796,15 @@ def fast_monitor_tasks(job: Any) -> int: return exit_code -def message_listener(queues: Any, traces: Any, args: Any): +def message_listener(queues: namedtuple, traces: Any, args: object): """ Listen for messages from ActiveMQ. Thread. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object) """ while not args.graceful_stop.is_set() and args.subscribe_to_msgsvc: @@ -2777,7 +2848,7 @@ def message_listener(queues: Any, traces: Any, args: Any): logger.info('[job] message listener thread has finished') -def fast_job_monitor(queues: Any, traces: Any, args: Any) -> None: +def fast_job_monitor(queues: namedtuple, traces: Any, args: object) -> None: """ Fast monitoring of job parameters. @@ -2785,9 +2856,9 @@ def fast_job_monitor(queues: Any, traces: Any, args: Any) -> None: This function can be used for monitoring processes below the one minute threshold of the normal job_monitor thread. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object) """ # peeking and current time; peeking_time gets updated if and when jobs are being monitored, update_time is only # used for sending the heartbeat and is updated after a server update @@ -2843,7 +2914,7 @@ def fast_job_monitor(queues: Any, traces: Any, args: Any) -> None: logger.info('[job] fast job monitor thread has finished') -def job_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 +def job_monitor(queues: namedtuple, traces: Any, args: object): # noqa: C901 """ Monitor job parameters. @@ -2854,9 +2925,9 @@ def job_monitor(queues: Any, traces: Any, args: Any): # noqa: C901 looping jobs are checked once every ten minutes (default) and the heartbeat is sent once every 30 minutes. Memory usage is checked once a minute. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object) """ # initialize the monitoring time object mt = MonitoringTime() @@ -3121,7 +3192,10 @@ def download_new_proxy(role: str = 'production', proxy_type: str = '', workdir: ec, _, new_x509 = user.get_and_verify_proxy(x509, voms_role=voms_role, proxy_type=proxy_type, workdir=workdir) if ec != 0: # do not return non-zero exit code if only download fails logger.warning('failed to download/verify new proxy') - exit_code = errors.CERTIFICATEHASEXPIRED if ec == errors.CERTIFICATEHASEXPIRED else errors.NOVOMSPROXY + if ec == errors.ARCPROXYLIBFAILURE: + logger.warning("currently ignoring arcproxy library failure") + else: + exit_code = errors.CERTIFICATEHASEXPIRED if ec == errors.CERTIFICATEHASEXPIRED else errors.NOVOMSPROXY elif new_x509 and new_x509 != x509 and 'unified' in new_x509 and os.path.exists(new_x509): os.environ['X509_UNIFIED_DISPATCH'] = new_x509 logger.debug(f'set X509_UNIFIED_DISPATCH to {new_x509}') @@ -3160,14 +3234,14 @@ def send_heartbeat_if_time(job: Any, args: Any, update_time: float) -> int: return int(update_time) -def fail_monitored_job(job: Any, exit_code: int, diagnostics: str, queues: Any, traces: Any): +def fail_monitored_job(job: object, exit_code: int, diagnostics: str, queues: namedtuple, traces: Any): """ Fail a monitored job. - :param job: job object (Any) + :param job: job object (object) :param exit_code: exit code from job_monitor_tasks (int) :param diagnostics: pilot error diagnostics (str) - :param queues: queues object (Any) + :param queues: queues object (namedtuple) :param traces: traces object (Any). """ set_pilot_state(job=job, state="failed") diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 66ab1840a..24e056def 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -18,50 +18,70 @@ # # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 # NOTE: this module should deal with non-job related monitoring, such as thread monitoring. Job monitoring is # a task for the job_monitor thread in the Job component. -"""Functions for monitoring of threads.""" +"""Functions for monitoring of pilot and threads.""" import logging import threading import time import re + +from collections import namedtuple from os import environ, getuid -from subprocess import Popen, PIPE +from subprocess import ( + Popen, + PIPE +) from typing import Any from pilot.common.exception import PilotException, ExceededMaxWaitTime -from pilot.util.auxiliary import check_for_final_server_update, set_pilot_state +from pilot.util.auxiliary import ( + check_for_final_server_update, + set_pilot_state +) from pilot.util.common import is_pilot_check from pilot.util.config import config from pilot.util.constants import MAX_KILL_WAIT_TIME # from pilot.util.container import execute from pilot.util.features import MachineFeatures from pilot.util.heartbeat import update_pilot_heartbeat -from pilot.util.queuehandling import get_queuedata_from_job, get_maxwalltime_from_job, abort_jobs_in_queues +from pilot.util.https import ( + get_local_oidc_token_info, + update_local_oidc_token_info +) +from pilot.util.queuehandling import ( + abort_jobs_in_queues, + get_maxwalltime_from_job, + get_queuedata_from_job, +) from pilot.util.timing import get_time_since_start logger = logging.getLogger(__name__) -def control(queues: Any, traces: Any, args: Any): # noqa: C901 +def control(queues: namedtuple, traces: Any, args: object): # noqa: C901 """ Monitor threads. Main control function, run from the relevant workflow module. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any) + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object) """ t_0 = time.time() traces.pilot['lifetime_start'] = t_0 # ie referring to when pilot monitoring began traces.pilot['lifetime_max'] = t_0 threadchecktime = int(config.Pilot.thread_check) + # if OIDC tokens are used, define the time interval for checking the token + # otherwise the following variable is None + tokendownloadchecktime = get_oidc_check_time() + last_token_check = t_0 # for CPU usage debugging # cpuchecktime = int(config.Pilot.cpu_check) @@ -72,7 +92,7 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 push = args.harvester and args.harvester_submitmode.lower() == 'push' try: # overall loop counter (ignoring the fact that more than one job may be running) - niter = 0 + n_iterations = 0 max_running_time_old = 0 while not args.graceful_stop.is_set(): @@ -82,6 +102,12 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 run_checks(queues, args) break + # check if the OIDC token needs to be refreshed + if tokendownloadchecktime: + if int(time.time() - last_token_check) > tokendownloadchecktime: + last_token_check = time.time() + update_local_oidc_token_info(args.url, args.port) + # abort if kill signal arrived too long time ago, ie loop is stuck if args.kill_time and int(time.time()) - args.kill_time > MAX_KILL_WAIT_TIME: logger.warning('loop has run for too long time - will abort') @@ -110,7 +136,7 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 f'exceeded - time to abort pilot') reached_maxtime_abort(args) break - if niter % 60 == 0: + if n_iterations % 60 == 0: logger.info(f'{time_since_start}s have passed since pilot start') # every minute run the following check @@ -149,7 +175,7 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 logger.fatal(f'thread \'{thread.name}\' is not alive') # args.graceful_stop.set() - niter += 1 + n_iterations += 1 except Exception as error: print((f"monitor: exception caught: {error}")) @@ -158,6 +184,25 @@ def control(queues: Any, traces: Any, args: Any): # noqa: C901 logger.info('[monitor] control thread has ended') +def get_oidc_check_time() -> int or None: + """ + Return the time interval for checking the OIDC token. + + :return: time interval for checking the OIDC token (int or None). + """ + auth_token, auth_origin = get_local_oidc_token_info() + use_oidc_token = True if auth_token and auth_origin else False + if use_oidc_token: + try: + token_check = int(config.Token.download_check) + except (AttributeError, ValueError): + token_check = None + else: + token_check = None + + return token_check + + def run_shutdowntime_minute_check(time_since_start: int) -> bool: """ Run checks on machine features shutdowntime once a minute. @@ -299,12 +344,12 @@ def get_proper_pilot_heartbeat() -> int: return 60 -def run_checks(queues: Any, args: Any) -> None: +def run_checks(queues: namedtuple, args: object) -> None: """ Perform non-job related monitoring checks. - :param queues: queues object (Any) - :param args: Pilot arguments object (Any) + :param queues: queues object (namedtuple) + :param args: Pilot arguments object (object) :raises: ExceedMaxWaitTime. """ # check how long time has passed since last successful heartbeat @@ -381,7 +426,7 @@ def run_checks(queues: Any, args: Any) -> None: # raise ExceededMaxWaitTime(diagnostics) -def get_max_running_time(lifetime: int, queuedata: Any, queues: Any, push: bool, pod: bool) -> int: +def get_max_running_time(lifetime: int, queuedata: Any, queues: namedtuple, push: bool, pod: bool) -> int: """ Return the maximum allowed running time for the pilot. @@ -390,7 +435,7 @@ def get_max_running_time(lifetime: int, queuedata: Any, queues: Any, push: bool, :param lifetime: optional pilot option time in seconds (int) :param queuedata: queuedata object (Any) - :param queues: queues object (Any) + :param queues: queues object (namedtuple) :param push: push mode (bool) :param pod: pod mode (bool) :return: max running time in seconds (int). diff --git a/pilot/control/payload.py b/pilot/control/payload.py index 8ef1c4538..25adce158 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -30,8 +30,16 @@ import time import traceback import queue -from re import findall, split -from typing import Any, TextIO +from collections import namedtuple +from re import ( + findall, + split, + search +) +from typing import ( + Any, + TextIO +) from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import ( @@ -39,11 +47,12 @@ PilotException ) from pilot.control.payloads import ( - generic, eventservice, - eventservicemerge + eventservicemerge, + generic, ) from pilot.control.job import send_state +from pilot.info import JobData from pilot.util.auxiliary import set_pilot_state from pilot.util.container import execute from pilot.util.config import config @@ -66,13 +75,13 @@ errors = ErrorCodes() -def control(queues: Any, traces: Any, args: Any): +def control(queues: namedtuple, traces: Any, args: object): """ Set up payload threads. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any). + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object). """ targets = {'validate_pre': validate_pre, 'execute_payloads': execute_payloads, 'validate_post': validate_post, 'failed_post': failed_post, 'run_realtimelog': run_realtimelog} @@ -126,7 +135,7 @@ def control(queues: Any, traces: Any, args: Any): logger.info('[payload] control thread has finished') -def validate_pre(queues: Any, traces: Any, args: Any): +def validate_pre(queues: namedtuple, traces: Any, args: object): """ Get a Job object from the "payloads" queue and validate it. @@ -135,9 +144,9 @@ def validate_pre(queues: Any, traces: Any, args: Any): If the payload is successfully validated (user defined), the Job object is placed in the "validated_payloads" queue, otherwise it is placed in the "failed_payloads" queue. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (Any). + :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc) (object). """ while not args.graceful_stop.is_set(): time.sleep(0.5) @@ -160,11 +169,11 @@ def validate_pre(queues: Any, traces: Any, args: Any): logger.info('[payload] validate_pre thread has finished') -def _validate_payload(job: Any) -> bool: +def _validate_payload(job: JobData) -> bool: """ Perform user validation tests for the payload. - :param job: job object (Any) + :param job: job object (JobData) :return: boolean (bool). """ status = True @@ -181,12 +190,12 @@ def _validate_payload(job: Any) -> bool: return status -def get_payload_executor(args: Any, job: Any, out: TextIO, err: TextIO, traces: Any) -> Any: +def get_payload_executor(args: object, job: JobData, out: TextIO, err: TextIO, traces: Any) -> Any: """ Get payload executor function for different payload. - :param args: Pilot arguments object (Any) - :param job: job object (Any) + :param args: Pilot arguments object (object) + :param job: job object (JobData) :param out: stdout file object (TextIO) :param err: stderr file object (TextIO) :param traces: traces object (Any) @@ -202,7 +211,7 @@ def get_payload_executor(args: Any, job: Any, out: TextIO, err: TextIO, traces: return payload_executor -def execute_payloads(queues: Any, traces: Any, args: Any): # noqa: C901 +def execute_payloads(queues: namedtuple, traces: Any, args: object): # noqa: C901 """ Execute queued payloads. @@ -212,9 +221,9 @@ def execute_payloads(queues: Any, traces: Any, args: Any): # noqa: C901 is started, the thread will wait for it to finish and then check for any failures. A successfully completed job is placed in the "finished_payloads" queue, and a failed job will be placed in the "failed_payloads" queue. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any). + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object). """ job = None while not args.graceful_stop.is_set(): @@ -385,7 +394,7 @@ def get_rtlogging() -> str: return rtlogging -def get_logging_info(job: Any, args: Any) -> dict: +def get_logging_info(job: JobData, args: object) -> dict: """ Extract the logging type/protocol/url/port from catchall if present, or from args fields. @@ -396,13 +405,13 @@ def get_logging_info(job: Any, args: Any) -> dict: Note: the returned dictionary can be built with either args (has priority) or catchall info. - :param job: job object (Any) - :param args: Pilot arguments object (Any) + :param job: job object (JobData) + :param args: Pilot arguments object (object) :return: info dictionary (logging_type (string), protocol (string), url (string), port (int)) (dict). """ info_dic = {} - if not job.realtimelogging: + if not job.realtimelogging and "loggingfile" not in job.infosys.queuedata.catchall: logger.info("job.realtimelogging is not enabled") return {} @@ -410,30 +419,36 @@ def get_logging_info(job: Any, args: Any) -> dict: info_dic['logname'] = args.realtime_logname if args.realtime_logname else "pilot-log" logserver = args.realtime_logging_server if args.realtime_logging_server else "" - pattern = r'(\S+)\;(\S+)\:\/\/(\S+)\:(\d+)' - info = findall(pattern, get_rtlogging()) - + info = findall(r'(\S+)\;(\S+)\:\/\/(\S+)\:(\d+)', get_rtlogging()) if not logserver and not info: - logger.warning('not enough info available for activating real-time logging') + logger.warning(f"not enough info available for activating real-time logging (info='{info}', logserver='{logserver}')") return {} if len(logserver) > 0: - items = logserver.split(':') - info_dic['logging_type'] = items[0].lower() - pattern = r'(\S+)\:\/\/(\S+)' - if len(items) > 2: - _address = findall(pattern, items[1]) - info_dic['port'] = items[2] - else: - _address = None - info_dic['port'] = 24224 - if _address: - info_dic['protocol'] = _address[0][0] - info_dic['url'] = _address[0][1] + if ';' not in logserver: + logger.warning(f'wrong format of logserver: does not contain a \';\' character: {logserver}') + logger.info("correct logserver formal: logging_type;protocol://hostname:port") + return {} + + regex = r"logserver='(?P[^;]+);(?P[^:]+)://(?P[^:]+):(?P\d+)'" + match = search(regex, logserver) + if match: + logging_type = match.group('logging_type') + protocol = match.group('protocol') + hostname = match.group('hostname') + port = match.group('port') + + # Print the extracted values + logger.debug(f"extracted logging_type='{logging_type}', protocol='{protocol}', hostname='{hostname}'," + f"port='{port}' from logserver='{logserver}'") + + info_dic['logging_type'] = logging_type + info_dic['protocol'] = protocol + info_dic['url'] = hostname + info_dic['port'] = port else: - logger.warning(f'protocol/url could not be extracted from {items}') - info_dic['protocol'] = '' - info_dic['url'] = '' + logger.warning(f"no match found in logserver='{logserver}' for pattern=r'{regex}'") + return {} elif info: try: info_dic['logging_type'] = info[0][0] @@ -445,7 +460,7 @@ def get_logging_info(job: Any, args: Any) -> dict: return {} # find the log file to tail - path = find_log_to_tail(job.debug_command, job.workdir, args, job.is_analysis()) + path = find_log_to_tail(job.debug_command, job.workdir, args, job.is_analysis(), job.infosys.queuedata.catchall) logger.info(f'using {path} for real-time logging') info_dic['logfiles'] = [path] @@ -458,14 +473,15 @@ def get_logging_info(job: Any, args: Any) -> dict: return info_dic -def find_log_to_tail(debug_command: str, workdir: str, args: Any, is_analysis: bool) -> str: +def find_log_to_tail(debug_command: str, workdir: str, args: object, is_analysis: bool, catchall: str) -> str: """ Find the log file to tail in the RT logging. :param debug_command: requested debug command (str) :param workdir: job working directory (str) - :param args: Pilot arguments object (Any) + :param args: Pilot arguments object (object) :param is_analysis: True for user jobs, False otherwise (bool) + :param catchall: catchall field from queuedata (str) :return: path to log file (str). """ path = "" @@ -488,27 +504,31 @@ def find_log_to_tail(debug_command: str, workdir: str, args: Any, is_analysis: b break counter += 10 + if not path and "loggingfile" in catchall: + # extract the path from the catchall "..,loggingfile=path,.." + _path = findall(r'loggingfile=([^,]+)', catchall) + if _path: + path = _path[0] + logger.debug(f'found path in catchall: {path}') + # fallback to known log file if no other file could be found logf = path if path else config.Payload.payloadstdout - if not path: - if filename: - logger.warning(f'file {filename} was not found for {maxwait} s, using default') - else: - logger.info(f'using {logf} for real-time logging') + if not path and filename: + logger.warning(f'file {filename} was not found for {maxwait} s, using default') return logf -def run_realtimelog(queues: Any, traces: Any, args: Any): # noqa: C901 +def run_realtimelog(queues: namedtuple, traces: Any, args: object): # noqa: C901 """ Validate finished payloads. If payload finished correctly, add the job to the data_out queue. If it failed, add it to the data_out queue as well but only for log stage-out (in failed_post() below). - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any). + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object). """ info_dic = None while not args.graceful_stop.is_set(): @@ -594,11 +614,11 @@ def run_realtimelog(queues: Any, traces: Any, args: Any): # noqa: C901 logger.info('[payload] run_realtimelog thread has finished') -def set_cpu_consumption_time(job: Any): +def set_cpu_consumption_time(job: JobData): """ Set the CPU consumption time. - :param job: job object (Any). + :param job: job object (JobData). """ cpuconsumptiontime = get_cpu_consumption_time(job.t0) job.cpuconsumptiontime = int(round(cpuconsumptiontime)) @@ -607,13 +627,13 @@ def set_cpu_consumption_time(job: Any): logger.info(f'CPU consumption time: {cpuconsumptiontime} {job.cpuconsumptionunit} (rounded to {job.cpuconsumptiontime} {job.cpuconsumptionunit})') -def perform_initial_payload_error_analysis(job: Any, exit_code: int): +def perform_initial_payload_error_analysis(job: JobData, exit_code: int): """ Perform an initial analysis of the payload. Singularity/apptainer errors are caught here. - :param job: job object (Any) + :param job: job object (JobData) :param exit_code: exit code from payload execution (int). """ if exit_code != 0: @@ -748,7 +768,7 @@ def set_error_code_from_stderr(msg: str, fatal: bool) -> int: return exit_code -def validate_post(queues: Any, traces: Any, args: Any): +def validate_post(queues: namedtuple, traces: Any, args: object): """ Validate finished payloads. @@ -757,9 +777,9 @@ def validate_post(queues: Any, traces: Any, args: Any): If payload finished correctly, add the job to the data_out queue. If it failed, add it to the data_out queue as well but only for log stage-out (in failed_post() below). - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any). + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object). """ while not args.graceful_stop.is_set(): time.sleep(0.5) @@ -785,7 +805,7 @@ def validate_post(queues: Any, traces: Any, args: Any): logger.info('[payload] validate_post thread has finished') -def failed_post(queues: Any, traces: Any, args: Any): +def failed_post(queues: namedtuple, traces: Any, args: object): """ Handle failed jobs. @@ -794,9 +814,9 @@ def failed_post(queues: Any, traces: Any, args: Any): Get a Job object from the "failed_payloads" queue. Set the pilot state to "stageout" and the stageout field to "log", and add the Job object to the "data_out" queue. - :param queues: internal queues for job handling (Any) + :param queues: internal queues for job handling (namedtuple) :param traces: tuple containing internal pilot states (Any) - :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (Any). + :param args: Pilot arguments object (e.g. containing queue name, queuedata dictionary, etc) (object). """ while not args.graceful_stop.is_set(): time.sleep(0.5) diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py index ebff6c7a0..ede9fb60b 100644 --- a/pilot/control/payloads/eventservice.py +++ b/pilot/control/payloads/eventservice.py @@ -18,7 +18,7 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2017-2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2021-2024 +# - Paul Nilsson, paul.nilsson@cern.ch, 2021-24 """Executor module for event service payloads.""" @@ -30,6 +30,7 @@ from pilot.common import exception from pilot.control.payloads import generic from pilot.eventservice.workexecutor.workexecutor import WorkExecutor +from pilot.info import JobData logger = logging.getLogger(__name__) @@ -39,27 +40,27 @@ class Executor(generic.Executor): # only define the __init__ function if it actually does anything - otherwise it can be omitted since the # parent __init__ function will be called automatically - # def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): + # def __init__(self, args: Any, job: JobData, out: TextIO, err: TextIO, traces: Any): # """ # Set initial values. # # :param args: args object (Any) - # :param job: job object (Any) + # :param job: job object (JobData) # :param out: stdout file object (TextIO) # :param err: stderr file object (TextIO) # :param traces: traces object (Any). # """ # super().__init__(args, job, out, err, traces) - def run_payload(self, job: Any, cmd: str, out: TextIO, err: TextIO) -> Any: + def run_payload(self, job: JobData, cmd: str, out: TextIO, err: TextIO) -> Any: """ Run the payload for the given job and return the executor. - :param job: job object - :param cmd: (unused in ES mode) - :param out: stdout file object - :param err: stderr file object - :return: executor instance. + :param job: job object (JobData) + :param cmd: (unused in ES mode) command to run (str) + :param out: stdout file object (TextIO) + :param err: stderr file object (TextIO) + :return: executor instance (Any). """ self.pre_setup(job) @@ -119,18 +120,18 @@ def get_executor_type(self) -> dict: This is usually the 'generic' type, which means normal event service. It can also be 'raythena' if specified in the Pilot options. - :return: executor type dictionary. + :return: executor type dictionary (dict). """ # executor_type = 'hpo' if job.is_hpo else os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') # return {'executor_type': executor_type} return {"executor_type": os.environ.get("PILOT_ES_EXECUTOR_TYPE", "generic")} - def wait_graceful(self, args: Any, proc: Any) -> int: + def wait_graceful(self, args: object, proc: Any) -> int: """ Wait for the graceful signal bit to be set in the args object. - :param args: args object - :param proc: process + :param args: args object (object) + :param proc: process object (Any) :return: exit code (int). """ t_1 = time.time() diff --git a/pilot/control/payloads/eventservicemerge.py b/pilot/control/payloads/eventservicemerge.py index bd3be12b8..a8f3483bc 100644 --- a/pilot/control/payloads/eventservicemerge.py +++ b/pilot/control/payloads/eventservicemerge.py @@ -18,15 +18,15 @@ # # Authors: # - Wen Guan, wen.guan@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2024 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 """Executor module for event service merge payloads.""" import logging import os -from typing import Any # , TextIO from pilot.control.payloads import generic +from pilot.info import JobData from pilot.util.container import execute logger = logging.getLogger(__name__) @@ -37,12 +37,12 @@ class Executor(generic.Executor): # only define the __init__ function if it actually does anything - otherwise it can be omitted since the # parent __init__ function will be called automatically - # def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): + # def __init__(self, args: Any, job: JobData, out: TextIO, err: TextIO, traces: Any): # """ # Set initial values. # # :param args: args object (Any) - # :param job: job object (Any) + # :param job: job object (JobData) # :param out: stdout file object (TextIO) # :param err: stderr file object (TextIO) # :param traces: traces object (Any). @@ -62,13 +62,13 @@ def untar_file(self, lfn: str, workdir: str): exit_code, stdout, stderr = execute(command) logger.info(f"exit_code: {exit_code}, stdout: {stdout}, stderr: {stderr}\n") - def utility_before_payload(self, job: Any): + def utility_before_payload(self, job: JobData): """ Run utility functions before payload. Note: this function updates job.jobparams (process_writetofile() call) - :param job: job object. + :param job: job object (JobData). """ logger.info("untar input tar files for eventservicemerge job") for fspec in job.indata: diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 91d98268f..2317ff7b1 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -17,10 +17,10 @@ # under the License. # # Authors: -# - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 +# - Mario Lassnig, mario.lassnig@cern.ch, 2016-17 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Tobias Wegner, tobias.wegner@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2024 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 # - Wen Guan, wen.guan@cern.ch, 2018 """Executor module for generic payloads.""" @@ -35,24 +35,31 @@ from pilot.common.errorcodes import ErrorCodes from pilot.control.job import send_state +from pilot.info import JobData from pilot.util.auxiliary import set_pilot_state # , show_memory_usage from pilot.util.config import config from pilot.util.container import execute from pilot.util.constants import ( + UTILITY_AFTER_PAYLOAD_FINISHED, + UTILITY_AFTER_PAYLOAD_STARTED, UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, - UTILITY_AFTER_PAYLOAD_STARTED, - UTILITY_AFTER_PAYLOAD_FINISHED, - PILOT_PRE_SETUP, + PILOT_POST_PAYLOAD, PILOT_POST_SETUP, + PILOT_PRE_SETUP, PILOT_PRE_PAYLOAD, - PILOT_POST_PAYLOAD, - UTILITY_AFTER_PAYLOAD_STARTED2, UTILITY_AFTER_PAYLOAD_FINISHED2, + UTILITY_AFTER_PAYLOAD_STARTED2, +) +from pilot.util.filehandling import ( + write_file, + read_file ) -from pilot.util.filehandling import write_file, read_file from pilot.util.processes import kill_processes -from pilot.util.timing import add_to_pilot_timing, get_time_measurement +from pilot.util.timing import ( + add_to_pilot_timing, + get_time_measurement +) from pilot.common.exception import PilotException logger = logging.getLogger(__name__) @@ -62,12 +69,12 @@ class Executor: """Executor class for generic payloads.""" - def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): + def __init__(self, args: object, job: JobData, out: TextIO, err: TextIO, traces: Any): """ Set initial values. - :param args: args object (Any) - :param job: job object (Any) + :param args: args object (object) + :param job: job object (JobData) :param out: stdout file object (TextIO) :param err: stderr file object (TextIO) :param traces: traces object (Any). @@ -85,19 +92,19 @@ def __init__(self, args: Any, job: Any, out: TextIO, err: TextIO, traces: Any): # self.__postprocess_stdout_name = '' # self.__postprocess_stderr_name = '' - def get_job(self): + def get_job(self) -> object: """ Get the job object. - :return: job object. + :return: job object (object). """ return self.__job - def pre_setup(self, job: Any): + def pre_setup(self, job: JobData): """ Run pre setup functions. - :param job: job object (Any). + :param job: job object (JobData). """ # write time stamps to pilot timing file update_time = time.time() @@ -105,12 +112,12 @@ def pre_setup(self, job: Any): logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, update_time, self.__args) - def post_setup(self, job: Any, update_time: bool = None): + def post_setup(self, job: JobData, update_time: bool = None): """ Run post run functions. - :param job: job object - :param update_time: should time stamps be written to timing file? (bool) + :param job: job object (JobData) + :param update_time: should time stamps be written to timing file? (bool). """ # write time stamps to pilot timing file if not update_time: @@ -159,7 +166,7 @@ def improve_post_setup(self): ) self.post_setup(self.__job, update_time=end_setup_time) - def utility_before_payload(self, job: Any) -> str: + def utility_before_payload(self, job: JobData) -> str: """ Prepare commands/utilities to run before payload. @@ -168,7 +175,7 @@ def utility_before_payload(self, job: Any) -> str: REFACTOR - :param job: job object + :param job: job object (JobData) :return: utility command (str). """ cmd = "" @@ -192,13 +199,13 @@ def utility_before_payload(self, job: Any) -> str: return cmd - def utility_with_payload(self, job: Any) -> str: + def utility_with_payload(self, job: JobData) -> str: """ Run functions alongside payload. REFACTOR - :param job: job object. + :param job: job object (JobData) :return: utility command (str). """ cmd = "" @@ -249,11 +256,11 @@ def get_utility_command(self, order: str = "") -> str: return cmd - def utility_after_payload_started(self, job: Any): + def utility_after_payload_started(self, job: JobData): """ Run utility functions after payload started. - :param job: job object (Any). + :param job: job object (JobData). """ # get the payload command from the user specific code pilot_user = os.environ.get("PILOT_USER", "generic").lower() @@ -322,13 +329,13 @@ def utility_after_payload_started(self, job: Any): # else: # logger.info(f'could not extract any pid from ps for cmd={cmd}') - def utility_after_payload_started_new(self, job: Any) -> str: + def utility_after_payload_started_new(self, job: JobData) -> str: """ Run utility functions after payload started. REFACTOR - :param job: job object + :param job: job object (JobData) :return: utility command (str). """ cmd = "" @@ -364,7 +371,7 @@ def utility_after_payload_started_new(self, job: Any) -> str: # # also store the full command in case it needs to be restarted later (by the job_monitor() thread) # job.utilities[cmd_dictionary.get('command')] = [proc, 1, utilitycommand] - def utility_after_payload_finished(self, job: Any, order: str) -> (str, str, bool): + def utility_after_payload_finished(self, job: JobData, order: str) -> (str, str, bool): """ Prepare commands/utilities to run after payload has finished. @@ -372,7 +379,7 @@ def utility_after_payload_finished(self, job: Any, order: str) -> (str, str, boo The order constant can be UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2 - :param job: job object + :param job: job object (JobData) :param order: string constant used for utility selection (str) :return: command (str), label (str), ignore failure (bool). """ @@ -398,12 +405,12 @@ def utility_after_payload_finished(self, job: Any, order: str) -> (str, str, boo ) return cmd, label, ignore_failure - def execute_utility_command(self, cmd: str, job: Any, label: str) -> int: + def execute_utility_command(self, cmd: str, job: JobData, label: str) -> int: """ Execute a utility command (e.g. pre/postprocess commands; label=preprocess etc). :param cmd: full command to be executed (str) - :param job: job object + :param job: job object (JobData) :param label: command label (str) :return: exit code (int). """ @@ -471,13 +478,13 @@ def write_utility_output(self, workdir: str, step: str, stdout: str, stderr: str else: logger.debug(f"wrote {name}") - def pre_payload(self, job: Any): + def pre_payload(self, job: JobData): """ Run functions before payload. E.g. write time stamps to timing file. - :param job: job object. + :param job: job object (JobData). """ # write time stamps to pilot timing file update_time = time.time() @@ -485,13 +492,13 @@ def pre_payload(self, job: Any): logger.debug(f"gmtime is {time.gmtime(update_time)}") add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, update_time, self.__args) - def post_payload(self, job: Any): + def post_payload(self, job: JobData): """ Run functions after payload. E.g. write time stamps to timing file. - :param job: job object. + :param job: job object (JobData). """ # write time stamps to pilot timing file update_time = time.time() @@ -546,17 +553,17 @@ def run_command(self, cmd: str, label: str = "") -> Any: return proc - def run_payload(self, job: Any, cmd: str, out: Any, err: Any) -> Any: + def run_payload(self, job: JobData, cmd: str, out: Any, err: Any) -> Any: """ Set up and execute the main payload process. REFACTOR using run_command() - :param job: job object (Any) + :param job: job object (JobData) :param cmd: command (str) - :param out: (currently not used; deprecated) - :param err: (currently not used; deprecated) - :return: proc (subprocess returned by Popen()). + :param out: (currently not used; deprecated) stdout file object (Any) + :param err: (currently not used; deprecated) stderr file object (Any) + :return: proc (subprocess returned by Popen()) (Any). """ # main payload process steps @@ -639,11 +646,11 @@ def cut_str_from_last_semicolon(_cmd: str) -> str: return setup - def wait_graceful(self, args: Any, proc: Any) -> int: + def wait_graceful(self, args: object, proc: Any) -> int: """ Wait for payload process to finish. - :param args: pilot arguments object (Any) + :param args: pilot arguments object (object) :param proc: subprocess object (Any) :return: exit code (int). """ @@ -684,11 +691,11 @@ def wait_graceful(self, args: Any, proc: Any) -> int: return exit_code - def get_payload_command(self, job: Any) -> str: + def get_payload_command(self, job: JobData) -> str: """ Return the payload command string. - :param job: job object (Any) + :param job: job object (JobData) :return: command (str). """ cmd = "" @@ -712,11 +719,11 @@ def get_payload_command(self, job: Any) -> str: return cmd - def run_preprocess(self, job: Any): + def run_preprocess(self, job: JobData): """ Run any preprocess payloads. - :param job: job object (Any) + :param job: job object (JobData) :return: exit code (int) :raises: Exception. """ @@ -764,7 +771,7 @@ def run_preprocess(self, job: Any): return exit_code - def should_verify_setup(self): + def should_verify_setup(self) -> bool: """ Determine if the setup command should be verified. @@ -774,9 +781,10 @@ def should_verify_setup(self): user = __import__( f"pilot.user.{pilot_user}.setup", globals(), locals(), [pilot_user], 0 ) + return user.should_verify_setup(self.__job) - def run(self) -> (int, str): # noqa: C901 + def run(self) -> tuple[int, str]: # noqa: C901 """ Run all payload processes (including pre- and post-processes, and utilities). @@ -801,7 +809,8 @@ def run(self) -> (int, str): # noqa: C901 # should the setup be verified? (user defined) verify_setup = self.should_verify_setup() if verify_setup: - logger.debug(f"extracted setup to be verified:\n\n{self.__job.setup}") + logger.info(f"extracted setup to be verified:\n\n{self.__job.setup}") + logger.warning('setup verification will lead to some repeated messages next, before the payload is executed') try: _cmd = self.__job.setup stdout_filename = os.path.join(self.__job.workdir, "setup.stdout") @@ -1095,7 +1104,6 @@ def kill_and_wait_for_process(self, pid: int, user: str, utcmd: str) -> int: logger.warning(f"Error sending signal to/waiting for process {pid}: {exc}") return None - # try: # # Send SIGUSR1 signal to the process # os.kill(pid, sig) diff --git a/pilot/info/basedata.py b/pilot/info/basedata.py index 43a9edcc5..337ffce54 100644 --- a/pilot/info/basedata.py +++ b/pilot/info/basedata.py @@ -138,6 +138,8 @@ def clean_numeric(self, raw: Any, ktype: Any, kname: Any = None, defval: int = 0 if isinstance(raw, str): raw = raw.strip() + if raw.upper() == "NULL": # Handle "NULL" as a special case + return defval try: return ktype(raw) diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index a760341d4..f936dbbf1 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -33,21 +33,28 @@ :date: February 2018 """ +import ast +import logging import os import re -import ast import shlex -import pipes +from json import dumps from time import sleep +from typing import Any -from .basedata import BaseData -from .filespec import FileSpec -from pilot.util.auxiliary import get_object_size, get_key_value +from pilot.util.auxiliary import ( + get_object_size, + get_key_value +) from pilot.util.constants import LOG_TRANSFER_NOT_DONE -from pilot.util.filehandling import get_guid, get_valid_path_from_list +from pilot.util.filehandling import ( + get_guid, + get_valid_path_from_list +) from pilot.util.timing import get_elapsed_real_time +from .basedata import BaseData +from .filespec import FileSpec -import logging logger = logging.getLogger(__name__) @@ -139,36 +146,37 @@ class JobData(BaseData): usecontainer = False # boolean, True if a container is to be used for the payload # from job definition - attemptnr = 0 # job attempt number - destinationdblock = "" ## to be moved to FileSpec (job.outdata) - datasetin = "" ## TO BE DEPRECATED: moved to FileSpec (job.indata) - debug = False # debug mode, when True, pilot will send debug info back to the server - debug_command = '' # debug command (can be defined on the task side) - produserid = "" # the user DN (added to trace report) - jobdefinitionid = "" # the job definition id (added to trace report) - infilesguids = "" # - indata = [] # list of `FileSpec` objects for input files (aggregated inFiles, ddmEndPointIn, scopeIn, filesizeIn, etc) - outdata = [] # list of `FileSpec` objects for output files - logdata = [] # list of `FileSpec` objects for log file(s) + attemptnr = 0 # job attempt number + destinationdblock = "" ## to be moved to FileSpec (job.outdata) + datasetin = "" ## TO BE DEPRECATED: moved to FileSpec (job.indata) + debug = False # debug mode, when True, pilot will send debug info back to the server + debug_command = '' # debug command (can be defined on the task side) + produserid = "" # the user DN (added to trace report) + jobdefinitionid = "" # the job definition id (added to trace report) + infilesguids = "" # guids for input files + indata = [] # list of `FileSpec` objects for input files (aggregated inFiles, ddmEndPointIn, scopeIn, filesizeIn, etc) + outdata = [] # list of `FileSpec` objects for output files + logdata = [] # list of `FileSpec` objects for log file(s) # preprocess = {u'args': u'preprocess', u'command': u'echo'} # postprocess = {u'args': u'postprocess', u'command': u'echo'} - preprocess = {} # preprocess dictionary with command to execute before payload, {'command': '..', 'args': '..'} - postprocess = {} # postprocess dictionary with command to execute after payload, {'command': '..', 'args': '..'} - coprocess = {} # coprocess dictionary with command to execute during payload, {'command': '..', 'args': '..'} + preprocess = {} # preprocess dictionary with command to execute before payload, {'command': '..', 'args': '..'} + postprocess = {} # postprocess dictionary with command to execute after payload, {'command': '..', 'args': '..'} + coprocess = {} # coprocess dictionary with command to execute during payload, {'command': '..', 'args': '..'} # coprocess = {u'args': u'coprocess', u'command': u'echo'} containeroptions = {} # - use_vp = False # True for VP jobs - maxwalltime = 0 # maxWalltime in s - dask_scheduler_ip = '' # enhanced job definition for Dask jobs + use_vp = False # True for VP jobs + maxwalltime = 0 # maxWalltime in s + dask_scheduler_ip = '' # enhanced job definition for Dask jobs jupyter_session_ip = '' # enhanced job definition for Dask jobs - + minramcount = 0 # minimum number of RAM required by the payload + altstageout = None # alternative stage-out method, on, off, force # home package string with additional payload release information; does not need to be added to # the conversion function since it's already lower case - homepackage = "" # - jobsetid = "" # job set id - noexecstrcnv = None # server instruction to the pilot if it should take payload setup from job parameters - swrelease = "" # software release string - writetofile = "" # + homepackage = "" # home package for TRF + jobsetid = "" # job set id + noexecstrcnv = None # server instruction to the pilot if it should take payload setup from job parameters + swrelease = "" # software release string + writetofile = "" # # cmtconfig encoded info alrbuserplatform = "" # ALRB_USER_PLATFORM encoded in platform/cmtconfig value @@ -179,7 +187,7 @@ class JobData(BaseData): # specify the type of attributes for proper data validation and casting _keys = {int: ['corecount', 'piloterrorcode', 'transexitcode', 'exitcode', 'cpuconversionfactor', 'exeerrorcode', 'attemptnr', 'nevents', 'neventsw', 'pid', 'cpuconsumptiontime', 'maxcpucount', 'actualcorecount', - 'requestid', 'maxwalltime'], + 'requestid', 'maxwalltime', 'minramcount'], str: ['jobid', 'taskid', 'jobparams', 'transformation', 'destinationdblock', 'exeerrordiag' 'state', 'serverstate', 'workdir', 'stageout', 'platform', 'piloterrordiag', 'exitmsg', 'produserid', 'jobdefinitionid', 'writetofile', @@ -187,7 +195,7 @@ class JobData(BaseData): 'swrelease', 'zipmap', 'imagename', 'imagename_jobdef', 'accessmode', 'transfertype', 'datasetin', ## TO BE DEPRECATED: moved to FileSpec (job.indata) 'infilesguids', 'memorymonitor', 'allownooutput', 'pandasecrets', 'prodproxy', 'alrbuserplatform', - 'debug_command', 'dask_scheduler_ip', 'jupyter_session_ip'], + 'debug_command', 'dask_scheduler_ip', 'jupyter_session_ip', 'altstageout'], list: ['piloterrorcodes', 'piloterrordiags', 'workdirsizes', 'zombies', 'corecounts', 'subprocesses', 'logdata', 'outdata', 'indata'], dict: ['status', 'fileinfo', 'metadata', 'utilities', 'overwrite_queuedata', 'sizes', 'preprocess', @@ -196,22 +204,26 @@ class JobData(BaseData): 'use_vp', 'looping_check'] } - def __init__(self, data, use_kmap=True): - """ - :param data: input dictionary of data settings + def __init__(self, data: dict, use_kmap: bool = True): """ + Initialize JobData object. + :param data: input dictionary of data settings (dict) + :param use_kmap: use kmap for data conversion (bool). + """ self.infosys = None # reference to Job specific InfoService instance self._rawdata = data self.load(data, use_kmap=use_kmap) # for native HPO pilot support - if self.is_hpo and False: - self.is_eventservice = True + # if self.is_hpo: + # self.is_eventservice = True - def init(self, infosys): + def init(self, infosys: Any): """ - :param infosys: infosys object + Initialize JobData object with InfoService instance. + + :param infosys: infosys object (Any). """ self.infosys = infosys self.indata = self.prepare_infiles(self._rawdata) @@ -241,16 +253,17 @@ def init(self, infosys): #if image_base and not os.path.isabs(self.imagename) and not self.imagename.startswith('docker'): # self.imagename = os.path.join(image_base, self.imagename) - def prepare_infiles(self, data): - """ - Construct FileSpec objects for input files from raw dict `data` - :return: list of validated `FileSpec` objects + def prepare_infiles(self, data: dict) -> list: """ + Construct FileSpec objects for input files from raw dict `data`. + :param data: input dictionary of data settings (dict) + :return: list of validated `FileSpec` objects. + """ # direct access handling self.set_accessmode() - access_keys = ['allow_lan', 'allow_wan', 'direct_access_lan', 'direct_access_wan'] + access_keys = {'allow_lan', 'allow_wan', 'direct_access_lan', 'direct_access_wan'} if not self.infosys or not self.infosys.queuedata: self.show_access_settings(access_keys) @@ -260,7 +273,7 @@ def prepare_infiles(self, data): ksources = dict([item, self.clean_listdata(data.get(item, ''), list, item, [])] for item in list(kmap.values())) ret, lfns = [], set() for ind, lfn in enumerate(ksources.get('inFiles', [])): - if lfn in ['', 'NULL'] or lfn in lfns: # exclude null data and duplicates + if lfn in {'', 'NULL'} or lfn in lfns: # exclude null data and duplicates continue lfns.add(lfn) idat = {} @@ -289,11 +302,7 @@ def prepare_infiles(self, data): return ret def set_accessmode(self): - """ - Set the accessmode field using jobparams. - - :return: - """ + """Set the accessmode field using jobparams.""" self.accessmode = None if '--accessmode=direct' in self.jobparams: self.accessmode = 'direct' @@ -301,19 +310,18 @@ def set_accessmode(self): self.accessmode = 'copy' @staticmethod - def show_access_settings(access_keys): + def show_access_settings(access_keys: list): """ Show access settings for the case job.infosys.queuedata is not initialized. :param access_keys: list of access keys (list). - :return: """ dat = dict([item, getattr(FileSpec, item, None)] for item in access_keys) msg = ', '.join([f"{item}={value}" for item, value in sorted(dat.items())]) logger.info(f'job.infosys.queuedata is not initialized: the following access settings will be used by default: {msg}') @staticmethod - def get_kmap(): + def get_kmap() -> dict: """ Return the kmap dictionary for server data to pilot conversions. @@ -333,17 +341,17 @@ def get_kmap(): return kmap - def prepare_outfiles(self, data): + def prepare_outfiles(self, data: dict) -> tuple: """ - Construct validated FileSpec objects for output and log files from raw dict `data` + Construct validated FileSpec objects for output and log files from raw dict `data`. + Note: final preparation for output files can only be done after the payload has finished in case the payload has produced a job report with e.g. output file guids. For ATLAS, this is verified in pilot/user/atlas/diagnose/process_job_report(). - :param data: - :return: (list of `FileSpec` for output, list of `FileSpec` for log) + :param data: input dictionary of data settings (dict) + :return: (list of `FileSpec` for output, list of `FileSpec` for log) (tuple). """ - # form raw list data from input comma-separated values for further validataion by FileSpec kmap = { # 'internal_name': 'ext_key_structure' @@ -383,23 +391,23 @@ def prepare_outfiles(self, data): return self._get_all_output(ksources, kmap, log_lfn, data) - def _get_all_output(self, ksources, kmap, log_lfn, data): + def _get_all_output(self, ksources: dict, kmap: dict, log_lfn: str, data: dict) -> tuple: """ Create lists of FileSpecs for output + log files. + Helper function for prepare_output(). - :param ksources: - :param kmap: - :param log_lfn: log file name (string). - :param data: - :return: ret_output (list of FileSpec), ret_log (list of FileSpec) + :param ksources: dictionary of sources (dict) + :param kmap: dictionary of mappings (dict) + :param log_lfn: log file name (str) + :param data: input dictionary of data settings (dict) + :return: ret_output (list of FileSpec), ret_log (list of FileSpec). """ - ret_output, ret_log = [], [] lfns = set() for ind, lfn in enumerate(ksources['outFiles']): - if lfn in ['', 'NULL'] or lfn in lfns: # exclude null data and duplicates + if lfn in {'', 'NULL'} or lfn in lfns: # exclude null data and duplicates continue lfns.add(lfn) idat = {} @@ -420,12 +428,16 @@ def _get_all_output(self, ksources, kmap, log_lfn, data): return ret_output, ret_log - def __getitem__(self, key): - """ - Temporary Integration function to keep dict-based access for old logic in compatible way - TO BE REMOVED ONCE all fields will be moved to Job object attributes + def __getitem__(self, key: str): """ + Return the value of the given key. + Temporary Integration function to keep dict-based access for old logic in compatible way + TO BE REMOVED ONCE all fields will be moved to Job object attributes + + :param key: key (str) + :return: value (Any). + """ if key == 'infosys': return self.infosys @@ -436,34 +448,48 @@ def __getitem__(self, key): def __setitem__(self, key, val): """ - Temporary Integration function to keep dict-based access for old logic in compatible way - TO BE REMOVED ONCE all fields will be moved to Job object attributes - """ + Set the value of the given key. - self._rawdata[key] = val + Temporary Integration function to keep dict-based access for old logic in compatible way + TO BE REMOVED ONCE all fields will be moved to Job object attributes. - def __contains__(self, key): + :param key: key (str) + :param val: value (Any). """ - Temporary Integration function to keep dict-based access for old logic in compatible way - TO BE REMOVED ONCE all fields will be moved to Job object attributes + self._rawdata[key] = val + + def __contains__(self, key: str) -> bool: """ + Check if the key is in the raw data. - return key in self._rawdata + Temporary Integration function to keep dict-based access for old logic in compatible way + TO BE REMOVED ONCE all fields will be moved to Job object attributes - def get(self, key, defval=None): + :param key: key (str) + :return: boolean. """ - Temporary Integration function to keep dict-based access for old logic in compatible way - TO BE REMOVED ONCE all fields will be moved to Job object attributes + return key in self._rawdata + + def get(self, key: str, defval: Any = None): """ + Return the value of the given key. - return self._rawdata.get(key, defval) + Temporary Integration function to keep dict-based access for old logic in compatible way + TO BE REMOVED ONCE all fields will be moved to Job object attributes - def load(self, data, use_kmap=True): + :param key: key (str) + :param defval: default value (Any + :return: value (Any). """ - Construct and initialize data from ext source - :param data: input dictionary of job data settings + return self._rawdata.get(key, defval) + + def load(self, data: dict, use_kmap: bool = True): """ + Construct and initialize data from ext source. + :param data: input dictionary of job data settings (dict) + :param use_kmap: use kmap for data conversion (bool). + """ ## the translation map of the container attributes from external data to internal schema ## 'internal_name':('ext_name1', 'extname2_if_any') ## 'internal_name2':'ext_name3' @@ -504,63 +530,57 @@ def load(self, data, use_kmap=True): 'requestid': 'reqID', 'maxwalltime': 'maxWalltime', 'dask_scheduler_ip': 'scheduler_ip', - 'jupyter_session_ip': 'session_ip' + 'jupyter_session_ip': 'session_ip', + 'minramcount': 'minRamCount', + 'altstageout': 'altStageOut' } if use_kmap else {} self._load_data(data, kmap) - def is_analysis(self): ## if it's experiment specific logic then it could be isolated into extended JobDataATLAS class - """ - Determine whether the job is an analysis user job or not. - :return: True in case of user analysis job + def is_analysis(self) -> bool: ## if it's experiment specific logic then it could be isolated into extended JobDataATLAS class """ + Determine whether the job is an analysis user job or not. - is_analysis = self.transformation.startswith('https://') or self.transformation.startswith('http://') - - # apply addons checks later if need - - return is_analysis + :return: True in case of user analysis job (bool). + """ + return self.transformation.startswith('https://') or self.transformation.startswith('http://') - def is_build_job(self): + def is_build_job(self) -> bool: """ Check if the job is a build job. + (i.e. check if the job has an output file that is a lib file). - :return: boolean + :return: boolean. """ + return any('.lib.' in fspec.lfn and '.log.' not in fspec.lfn for fspec in self.outdata) - for fspec in self.outdata: - if '.lib.' in fspec.lfn and '.log.' not in fspec.lfn: - return True + def is_local(self) -> bool: + """ + Check if the input files should be accessed locally. - return False + Confusing function, since it does not consider real status of applied transfer, TOBE DEPRECATED, use `has_remoteio()` instead - def is_local(self): ## confusing function, since it does not consider real status of applied transfer, TOBE DEPRECATED, use `has_remoteio()` instead of - """ - Should the input files be accessed locally? Note: all input files will have storage_token set to local in that case. :return: boolean. """ + return any(fspec.storage_token == 'local' and '.lib.' not in fspec.lfn for fspec in self.indata) - for fspec in self.indata: - if fspec.storage_token == 'local' and '.lib.' not in fspec.lfn: - return True - - def has_remoteio(self): - """ - Check status of input file transfers and determine either direct access mode will be used or not. - :return: True if at least one file should use direct access mode + def has_remoteio(self) -> bool: """ + Check status of input file transfers and determine if direct access mode will be used or not. - return any([fspec.status == 'remote_io' for fspec in self.indata]) + :return: True if at least one file should use direct access mode (bool). + """ + return any(fspec.status == 'remote_io' for fspec in self.indata) def clean(self): """ - Validate and finally clean up required data values (object properties) if need - :return: None - """ + Validate and finally clean up required data values (object properties) if needed. + Not used. + """ pass ## custom function pattern to apply extra validation to the key values @@ -570,11 +590,14 @@ def clean(self): ## ## return value - def clean__corecount(self, raw, value): - """ - Verify and validate value for the corecount key (set to 1 if not set) + def clean__corecount(self, raw: Any, value: int) -> Any: """ + Verify and validate value for the corecount key (set to 1 if not set). + :param raw: (unused) (Any) + :param value: core count (int) + :return: updated core count (int). + """ # note: experiment specific # Overwrite the corecount value with ATHENA_PROC_NUMBER if it is set @@ -587,16 +610,16 @@ def clean__corecount(self, raw, value): return value if value else 1 - def clean__platform(self, raw, value): + def clean__platform(self, raw: Any, value: str) -> str: """ Verify and validate value for the platform key. + Set the alrbuserplatform value if encoded in platform/cmtconfig string. - :param raw: (unused). - :param value: platform (string). - :return: updated platform (string). + :param raw: (unused) (Any) + :param value: platform (str) + :return: updated platform (str). """ - v = value if value.lower() not in ['null', 'none'] else '' # handle encoded alrbuserplatform in cmtconfig/platform string if '@' in v: @@ -605,18 +628,18 @@ def clean__platform(self, raw, value): return v - def clean__jobparams(self, raw, value): + def clean__jobparams(self, raw: Any, value: str) -> str: """ - Verify and validate value for the jobparams key + Verify and validate value for the jobparams key. + Extract value from jobparams not related to job options. The function will in particular extract and remove --overwriteQueueData, ZIP_MAP and --containerimage. It will remove the old Pilot 1 option --overwriteQueuedata which should be replaced with --overwriteQueueData. - :param raw: (unused). - :param value: job parameters (string). - :return: updated job parameters (string). + :param raw: (unused) (Any) + :param value: job parameters (str) + :return: updated job parameters (str). """ - # value += ' --athenaopts "HITtoRDO:--nprocs=$ATHENA_CORE_NUMBER" someblah' logger.info(f'cleaning jobparams: {value}') @@ -665,14 +688,13 @@ def clean__jobparams(self, raw, value): return ret - def extract_container_image(self, jobparams): + def extract_container_image(self, jobparams: str) -> tuple: """ Extract the container image from the job parameters if present, and remove it. - :param jobparams: job parameters (string). - :return: updated job parameters (string), extracted image name (string). + :param jobparams: job parameters (str) + :return: string with updated job parameters, string with extracted image name (tuple). """ - imagename = "" # define regexp pattern for the full container image option @@ -702,25 +724,25 @@ def extract_container_image(self, jobparams): return jobparams, imagename @classmethod - def parse_args(self, data, options, remove=False): - """ - Extract option/values from string containing command line options (arguments) - :param data: input command line arguments (raw string) - :param options: dict of option names to be considered: (name, type), type is a cast function to be applied with result value - :param remove: boolean, if True then exclude specified options from returned raw string of command line arguments - :return: tuple: (dict of extracted options, raw string of final command line options) + def parse_args(cls, data: str, options: dict, remove: bool = False) -> tuple: """ + Extract option/values from string containing command line options (arguments). + :param data: input command line arguments (str) + :param options: dict of option names to be considered: (name, type), type is a cast function to be applied with result value (dict) + :param remove: boolean, if True then exclude specified options from returned raw string of command line arguments (bool) + :return: Dict of extracted options, raw string of final command line options (tuple). + """ logger.debug(f'extract options={list(options.keys())} from data={data}') if not options: return {}, data - opts, pargs = self.get_opts_pargs(data) + opts, pargs = cls.get_opts_pargs(data) if not opts: return {}, data - ret = self.get_ret(options, opts) + ret = cls.get_ret(options, opts) ## serialize parameters back to string rawdata = data @@ -734,24 +756,23 @@ def parse_args(self, data, options, remove=False): final_args.extend(arg) else: final_args.append(arg) - rawdata = " ".join(pipes.quote(e) for e in final_args) + rawdata = " ".join(shlex.quote(e) for e in final_args) return ret, rawdata @staticmethod - def get_opts_pargs(data): + def get_opts_pargs(data: str) -> tuple[dict, list]: """ Get the opts and pargs variables. - :param data: input command line arguments (raw string) - :return: opts (dict), pargs (list) + :param data: input command line arguments (str) + :return: opts dict, pargs list (tuple). """ - try: args = shlex.split(data) except ValueError as exc: logger.error(f'Failed to parse input arguments from data={data}, error={exc} .. skipped.') - return {}, data + return {}, [] opts, curopt, pargs = {}, None, [] for arg in args: @@ -773,15 +794,14 @@ def get_opts_pargs(data): return opts, pargs @staticmethod - def get_ret(options, opts): + def get_ret(options: dict, opts: dict): """ Get the ret variable from the options. - :param options: - :param opts: + :param options: dict of option names to be considered: (name, type) (dict) + :param opts: dict of extracted options (dict) :return: ret (dict). """ - ret = {} for opt, fcast in list(options.items()): val = opts.get(opt) @@ -794,15 +814,14 @@ def get_ret(options, opts): return ret - def add_workdir_size(self, workdir_size): + def add_workdir_size(self, workdir_size: int): """ Add a measured workdir size to the workdirsizes field. + The function will deduce any input and output file sizes from the workdir size. :param workdir_size: workdir size (int). - :return: """ - if not isinstance(workdir_size, int): try: workdir_size = int(workdir_size) @@ -826,8 +845,7 @@ def add_workdir_size(self, workdir_size): continue pfn = os.path.join(self.workdir, fspec.lfn) if not os.path.isfile(pfn): - msg = f"pfn file={pfn} does not exist (skip from workdir size calculation)" - logger.info(msg) + logger.info(f"pfn file={pfn} does not exist (skip from workdir size calculation)") else: total_size += os.path.getsize(pfn) @@ -836,15 +854,14 @@ def add_workdir_size(self, workdir_size): self.workdirsizes.append(workdir_size) - def get_max_workdir_size(self): + def get_max_workdir_size(self) -> int: """ Return the maximum disk space used by the payload. :return: workdir size (int). """ - maxdirsize = 0 - if self.workdirsizes != []: + if self.workdirsizes: # Get the maximum value from the list maxdirsize = max(self.workdirsizes) else: @@ -852,13 +869,12 @@ def get_max_workdir_size(self): return maxdirsize - def get_lfns_and_guids(self): + def get_lfns_and_guids(self) -> tuple[list, list]: """ Return ordered lists with the input file LFNs and GUIDs. - :return: list of input files, list of corresponding GUIDs. + :return: list of input files, list of corresponding GUIDs (tuple). """ - lfns = [] guids = [] @@ -868,17 +884,16 @@ def get_lfns_and_guids(self): return lfns, guids - def get_status(self, key): + def get_status(self, key: str) -> str: """ Return the value for the given key (e.g. LOG_TRANSFER) from the status dictionary. LOG_TRANSFER_NOT_DONE is returned if job object is not defined for key='LOG_TRANSFER'. If no key is found, None will be returned. - :param key: key name (string). - :return: corresponding key value in job.status dictionary (string). + :param key: key name (str) + :return: corresponding key value in job.status dictionary (str). """ - log_transfer = self.status.get(key, None) if not log_transfer: @@ -887,21 +902,27 @@ def get_status(self, key): return log_transfer - def get_job_option_for_input_name(self, input_name): + def get_job_option_for_input_name(self, input_name: str) -> str or None: """ + Get the job option for the given input name. + Expecting something like --inputHitsFile=@input_name in jobparams. - :returns: job_option such as --inputHitsFile + :param input_name: input name (str) + :return: job_option such as --inputHitsFile (str). """ job_options = self.jobparams.split(' ') input_name_option = f'=@{input_name}' for job_option in job_options: if input_name_option in job_option: return job_option.split("=")[0] + return None def process_writetofile(self): """ + Process the writetofile field. + Expecting writetofile from the job definition. The format is 'inputFor_file1:lfn1,lfn2^inputFor_file2:lfn3,lfn4' @@ -918,19 +939,20 @@ def process_writetofile(self): logger.error(f"writeToFile doesn't have the correct format, expecting a separator \':\' for {fileinfo}") if writetofile_dictionary: - for input_name in writetofile_dictionary: + for input_name, input_files in writetofile_dictionary.items(): input_name_new = input_name + '.txt' input_name_full = os.path.join(self.workdir, input_name_new) - f = open(input_name_full, 'w') - job_option = self.get_job_option_for_input_name(input_name) - if not job_option: - logger.error("unknown job option format, expected job options such as \'--inputHitsFile\' for input file: {input_name}") - else: - f.write(f"{job_option}\n") - for input_file in writetofile_dictionary[input_name]: - f.write(f"{input_file}\n") - f.close() - logger.info(f"wrote input file list to file {input_name_full}: {writetofile_dictionary[input_name]}") + + with open(input_name_full, 'w', encoding='utf-8') as f: + job_option = self.get_job_option_for_input_name(input_name) + if not job_option: + logger.error("unknown job option format, " + "expected job options such as \'--inputHitsFile\' for input file: {input_name}") + else: + f.write(f"{job_option}\n") + for input_file in input_files: + f.write(f"{input_file}\n") + logger.info(f"wrote input file list to file {input_name_full}: {input_files}") self.jobparams = self.jobparams.replace(input_name, input_name_new) if job_option: @@ -938,15 +960,14 @@ def process_writetofile(self): self.jobparams = self.jobparams.replace('--autoConfiguration=everything', '') logger.info(f"jobparams after processing writeToFile: {self.jobparams}") - def add_size(self, size): + def add_size(self, size: int): """ Add a size measurement to the sizes field at the current time stamp. + A size measurement is in Bytes. :param size: size of object in Bytes (int). - :return: """ - # is t0 set? if not, set it if not self.t0: self.t0 = os.times() @@ -957,81 +978,111 @@ def add_size(self, size): # add a data point to the sizes dictionary self.sizes[time_stamp] = size - def get_size(self): + def get_size(self) -> int: """ Determine the size (B) of the job object. :return: size (int). """ - # protect against the case where the object changes size during calculation (rare) try: self.currentsize = get_object_size(self) except Exception: pass + return self.currentsize - def collect_zombies(self, depth=None): - """ - Collect zombie child processes, depth is the max number of loops, plus 1, - to avoid infinite looping even if some child processes really get wedged; - depth=None means it will keep going until all child zombies have been collected. +# def collect_zombies(self, depth: int = None): +# """ +# Collect zombie child processes. +# +# Depth is the max number of loops, plus 1, to avoid infinite looping even if some child processes get really +# wedged; depth=None means it will keep going until all child zombies have been collected. +# +# :param depth: max depth (int). +# """ +# sleep(1) +# +# if self.zombies and depth > 1: +# logger.info(f"--- collectZombieJob: --- {depth}, {self.zombies}") +# depth -= 1 +# for zombie in self.zombies: +# try: +# logger.info(f"zombie collector waiting for pid {zombie}") +# _id, _ = os.waitpid(zombie, os.WNOHANG) +# except OSError as exc: +# logger.info(f"harmless exception when collecting zombies: {exc}") +# self.zombies.remove(zombie) +# else: +# if _id: # finished +# self.zombies.remove(zombie) +# self.collect_zombies(depth=depth) # recursion +# +# if self.zombies and not depth: +# # for the infinite waiting case, we have to use blocked waiting, otherwise it throws +# # RuntimeError: maximum recursion depth exceeded +# for zombie in self.zombies: +# try: +# _id, _ = os.waitpid(zombie, 0) +# except OSError as exc: +# logger.info(f"harmless exception when collecting zombie jobs: {exc}") +# self.zombies.remove(zombie) +# else: +# if _id: # finished +# self.zombies.remove(zombie) +# self.collect_zombies(depth=depth) # recursion + + def collect_zombies(self, depth: int = None): + """ + Collect zombie child processes. + + Depth is the max number of loops, plus 1, to avoid infinite looping even if some child processes get really + wedged; depth=None means it will keep going until all child zombies have been collected. :param depth: max depth (int). - :return: """ - sleep(1) - if self.zombies and depth > 1: - logger.info(f"--- collectZombieJob: --- {depth}, {self.zombies}") - depth -= 1 + current_depth = depth + while self.zombies and (current_depth is None or current_depth > 0): + if current_depth: + logger.info(f"--- collectZombieJob: --- {current_depth}, {self.zombies}") + current_depth -= 1 + + zombies_to_remove = [] for zombie in self.zombies: try: logger.info(f"zombie collector waiting for pid {zombie}") - _id, _ = os.waitpid(zombie, os.WNOHANG) + _id, _ = os.waitpid(zombie, os.WNOHANG if current_depth else 0) except OSError as exc: logger.info(f"harmless exception when collecting zombies: {exc}") - self.zombies.remove(zombie) + zombies_to_remove.append(zombie) else: if _id: # finished - self.zombies.remove(zombie) - self.collect_zombies(depth=depth) # recursion + zombies_to_remove.append(zombie) - if self.zombies and not depth: - # for the infinite waiting case, we have to use blocked waiting, otherwise it throws - # RuntimeError: maximum recursion depth exceeded - for zombie in self.zombies: - try: - _id, _ = os.waitpid(zombie, 0) - except OSError as exc: - logger.info(f"harmless exception when collecting zombie jobs: {exc}") - self.zombies.remove(zombie) - else: - if _id: # finished - self.zombies.remove(zombie) - self.collect_zombies(depth=depth) # recursion + # Remove collected zombies from the list + for zombie in zombies_to_remove: + self.zombies.remove(zombie) - def only_copy_to_scratch(self): ## TO BE DEPRECATED, use `has_remoteio()` instead of + if current_depth == 0: + break + + def only_copy_to_scratch(self) -> bool: ## TO BE DEPRECATED, use `has_remoteio()` instead of """ Determine if the payload only has copy-to-scratch input. + In this case, there should be no --usePFCTurl or --directIn in the job parameters. - :return: True if only copy-to-scratch. False if at least one file should use direct access mode + :return: True if only copy-to-scratch. False if at least one file should use direct access mode (bool) """ - - for fspec in self.indata: - if fspec.status == 'remote_io': - return False - - return True + return not any(fspec.status == 'remote_io' for fspec in self.indata) + # for fspec in self.indata: + # if fspec.status == 'remote_io': + # return False def reset_errors(self): # temporary fix, make sure all queues are empty before starting new job - """ - - :return: - """ - + """Reset error codes and messages.""" self.piloterrorcode = 0 self.piloterrorcodes = [] self.piloterrordiag = "" @@ -1045,9 +1096,5 @@ def reset_errors(self): # temporary fix, make sure all queues are empty before self.subprocesses = [] def to_json(self): - """ - Convert class to dictionary. - """ - - from json import dumps + """Convert class to dictionary.""" return dumps(self, default=lambda par: par.__dict__) diff --git a/pilot/info/jobinfo.py b/pilot/info/jobinfo.py index af9562c96..1b557eb4a 100644 --- a/pilot/info/jobinfo.py +++ b/pilot/info/jobinfo.py @@ -17,7 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-2024 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-24 """ @@ -29,41 +29,47 @@ :date: January 2018 """ +from typing import Any + import logging logger = logging.getLogger(__name__) class JobInfoProvider: """ - Job info provider which is used to extract settings specific for given Job - and overwrite general configuration used by Information Service + Job info provider used to extract settings specific for a given job + and to overwrite the general configuration used by the Information Service. """ - job = None ## Job instance + job = None # Job instance - def __init__(self, job): - self.job = job + def __init__(self, job: Any): + """ + Initialize JobInfoProvider with Job instance. - def resolve_schedconf_sources(self): + :param job: Job object (Any). """ - Resolve Job specific prioritized list of source names to be used for SchedConfig data load - :return: prioritized list of source names + self.job = job + + def resolve_schedconf_sources(self) -> None: """ + Resolve Job specific prioritized list of source names to be used for SchedConfig data load + :return: prioritized list of source names (None if not implemented yet) + """ ## FIX ME LATER ## quick stub implementation: extract later from jobParams, e.g. from overwriteAGISData.. ## an example of return data: ## return ['AGIS', 'LOCAL', 'CVMFS'] ## - return None ## Not implemented yet - def resolve_queuedata(self, pandaqueue, **kwargs): - """ - Resolve Job specific settings for queue data (overwriteQueueData) - :return: dict of settings for given PandaQueue as a key + def resolve_queuedata(self, pandaqueue: str, **kwargs: dict) -> dict: """ + Resolve Job specific settings for queue data (overwriteQueueData) + :return: Dictionary of settings for given PandaQueue as a key (dict). + """ # use following keys from job definition # keys format: [(inputkey, outputkey), inputkey2] # outputkey is the name of external source attribute @@ -80,15 +86,15 @@ def resolve_queuedata(self, pandaqueue, **kwargs): data[okey] = val data.update(self.job.overwrite_queuedata) ## use job.overwrite_queuedata as a master source - logger.info(f'queuedata: following keys will be overwritten by Job values: {data}') return {pandaqueue: data} def resolve_storage_data(self, ddmendpoints: list = None, **kwargs: dict) -> dict: """ - Resolve Job specific settings for storage data (including data passed via --overwriteStorageData) - :return: dict of settings for requested DDMEndpoints with ddmendpoin as a key + Resolve Job specific settings for storage data (including data passed via --overwriteStorageData) + + :return: dict of settings for requested DDMEndpoints with ddmendpoin as a key """ if ddmendpoints is None: ddmendpoints = [] @@ -96,10 +102,7 @@ def resolve_storage_data(self, ddmendpoints: list = None, **kwargs: dict) -> dic ## use job.overwrite_storagedata as a master source master_data = self.job.overwrite_storagedata or {} - try: - data.update((k, v) for k, v in master_data.iteritems() if k in set(ddmendpoints or master_data) & set(master_data)) # Python 2 - except Exception: - data.update((k, v) for k, v in list(master_data.items()) if k in set(ddmendpoints or master_data) & set(master_data)) # Python 3 + data.update((k, v) for k, v in list(master_data.items()) if k in set(ddmendpoints or master_data) & set(master_data)) if data: logger.info(f'storagedata: following data extracted from Job definition will be used: {data}') diff --git a/pilot/info/jobinfoservice.py b/pilot/info/jobinfoservice.py deleted file mode 100644 index ba7cb0bc6..000000000 --- a/pilot/info/jobinfoservice.py +++ /dev/null @@ -1,48 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# Authors: -# - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2023 - -""" -Job specific Info Service -It could customize/overwrite settings provided by the main Info Service - -:author: Alexey Anisenkov -:contact: anisyonk@cern.ch -:date: January 2018 -""" - -from .infoservice import InfoService -from .jobinfo import JobInfoProvider - -import logging -logger = logging.getLogger(__name__) - - -class JobInfoService(InfoService): ## TO BE DEPRECATED/REMOVED - """ - Info service: Job specific - Job could overwrite settings provided by Info Service - - *** KEPT for a while in repo .. most probably will be deprecated and removed soon ** - """ - - def __init__(self, job): - - self.jobinfo = JobInfoProvider(job) diff --git a/pilot/info/queuedata.py b/pilot/info/queuedata.py index 5e89075c1..3e3ee1b76 100644 --- a/pilot/info/queuedata.py +++ b/pilot/info/queuedata.py @@ -17,7 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018-19 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-24 """ @@ -37,17 +37,18 @@ :date: January 2018 """ +import logging import re +from typing import Any from .basedata import BaseData -import logging logger = logging.getLogger(__name__) class QueueData(BaseData): """ - High-level object to host all queuedata settings associated to given PandaQueue + High-level object to host all queuedata settings associated to given PandaQueue """ # ## put explicit list of all the attributes with comments for better inline-documentation by sphinx @@ -59,11 +60,9 @@ class QueueData(BaseData): appdir = "" # catchall = "" # General catchall field environ = "" # Special field for key=value pairs to be added as exports to payload command - platform = "" # cmtconfig value container_options = "" # singularity only options? to be reviewed and forced to be a dict (support options for other containers?) container_type = {} # dict of container names by user as a key - copytools = None acopytools = None @@ -76,61 +75,51 @@ class QueueData(BaseData): astorages = None aprotocols = None params = {} - state = None # AGIS PQ state, e.g. ACTIVE status = "" # PQ status, e.g. online site = None # ATLAS Site name direct_access_lan = False # Prefer remote io (True) or use only copy2scratch method (False) for stage-in over LAN direct_access_wan = False # Prefer remote io (True) or use only copy2scratch method (False) for stage-in over WAN - allow_lan = True # Allow LAN access (whatever method) for stage-in allow_wan = False # Allow WAN access (whatever method) for stage-in use_pcache = False - maxwdir = 0 # in MB maxrss = 0 maxinputsize = 0 - timefloor = 0 # The maximum time during which the pilot is allowed to start a new job, in seconds corecount = 1 # - maxtime = 0 # maximum allowed lifetime for pilot to run on the resource (0 will be ignored, fallback to default) - pledgedcpu = 0 # es_stageout_gap = 0 ## time gap value in seconds for ES stageout - is_cvmfs = True # has cvmfs installed + memkillgrace = 100 # memory kill grace value in percentage # specify the type of attributes for proper data validation and casting _keys = {int: ['timefloor', 'maxwdir', 'pledgedcpu', 'es_stageout_gap', - 'corecount', 'maxrss', 'maxtime', 'maxinputsize'], + 'corecount', 'maxrss', 'maxtime', 'maxinputsize', 'memkillgrace'], str: ['name', 'type', 'appdir', 'catchall', 'platform', 'container_options', 'container_type', 'resource', 'state', 'status', 'site', 'environ'], dict: ['copytools', 'acopytools', 'astorages', 'aprotocols', 'acopytools_schemas', 'params'], bool: ['allow_lan', 'allow_wan', 'direct_access_lan', 'direct_access_wan', 'is_cvmfs', 'use_pcache'] } - def __init__(self, data): + def __init__(self, data: dict): """ - Init class instance. + Initialize class instance. :param data: input dictionary of queue data settings (dict). """ self.load(data) - - # DEBUG - #import pprint - #logger.debug(f'initialize QueueData from raw:\n{pprint.pformat(data)}') logger.debug(f'final parsed QueueData content:\n{self}') - def load(self, data): - """ - Construct and initialize data from ext source - :param data: input dictionary of queue data settings + def load(self, data: dict): """ + Construct and initialize data from ext source + :param data: input dictionary of queue data settings (dict). + """ # the translation map of the queue data attributes from external data to internal schema # 'internal_name':('ext_name1', 'extname2_if_any') # 'internal_name2':'ext_name3' @@ -149,22 +138,25 @@ def load(self, data): self._load_data(data, kmap) - def resolve_allowed_schemas(self, activity, copytool=None): - """ - Resolve list of allowed schemas for given activity and requested copytool based on `acopytools_schemas` settings - :param activity: str or ordered list of transfer activity names to resolve acopytools related data - :return: list of protocol schemes + def resolve_allowed_schemas(self, activity: str or list, copytool: str = None) -> list: """ + Resolve list of allowed schemas for given activity and requested copytool based on `acopytools_schemas` settings + :param activity: str or ordered list of transfer activity names to resolve acopytools related data (str or list) + :param copytool: requested copytool name (str) + :return: list of protocol schemes (list). + """ if not activity: activity = 'default' if isinstance(activity, str): - activity = [activity] - if 'default' not in activity: - activity = activity + ['default'] + activity_list = list(activity) + else: + activity_list = activity + if 'default' not in activity_list: + activity_list.append('default') adat = {} - for aname in activity: + for aname in activity_list: adat = self.acopytools_schemas.get(aname) if adat: break @@ -180,11 +172,7 @@ def resolve_allowed_schemas(self, activity, copytool=None): return adat.get(copytool) or [] def clean(self): - """ - Validate and finally clean up required data values (required object properties) if need - :return: None - """ - + """Validate and finally clean up required data values (required object properties) if needed.""" # validate es_stageout_gap value if not self.es_stageout_gap: is_opportunistic = self.pledgedcpu and self.pledgedcpu == -1 @@ -209,8 +197,6 @@ def clean(self): self.container_options = self.container_options.replace(" --contain", ",${workdir} --contain") logger.info(f"note: added missing $workdir to container_options: {self.container_options}") - pass - ## custom function pattern to apply extra validation to the key values ##def clean__keyname(self, raw, value): ## :param raw: raw value passed from ext source as input @@ -218,22 +204,27 @@ def clean(self): ## ## return value - def clean__timefloor(self, raw, value): - """ - Verify and validate value for the timefloor key (convert to seconds) + def clean__timefloor(self, raw: Any, value: int) -> int: """ + Verify and validate value for the timefloor key (convert to seconds). + :param raw: raw value passed from ext source as input - unused (Any) + :param value: preliminary cleaned and cast to proper type value (int) + :return: timefloor value in seconds (int). + """ return value * 60 - def clean__container_type(self, raw, value): + def clean__container_type(self, raw: Any, value: str) -> dict: """ - Parse and prepare value for the container_type key - Expected raw data in format 'container_name:user_name;' - E.g. container_type = 'singularity:pilot;docker:wrapper', 'apptainer:pilot;docker:wrapper' + Parse and prepare value for the container_type key. - :return: dict of container names by user as a key - """ + Expected raw data in format 'container_name:user_name;' + E.g. container_type = 'singularity:pilot;docker:wrapper', 'apptainer:pilot;docker:wrapper' + :param raw: raw value passed from ext source as input - unused (Any) + :param value: preliminary cleaned and cast to proper type value (str) + :return: dictionary of container names by user as a key (dict). + """ ret = {} val = value or '' for e in val.split(';'): @@ -244,16 +235,22 @@ def clean__container_type(self, raw, value): return ret - def clean__container_options(self, raw, value): - """ - Verify and validate value for the container_options key (remove bad values) + def clean__container_options(self, raw: Any, value: str) -> str: """ + Verify and validate value for the container_options key (remove bad values) + :param raw: raw value passed from ext source as input - unused (Any) + :param value: preliminary cleaned and cast to proper type value (str) + :return: cleaned container_options value (str). + """ return value if value.lower() not in ['none'] else '' - def clean__corecount(self, raw, value): - """ - Verify and validate value for the corecount key (set to 1 if not set) + def clean__corecount(self, raw: Any, value: int) -> int: """ + Verify and validate value for the corecount key (set to 1 if not set) + :param raw: raw value passed from ext source as input - unused (Any) + :param value: preliminary cleaned and cast to proper type value (int) + :return: corecount value (int). + """ return value if value else 1 diff --git a/pilot/info/storagedata.py b/pilot/info/storagedata.py index ea5bab8ba..5998fde34 100644 --- a/pilot/info/storagedata.py +++ b/pilot/info/storagedata.py @@ -17,7 +17,7 @@ # # Authors: # - Alexey Anisenkov, anisyonk@cern.ch, 2018 -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2019-24 """ The implementation of data structure to host storage data description. @@ -31,20 +31,21 @@ :contact: anisyonk@cern.ch :date: January 2018 """ +import logging import traceback from os import environ +from typing import Any from pilot.util import https from pilot.util.config import config from .basedata import BaseData -import logging logger = logging.getLogger(__name__) class StorageData(BaseData): """ - High-level object to host Storage details (available protocols, etc.) + High-level object to host Storage details (available protocols, etc.) """ ## put explicit list of all the attributes with comments for better inline-documentation by sphinx @@ -74,11 +75,12 @@ class StorageData(BaseData): bool: ['is_deterministic'] } - def __init__(self, data): - """ - :param data: input dictionary of storage description by DDMEndpoint name as key + def __init__(self, data: dict): """ + Initialize StorageData object with input data. + :param data: input dictionary of storage description by DDMEndpoint name as key (dict). + """ self.load(data) # DEBUG @@ -86,12 +88,12 @@ def __init__(self, data): # logger.debug(f'initialize StorageData from raw:\n{pprint.pformat(data)}') # logger.debug(f'final parsed StorageData content:\n{self}') - def load(self, data): - """ - Construct and initialize data from ext source - :param data: input dictionary of storage description by DDMEndpoint name as key + def load(self, data: dict): """ + Construct and initialize data from ext source. + :param data: input dictionary of storage description by DDMEndpoint name as key (dict). + """ # the translation map of the queue data attributes from external data to internal schema # first defined ext field name will be used # if key is not explicitly specified then ext name will be used as is @@ -113,41 +115,41 @@ def load(self, data): ## return value # to be improved: move it to some data loader - def get_security_key(self, secret_key, access_key): + def get_security_key(self, secret_key: str, access_key: str) -> dict: """ - Get security key pair from panda - :param secret_key: secrect key name as string - :param access_key: access key name as string - :return: setup as a string + Get security key pair from panda. + + :param secret_key: secret key name (str) + :param access_key: access key name (str) + :return: dictionary with public and private keys (dict). """ try: data = {'privateKeyName': secret_key, 'publicKeyName': access_key} - logger.info(f"Getting key pair: {data}") url = environ.get('PANDA_SERVER_URL', config.Pilot.pandaserver) + logger.info(f"requesting key pair from {url}: {data}") res = https.request(f'{url}/server/panda/getKeyPair', data=data) if res and res['StatusCode'] == 0: return {"publicKey": res["publicKey"], "privateKey": res["privateKey"]} - else: - logger.info(f"Got key pair returns wrong value: {res}") + logger.info(f"key pair returned wrong value: {res}") except Exception as exc: - logger.error(f"Failed to get key pair({access_key},{secret_key}): {exc}, {traceback.format_exc()}") + logger.error(f"failed to get key pair ({access_key},{secret_key}): {exc}, {traceback.format_exc()}") return {} - def get_special_setup(self, protocol_id=None): - """ - Construct special setup for ddms such as objectstore - :param protocol_id: protocol id. - :return: setup as a string + def get_special_setup(self, protocol_id: Any = None): """ + Construct special setup for ddms such as objectstores. - logger.info(f"get special setup for protocol id({protocol_id})") + :param protocol_id: protocol id (Any) + :return: special setup string (str). + """ + logger.debug(f"get special setup for protocol id ({protocol_id})") if protocol_id in self.special_setup and self.special_setup[protocol_id]: return self.special_setup[protocol_id] - if protocol_id is None or str(protocol_id) not in list(self.rprotocols.keys()): # Python 2/3 + if protocol_id is None or str(protocol_id) not in self.rprotocols: return None - if self.type in ['OS_ES', 'OS_LOGS']: + if self.type in {'OS_ES', 'OS_LOGS'}: self.special_setup[protocol_id] = None settings = self.rprotocols.get(str(protocol_id), {}).get('settings', {}) diff --git a/pilot/resource/jobdescription.py b/pilot/resource/jobdescription.py index 7fc7ad3cd..5f6b5e18f 100755 --- a/pilot/resource/jobdescription.py +++ b/pilot/resource/jobdescription.py @@ -18,7 +18,7 @@ # # Authors: # - Danila Oleynik, 2018-2021 -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 """Function library for Titan.""" @@ -581,9 +581,9 @@ def get_traceback(self) -> str: continue # we don't need inner scopes of this and subsequent calls i = ii[1] tb_str += f'{i[0]}:{i[1]} (in {i[2]}): {i[3]}\n' - thread = threading.currentThread() + thread = threading.current_thread() - return 'Traceback: (latest call first)' + tb_str + f'Thread: {thread.getName()}({thread.ident})' + return 'Traceback: (latest call first)' + tb_str + f'Thread: {thread.name}({thread.ident})' def __getattr__(self, key: str) -> str: """ diff --git a/pilot/resource/summit.py b/pilot/resource/summit.py deleted file mode 100644 index bceccc603..000000000 --- a/pilot/resource/summit.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2019-23 - -"""Functions for Summit.""" - -import logging -from typing import Any - -logger = logging.getLogger(__name__) - - -def get_setup(job: Any = None) -> list: - """ - Return the resource specific setup. - - :param job: optional job object (Any) - :return: setup commands (list). - """ - if not job: - logger.warning('job object not sent to get_setup') - - return [] diff --git a/pilot/resource/titan.py b/pilot/resource/titan.py index 043bd9f0c..d25ceb1c8 100644 --- a/pilot/resource/titan.py +++ b/pilot/resource/titan.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-24 # - Danila Oleynik danila.oleynik@cern.ch, 2018 """Functions for Titan.""" @@ -185,7 +185,7 @@ def set_scratch_workdir(job: Any, work_dir: str, args: dict) -> str: except IOError as exc: logger.error(f"i/o error({exc.errno}): {exc.strerror}") logger.error(f"copy to scratch failed, execution terminated': \n {sys.exc_info()[1]} ") - raise FileHandlingFailure("Copy to RAM disk failed") + raise FileHandlingFailure("Copy to RAM disk failed") from exc finally: add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args) else: @@ -225,9 +225,9 @@ def process_jobreport(payload_report_file: str, job_scratch_path: str, job_commu write_json(dst_file, job_report) - except IOError: + except IOError as exc: logger.error(f"job report copy failed, execution terminated': \n {sys.exc_info()[1]} ") - raise FileHandlingFailure("job report copy from RAM failed") + raise FileHandlingFailure("job report copy from RAM failed") from exc def postprocess_workdir(workdir: str): @@ -241,8 +241,8 @@ def postprocess_workdir(workdir: str): try: if os.path.exists(pseudo_dir): remove(os.path.join(workdir, pseudo_dir)) - except IOError: - raise FileHandlingFailure("Post processing of working directory failed") + except IOError as exc: + raise FileHandlingFailure("Post processing of working directory failed") from exc def command_fix(command: str, job_scratch_dir: str) -> str: @@ -254,13 +254,13 @@ def command_fix(command: str, job_scratch_dir: str) -> str: :return: updated/fixed payload command (str). """ subs_a = command.split() - for i in range(len(subs_a)): + for i, sub in enumerate(subs_a): if i > 0: - if '(' in subs_a[i] and not subs_a[i][0] == '"': - subs_a[i] = '"' + subs_a[i] + '"' - if subs_a[i].startswith("--inputEVNTFile"): - filename = subs_a[i].split("=")[1] - subs_a[i] = subs_a[i].replace(filename, os.path.join(job_scratch_dir, filename)) + if '(' in sub and not sub[0] == '"': + subs_a[i] = '"' + sub + '"' + if sub.startswith("--inputEVNTFile"): + filename = sub.split("=")[1] + subs_a[i] = sub.replace(filename, os.path.join(job_scratch_dir, filename)) fixed_command = ' '.join(subs_a) fixed_command = fixed_command.strip() diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index b6f20ad1a..45488de65 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -16,7 +16,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 """Script for remote file open verification.""" @@ -36,12 +36,10 @@ import ROOT from pilot.util.config import config -from pilot.util.filehandling import ( - write_json, -) +from pilot.util.filehandling import write_json from pilot.util.loggingsupport import ( - flush_handler, establish_logging, + flush_handler, ) from pilot.util.processes import kill_processes @@ -114,10 +112,10 @@ def get_file_lists(turls_string: str) -> dict: """ _turls = [] - try: + if isinstance(turls_string, str): _turls = turls_string.split(',') - except Exception as _error: - message(f"exception caught: {_error}") + else: + message(f"unexpected type for turls_string: {type(turls_string).__name__}") return {'turls': _turls} @@ -141,8 +139,8 @@ def try_open_file(turl_str: str, _queues: namedtuple): # message(f"internal TFile.Open() time-out set to {_timeout} ms") message(f'opening {turl_str}') in_file = ROOT.TFile.Open(turl_str) - except Exception as exc: - message(f'caught exception: {exc}') + except Exception as e: + message(f'caught exception: {e}') else: if in_file and in_file.IsOpen(): in_file.Close() @@ -226,7 +224,7 @@ def interrupt(_args: Any, signum: Any, frame: Any): try: logname = config.Pilot.remotefileverification_log - except Exception as error: + except AttributeError as error: print(f"caught exception: {error} (skipping remote file open verification)") sys.exit(1) else: @@ -267,21 +265,15 @@ def interrupt(_args: Any, signum: Any, frame: Any): except queue.Empty: message("reached time-out") break - except Exception as error: - message(f"caught exception: {error}") thread = spawn_file_open_thread(queues, turls) if thread: threads.append(thread) # wait until all threads have finished - try: - for thread in threads: - thread.join() - except Exception as exc: - logger.warning(f"exception caught while handling threads: {exc}") - finally: - logger.info('all remote file open threads have been joined') + for thread in threads: + thread.join() + logger.info('all remote file open threads have been joined') opened_turls = list(queues.opened.queue) opened_turls.sort() diff --git a/pilot/scripts/stagein.py b/pilot/scripts/stagein.py index 6fc6f1fcd..4a3e52f9d 100644 --- a/pilot/scripts/stagein.py +++ b/pilot/scripts/stagein.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 """This script is executed by the pilot in a container to perform stage-in of input files.""" @@ -31,9 +31,9 @@ from pilot.api.es_data import StageInESClient from pilot.common.exception import ConversionFailure from pilot.info import ( + infosys, InfoService, FileSpec, - infosys, ) from pilot.util.config import config from pilot.util.filehandling import ( @@ -226,7 +226,10 @@ def message(msg: str): :param msg: message (str). """ - print(msg) if not logger else logger.info(msg) + if not logger: + print(msg) + else: + logger.info(msg) def str_to_int_list(_list: list) -> list: diff --git a/pilot/scripts/stageout.py b/pilot/scripts/stageout.py index e04b8f3ef..01c28a7f7 100644 --- a/pilot/scripts/stageout.py +++ b/pilot/scripts/stageout.py @@ -17,7 +17,7 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-24 """This script is executed by the pilot in a container to perform stage-out of output files.""" @@ -26,14 +26,15 @@ import os import re import sys +import traceback from pilot.api.data import StageOutClient from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException from pilot.info import ( + infosys, InfoService, FileSpec, - infosys, ) from pilot.util.config import config from pilot.util.filehandling import write_json @@ -191,7 +192,10 @@ def message(msg: str): :param msg: message (str). """ - print(msg) if not logger else logger.info(msg) + if not logger: + print(msg) + else: + logger.info(msg) def get_file_lists(_lfns: str, _scopes: str, _ddmendpoints: str, _datasets: str, _guids: str) -> tuple: @@ -332,7 +336,6 @@ def extract_error_info(_err: str) -> tuple: try: r = client.transfer(xfiles, activity=activity, **kwargs) except PilotException as error: - import traceback error_msg = traceback.format_exc() logger.error(error_msg) err = errors.format_diagnostics(error.get_error_code(), error_msg) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 006148f06..4f1149117 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -47,6 +47,7 @@ FileHandlingFailure ) from pilot.info.filespec import FileSpec +from pilot.info.jobdata import JobData from pilot.util.config import config from pilot.util.constants import ( UTILITY_BEFORE_PAYLOAD, @@ -100,7 +101,6 @@ get_network_monitor_setup, post_memory_monitor_action, get_memory_monitor_summary_filename, - get_prefetcher_setup, get_memory_monitor_output_filename, get_metadata_dict_from_txt, ) @@ -132,13 +132,13 @@ def sanity_check() -> int: return 0 -def validate(job: Any) -> bool: +def validate(job: JobData) -> bool: """ Perform user specific payload/job validation. This function will produce a local DBRelease file if necessary (old releases). - :param job: job object (Any) + :param job: job object (JobData) :return: True if validation is successful, False otherwise (bool). """ status = True @@ -180,14 +180,14 @@ def validate(job: Any) -> bool: return status -def open_remote_files(indata: list, workdir: str, nthreads: int) -> (int, str, list, int): # noqa: C901 +def open_remote_files(indata: list, workdir: str, nthreads: int) -> tuple[int, str, list, int]: # noqa: C901 """ Verify that direct i/o files can be opened. :param indata: list of FileSpec (list) :param workdir: working directory (str) :param nthreads: number of concurrent file open threads (int) - :return: exit code (int), diagnostics (str), not opened files (list), lsetup time (int). + :return: exit code (int), diagnostics (str), not opened files (list), lsetup time (int) (tuple). :raises PilotException: in case of pilot error. """ exitcode = 0 @@ -329,14 +329,14 @@ def get_timeout_for_remoteio(indata: list) -> int: return len(remote_io) * 30 + 900 -def parse_remotefileverification_dictionary(workdir: str) -> (int, str, list): +def parse_remotefileverification_dictionary(workdir: str) -> tuple[int, str, list]: """ Verify that all files could be remotely opened. Note: currently ignoring if remote file dictionary doesn't exist. :param workdir: work directory needed for opening remote file dictionary (str) - :return: exit code (int), diagnostics (str), not opened files (list). + :return: exit code (int), diagnostics (str), not opened files (list) (tuple). """ exitcode = 0 diagnostics = "" @@ -409,7 +409,7 @@ def extract_turls(indata: list) -> str: ) -def process_remote_file_traces(path: str, job: Any, not_opened_turls: list): +def process_remote_file_traces(path: str, job: JobData, not_opened_turls: list): """ Report traces for remote files. @@ -417,7 +417,7 @@ def process_remote_file_traces(path: str, job: Any, not_opened_turls: list): and updates it per file before reporting it to the Rucio server. :param path: path to base trace report (str) - :param job: job object (Any) + :param job: job object (JobData) :param not_opened_turls: list of turls that could not be opened (list) """ try: @@ -487,12 +487,12 @@ def get_nthreads(catchall: str) -> int: return _nthreads if _nthreads else 1 -def get_payload_command(job: Any) -> str: +def get_payload_command(job: JobData) -> str: """ Return the full command for executing the payload, including the sourcing of all setup files and setting of environment variables. - :param job: job object (Any) - :return: command (string). + :param job: job object (JobData) + :return: command (str). :raises TrfDownloadFailure: in case of download failure. """ # Should the pilot do the setup or does jobPars already contain the information? @@ -516,7 +516,6 @@ def get_payload_command(job: Any) -> str: diagnostics = "" try: - logger.debug('executing open_remote_files()') exitcode, diagnostics, not_opened_turls, lsetup_time = open_remote_files(job.indata, job.workdir, get_nthreads(catchall)) except Exception as exc: logger.warning(f'caught std exception: {exc}') @@ -531,7 +530,7 @@ def get_payload_command(job: Any) -> str: logger.warning(f'base trace report does not exist ({path}) - ' f'input file traces should already have been sent') else: - process_remote_file_traces(path, job, not_opened_turls) + process_remote_file_traces(path, job, not_opened_turls) # ignore PyCharm warning, path is str # fail the job if the remote files could not be verified if exitcode != 0: @@ -623,9 +622,7 @@ def prepend_env_vars(environ: str, cmd: str) -> str: :return: updated payload command (str). """ exports = get_exports(environ) - exports_to_add = '' - for _cmd in exports: - exports_to_add += _cmd + exports_to_add = ''.join(exports) # add the UTC time zone exports_to_add += "export TZ=\'UTC\'; " @@ -658,8 +655,7 @@ def get_exports(from_string: str) -> list: key_values = get_key_values(from_string) logger.debug(f'extracted key-values: {key_values}') if key_values: - for number in range(len(key_values)): - raw_val = key_values[number] + for _, raw_val in enumerate(key_values): _key = raw_val[0] _value = raw_val[1] key_value = '' @@ -672,12 +668,12 @@ def get_exports(from_string: str) -> list: return exports -def get_normal_payload_command(cmd: str, job: Any, preparesetup: bool, userjob: bool) -> str: +def get_normal_payload_command(cmd: str, job: JobData, preparesetup: bool, userjob: bool) -> str: """ Return the payload command for a normal production/analysis job. :param cmd: any preliminary command setup (str) - :param job: job object (Any) + :param job: job object (JobData) :param userjob: True for user analysis jobs, False otherwise (bool) :param preparesetup: True if the pilot should prepare the setup, False if already in the job parameters (bool) :return: normal payload command (str). @@ -723,12 +719,12 @@ def get_normal_payload_command(cmd: str, job: Any, preparesetup: bool, userjob: return cmd -def get_generic_payload_command(cmd: str, job: Any, preparesetup: bool, userjob: bool) -> str: +def get_generic_payload_command(cmd: str, job: JobData, preparesetup: bool, userjob: bool) -> str: """ Return the payload command for a generic job. :param cmd: any preliminary command setup (str) - :param job: job object (Any) + :param job: job object (JobData) :param preparesetup: True if the pilot should prepare the setup, False if already in the job parameters (bool) :param userjob: True for user analysis jobs, False otherwise (bool) :return: generic job command (str). @@ -866,14 +862,14 @@ def add_makeflags(job_core_count: int, cmd: str) -> str: return cmd -def get_analysis_run_command(job: Any, trf_name: str) -> str: # noqa: C901 +def get_analysis_run_command(job: JobData, trf_name: str) -> str: # noqa: C901 """ Return the proper run command for the user job. Example output: export X509_USER_PROXY=<..>;./runAthena --usePFCTurl --directIn - :param job: job object (Any) + :param job: job object (JobData) :param trf_name: name of the transform that will run the job (str) :return: command (str). """ @@ -881,17 +877,20 @@ def get_analysis_run_command(job: Any, trf_name: str) -> str: # noqa: C901 # add the user proxy if 'X509_USER_PROXY' in os.environ and not job.imagename: - logger.debug(f'X509_UNIFIED_DISPATCH={os.environ.get("X509_UNIFIED_DISPATCH")}') x509 = os.environ.get('X509_UNIFIED_DISPATCH', os.environ.get('X509_USER_PROXY', '')) cmd += f'export X509_USER_PROXY={x509};' - if 'OIDC_AUTH_TOKEN' in os.environ: - cmd += 'unset OIDC_AUTH_TOKEN;' - if 'OIDC_AUTH_ORIGIN' in os.environ: - cmd += 'unset OIDC_AUTH_ORIGIN;' - if 'PANDA_AUTH_TOKEN' in os.environ: - cmd += 'unset PANDA_AUTH_TOKEN;' - if 'PANDA_AUTH_ORIGIN' in os.environ: - cmd += 'unset PANDA_AUTH_ORIGIN;' + + env_vars_to_unset = [ + 'OIDC_AUTH_TOKEN', + 'OIDC_AUTH_ORIGIN', + 'PANDA_AUTH_TOKEN', + 'PANDA_AUTH_ORIGIN', + 'OIDC_REFRESHED_AUTH_TOKEN' + ] + + for var in env_vars_to_unset: + if var in os.environ: + cmd += f'unset {var};' # set up trfs if job.imagename == "": # user jobs with no imagename defined @@ -1008,11 +1007,11 @@ def get_guids_from_jobparams(jobparams: str, infiles: list, infilesguids: list) return guidlist -def test_job_data(job: Any): +def test_job_data(job: JobData): """ Test function to verify that the job object contains the expected data. - :param job: job object (Any) + :param job: job object (JobData). """ # in case the job was created with --outputs="regex|DST_.*\.root", we can now look for the corresponding # output files and add them to the output file list @@ -1066,7 +1065,7 @@ def test_job_data(job: Any): logger.debug('no regex found in outdata file list') -def update_job_data(job: Any): +def update_job_data(job: JobData): """ Update the job object. @@ -1075,7 +1074,7 @@ def update_job_data(job: Any): In the case of ATLAS, information is extracted from the metadata field and added to other job object fields. - :param job: job object (Any). + :param job: job object (JobData). """ ## comment from Alexey: ## it would be better to reallocate this logic (as well as parse @@ -1131,14 +1130,14 @@ def update_job_data(job: Any): validate_output_data(job) -def validate_output_data(job: Any): +def validate_output_data(job: JobData): """ Validate output data. Set any missing GUIDs and make sure the output file names follow the ATLAS naming convention - if not, set the error code. - :param job: job object (Any). + :param job: job object (JobData). """ ## validate output data (to be moved into the JobData) ## warning: do no execute this code unless guid lookup in job report @@ -1190,11 +1189,11 @@ def naming_convention_pattern() -> str: return fr"^[A-Za-z0-9][A-Za-z0-9.\-_]{{1,{max_filename_size}}}$" -def get_stageout_label(job: Any): +def get_stageout_label(job: JobData): """ Get a proper stage-out label. - :param job: job object (Any) + :param job: job object (JobData) :return: "all"/"log" depending on stage-out type (str). """ stageout = "all" @@ -1214,11 +1213,11 @@ def get_stageout_label(job: Any): return stageout -def update_output_for_hpo(job: Any): +def update_output_for_hpo(job: JobData): """ Update the output (outdata) for HPO jobs. - :param job: job object (Any). + :param job: job object (JobData). """ try: new_outdata = discover_new_outdata(job) @@ -1230,12 +1229,12 @@ def update_output_for_hpo(job: Any): job.outdata = new_outdata -def discover_new_outdata(job: Any): +def discover_new_outdata(job: JobData) -> list: """ Discover new outdata created by HPO job. - :param job: job object (Any) - :return: new_outdata (list of FileSpec objects). + :param job: job object (JobData) + :return: new_outdata (list of FileSpec objects) (list). """ new_outdata = [] @@ -1243,7 +1242,7 @@ def discover_new_outdata(job: Any): new_output = discover_new_output(outdata_file.lfn, job.workdir) if new_output: # create new FileSpec objects out of the new output - for outfile in new_output: + for outfile, file_info in new_output.items(): # note: guid will be taken from job report # after this function has been called files = [{ @@ -1253,8 +1252,8 @@ def discover_new_outdata(job: Any): 'dataset': outdata_file.dataset, 'ddmendpoint': outdata_file.ddmendpoint, 'ddmendpoint_alt': None, - 'filesize': new_output[outfile]['filesize'], - 'checksum': new_output[outfile]['checksum'], + 'filesize': file_info['filesize'], + 'checksum': file_info['checksum'], 'guid': '' }] @@ -1301,7 +1300,7 @@ def discover_new_output(name_pattern: str, workdir: str) -> dict: return new_output -def extract_output_file_guids(job: Any) -> None: +def extract_output_file_guids(job: JobData): """ Extract output file info from the job report and make sure all guids are assigned. @@ -1310,8 +1309,7 @@ def extract_output_file_guids(job: Any) -> None: this function might not be called if metadata info is not found prior to the call. - :param job: job object (Any) - :return: None. + :param job: job object (JobData). """ # make sure there is a defined output file list in the job report - # unless it is allowed by task parameter allowNoOutput @@ -1369,10 +1367,8 @@ def extract_output_file_guids(job: Any) -> None: # will overwrite output file list: extra=%s' % extra) #job.outdata = extra - return - -def verify_output_files(job: Any) -> bool: +def verify_output_files(job: JobData) -> bool: """ Verify that the output files from the job definition are listed in the job report. @@ -1385,7 +1381,7 @@ def verify_output_files(job: Any) -> bool: there with zero events. Then if allownooutput is not set - fail the job. If it is set, then do not store the output, and finish ok. - :param job: job object (Any) + :param job: job object (JobData) :return: True if output files were validated correctly, False otherwise (bool). """ failed = False @@ -1441,7 +1437,7 @@ def verify_output_files(job: Any) -> bool: return status -def verify_extracted_output_files(output: list, lfns_jobdef: list, job: Any) -> (bool, int): +def verify_extracted_output_files(output: list, lfns_jobdef: list, job: JobData) -> tuple[bool, int]: """ Make sure all output files extracted from the job report are listed. @@ -1449,8 +1445,8 @@ def verify_extracted_output_files(output: list, lfns_jobdef: list, job: Any) -> :param output: list of FileSpecs (list) :param lfns_jobdef: list of lfns strings from job definition (list) - :param job: job object (Any) - :return: True if successful, False if failed (bool), number of events (int). + :param job: job object (JobData) + :return: True if successful, False if failed (bool), number of events (int) (tuple). """ failed = False nevents = 0 @@ -1518,12 +1514,12 @@ def verify_extracted_output_files(output: list, lfns_jobdef: list, job: Any) -> return status, nevents -def remove_from_stageout(lfn: str, job: Any): +def remove_from_stageout(lfn: str, job: JobData): """ Remove the given lfn from the stage-out list. :param lfn: local file name (str) - :param job: job object (Any). + :param job: job object (JobData). """ outdata = [] for fspec in job.outdata: @@ -1534,11 +1530,11 @@ def remove_from_stageout(lfn: str, job: Any): job.outdata = outdata -def remove_no_output_files(job: Any): +def remove_no_output_files(job: JobData): """ Remove files from output file list if they are listed in allowNoOutput and do not exist. - :param job: job object (Any). + :param job: job object (JobData). """ # first identify the files to keep _outfiles = [] @@ -1604,7 +1600,6 @@ def get(self, path: str, dst_dict: dict, dst_key: str): :param path: path to the value (str) :param dst_dict: destination dictionary (dict) :param dst_key: destination key (str) - :return: None. """ keys = path.split("/") if len(keys) == 0: @@ -1620,8 +1615,6 @@ def get(self, path: str, dst_dict: dict, dst_key: str): if last_key in me_: dst_dict[dst_key] = me_[last_key] - return - def parse_jobreport_data(job_report: dict) -> dict: # noqa: C901 """ @@ -1738,7 +1731,7 @@ def get_resimevents(jobreport_dictionary: dict) -> int or None: return resimevents -def get_db_info(jobreport_dictionary) -> (int, int): +def get_db_info(jobreport_dictionary: dict) -> tuple[int, int]: """ Extract and add up the DB info from the job report. @@ -1748,7 +1741,7 @@ def get_db_info(jobreport_dictionary) -> (int, int): been done already by the transform and stored in dbDataTotal and dbTimeTotal. :param jobreport_dictionary: job report dictionary (dict) - :return: db_time (int), db_data (int). + :return: db_time (int), db_data (int) (tuple). """ db_time = 0 db_data = 0 @@ -1797,7 +1790,7 @@ def get_db_info_str(db_time: int, db_data: int) -> (str, str): return db_time_s, db_data_s -def get_cpu_times(jobreport_dictionary: dict) -> (str, int, float): +def get_cpu_times(jobreport_dictionary: dict) -> tuple[str, int, float]: """ Extract and add up the total CPU times from the job report. @@ -1806,7 +1799,7 @@ def get_cpu_times(jobreport_dictionary: dict) -> (str, int, float): Note: this function is used with Event Service jobs :param jobreport_dictionary: job report dictionary (dict) - :return: cpu_conversion_unit (str), total_cpu_time (int), conversion_factor (output consistent with set_time_consumed()) (float). + :return: cpu_conversion_unit (str), total_cpu_time (int), conversion_factor (output consistent with set_time_consumed()) (float) (tuple). """ total_cpu_time = 0 @@ -1826,14 +1819,14 @@ def get_cpu_times(jobreport_dictionary: dict) -> (str, int, float): return cpu_conversion_unit, total_cpu_time, conversion_factor -def get_exit_info(jobreport_dictionary: dict) -> (int, str): +def get_exit_info(jobreport_dictionary: dict) -> tuple[int, str]: """ Return the exit code (exitCode) and exit message (exitMsg). E.g. (0, 'OK'). :param jobreport_dictionary: - :return: exit_code (int), exit_message (str). + :return: exit_code (int), exit_message (str) (tuple). """ return jobreport_dictionary.get('exitCode'), jobreport_dictionary.get('exitMsg') @@ -2096,7 +2089,7 @@ def remove_redundant_files(workdir: str, outputfiles: list = None, piloterrors: :param workdir: working directory (str) :param outputfiles: list of protected output files (list) - :param errors: list of Pilot assigned error codes (list) + :param piloterrors: list of Pilot assigned error codes (list) :param debugmode: True if debug mode has been switched on (bool). """ if outputfiles is None: @@ -2180,7 +2173,7 @@ def download_command(process: dict, workdir: str) -> dict: return process -def get_utility_commands(order: int = None, job: Any = None) -> dict or None: +def get_utility_commands(order: int = None, job: JobData = None) -> dict or None: """ Return a dictionary of utility commands and arguments to be executed in parallel with the payload. @@ -2204,9 +2197,9 @@ def get_utility_commands(order: int = None, job: Any = None) -> dict or None: FORMAT: {'command': , 'args': , 'label': , 'ignore_failure': } - :param order: optional sorting order (see pilot.util.constants). - :param job: optional job object. - :return: dictionary of utilities to be executed in parallel with the payload. + :param order: optional sorting order (see pilot.util.constants) (int) + :param job: optional job object (JobData) + :return: dictionary of utilities to be executed in parallel with the payload (dict or None). """ if order == UTILITY_BEFORE_PAYLOAD and job.preprocess: return get_precopostprocess_command(job.preprocess, job.workdir, 'preprocess') @@ -2391,6 +2384,8 @@ def xcache_activation_command(workdir: str = '', jobid: str = '') -> dict: :param jobid: PanDA job id to guarantee that xcache process is unique (int) :return: xcache command (str). """ + if workdir: # to bypass pylint warning + pass # a successful startup will set ALRB_XCACHE_PROXY and ALRB_XCACHE_PROXY_REMOTE # so any file access with root://... should be replaced with one of # the above (depending on whether you are on the same machine or not) @@ -2421,6 +2416,8 @@ def xcache_deactivation_command(workdir: str = '', jobid: str = '') -> dict: :param jobid: unused job id - do not remove (str) :return: xcache command (dict). """ + if jobid: # to bypass pylint warning + pass path = os.environ.get('ALRB_XCACHE_LOG', None) if path and os.path.exists(path): logger.debug(f'copying xcache messages log file ({path}) to work dir ({workdir})') @@ -2440,14 +2437,14 @@ def xcache_deactivation_command(workdir: str = '', jobid: str = '') -> dict: return {'command': command, 'args': '-p $ALRB_XCACHE_MYPROCESS'} -def get_utility_command_setup(name: str, job: Any, setup: str = None) -> str: +def get_utility_command_setup(name: str, job: JobData, setup: str = None) -> str: """ Return the proper setup for the given utility command. If a payload setup is specified, then the utility command string should be prepended to it. :param name: name of utility (str) - :param job: job object (Any) + :param job: job object (JobData) :param setup: optional payload setup string (str) :return: utility command setup (str). """ @@ -2455,18 +2452,12 @@ def get_utility_command_setup(name: str, job: Any, setup: str = None) -> str: # must know if payload is running in a container or not # (enables search for pid in ps output) use_container = job.usecontainer or 'runcontainer' in job.transformation - dump_ps = ("PRMON_DEBUG" in job.infosys.queuedata.catchall) setup, pid = get_memory_monitor_setup( job.pid, - job.pgrp, job.jobid, job.workdir, - job.command, - use_container=use_container, - transformation=job.transformation, - outdata=job.outdata, - dump_ps=dump_ps + use_container=use_container ) _pattern = r"([\S]+)\ ." @@ -2482,16 +2473,13 @@ def get_utility_command_setup(name: str, job: Any, setup: str = None) -> str: logger.debug(f'updating pgrp={job.pgrp} for pid={pid}') try: job.pgrp = os.getpgid(pid) - except Exception as exc: + except ProcessLookupError as exc: logger.warning(f'os.getpgid({pid}) failed with: {exc}') return setup if name == 'NetworkMonitor' and setup: return get_network_monitor_setup(setup, job) - if name == 'Prefetcher': - return get_prefetcher_setup(job) - return "" @@ -2514,12 +2502,12 @@ def get_utility_command_execution_order(name: str) -> int: return UTILITY_AFTER_PAYLOAD_STARTED -def post_utility_command_action(name: str, job: Any): +def post_utility_command_action(name: str, job: JobData): """ Perform post action for given utility command. :param name: name of utility command (str) - :param job: job object (Any). + :param job: job object (JobData). """ if name == 'NetworkMonitor': pass @@ -2549,12 +2537,12 @@ def get_utility_command_output_filename(name: str, selector: bool = None) -> str return get_memory_monitor_summary_filename(selector=selector) if name == 'MemoryMonitor' else "" -def verify_lfn_length(outdata: list) -> (int, str): +def verify_lfn_length(outdata: list) -> tuple[int, str]: """ Make sure that the LFNs are all within the allowed length. :param outdata: list of FileSpec objects (list) - :return: error code (int), diagnostics (str). + :return: error code (int), diagnostics (str) (tuple). """ exitcode = 0 diagnostics = "" @@ -2604,7 +2592,7 @@ def verify_ncores(corecount: int): f"(ATHENA_PROC_NUMBER will not be overwritten)") -def verify_job(job: Any) -> bool: +def verify_job(job: JobData) -> bool: """ Verify job parameters for specific errors. @@ -2612,7 +2600,7 @@ def verify_job(job: Any) -> bool: in case of problem, the function should set the corresponding pilot error code using: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) - :param job: job object (Any) + :param job: job object (JobData) :return: True if verified, False otherwise (bool). """ status = False @@ -2632,11 +2620,11 @@ def verify_job(job: Any) -> bool: return status -def update_stagein(job: Any): +def update_stagein(job: JobData): """ Skip DBRelease files during stage-in. - :param job: job object (Any). + :param job: job object (JobData). """ for fspec in job.indata: if 'DBRelease' in fspec.lfn: @@ -2667,13 +2655,13 @@ def should_update_logstash(frequency: int = 10) -> bool: return randint(0, frequency - 1) == 0 -def update_server(job: Any) -> None: +def update_server(job: JobData) -> None: """ Perform any user specific server actions. E.g. this can be used to send special information to a logstash. - :param job: job object (Any). + :param job: job object (JobData). """ # attempt to read memory_monitor_output.txt and convert it to json if not should_update_logstash(): @@ -2721,11 +2709,11 @@ def update_server(job: Any) -> None: return -def preprocess_debug_command(job: Any): +def preprocess_debug_command(job: JobData): """ Pre-process the debug command in debug mode. - :param job: Job object (Any). + :param job: Job object (JobData). """ # Should the pilot do the setup or does jobPars already contain the information? preparesetup = should_pilot_prepare_setup(job.noexecstrcnv, job.jobparams) @@ -2814,16 +2802,22 @@ def allow_timefloor(submitmode: str) -> bool: :param submitmode: submit mode (str) :return: always True for ATLAS (bool). """ + if submitmode: # to bypass pylint score 0 + pass + return True -def get_pilot_id(jobid: int) -> str: +def get_pilot_id(jobid: str) -> str: """ Get the pilot id from the environment variable GTAG. Update if necessary (not for ATLAS since we want the same pilot id for all multi-jobs). - :param jobid: PanDA job id - UNUSED (int) + :param jobid: PanDA job id - UNUSED (str) :return: pilot id (str). """ + if jobid: # to bypass pylint score 0 + pass + return os.environ.get("GTAG", "unknown") diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index f6ada08d6..adc7de779 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -17,26 +17,43 @@ # under the License. # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-23 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-24 # - Alexander Bogdanchikov, Alexander.Bogdanchikov@cern.ch, 2019-20 +"""Functions related to containerisation for ATLAS.""" + import fcntl import json import logging import os -import pipes import re +import shlex import subprocess import time -from typing import Any, Callable + +from collections.abc import Callable +from typing import Any # for user container test: import urllib from pilot.common.errorcodes import ErrorCodes -from pilot.common.exception import PilotException, FileHandlingFailure -from pilot.user.atlas.setup import get_asetup, get_file_system_root_path -from pilot.user.atlas.proxy import get_and_verify_proxy, get_voms_role -from pilot.info import InfoService, infosys +from pilot.common.exception import ( + PilotException, + FileHandlingFailure +) +from pilot.user.atlas.setup import ( + get_asetup, + get_file_system_root_path +) +from pilot.user.atlas.proxy import ( + get_and_verify_proxy, + get_voms_role +) +from pilot.info import ( + InfoService, + infosys, + JobData +) from pilot.util.config import config from pilot.util.constants import get_rucio_client_version from pilot.util.container import obscure_token @@ -50,11 +67,11 @@ errors = ErrorCodes() -def do_use_container(**kwargs: Any) -> bool: +def do_use_container(**kwargs: dict) -> bool: """ Decide whether to use a container or not. - :param kwargs: dictionary of key-word arguments (Any) + :param kwargs: dictionary of key-word arguments (dict) :return: True if function has decided that a container should be used, False otherwise (bool). """ # to force no container use: return False @@ -66,35 +83,28 @@ def do_use_container(**kwargs: Any) -> bool: # for user jobs, TRF option --containerImage must have been used, ie imagename must be set if job.imagename and job.imagename != 'NULL': use_container = True - logger.debug('job.imagename set -> use_container = True') elif not (job.platform or job.alrbuserplatform): use_container = False - logger.debug('not (job.platform or job.alrbuserplatform) -> use_container = False') else: queuedata = job.infosys.queuedata container_name = queuedata.container_type.get("pilot") if container_name: use_container = True - logger.debug(f"container_name == \'{container_name}\' -> use_container = True") - else: - logger.debug('else -> use_container = False') elif copytool: # override for copytools - use a container for stage-in/out use_container = True - logger.debug('copytool -> use_container = False') - else: - logger.debug('not job -> use_container = False') return use_container -def wrapper(executable: str, **kwargs: Any) -> Callable[..., Any]: +def wrapper(executable: str, **kwargs: dict) -> Callable[..., Any]: """ - Wrapper function for any container specific usage. + Wrap given function for any container specific usage. + This function will be called by pilot.util.container.execute() and prepends the executable with a container command. :param executable: command to be executed (str) - :param kwargs: dictionary of key-word arguments (Any) + :param kwargs: dictionary of key-word arguments (dict) :return: executable wrapped with container command (Callable). """ workdir = kwargs.get('workdir', '.') @@ -147,7 +157,7 @@ def get_grid_image(platform: str) -> str: image = arch_and_os + ".img" _path1 = os.path.join(get_file_system_root_path(), "atlas.cern.ch/repo/containers/images/apptainer") _path2 = os.path.join(get_file_system_root_path(), "atlas.cern.ch/repo/containers/images/singularity") - paths = [path for path in [_path1, _path2] if os.path.isdir(path)] + paths = tuple(path for path in (_path1, _path2) if os.path.isdir(path)) _path = paths[0] path = os.path.join(_path, image) if not os.path.exists(path): @@ -161,16 +171,16 @@ def get_grid_image(platform: str) -> str: return path -def get_middleware_type(): +def get_middleware_type() -> str: """ Return the middleware type from the container type. + E.g. container_type = 'singularity:pilot;docker:wrapper;container:middleware' get_middleware_type() -> 'container', meaning that middleware should be taken from the container. The default is otherwise 'workernode', i.e. middleware is assumed to be present on the worker node. - :return: middleware_type (string) + :return: middleware_type (str). """ - middleware_type = "" container_type = infosys.queuedata.container_type @@ -192,20 +202,19 @@ def get_middleware_type(): return middleware_type -def extract_atlas_setup(asetup, swrelease): +def extract_atlas_setup(asetup: str, swrelease: str) -> tuple[str, str]: """ Extract the asetup command from the full setup command for jobs that have a defined release. + export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh --quiet;source $AtlasSetup/scripts/asetup.sh -> $AtlasSetup/scripts/asetup.sh, export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh --quiet; - :param asetup: full asetup command (string). - :param swrelease: ATLAS release (string). - :return: extracted asetup command, cleaned up full asetup command without asetup.sh (string). + :param asetup: full asetup command (str). + :param swrelease: ATLAS release (str). + :return: extracted asetup command (str), cleaned up full asetup command without asetup.sh (str) (tuple). """ - - logger.debug(f'swrelease={swrelease}') if not swrelease: return '', '' @@ -225,21 +234,19 @@ def extract_atlas_setup(asetup, swrelease): return atlas_setup, cleaned_atlas_setup -def extract_full_atlas_setup(cmd, atlas_setup): +def extract_full_atlas_setup(cmd: str, atlas_setup: str) -> tuple[str, str]: """ Extract the full asetup (including options) from the payload setup command. + atlas_setup is typically '$AtlasSetup/scripts/asetup.sh'. - :param cmd: full payload setup command (string). - :param atlas_setup: asetup command (string). - :return: extracted full asetup command, updated full payload setup command without asetup part (string). + :param cmd: full payload setup command (str) + :param atlas_setup: asetup command (str) + :return: extracted full asetup command (str), updated full payload setup command without asetup part (str) (tuple). """ - updated_cmds = [] extracted_asetup = "" - logger.debug(f'cmd={cmd}, atlas_setup={atlas_setup}') - if not atlas_setup: return extracted_asetup, cmd @@ -254,21 +261,20 @@ def extract_full_atlas_setup(cmd, atlas_setup): except AttributeError as exc: logger.warning(f'exception caught while extracting full atlas setup: {exc}') updated_cmd = cmd - logger.debug(f'updated payload setup command: {updated_cmd}') return extracted_asetup, updated_cmd -def update_alrb_setup(cmd, use_release_setup): +def update_alrb_setup(cmd: str, use_release_setup: str) -> str: """ Update the ALRB setup command. + Add the ALRB_CONT_SETUPFILE in case the release setup file was created earlier (required available cvmfs). :param cmd: full ALRB setup command (string). :param use_release_setup: should the release setup file be added to the setup command? (Boolean). :return: updated ALRB setup command (string). """ - updated_cmds = [] try: _cmd = cmd.split(';') @@ -280,30 +286,28 @@ def update_alrb_setup(cmd, use_release_setup): except AttributeError as exc: logger.warning(f'exception caught while extracting full atlas setup: {exc}') updated_cmd = cmd - logger.debug(f'updated ALRB command: {updated_cmd}') return updated_cmd -def update_for_user_proxy(_cmd, cmd, is_analysis=False, queue_type=''): +def update_for_user_proxy(setup_cmd: str, cmd: str, is_analysis: bool = False, queue_type: str = '') -> tuple[int, str, str, str]: """ Add the X509 user proxy to the container sub command string if set, and remove it from the main container command. + Try to receive payload proxy and update X509_USER_PROXY in container setup command In case payload proxy from server is required, this function will also download and verify this proxy. - :param _cmd: container setup command (string). - :param cmd: command the container will execute (string). - :param is_analysis: True for user job (Boolean). - :param queue_type: queue type (e.g. 'unified') (string). - :return: exit_code (int), diagnostics (string), updated _cmd (string), updated cmd (string). + :param setup_cmd: container setup command (str) + :param cmd: command the container will execute (str) + :param is_analysis: True for user job (bool) + :param queue_type: queue type (e.g. 'unified') (str) + :return: exit_code (int), diagnostics (str), updated _cmd (str), updated cmd (str) (tuple). """ - exit_code = 0 diagnostics = "" #x509 = os.environ.get('X509_USER_PROXY', '') x509 = os.environ.get('X509_UNIFIED_DISPATCH', os.environ.get('X509_USER_PROXY', '')) - logger.debug(f'using X509_USER_PROXY={x509}') if x509 != "": # do not include the X509_USER_PROXY in the command the container will execute cmd = cmd.replace(f"export X509_USER_PROXY={x509};", '') @@ -318,20 +322,19 @@ def update_for_user_proxy(_cmd, cmd, is_analysis=False, queue_type=''): logger.warning('payload proxy verification failed') # add X509_USER_PROXY setting to the container setup command - _cmd = f"export X509_USER_PROXY={x509};" + _cmd + setup_cmd = f"export X509_USER_PROXY={x509};" + setup_cmd - return exit_code, diagnostics, _cmd, cmd + return exit_code, diagnostics, setup_cmd, cmd -def set_platform(job, alrb_setup): +def set_platform(job: JobData, alrb_setup: str) -> str: """ Set thePlatform variable and add it to the sub container command. - :param job: job object. - :param alrb_setup: ALRB setup (string). - :return: updated ALRB setup (string). + :param job: job object (JobData) + :param alrb_setup: ALRB setup (str) + :return: updated ALRB setup (str). """ - if job.alrbuserplatform: alrb_setup += f'export thePlatform="{job.alrbuserplatform}";' elif job.preprocess and job.containeroptions: @@ -344,15 +347,15 @@ def set_platform(job, alrb_setup): return alrb_setup -def get_container_options(container_options): +def get_container_options(container_options: str) -> str: """ Get the container options from AGIS for the container execution command. + For Raythena ES jobs, replace the -C with "" (otherwise IPC does not work, needed by yampl). - :param container_options: container options from AGIS (string). - :return: updated container command (string). + :param container_options: container options from AGIS (str) + :return: updated container command (str). """ - is_raythena = os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') == 'raythena' opts = '' @@ -366,21 +369,20 @@ def get_container_options(container_options): container_options = container_options.replace('--containall', '') if container_options: opts += f'-e "{container_options}"' + # consider using options "-c -i -p" instead of "-C". The difference is that the latter blocks all environment + # variables by default and the former does not + # update: skip the -i to allow IPC, otherwise yampl won't work + elif is_raythena: + pass + # opts += 'export ALRB_CONT_CMDOPTS=\"$ALRB_CONT_CMDOPTS -c -i -p\";' else: - # consider using options "-c -i -p" instead of "-C". The difference is that the latter blocks all environment - # variables by default and the former does not - # update: skip the -i to allow IPC, otherwise yampl won't work - if is_raythena: - pass - # opts += 'export ALRB_CONT_CMDOPTS=\"$ALRB_CONT_CMDOPTS -c -i -p\";' - else: - #opts += '-e \"-C\"' - opts += '-e \"-c -i\"' + #opts += '-e \"-C\"' + opts += '-e \"-c -i\"' return opts -def alrb_wrapper(cmd: str, workdir: str, job: Any = None) -> str: +def alrb_wrapper(cmd: str, workdir: str, job: JobData = None) -> str: """ Wrap the given command with the special ALRB setup for containers E.g. cmd = /bin/bash hello_world.sh @@ -389,12 +391,13 @@ def alrb_wrapper(cmd: str, workdir: str, job: Any = None) -> str: export ALRB_CONT_RUNPAYLOAD="cmd' setupATLAS -c $thePlatform - :param cmd (string): command to be executed in a container. - :param workdir: (not used) - :param job: job object. - :return: prepended command with singularity/apptainer execution command (string). + :param cmd: command to be executed in a container (str) + :param workdir: (not used) (str) + :param job: job object (JobData) + :return: prepended command with singularity/apptainer execution command (str). """ - + if workdir: # bypass pylint warning + pass if not job: logger.warning('the ALRB wrapper did not get a job object - cannot proceed') return cmd @@ -510,7 +513,6 @@ def add_docker_login(cmd: str, pandasecrets: dict) -> dict: :param pandasecrets: panda secrets (dict) :return: updated payload command (str). """ - pattern = r'docker://[^/]+/' tmp = json.loads(pandasecrets) docker_tokens = tmp.get('DOCKER_TOKENS', None) @@ -552,19 +554,18 @@ def add_docker_login(cmd: str, pandasecrets: dict) -> dict: return cmd -def add_asetup(job, alrb_setup, is_cvmfs, release_setup, container_script, container_options): +def add_asetup(job: JobData, alrb_setup: str, is_cvmfs: bool, release_setup: str, container_script: str, container_options: str) -> str: """ Add atlasLocalSetup and options to form the final payload command. - :param job: job object. - :param alrb_setup: ALRB setup (string). - :param is_cvmfs: True for cvmfs sites (Boolean). - :param release_setup: release setup (string). - :param container_script: container script name (string). - :param container_options: container options (string). - :return: final payload command (string). + :param job: job object (JobData) + :param alrb_setup: ALRB setup (str) + :param is_cvmfs: True for cvmfs sites (bool) + :param release_setup: release setup (str) + :param container_script: container script name (str) + :param container_options: container options (str) + :return: final payload command (str). """ - # this should not be necessary after the extract_container_image() in JobData update # containerImage should have been removed already if '--containerImage' in job.jobparams: @@ -605,19 +606,19 @@ def add_asetup(job, alrb_setup, is_cvmfs, release_setup, container_script, conta return cmd -def get_full_asetup(cmd, atlas_setup): +def get_full_asetup(cmd: str, atlas_setup: str) -> str: """ Extract the full asetup command from the payload execution command. + (Easier that generating it again). We need to remove this command for stand-alone containers. Alternatively: do not include it in the first place (but this seems to trigger the need for further changes). atlas_setup is "source $AtlasSetup/scripts/asetup.sh", which is extracted in a previous step. The function typically returns: "source $AtlasSetup/scripts/asetup.sh 21.0,Athena,2020-05-19T2148,notest --makeflags='$MAKEFLAGS';". - :param cmd: payload execution command (string). - :param atlas_setup: extracted atlas setup (string). - :return: full atlas setup (string). + :param cmd: payload execution command (str) + :param atlas_setup: extracted atlas setup (str) + :return: full atlas setup (str). """ - pos = cmd.find(atlas_setup) cmd = cmd[pos:] # remove everything before 'source $AtlasSetup/..' pos = cmd.find(';') @@ -626,15 +627,14 @@ def get_full_asetup(cmd, atlas_setup): return cmd -def replace_last_command(cmd, replacement): +def replace_last_command(cmd: str, replacement: str) -> str: """ Replace the last command in cmd with given replacement. - :param cmd: command (string). - :param replacement: replacement (string). - :return: updated command (string). + :param cmd: command (str) + :param replacement: replacement (str) + :return: updated command (str). """ - cmd = cmd.strip('; ') last_bit = cmd.split(';')[-1] cmd = cmd.replace(last_bit.strip(), replacement) @@ -642,21 +642,20 @@ def replace_last_command(cmd, replacement): return cmd -def create_release_setup(cmd, atlas_setup, full_atlas_setup, release, workdir, is_cvmfs): +def create_release_setup(cmd: str, atlas_setup: str, full_atlas_setup: str, release: str, workdir: str, is_cvmfs: bool) -> tuple[str, str]: """ Get the proper release setup script name, and create the script if necessary. This function also updates the cmd string (removes full asetup from payload command). - :param cmd: Payload execution command (string). - :param atlas_setup: asetup command (string). - :param full_atlas_setup: full asetup command (string). - :param release: software release, needed to determine Athena environment (string). - :param workdir: job workdir (string). - :param is_cvmfs: does the queue have cvmfs? (Boolean). - :return: proper release setup name (string), updated cmd (string). + :param cmd: Payload execution command (str) + :param atlas_setup: asetup command (str) + :param full_atlas_setup: full asetup command (str) + :param release: software release, needed to determine Athena environment (str) + :param workdir: job workdir (str) + :param is_cvmfs: does the queue have cvmfs? (bool) + :return: proper release setup name (str), updated cmd (str). """ - release_setup_name = '/srv/my_release_setup.sh' # extracted_asetup should be written to 'my_release_setup.sh' and cmd to 'container_script.sh' @@ -687,9 +686,13 @@ def create_release_setup(cmd, atlas_setup, full_atlas_setup, release, workdir, i ## DEPRECATED, remove after verification with user container job -def remove_container_string(job_params): - """ Retrieve the container string from the job parameters """ +def remove_container_string(job_params: str) -> tuple[str, str]: + """ + Retrieve the container string from the job parameters. + :param job_params: job parameters (str) + :return: updated job parameters (str), extracted container path (str) (tuple). + """ pattern = r" \'?\-\-containerImage\=?\ ?([\S]+)\ ?\'?" compiled_pattern = re.compile(pattern) @@ -706,9 +709,10 @@ def remove_container_string(job_params): return job_params, container_path -def container_wrapper(cmd, workdir, job=None): +def container_wrapper(cmd: str, workdir: str, job: JobData = None) -> str: """ - Prepend the given command with the singularity/apptainer execution command + Prepend the given command with the singularity/apptainer execution command. + E.g. cmd = /bin/bash hello_world.sh -> singularity_command = singularity exec -B /bin/bash hello_world.sh singularity exec -B /cvmfs/atlas.cern.ch/repo/images/singularity/x86_64-slc6.img