From 042c9f68b5a1362fa5cb38c05aa268744a5b34a9 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 24 Sep 2020 10:45:55 +0200 Subject: [PATCH 01/33] New version 2.8.4.1 --- PILOTVERSION | 2 +- pilot/util/constants.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index f358d8cf6..38b918e79 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.3.3 \ No newline at end of file +2.8.4.1 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 842e02b54..6b044a76a 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -13,8 +13,8 @@ # Pilot version RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates -REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '3' # build number should be reset to '1' for every new development cycle +REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '1' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 806c49d918bf654f2f5e54c0cd182988d341ae5d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 24 Sep 2020 14:25:01 +0200 Subject: [PATCH 02/33] Improved actualcorecount calculation (any false positives are removed with grep -x) --- PILOTVERSION | 2 +- pilot/user/atlas/cpu.py | 5 ++++- pilot/util/constants.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 38b918e79..1a7596640 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.1 \ No newline at end of file +2.8.4.2 \ No newline at end of file diff --git a/pilot/user/atlas/cpu.py b/pilot/user/atlas/cpu.py index ddedaf618..8ca48e78c 100644 --- a/pilot/user/atlas/cpu.py +++ b/pilot/user/atlas/cpu.py @@ -70,7 +70,10 @@ def set_core_counts(job): log = get_logger(job.jobid) if job.pgrp: - cmd = "ps axo pgid,psr | sort | grep %d | uniq | wc -l" % job.pgrp + # ps axo pgid,psr -> 154628 8 \n 154628 9 \n 1546280 1 .. + # sort is redundant; uniq removes any duplicate lines; wc -l gives the final count + # awk is added to get the pgrp list only and then grep -x makes sure that false positives are removed, e.g. 1546280 + cmd = "ps axo pgid,psr | sort | grep %d | uniq | awk '{print $1}' | grep -x %d | wc -l" % (job.pgrp, job.pgrp) exit_code, stdout, stderr = execute(cmd, mute=True) log.debug('%s: %s' % (cmd, stdout)) try: diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6b044a76a..3519ed8da 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '1' # build number should be reset to '1' for every new development cycle +BUILD = '2' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From c130f891a04b31892b35155f5202aa1436871a5c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 24 Sep 2020 14:51:53 +0200 Subject: [PATCH 03/33] Printing mean actualcorecount to log --- pilot/control/job.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 49eaeaf1e..40c7bb823 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -43,7 +43,7 @@ is_harvester_mode, get_worker_attributes_file, publish_job_report, publish_work_report, get_event_status_file, \ publish_stageout_files from pilot.util.jobmetrics import get_job_metrics -# from pilot.util.math import mean +from pilot.util.math import mean from pilot.util.monitoring import job_monitor_tasks, check_local_space from pilot.util.monitoringtime import MonitoringTime from pilot.util.processes import cleanup, threads_aborted @@ -539,6 +539,8 @@ def get_data_structure(job, state, args, xml=None, metadata=None): if job.corecount and job.corecount != 'null' and job.corecount != 'NULL': data['coreCount'] = job.corecount #data['coreCount'] = mean(job.corecounts) if job.corecounts else job.corecount + if job.corecounts: + log.info('mean actualcorecount: %f' % mean(job.corecounts)) # get the number of events, should report in heartbeat in case of preempted. if job.nevents != 0: From 4f01986c992c981a76ff7e95db778cda6a3eacd5 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 24 Sep 2020 15:13:23 +0200 Subject: [PATCH 04/33] Removed useless function. Started refactoring of main payload function in preparation for HPO loop. Added preliminary HPO loop --- pilot/control/payloads/generic.py | 45 +++++++++++++------------------ 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 7ec249c2a..96d3bb07a 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -47,28 +47,6 @@ def get_job(self): """ return self.__job - def setup_payload(self, job, out, err): - """ - (add description) - :param job: - :param out: - :param err: - :return: - """ - # log = get_logger(job.jobid, logger) - - # try: - # create symbolic link for sqlite200 and geomDB in job dir - # for db_name in ['sqlite200', 'geomDB']: - # src = '/cvmfs/atlas.cern.ch/repo/sw/database/DBRelease/current/%s' % db_name - # link_name = 'job-%s/%s' % (job.jobid, db_name) - # os.symlink(src, link_name) - # except Exception as e: - # log.error('could not create symbolic links to database files: %s' % e) - # return False - - return True - def pre_setup(self, job): """ Functions to run pre setup @@ -253,7 +231,7 @@ def post_payload(self, job): def run_payload(self, job, out, err): """ - Setup and execute the preprocess, payload and postprocess commands. + Setup and execute the main payload process. :param job: job object. :param out: (currently not used; deprecated) @@ -399,7 +377,9 @@ def wait_graceful(self, args, proc, job): def run(self): """ - (add description) + Run all payload processes (including pre- and post-processes, and utilities). + In the case of HPO jobs, this function will loop over all processes until the preprocess returns a special + exit code. :return: """ log = get_logger(str(self.__job.jobid), logger) @@ -407,9 +387,19 @@ def run(self): exit_code = 1 pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - if self.setup_payload(self.__job, self.__out, self.__err): + # prepare for main payload + + # a loop is needed for HPO jobs + # abort when nothing more to run, or when the preprocess returns a special exit code + is_hpo = False + while True: + # first run the preprocess (if necessary) + + # now run the main payload, when it finishes, run the postprocess (if necessary) proc = self.run_payload(self.__job, self.__out, self.__err) - if proc is not None: + if proc is None: + break + else: # the process is now running, update the server send_state(self.__job, self.__args, self.__job.state) @@ -451,4 +441,7 @@ def run(self): user.post_utility_command_action(utcmd, self.__job) + if not is_hpo: + break + return exit_code From 12f7b6c27b33ab9c980f0447a84722caf603c830 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 24 Sep 2020 17:00:04 +0200 Subject: [PATCH 05/33] Refactorings --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 134 +++++++++++++++++++----------- pilot/util/constants.py | 2 +- 3 files changed, 86 insertions(+), 52 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 1a7596640..7885c2ec3 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.2 \ No newline at end of file +2.8.4.3 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 96d3bb07a..8c2a4236c 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -88,8 +88,9 @@ def utility_before_payload(self, job): def utility_with_payload(self, job): """ - Functions to run with payload - :param job: job object + Functions to run with payload. + + :param job: job object. """ log = get_logger(job.jobid, logger) @@ -103,6 +104,8 @@ def utility_with_payload(self, job): cmd = '%s %s' % (cmd_dictionary.get('command'), cmd_dictionary.get('args')) log.debug('utility command to be executed with the payload: %s' % cmd) + return cmd + def utility_after_payload_started(self, job): """ Functions to run after payload started @@ -229,7 +232,7 @@ def post_payload(self, job): # write time stamps to pilot timing file add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), self.__args) - def run_payload(self, job, out, err): + def run_payload(self, job, cmd, out, err): """ Setup and execute the main payload process. @@ -241,55 +244,14 @@ def run_payload(self, job, out, err): log = get_logger(job.jobid, logger) - self.pre_setup(job) - - # get the payload command from the user specific code - pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 - - self.post_setup(job) - - try: - cmd_before_payload = self.utility_before_payload(job) - except Exception as e: - log.error(e) - raise e + # main payload process steps # add time for PILOT_PRE_PAYLOAD self.pre_payload(job) - self.utility_with_payload(job) - - # for testing looping job: cmd = user.get_payload_command(job) + ';sleep 240' - try: - cmd = user.get_payload_command(job) - except PilotException as error: - import traceback - log.error(traceback.format_exc()) - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) - self.__traces.pilot['error_code'] = job.piloterrorcodes[0] - log.fatal('could not define payload command (traces error set to: %d)' % self.__traces.pilot['error_code']) - return None - - # preprocess - - # extract the setup in case the preprocess command needs it - job.setup = self.extract_setup(cmd) - if cmd_before_payload: - cmd_before_payload = job.setup + cmd_before_payload - log.info("\n\npreprocess execution command:\n\n%s\n" % cmd_before_payload) - exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess') - if exit_code: - log.fatal('cannot continue since preprocess failed') - return None - else: - # in case the preprocess produced a command, chmod it - path = os.path.join(job.workdir, job.containeroptions.get('containerExec', 'does_not_exist')) - if os.path.exists(path): - log.debug('chmod 0o755: %s' % path) - os.chmod(path, 0o755) - - # main payload process + _cmd = self.utility_with_payload(job) + if _cmd: + log.info('could have executed: %s (currently not used)' % _cmd) log.info("\n\npayload execution command:\n\n%s\n" % cmd) try: @@ -375,6 +337,70 @@ def wait_graceful(self, args, proc, job): return exit_code + def get_payload_command(self, job): + """ + Return the payload command string. + + :param job: job object. + :return: command (string). + """ + + log = get_logger(str(job), logger) + + cmd = "" + # for testing looping job: cmd = user.get_payload_command(job) + ';sleep 240' + try: + pilot_user = os.environ.get('PILOT_USER', 'generic').lower() + user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], + 0) # Python 2/3 + cmd = user.get_payload_command(job) + except PilotException as error: + self.post_setup(job) + import traceback + log.error(traceback.format_exc()) + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code()) + self.__traces.pilot['error_code'] = job.piloterrorcodes[0] + log.fatal( + 'could not define payload command (traces error set to: %d)' % self.__traces.pilot['error_code']) + + return cmd + + def run_preprocess(self, job): + """ + Run any preprocess payloads. + + :param job: job object. + :return: + """ + + log = get_logger(str(self.__job.jobid), logger) + exit_code = 0 + + try: + cmd_before_payload = self.utility_before_payload(job) + except Exception as e: + log.error(e) + raise e + + if cmd_before_payload: + cmd_before_payload = job.setup + cmd_before_payload + log.info("\n\npreprocess execution command:\n\n%s\n" % cmd_before_payload) + exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess') + if exit_code == 42: + log.fatal('no more HP points - time to abort') + elif exit_code: + # set error code + # .. + log.fatal('cannot continue since preprocess failed') + else: + # in case the preprocess produced a command, chmod it + path = os.path.join(job.workdir, job.containeroptions.get('containerExec', 'does_not_exist')) + if os.path.exists(path): + log.debug('chmod 0o755: %s' % path) + os.chmod(path, 0o755) + + return exit_code + def run(self): """ Run all payload processes (including pre- and post-processes, and utilities). @@ -387,16 +413,24 @@ def run(self): exit_code = 1 pilot_user = os.environ.get('PILOT_USER', 'generic').lower() - # prepare for main payload + # get the payload command from the user specific code + self.pre_setup(self.__job) + cmd = self.get_payload_command(self.__job) + # extract the setup in case the preprocess command needs it + self.__job.setup = self.extract_setup(cmd) + self.post_setup(self.__job) # a loop is needed for HPO jobs # abort when nothing more to run, or when the preprocess returns a special exit code is_hpo = False while True: # first run the preprocess (if necessary) + exit_code = self.run_preprocess(self.__job) + if exit_code: + break # now run the main payload, when it finishes, run the postprocess (if necessary) - proc = self.run_payload(self.__job, self.__out, self.__err) + proc = self.run_payload(self.__job, cmd, self.__out, self.__err) if proc is None: break else: diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 3519ed8da..0c08bbe93 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '2' # build number should be reset to '1' for every new development cycle +BUILD = '3' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 443c94b2d840db52c2d77f9b1f2636bcf4ae3d0f Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sat, 26 Sep 2020 19:12:15 -0500 Subject: [PATCH 06/33] fix file type check for python 3 --- pilot/eventservice/esprocess/esprocess.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index 78e4a7607..765356094 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -7,6 +7,7 @@ # - Wen Guan, wen.guan@cern.ch, 2017-2018 # - Paul Nilsson, paul.nilsson@cern.ch, 2018-2019 +import io import json import logging import os @@ -29,6 +30,12 @@ logger = logging.getLogger(__name__) +try: + file_type = file +except NameError: + file_type = io.IOBase + + """ Main process to handle event service. It makes use of two hooks get_event_ranges_hook and handle_out_message_hook to communicate with other processes when @@ -161,7 +168,7 @@ def init_payload_process(self): # noqa: C901 executable = 'cd %s; %s' % (workdir, executable) if 'output_file' in self.__payload: - if type(self.__payload['output_file']) in [file]: + if isinstance(self.__payload['output_file'], file_type): output_file_fd = self.__payload['output_file'] else: if '/' in self.__payload['output_file']: @@ -174,7 +181,7 @@ def init_payload_process(self): # noqa: C901 output_file_fd = open(output_file, 'w') if 'error_file' in self.__payload: - if type(self.__payload['error_file']) in [file]: + if isinstance(self.__payload['error_file'], file_type): error_file_fd = self.__payload['error_file'] else: if '/' in self.__payload['error_file']: From b163a9b6bf1125212dda3e6bb57188022413cbf0 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sun, 27 Sep 2020 06:53:47 -0500 Subject: [PATCH 07/33] fix python3 threading _stop and change isSet() to is_set() --- .../communicationmanager/communicationmanager.py | 2 +- pilot/eventservice/esprocess/esmessage.py | 6 +++--- pilot/eventservice/workexecutor/plugins/baseexecutor.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pilot/eventservice/communicationmanager/communicationmanager.py b/pilot/eventservice/communicationmanager/communicationmanager.py index 9f3ba599b..2d1053cff 100644 --- a/pilot/eventservice/communicationmanager/communicationmanager.py +++ b/pilot/eventservice/communicationmanager/communicationmanager.py @@ -159,7 +159,7 @@ def is_stop(self): :returns: True if the stop signal is set, otherwise False """ - return self.stop_event.isSet() + return self.stop_event.is_set() def get_jobs(self, njobs=1, post_hook=None, args=None): """ diff --git a/pilot/eventservice/esprocess/esmessage.py b/pilot/eventservice/esprocess/esmessage.py index 7965a89b8..e2d65e37d 100644 --- a/pilot/eventservice/esprocess/esmessage.py +++ b/pilot/eventservice/esprocess/esmessage.py @@ -39,7 +39,7 @@ def __init__(self, message_queue, socket_name=None, context='local', **kwds): self.setName("MessageThread") self.__message_queue = message_queue self._socket_name = socket_name - self._stop = threading.Event() + self.__stop = threading.Event() logger.info('try to import yampl') try: @@ -82,7 +82,7 @@ def stop(self): Set stop event. """ logger.debug('set stop event') - self._stop.set() + self.__stop.set() def is_stopped(self): """ @@ -90,7 +90,7 @@ def is_stopped(self): :returns: True if stop event is set, otherwise False. """ - return self._stop.isSet() + return self.__stop.is_set() def terminate(self): """ diff --git a/pilot/eventservice/workexecutor/plugins/baseexecutor.py b/pilot/eventservice/workexecutor/plugins/baseexecutor.py index 613ea9a0b..ae565399e 100644 --- a/pilot/eventservice/workexecutor/plugins/baseexecutor.py +++ b/pilot/eventservice/workexecutor/plugins/baseexecutor.py @@ -66,7 +66,7 @@ def stop(self): self.__stop.set() def is_stop(self): - return self.__stop.isSet() + return self.__stop.is_set() def stop_communicator(self): logger.info("Stopping communication manager") From 2203afa855bd72fe233ac43c93e769fca98dfbfb Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sun, 27 Sep 2020 08:53:24 -0500 Subject: [PATCH 08/33] fix python3 type error --- pilot/eventservice/esprocess/esprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index 765356094..443e7463c 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -475,7 +475,7 @@ def handle_messages(self): pass else: logger.debug('received message from payload: %s' % message) - if "Ready for events" in message: + if "Ready for events" in str(message): event_ranges = self.get_event_range_to_payload() if not event_ranges: event_ranges = "No more events" From 100db31bb8817aa4d95283073b6eec05dbf6ecdd Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sun, 27 Sep 2020 10:53:52 -0500 Subject: [PATCH 09/33] fix parse_out_message function --- pilot/eventservice/esprocess/esprocess.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index 443e7463c..d46432d6b 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -403,6 +403,7 @@ def parse_out_message(self, message): UnknownException: when other unknown exception is caught. """ + message = str(message) # needed for Python 3 logger.debug('parsing message: %s' % message) try: if message.startswith("/"): From 04662ec3b153cdd99c8988b27ec659150d42d14d Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sun, 27 Sep 2020 12:37:19 -0500 Subject: [PATCH 10/33] convert out message to string --- pilot/eventservice/esprocess/esprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index d46432d6b..a4c6c0c4a 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -475,8 +475,9 @@ def handle_messages(self): except queue.Empty: pass else: + message = str(message) # convert to string - works Python 2 or Python 3 logger.debug('received message from payload: %s' % message) - if "Ready for events" in str(message): + if "Ready for events" in message: event_ranges = self.get_event_range_to_payload() if not event_ranges: event_ranges = "No more events" From 9bf8ba11cede2efbb7a3f56f98c049cf5f1f58d2 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sun, 27 Sep 2020 17:00:16 -0500 Subject: [PATCH 11/33] convert out message to str - Python3 --- pilot/eventservice/esprocess/esprocess.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index a4c6c0c4a..cb004d4bc 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -13,6 +13,7 @@ import os import re import subprocess +import sys import time import threading import traceback @@ -403,7 +404,6 @@ def parse_out_message(self, message): UnknownException: when other unknown exception is caught. """ - message = str(message) # needed for Python 3 logger.debug('parsing message: %s' % message) try: if message.startswith("/"): @@ -475,8 +475,11 @@ def handle_messages(self): except queue.Empty: pass else: - message = str(message) # convert to string - works Python 2 or Python 3 logger.debug('received message from payload: %s' % message) + logger.debug('type of received message from payload: %s' % type(message)) + if (sys.version_info > (3, 0)): # needed for Python 3 + message = message.decode('utf-8') + logger.debug('type of converted received message : %s' % type(message)) if "Ready for events" in message: event_ranges = self.get_event_range_to_payload() if not event_ranges: From 99c391c8c36e583d5373ad60ab56f1e791848b66 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sun, 27 Sep 2020 17:39:24 -0500 Subject: [PATCH 12/33] remove extra debug print messages --- pilot/eventservice/esprocess/esprocess.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index cb004d4bc..7079d5534 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -475,11 +475,9 @@ def handle_messages(self): except queue.Empty: pass else: - logger.debug('received message from payload: %s' % message) - logger.debug('type of received message from payload: %s' % type(message)) if (sys.version_info > (3, 0)): # needed for Python 3 message = message.decode('utf-8') - logger.debug('type of converted received message : %s' % type(message)) + logger.debug('received message from payload: %s' % message) if "Ready for events" in message: event_ranges = self.get_event_range_to_payload() if not event_ranges: From 84adbd63e8ef7210235ee1016fd47e90043469a4 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sun, 27 Sep 2020 21:46:35 -0500 Subject: [PATCH 13/33] add python3 message encoding --- pilot/eventservice/esprocess/esprocess.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index 7079d5534..335f5cc76 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -384,6 +384,8 @@ def send_event_ranges_to_payload(self, event_ranges): msg = None if "No more events" in event_ranges: msg = event_ranges + if (sys.version_info > (3, 0)): # needed for Python 3 + msg = msg.encode('utf-8') self.is_no_more_events = True self.__no_more_event_time = time.time() else: From 5ac989ea0c84fb8dac352f46f062db8511939855 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 1 Oct 2020 16:08:34 +0200 Subject: [PATCH 14/33] Now sending meanCoreCount. Fallback mechanism for middleware container, rucio->CentOS7. Specifying middleware_container in default.cfg. --- PILOTVERSION | 2 +- pilot/control/interceptor.py | 22 +++++++++++++++ pilot/control/job.py | 11 +++++--- pilot/control/payload.py | 7 +++++ pilot/scripts/stagein.py | 2 +- pilot/scripts/stageout.py | 2 +- pilot/user/atlas/container.py | 50 ++++++++++++++++++++++++++++------- pilot/user/atlas/cpu.py | 5 ++++ pilot/user/generic/cpu.py | 2 +- pilot/util/constants.py | 2 +- pilot/util/default.cfg | 4 +-- pilot/workflow/generic.py | 4 +-- 12 files changed, 92 insertions(+), 21 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 7885c2ec3..00652e1ad 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.3 \ No newline at end of file +2.8.4.8 \ No newline at end of file diff --git a/pilot/control/interceptor.py b/pilot/control/interceptor.py index ebd19902c..e7987a3a3 100644 --- a/pilot/control/interceptor.py +++ b/pilot/control/interceptor.py @@ -15,6 +15,7 @@ import queue # Python 3 from pilot.common.exception import ExcThread +from pilot.util.processes import threads_aborted import logging logger = logging.getLogger(__name__) @@ -57,6 +58,13 @@ def run(args): time.sleep(0.5) + # proceed to set the job_aborted flag? + if threads_aborted(): + logger.debug('will proceed to set job_aborted') + args.job_aborted.set() + else: + logger.debug('will not set job_aborted yet') + logger.debug('[interceptor] run thread has finished') @@ -71,6 +79,13 @@ def receive(args): while not args.graceful_stop.is_set(): time.sleep(0.5) + # proceed to set the job_aborted flag? + if threads_aborted(): + logger.debug('will proceed to set job_aborted') + args.job_aborted.set() + else: + logger.debug('will not set job_aborted yet') + logger.debug('[interceptor] receive thread has finished') @@ -85,4 +100,11 @@ def send(args): while not args.graceful_stop.is_set(): time.sleep(0.5) + # proceed to set the job_aborted flag? + if threads_aborted(): + logger.debug('will proceed to set job_aborted') + args.job_aborted.set() + else: + logger.debug('will not set job_aborted yet') + logger.debug('[interceptor] receive send has finished') diff --git a/pilot/control/job.py b/pilot/control/job.py index 40c7bb823..ec3fc53cb 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -539,8 +539,10 @@ def get_data_structure(job, state, args, xml=None, metadata=None): if job.corecount and job.corecount != 'null' and job.corecount != 'NULL': data['coreCount'] = job.corecount #data['coreCount'] = mean(job.corecounts) if job.corecounts else job.corecount - if job.corecounts: - log.info('mean actualcorecount: %f' % mean(job.corecounts)) + if job.corecounts: + _mean = mean(job.corecounts) + log.info('mean actualcorecount: %f' % _mean) + data['meanCoreCount'] = _mean # get the number of events, should report in heartbeat in case of preempted. if job.nevents != 0: @@ -1474,6 +1476,9 @@ def retrieve(queues, traces, args): # noqa: C901 jobnumber += 1 while not args.graceful_stop.is_set(): if has_job_completed(queues, args): + #import signal + #os.kill(os.getpid(), signal.SIGTERM) + args.job_aborted.clear() args.abort_job.clear() logger.info('ready for new job') @@ -1913,7 +1918,7 @@ def get_finished_or_failed_job(args, queues): def get_heartbeat_period(debug=False): """ Return the proper heartbeat period, as determined by normal or debug mode. - In normal mode, the hearbeat period is 30*60 s, while in debug mode it is 5*60 s. Both values are defined in the + In normal mode, the heartbeat period is 30*60 s, while in debug mode it is 5*60 s. Both values are defined in the config file. :param debug: Boolean, True for debug mode. False otherwise. diff --git a/pilot/control/payload.py b/pilot/control/payload.py index a5a917383..4de430b9b 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -441,4 +441,11 @@ def failed_post(queues, traces, args): set_pilot_state(job=job, state='stageout') put_in_queue(job, queues.data_out) + # proceed to set the job_aborted flag? + if threads_aborted(): + logger.debug('will proceed to set job_aborted') + args.job_aborted.set() + else: + logger.debug('will not set job_aborted yet') + logger.info('[payload] failed_post thread has finished') diff --git a/pilot/scripts/stagein.py b/pilot/scripts/stagein.py index d27e8bd19..a67f7c65a 100644 --- a/pilot/scripts/stagein.py +++ b/pilot/scripts/stagein.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#do not use: #!/usr/bin/env python3 import argparse import os import re diff --git a/pilot/scripts/stageout.py b/pilot/scripts/stageout.py index 2b694e4da..0f9e2b230 100644 --- a/pilot/scripts/stageout.py +++ b/pilot/scripts/stageout.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#do not use: #!/usr/bin/env python3 import argparse import os import re diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index 0762bdd6d..2499a5f23 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -752,9 +752,8 @@ def create_middleware_container_command(workdir, cmd, label='stagein'): command = 'cd %s;' % workdir # add bits and pieces for the containerisation - content = 'lsetup rucio davix xrootd\n%s\nexit $?' % cmd - logger.debug('setup.sh content:\n%s' % content) - + middleware_container = get_middleware_container() + content = get_middleware_container_script(middleware_container, cmd) # store it in setup.sh script_name = 'stagein.sh' if label == 'stage-in' else 'stageout.sh' try: @@ -767,14 +766,47 @@ def create_middleware_container_command(workdir, cmd, label='stagein'): x509 = os.environ.get('X509_USER_PROXY', '') if x509: command += 'export X509_USER_PROXY=%s;' % x509 - pythonpath = 'export PYTHONPATH=%s:$PYTHONPATH;' % os.path.join(workdir, 'pilot2') + pythonpath = '' #'export PYTHONPATH=%s;' % os.path.join(workdir, 'pilot2') + #pythonpath = 'export PYTHONPATH=%s:$PYTHONPATH;' % os.path.join(workdir, 'pilot2') #pythonpath = 'export PYTHONPATH=/cvmfs/atlas.cern.ch/repo/sw/PandaPilot/pilot2/latest:$PYTHONPATH;' command += 'export ALRB_CONT_RUNPAYLOAD=\"%ssource /srv/%s\";' % (pythonpath, script_name) command += get_asetup(alrb=True) # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; - command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c centos7' - #path = '/cvmfs/unpacked.cern.ch/registry.hub.docker.com/atlas/rucio-clients:default' - # verify path .. - #command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c %s' % path - + command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c %s' % middleware_container + #if not 'rucio' in middleware_container: + # command += ' --nocvmfs' logger.debug('container command: %s' % command) return command + + +def get_middleware_container_script(middleware_container, cmd): + """ + Return the content of the middleware container script. + + :param middleware_container: container image (string). + :param cmd: isolated stage-in/out command (string). + :return: script content (string). + """ + + if 'rucio' in middleware_container: + content = 'python3 %s\nexit $?' % cmd + else: + content = 'lsetup rucio davix xrootd;python %s\nexit $?' % cmd + logger.debug('setup.sh content:\n%s' % content) + + return content + + +def get_middleware_container(): + """ + Return the middleware container. + + :return: path (string). + """ + + path = config.Container.middleware_container + if not os.path.exists(path): + logger.warning('requested middleware container path does not exist: %s (switching to default value)' % path) + path = 'CentOS7' + logger.info('using image: %s for middleware container' % path) + + return path diff --git a/pilot/user/atlas/cpu.py b/pilot/user/atlas/cpu.py index 8ca48e78c..14fea895d 100644 --- a/pilot/user/atlas/cpu.py +++ b/pilot/user/atlas/cpu.py @@ -70,6 +70,11 @@ def set_core_counts(job): log = get_logger(job.jobid) if job.pgrp: + # for debugging + cmd = "ps axo pgid,psr,comm,args | grep %d" % job.pgrp + exit_code, stdout, stderr = execute(cmd, mute=True) + log.debug('%s:\n%s\n' % (cmd, stdout)) + # ps axo pgid,psr -> 154628 8 \n 154628 9 \n 1546280 1 .. # sort is redundant; uniq removes any duplicate lines; wc -l gives the final count # awk is added to get the pgrp list only and then grep -x makes sure that false positives are removed, e.g. 1546280 diff --git a/pilot/user/generic/cpu.py b/pilot/user/generic/cpu.py index 327791968..68d412584 100644 --- a/pilot/user/generic/cpu.py +++ b/pilot/user/generic/cpu.py @@ -45,7 +45,7 @@ def set_core_counts(job): log = get_logger(job.jobid) if job.pgrp: - cmd = "ps axo pgid,psr | sort | grep %d | uniq | wc -l" % job.pgrp + cmd = "ps axo pgid,psr | sort | grep %d | uniq | awk '{print $1}' | grep -x %d | wc -l" % (job.pgrp, job.pgrp) exit_code, stdout, stderr = execute(cmd, mute=True) log.debug('%s: %s' % (cmd, stdout)) try: diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 0c08bbe93..af5bc11a7 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '3' # build number should be reset to '1' for every new development cycle +BUILD = '8' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index 567c1323a..3621c1545 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -214,10 +214,10 @@ stageout_dictionary: stageout_dictionary.json middleware_stageout_stdout: stageout_stdout.txt middleware_stageout_stderr: stageout_stderr.txt -# Name of middleware image (to be revised) +# Name of middleware image # This image is used if middleware is not found locally on the worker node. Middleware is expected to be present # in the container image -middleware_container: +middleware_container: /cvmfs/unpacked.cern.ch/registry.hub.docker.com/atlas/rucio-clients:default ################################ # Harvester parameters diff --git a/pilot/workflow/generic.py b/pilot/workflow/generic.py index 4800da1fd..27f5c84e8 100644 --- a/pilot/workflow/generic.py +++ b/pilot/workflow/generic.py @@ -79,10 +79,10 @@ def interrupt(args, signum, frame): args.signal = sig logger.warning('will instruct threads to abort and update the server') args.abort_job.set() + logger.warning('setting graceful stop (in case it was not set already)') + args.graceful_stop.set() logger.warning('waiting for threads to finish') args.job_aborted.wait() - logger.warning('setting graceful stop (in case it was not set already), pilot will abort') - args.graceful_stop.set() def register_signals(signals, args): From 8d3347807a4de7b76e0b925082b9675de02bdb2d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 1 Oct 2020 16:59:11 +0200 Subject: [PATCH 15/33] Renaming log files after each HPO iteration --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 46 +++++++++++++++++++++++++++---- pilot/util/constants.py | 2 +- 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 00652e1ad..fc16c35b4 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.8 \ No newline at end of file +2.8.4.9 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 8c2a4236c..72a446528 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -19,6 +19,7 @@ from pilot.common.errorcodes import ErrorCodes from pilot.control.job import send_state from pilot.util.auxiliary import get_logger, set_pilot_state +from pilot.util.config import config from pilot.util.container import execute from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED, \ UTILITY_AFTER_PAYLOAD_FINISHED, PILOT_PRE_SETUP, PILOT_POST_SETUP, PILOT_PRE_PAYLOAD, PILOT_POST_PAYLOAD @@ -39,6 +40,12 @@ def __init__(self, args, job, out, err, traces): self.__out = out self.__err = err self.__traces = traces + self.__payload_stdout = config.Payload.payloadstdout + self.__payload_stderr = config.Payload.payloadstderr + self.__preprocess_stdout = '' + self.__preprocess_stderr = '' + self.__postprocess_stdout = '' + self.__postprocess_stderr = '' def get_job(self): """ @@ -204,6 +211,14 @@ def write_utility_output(self, workdir, step, stdout, stderr): # dump to file try: + name_stdout = step + '_stdout.txt' + name_stderr = step + '_stderr.txt' + if step == 'preprocess': + self.__preprocess_stdout = name_stdout + self.__preprocess_stderr = name_stderr + elif step == 'postprocess': + self.__postprocess_stdout = name_stdout + self.__postprocess_stderr = name_stderr write_file(os.path.join(workdir, step + '_stdout.txt'), stdout, unique=True) except PilotException as e: logger.warning('failed to write utility stdout to file: %s, %s' % (e, stdout)) @@ -386,12 +401,12 @@ def run_preprocess(self, job): cmd_before_payload = job.setup + cmd_before_payload log.info("\n\npreprocess execution command:\n\n%s\n" % cmd_before_payload) exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess') - if exit_code == 42: + if exit_code == 160: log.fatal('no more HP points - time to abort') elif exit_code: # set error code - # .. - log.fatal('cannot continue since preprocess failed') + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PREPROCESSFAILURE) + log.fatal('cannot continue since preprocess failed: exit_code=%d' % exit_code) else: # in case the preprocess produced a command, chmod it path = os.path.join(job.workdir, job.containeroptions.get('containerExec', 'does_not_exist')) @@ -422,8 +437,10 @@ def run(self): # a loop is needed for HPO jobs # abort when nothing more to run, or when the preprocess returns a special exit code - is_hpo = False + iteration = 1 while True: + log.info('payload iteration loop #%d' % iteration) + # first run the preprocess (if necessary) exit_code = self.run_preprocess(self.__job) if exit_code: @@ -475,7 +492,26 @@ def run(self): user.post_utility_command_action(utcmd, self.__job) - if not is_hpo: + if self.__job.is_hpo: + # in case there are more hyper-parameter points, move away the previous log files + self.rename_log_files(iteration) + iteration += 1 + else: break return exit_code + + def rename_log_files(self, iteration): + """ + + :param iteration: + :return: + """ + + names = [self.__payload_stdout, self.__payload_stderr, self.__preprocess_stdout, self.__preprocess_stderr, + self.__postprocess_stdout, self.__postprocess_stderr] + for name in names: + if os.path.exists(name): + os.rename(name, name + '%d' % iteration) + else: + logger.warning('cannot rename %s since it does not exist' % name) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index af5bc11a7..44c26f3e3 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '8' # build number should be reset to '1' for every new development cycle +BUILD = '9' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From b1827f7b862debeda20dc6a826e931688fa2e5a0 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 2 Oct 2020 11:23:45 +0200 Subject: [PATCH 16/33] Added reset of exit code 160 --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 6 ++++-- pilot/util/constants.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index fc16c35b4..d36b254ae 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.9 \ No newline at end of file +2.8.4.10 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 72a446528..99143e90f 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -402,7 +402,7 @@ def run_preprocess(self, job): log.info("\n\npreprocess execution command:\n\n%s\n" % cmd_before_payload) exit_code = self.execute_utility_command(cmd_before_payload, job, 'preprocess') if exit_code == 160: - log.fatal('no more HP points - time to abort') + log.fatal('no more HP points - time to abort processing loop') elif exit_code: # set error code job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PREPROCESSFAILURE) @@ -444,6 +444,8 @@ def run(self): # first run the preprocess (if necessary) exit_code = self.run_preprocess(self.__job) if exit_code: + if exit_code == 160: + exit_code = 0 break # now run the main payload, when it finishes, run the postprocess (if necessary) @@ -494,7 +496,7 @@ def run(self): if self.__job.is_hpo: # in case there are more hyper-parameter points, move away the previous log files - self.rename_log_files(iteration) + #self.rename_log_files(iteration) iteration += 1 else: break diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 44c26f3e3..3fb094f10 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '9' # build number should be reset to '1' for every new development cycle +BUILD = '10' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 2670b7595133880dd82d29669e6a83330841c0cb Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 2 Oct 2020 17:07:02 +0200 Subject: [PATCH 17/33] Corrections --- PILOTVERSION | 2 +- pilot/control/payloads/generic.py | 9 +++++++-- pilot/user/atlas/diagnose.py | 5 +++++ pilot/util/constants.py | 2 +- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index d36b254ae..7d73b5cec 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.10 \ No newline at end of file +2.8.4.13 \ No newline at end of file diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 99143e90f..6030a98dd 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -182,14 +182,15 @@ def execute_utility_command(self, cmd, job, label): log = get_logger(job.jobid, logger) exit_code, stdout, stderr = execute(cmd, workdir=job.workdir, cwd=job.workdir, usecontainer=False) if exit_code: - log.warning('failed to run command: %s (exit code = %d) - see utility logs for details' % (cmd, exit_code)) + log.warning('command returned non-zero exit code: %s (exit code = %d) - see utility logs for details' % (cmd, exit_code)) if label == 'preprocess': err = errors.PREPROCESSFAILURE elif label == 'postprocess': err = errors.POSTPROCESSFAILURE else: err = errors.UNKNOWNPAYLOADFAILURE - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(err) + if exit_code != 160: # ignore no-more-data-points exit code + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(err) # write output to log files self.write_utility_output(job.workdir, label, stdout, stderr) @@ -449,6 +450,10 @@ def run(self): break # now run the main payload, when it finishes, run the postprocess (if necessary) + log.debug('job=%s' % str(self.__job)) + log.debug('cmd=%s' % str(cmd)) + log.debug('out=%s' % self.__out) + log.debug('err=%s' % self.__err) proc = self.run_payload(self.__job, cmd, self.__out, self.__err) if proc is None: break diff --git a/pilot/user/atlas/diagnose.py b/pilot/user/atlas/diagnose.py index cd9934bed..d882213b8 100644 --- a/pilot/user/atlas/diagnose.py +++ b/pilot/user/atlas/diagnose.py @@ -53,6 +53,11 @@ def interpret(job): if exit_code == 146: log.warning('user tarball was not downloaded (payload exit code %d)' % exit_code) set_error_nousertarball(job) + elif exit_code == 160: + log.info('ignoring harmless preprocess exit code %d' % exit_code) + job.transexitcode = 0 + job.exitcode = 0 + exit_code = 0 # extract special information, e.g. number of events try: diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 3fb094f10..6c460863f 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '10' # build number should be reset to '1' for every new development cycle +BUILD = '13' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 4ebbbc48eccf5e80695cbee774b2bab356179935 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 6 Oct 2020 10:12:59 +0200 Subject: [PATCH 18/33] Corrections --- PILOTVERSION | 2 +- pilot/control/payloads/eventservice.py | 9 +++++---- pilot/control/payloads/generic.py | 4 ---- pilot/util/constants.py | 2 +- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 7d73b5cec..e3631dbe0 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.13 \ No newline at end of file +2.8.4.14 \ No newline at end of file diff --git a/pilot/control/payloads/eventservice.py b/pilot/control/payloads/eventservice.py index 8cbd03096..2682ddb59 100644 --- a/pilot/control/payloads/eventservice.py +++ b/pilot/control/payloads/eventservice.py @@ -26,13 +26,14 @@ class Executor(generic.Executor): def __init__(self, args, job, out, err, traces): super(Executor, self).__init__(args, job, out, err, traces) - def run_payload(self, job, out, err): + def run_payload(self, job, cmd, out, err): """ (add description) - :param job: - :param out: - :param err: + :param job: job object. + :param cmd: (unused in ES mode) + :param out: stdout file object. + :param err: stderr file object. :return: """ log = get_logger(job.jobid, logger) diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 6030a98dd..3a752a0d1 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -450,10 +450,6 @@ def run(self): break # now run the main payload, when it finishes, run the postprocess (if necessary) - log.debug('job=%s' % str(self.__job)) - log.debug('cmd=%s' % str(cmd)) - log.debug('out=%s' % self.__out) - log.debug('err=%s' % self.__err) proc = self.run_payload(self.__job, cmd, self.__out, self.__err) if proc is None: break diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6c460863f..d841e9af8 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '13' # build number should be reset to '1' for every new development cycle +BUILD = '14' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From b407d6d21e4074e24ff2b5a170d8b2235847bc59 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 7 Oct 2020 13:23:20 +0200 Subject: [PATCH 19/33] Added remote file open verification script --- PILOTVERSION | 2 +- pilot/scripts/open_remote_file.py | 117 ++++++++++++++++++++++++++++++ pilot/scripts/stagein.py | 8 ++ pilot/scripts/stageout.py | 7 ++ pilot/util/constants.py | 2 +- pilot/util/default.cfg | 6 +- 6 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 pilot/scripts/open_remote_file.py diff --git a/PILOTVERSION b/PILOTVERSION index e3631dbe0..eda15db34 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.14 \ No newline at end of file +2.8.4.15 \ No newline at end of file diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py new file mode 100644 index 000000000..97c259c26 --- /dev/null +++ b/pilot/scripts/open_remote_file.py @@ -0,0 +1,117 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Paul Nilsson, paul.nilsson@cern.ch, 2020 + +import argparse +import os +import logging +import ROOT + +from pilot.util.config import config +from pilot.util.filehandling import establish_logging, write_json + +logger = logging.getLogger(__name__) + + +def get_args(): + """ + Return the args from the arg parser. + + :return: args (arg parser object). + """ + + arg_parser = argparse.ArgumentParser() + + arg_parser.add_argument('-d', + dest='debug', + action='store_true', + default=False, + help='Enable debug mode for logging messages') + arg_parser.add_argument('-w', + dest='workdir', + required=False, + default=os.getcwd(), + help='Working directory') + arg_parser.add_argument('--turls', + dest='turls', + required=True, + help='TURL list (e.g., filepath1,filepath2') + + return arg_parser.parse_args() + + +def message(msg): + print(msg) if not logger else logger.info(msg) + + +def get_file_lists(turls): + _turls = [] + + try: + _turls = turls.split(',') + except Exception as error: + message("exception caught: %s" % error) + + return {'turls': _turls} + + +def try_open_file(turl): + turl_opened = False + try: + in_file = ROOT.TFile.Open(turl) + except Exception as error: + message('caught exception: %s' % error) + else: + in_file.Close() + turl_opened = True + + if turl_opened: + message('turl=%s could be opened') + else: + message('turl=%s could not be opened') + + return turl_opened + + +if __name__ == '__main__': + """ + Main function of the remote file open script. + """ + + # get the args from the arg parser + args = get_args() + args.debug = True + + logname = 'default.log' + try: + logname = config.Pilot.remotefileverification_log + except Exception as error: + print("caught exception: %s (skipping remote file open verification)" % error) + exit(1) + else: + if not logname: + print("remote file open verification not desired") + exit(0) + + establish_logging(args, filename=logname) + logger = logging.getLogger(__name__) + + # get the file info + file_list_dictionary = get_file_lists(args.turls) + turls = file_list_dictionary.get('turls') + processed_turls_dictionary = {} + if turls: + message('got TURLs: %s' % str(turls)) + for turl in turls: + processed_turls_dictionary[turl] = try_open_file(turl) + + # write dictionary to file with results + _status = write_json(os.path.join(args.workdir, config.Pilot.remotefileverification_dictionary), processed_turls_dictionary) + else: + message('no TURLs to verify') + + exit(0) diff --git a/pilot/scripts/stagein.py b/pilot/scripts/stagein.py index a67f7c65a..2b2d0c9e0 100644 --- a/pilot/scripts/stagein.py +++ b/pilot/scripts/stagein.py @@ -1,4 +1,12 @@ #do not use: #!/usr/bin/env python3 +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Paul Nilsson, paul.nilsson@cern.ch, 2020 + import argparse import os import re diff --git a/pilot/scripts/stageout.py b/pilot/scripts/stageout.py index 0f9e2b230..5e79103fb 100644 --- a/pilot/scripts/stageout.py +++ b/pilot/scripts/stageout.py @@ -1,4 +1,11 @@ #do not use: #!/usr/bin/env python3 +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Paul Nilsson, paul.nilsson@cern.ch, 2020 import argparse import os import re diff --git a/pilot/util/constants.py b/pilot/util/constants.py index d841e9af8..cb37c56a3 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '14' # build number should be reset to '1' for every new development cycle +BUILD = '15' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index 3621c1545..127e1733a 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -21,7 +21,7 @@ name: ATLAS [Pilot] -# The default file name for the pilot log +# Pilot logs pilotlog: pilotlog.txt stageinlog: stageinlog.txt stageoutlog: stageoutlog.txt @@ -121,6 +121,10 @@ utility_with_stagein: http_connect_timeout: 100 http_maxtime: 120 +# Remote file open verification +remotefileverification_dictionary: remotefileverification_dictionary.json +remotefileverification_log: remotefileslog.txt + ################################ # Information service parameters From 0cceae30af6fc2ae9cdca3759bbaeb405378c9f1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 8 Oct 2020 12:42:46 +0200 Subject: [PATCH 20/33] Update --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 74 +++++++++++++++++++++++++++++++++++++- pilot/util/default.cfg | 2 +- pilot/util/filehandling.py | 24 +++++++++++++ pilot/util/middleware.py | 67 +++++++--------------------------- 5 files changed, 112 insertions(+), 57 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index eda15db34..834a300fe 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.15 \ No newline at end of file +2.8.4.15f \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 66a1e911b..4d2e00d24 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -33,7 +33,7 @@ from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED,\ UTILITY_AFTER_PAYLOAD, UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_WITH_STAGEIN from pilot.util.container import execute -from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps +from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy #from pilot.info import FileSpec @@ -122,6 +122,69 @@ def get_resource_name(): resource_name = 'grid' return resource_name +def open_remote_files(indata, workdir, cmd): + """ + Verify that direct i/o files can be opened. + + :param indata: list of FileSpec. + :param workdir: working directory (string). + :return: exit code (int), diagnostics (string). + """ + + ec = 0 + diagnostics = "" + + # extract direct i/o files from indata (string of comma-separated turls) + turls = extract_turls(indata) + if turls: + # execute file open script which will attempt to open each file + + script = 'open_remote_file.py' + final_script_path = os.path.join(workdir, script) + os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir + script_path = os.path.join('pilot/scripts', script) + full_script_path = os.path.join(os.path.join(workdir, script_path)) + copy(full_script_path, final_script_path) + + cmd = cmd + '; lsetup \'root 6.20.06-x86_64-centos7-gcc8-opt\'; ' + get_file_open_command(final_script_path, turls) + logger.info('*** executing \'%s\' ***' % cmd) + exit_code, stdout, stderr = execute(cmd, usecontainer=False) + logger.debug('ec=%d' % exit_code) + logger.debug('stdout=%s' % stdout) + logger.debug('stderr=%s' % stderr) + # error handling + else: + logger.info('nothing to verify (for remote files)') + + return ec, diagnostics + + +def get_file_open_command(script_path, turls): + """ + + :param script_path: path to script (string). + :return: comma-separated list of turls (string). + """ + + py = "python3" if is_python3() else "python" + return "%s %s --turls=%s -w %s" % (py, script_path, turls, os.path.dirname(script_path)) + + +def extract_turls(indata): + """ + Extract TURLs from indata for direct i/o files. + + :param indata: list of FileSpec. + :return: comma-separated list of turls (string). + """ + + turls = "" + for f in indata: + if f.status == 'remote_io' or True: + turls += f.turl if not turls else ",%s" % f.turl + + return turls + def get_payload_command(job): """ @@ -154,6 +217,15 @@ def get_payload_command(job): if ec != 0: raise PilotException(diagnostics, code=ec) + # make sure that remote file can be opened before executing payload + if config.Pilot.remotefileverification_log: + try: + ec, diagnostics = open_remote_files(job.indata, job.workdir, cmd) + #if ec != 0: + # raise PilotException(diagnostics, code=ec) + except Exception as e: + log.warning('caught exception: %s' % e) + if is_standard_atlas_job(job.swrelease): # Normal setup (production and user jobs) diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index 127e1733a..2a372ec4f 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -121,7 +121,7 @@ utility_with_stagein: http_connect_timeout: 100 http_maxtime: 120 -# Remote file open verification +# Remote file open verification (if not wanted, clear the remotefileverification_log) remotefileverification_dictionary: remotefileverification_dictionary.json remotefileverification_log: remotefileslog.txt diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 9aa9b1472..89f1497bc 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -998,3 +998,27 @@ def get_valid_path_from_list(paths): break return valid_path + + +def copy_pilot_source(workdir): + """ + Copy the pilot source into the work directory. + + :param workdir: working directory (string). + :return: diagnostics (string). + """ + + diagnostics = "" + srcdir = os.path.join(os.environ.get('PILOT_SOURCE_DIR', '.'), 'pilot2') + try: + logger.debug('copy %s to %s' % (srcdir, workdir)) + cmd = 'cp -r %s/* %s' % (srcdir, workdir) + exit_code, stdout, stderr = execute(cmd) + if exit_code != 0: + diagnostics = 'file copy failed: %d, %s' % (exit_code, stdout) + logger.warning(diagnostics) + except Exception as e: + diagnostics = 'exception caught when copying pilot2 source: %s' % e + logger.warning(diagnostics) + + return diagnostics diff --git a/pilot/util/middleware.py b/pilot/util/middleware.py index ff42bfb0c..5e226c56c 100644 --- a/pilot/util/middleware.py +++ b/pilot/util/middleware.py @@ -7,13 +7,13 @@ # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2020 -from os import environ, path, chmod, getcwd +from os import environ, path, getcwd #, chmod from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException, StageInFailure, StageOutFailure from pilot.util.config import config from pilot.util.container import execute -from pilot.util.filehandling import copy, read_json, write_file #, find_executable +from pilot.util.filehandling import copy, read_json, write_file, copy_pilot_source #, find_executable import logging logger = logging.getLogger(__name__) @@ -127,63 +127,22 @@ def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, lab :param remotesite: :param label: 'stage-[in|out]' (string). :return: stage-in/out command (string). - :raises StageInFailure: for stage-in failures - :raises StageOutFailure: for stage-out failures + :raises PilotException: for stage-in/out related failures """ - try: - filedata_dictionary = get_filedata_strings(xdata) - except Exception: - import traceback - msg = traceback.format_exc() - logger.warning('exception caught: %s' % msg) - if label == 'stage-in': - raise StageInFailure(msg) - else: - raise StageOutFailure(msg) - - srcdir = path.join(environ.get('PILOT_SOURCE_DIR', '.'), 'pilot2') - if not path.exists(srcdir): - msg = 'pilot source directory not correct: %s' % srcdir - logger.debug(msg) - if label == 'stage-in': - raise StageInFailure(msg) - else: - raise StageOutFailure(msg) - else: - logger.debug('using pilot source directory: %s' % srcdir) + filedata_dictionary = get_filedata_strings(xdata) # copy pilot source into container directory, unless it is already there - final_script_path = path.join(job.workdir, script) - try: - logger.debug('copy %s to %s' % (srcdir, job.workdir)) - cmd = 'cp -r %s/* %s' % (srcdir, job.workdir) - exit_code, stdout, stderr = execute(cmd) - if exit_code != 0: - msg = 'file copy failed: %d, %s' % (exit_code, stdout) - logger.warning(msg) - if label == 'stage-in': - raise StageInFailure(msg) - else: - raise StageOutFailure(msg) - - environ['PYTHONPATH'] = environ.get('PYTHONPATH') + ':' + job.workdir - logger.debug('PYTHONPATH=%s' % environ.get('PYTHONPATH')) - - script_path = path.join('pilot/scripts', script) - full_script_path = path.join(path.join(job.workdir, script_path)) + diagnostics = copy_pilot_source(job.workdir) + if diagnostics: + raise PilotException(diagnostics) - copy(full_script_path, final_script_path) - logger.debug('full_script_path=%s' % full_script_path) - logger.debug('final_script_path=%s' % final_script_path) - chmod(final_script_path, 0o755) # Python 2/3 - except Exception as e: - msg = 'exception caught when copying pilot2 source: %s' % e - logger.warning(msg) - if label == 'stage-in': - raise StageInFailure(msg) - else: - raise StageOutFailure(msg) + final_script_path = path.join(job.workdir, script) + environ['PYTHONPATH'] = environ.get('PYTHONPATH') + ':' + job.workdir + script_path = path.join('pilot/scripts', script) + full_script_path = path.join(path.join(job.workdir, script_path)) + copy(full_script_path, final_script_path) + # chmod(final_script_path, 0o755) # Python 2/3 if config.Container.use_middleware_container: # correct the path when containers have been used From 8e6e541ffa3c3a9c64300d1d36259cf82fc6c56e Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 8 Oct 2020 17:41:47 +0200 Subject: [PATCH 21/33] Containerised file open script --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 13 +++++++-- pilot/user/atlas/container.py | 54 +++++++++++++++++++++++++++++++---- pilot/util/constants.py | 2 +- 4 files changed, 61 insertions(+), 10 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 834a300fe..d331233fa 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.15f \ No newline at end of file +2.8.4.16 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 4d2e00d24..da440402b 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -20,6 +20,7 @@ except Exception: pass +from .container import create_root_container_command from .dbrelease import get_dbrelease_version, create_dbrelease from .setup import should_pilot_prepare_setup, is_standard_atlas_job,\ set_inds, get_analysis_trf, get_payload_environment_variables, replace_lfns_with_turls @@ -128,6 +129,7 @@ def open_remote_files(indata, workdir, cmd): :param indata: list of FileSpec. :param workdir: working directory (string). + :param cmd: asetup path (string). :return: exit code (int), diagnostics (string). """ @@ -146,7 +148,14 @@ def open_remote_files(indata, workdir, cmd): full_script_path = os.path.join(os.path.join(workdir, script_path)) copy(full_script_path, final_script_path) - cmd = cmd + '; lsetup \'root 6.20.06-x86_64-centos7-gcc8-opt\'; ' + get_file_open_command(final_script_path, turls) + # correct the path when containers have been used + final_script_path = os.path.join('.', script) + + _cmd = get_file_open_command(final_script_path, turls) + logger.debug('_cmd=%s' % _cmd) + cmd = cmd + '; ' + create_root_container_command('/srv', _cmd) + logger.debug('cmd=%s' % cmd) + logger.info('*** executing \'%s\' ***' % cmd) exit_code, stdout, stderr = execute(cmd, usecontainer=False) logger.debug('ec=%d' % exit_code) @@ -166,7 +175,7 @@ def get_file_open_command(script_path, turls): :return: comma-separated list of turls (string). """ - py = "python3" if is_python3() else "python" + py = 'python' #"python3" if is_python3() else "python" return "%s %s --turls=%s -w %s" % (py, script_path, turls, os.path.dirname(script_path)) diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index 2499a5f23..b5c26ba09 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -728,6 +728,37 @@ def singularity_wrapper(cmd, workdir, job=None): return cmd +def create_root_container_command(workdir, cmd): + """ + + :param workdir: + :param cmd: + :return: + """ + + command = 'cd %s;' % workdir + content = get_root_container_script(cmd) + script_name = 'open_file.sh' + + try: + status = write_file(os.path.join(workdir, script_name), content) + except PilotException as e: + raise e + else: + if status: + # generate the final container command + x509 = os.environ.get('X509_USER_PROXY', '') + if x509: + command += 'export X509_USER_PROXY=%s;' % x509 + command += 'export ALRB_CONT_RUNPAYLOAD=\"source /srv/%s\";' % script_name + command += get_asetup(alrb=True) # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; + command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c CentOS7' + + logger.debug('container command: %s' % command) + + return command + + def create_middleware_container_command(workdir, cmd, label='stagein'): """ Create the stage-in/out container command. @@ -766,18 +797,29 @@ def create_middleware_container_command(workdir, cmd, label='stagein'): x509 = os.environ.get('X509_USER_PROXY', '') if x509: command += 'export X509_USER_PROXY=%s;' % x509 - pythonpath = '' #'export PYTHONPATH=%s;' % os.path.join(workdir, 'pilot2') - #pythonpath = 'export PYTHONPATH=%s:$PYTHONPATH;' % os.path.join(workdir, 'pilot2') - #pythonpath = 'export PYTHONPATH=/cvmfs/atlas.cern.ch/repo/sw/PandaPilot/pilot2/latest:$PYTHONPATH;' - command += 'export ALRB_CONT_RUNPAYLOAD=\"%ssource /srv/%s\";' % (pythonpath, script_name) + command += 'export ALRB_CONT_RUNPAYLOAD=\"source /srv/%s\";' % script_name command += get_asetup(alrb=True) # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c %s' % middleware_container - #if not 'rucio' in middleware_container: - # command += ' --nocvmfs' + logger.debug('container command: %s' % command) + return command +def get_root_container_script(cmd): + """ + Return the content of the root container script. + + :param cmd: root command (string). + :return: script content (string). + """ + + content = 'lsetup \'root 6.20.06-x86_64-centos7-gcc8-opt\';\npython %s\nexit $?' % cmd + logger.debug('setup.sh content:\n%s' % content) + + return content + + def get_middleware_container_script(middleware_container, cmd): """ Return the content of the middleware container script. diff --git a/pilot/util/constants.py b/pilot/util/constants.py index cb37c56a3..093701f85 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '15' # build number should be reset to '1' for every new development cycle +BUILD = '16' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 84f63f503113e3bf677ba75494605e0b882d89fd Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 9 Oct 2020 15:08:11 +0200 Subject: [PATCH 22/33] Update --- PILOTVERSION | 2 +- pilot/scripts/open_remote_file.py | 6 ++++ pilot/user/atlas/common.py | 53 +++++++++++++++++++------------ pilot/user/atlas/container.py | 2 +- pilot/util/constants.py | 2 +- 5 files changed, 42 insertions(+), 23 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index d331233fa..503ec8e00 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.16 \ No newline at end of file +2.8.4.16b \ No newline at end of file diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index 97c259c26..aeeddb933 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -40,6 +40,11 @@ def get_args(): dest='turls', required=True, help='TURL list (e.g., filepath1,filepath2') + arg_parser.add_argument('--no-pilot-log', + dest='nopilotlog', + action='store_true', + default=False, + help='Do not write the pilot log to file') return arg_parser.parse_args() @@ -85,6 +90,7 @@ def try_open_file(turl): # get the args from the arg parser args = get_args() args.debug = True + args.nopilotlog = False logname = 'default.log' try: diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index da440402b..8f06e1b7d 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -34,7 +34,7 @@ from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED,\ UTILITY_AFTER_PAYLOAD, UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_WITH_STAGEIN from pilot.util.container import execute -from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy +from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy, copy_pilot_source #from pilot.info import FileSpec @@ -141,27 +141,41 @@ def open_remote_files(indata, workdir, cmd): if turls: # execute file open script which will attempt to open each file + # copy pilot source into container directory, unless it is already there + diagnostics = copy_pilot_source(workdir) + if diagnostics: + raise PilotException(diagnostics) + script = 'open_remote_file.py' final_script_path = os.path.join(workdir, script) os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir script_path = os.path.join('pilot/scripts', script) - full_script_path = os.path.join(os.path.join(workdir, script_path)) - copy(full_script_path, final_script_path) - - # correct the path when containers have been used - final_script_path = os.path.join('.', script) - - _cmd = get_file_open_command(final_script_path, turls) - logger.debug('_cmd=%s' % _cmd) - cmd = cmd + '; ' + create_root_container_command('/srv', _cmd) - logger.debug('cmd=%s' % cmd) - - logger.info('*** executing \'%s\' ***' % cmd) - exit_code, stdout, stderr = execute(cmd, usecontainer=False) - logger.debug('ec=%d' % exit_code) - logger.debug('stdout=%s' % stdout) - logger.debug('stderr=%s' % stderr) - # error handling + d1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot2'), script_path) + d2 = os.path.join(workdir, script_path) + logger.debug('d1=%s (exists: %s)' % (d1, str(os.path.exists(d1)))) + logger.debug('d2=%s (exists: %s)' % (d2, str(os.path.exists(d2)))) + full_script_path = d1 if os.path.exists(d1) else d2 + try: + copy(full_script_path, final_script_path) + except Exception as e: + diagnostics = 'pilot source copy failed: %s (cannot verify remote file open)' % e + logger.warning(diagnostics) + else: + # correct the path when containers have been used + final_script_path = os.path.join('.', script) + + _cmd = get_file_open_command(final_script_path, turls) + logger.debug('_cmd=%s' % _cmd) + cmd = cmd + '; ' + create_root_container_command(workdir, _cmd) + logger.debug('cmd=%s' % cmd) + + logger.info('*** executing \'%s\' ***' % cmd) + exit_code, stdout, stderr = execute(cmd, usecontainer=False) + logger.debug('ec=%d' % exit_code) + logger.debug('stdout=%s' % stdout) + logger.debug('stderr=%s' % stderr) + + # error handling else: logger.info('nothing to verify (for remote files)') @@ -175,8 +189,7 @@ def get_file_open_command(script_path, turls): :return: comma-separated list of turls (string). """ - py = 'python' #"python3" if is_python3() else "python" - return "%s %s --turls=%s -w %s" % (py, script_path, turls, os.path.dirname(script_path)) + return "%s --turls=%s -w %s" % (script_path, turls, os.path.dirname(script_path)) def extract_turls(indata): diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index b5c26ba09..1f5c766dc 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -815,7 +815,7 @@ def get_root_container_script(cmd): """ content = 'lsetup \'root 6.20.06-x86_64-centos7-gcc8-opt\';\npython %s\nexit $?' % cmd - logger.debug('setup.sh content:\n%s' % content) + logger.debug('setup content:\n%s' % content) return content diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 093701f85..0ae12f486 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '16' # build number should be reset to '1' for every new development cycle +BUILD = '16b' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From dcd26c568f15440ab1300df1661c571e15f5e88b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 9 Oct 2020 15:15:27 +0200 Subject: [PATCH 23/33] Update --- pilot/scripts/open_remote_file.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index aeeddb933..ab6c9316f 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -75,9 +75,9 @@ def try_open_file(turl): turl_opened = True if turl_opened: - message('turl=%s could be opened') + message('turl=%s could be opened' % turl) else: - message('turl=%s could not be opened') + message('turl=%s could not be opened' % turl) return turl_opened From 863a21cd8200ec2dc8521e1ce47cfee66ebcd6f1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 9 Oct 2020 15:19:15 +0200 Subject: [PATCH 24/33] Update --- pilot/scripts/open_remote_file.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index ab6c9316f..098d5115e 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -71,8 +71,9 @@ def try_open_file(turl): except Exception as error: message('caught exception: %s' % error) else: - in_file.Close() - turl_opened = True + if in_file: + in_file.Close() + turl_opened = True if turl_opened: message('turl=%s could be opened' % turl) From 4ad58eaaeff1a8225a61ec98e1f2da77c3b2751f Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 13 Oct 2020 11:37:27 +0200 Subject: [PATCH 25/33] Update --- PILOTVERSION | 2 +- pilot/scripts/open_remote_file.py | 7 +---- pilot/user/atlas/common.py | 52 ++++++++++++++++++++++--------- pilot/user/atlas/container.py | 6 ++-- pilot/util/constants.py | 2 +- pilot/util/middleware.py | 4 +-- 6 files changed, 45 insertions(+), 28 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 503ec8e00..1b5f9cf22 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.16b \ No newline at end of file +2.8.4.18 \ No newline at end of file diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index 098d5115e..94c883a7d 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -71,15 +71,10 @@ def try_open_file(turl): except Exception as error: message('caught exception: %s' % error) else: - if in_file: + if in_file and in_file.IsOpen(): in_file.Close() turl_opened = True - if turl_opened: - message('turl=%s could be opened' % turl) - else: - message('turl=%s could not be opened' % turl) - return turl_opened diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 8f06e1b7d..0cc6f9bd0 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2019 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2020 # - Wen Guan, wen.guan@cern.ch, 2018 import os @@ -34,9 +34,8 @@ from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_WITH_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED,\ UTILITY_AFTER_PAYLOAD, UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_WITH_STAGEIN from pilot.util.container import execute -from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy, copy_pilot_source - -#from pilot.info import FileSpec +from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\ + copy_pilot_source, write_file, read_json import logging logger = logging.getLogger(__name__) @@ -152,30 +151,53 @@ def open_remote_files(indata, workdir, cmd): script_path = os.path.join('pilot/scripts', script) d1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot2'), script_path) d2 = os.path.join(workdir, script_path) - logger.debug('d1=%s (exists: %s)' % (d1, str(os.path.exists(d1)))) - logger.debug('d2=%s (exists: %s)' % (d2, str(os.path.exists(d2)))) full_script_path = d1 if os.path.exists(d1) else d2 + if not os.path.exists(full_script_path): + # do not set ec since this will be a pilot issue rather than site issue + diagnostics = 'cannot perform file open test - script path does not exist: %s' % full_script_path + logger.warning(diagnostics) + logger.warning('tested both path=%s and path=%s (none exists)' % (d1, d2)) + return ec, diagnostics try: copy(full_script_path, final_script_path) except Exception as e: - diagnostics = 'pilot source copy failed: %s (cannot verify remote file open)' % e + # do not set ec since this will be a pilot issue rather than site issue + diagnostics = 'cannot perform file open test - pilot source copy failed: %s' % e logger.warning(diagnostics) + return ec, diagnostics else: # correct the path when containers have been used final_script_path = os.path.join('.', script) _cmd = get_file_open_command(final_script_path, turls) - logger.debug('_cmd=%s' % _cmd) cmd = cmd + '; ' + create_root_container_command(workdir, _cmd) - logger.debug('cmd=%s' % cmd) - logger.info('*** executing \'%s\' ***' % cmd) + logger.info('*** executing file open verification script:\n\n\'%s\'\n\n' % cmd) exit_code, stdout, stderr = execute(cmd, usecontainer=False) - logger.debug('ec=%d' % exit_code) - logger.debug('stdout=%s' % stdout) - logger.debug('stderr=%s' % stderr) + if config.Pilot.remotefileverification_log: + write_file(os.path.join(workdir, config.Pilot.remotefileverification_log), stdout + stderr, mute=False) # error handling + if exit_code: + logger.warning('script %s finished with ec=%d' % (script, exit_code)) + else: + dictionary_path = os.path.join(workdir, config.Pilot.remotefileverification_dictionary) + if not dictionary_path: + logger.warning('file does not exist: %s' % dictionary_path) + else: + file_dictionary = read_json(dictionary_path) + if not file_dictionary: + logger.warning('could not read dictionary from %s' % dictionary_path) + else: + not_opened = "" + for turl in file_dictionary: + opened = file_dictionary[turl] + logger.info('turl %s could be opened' % turl) if opened else logger.info('turl %s could not be opened' % turl) + if not opened: + not_opened += turl if not not_opened else ",%s" % turl + if not_opened: + ec = 1 + diagnostics = "turl not opened:%s" % not_opened if "," not in not_opened else "turls not opened:%s" % not_opened else: logger.info('nothing to verify (for remote files)') @@ -243,8 +265,8 @@ def get_payload_command(job): if config.Pilot.remotefileverification_log: try: ec, diagnostics = open_remote_files(job.indata, job.workdir, cmd) - #if ec != 0: - # raise PilotException(diagnostics, code=ec) + if ec != 0: + raise PilotException(diagnostics, code=ec) except Exception as e: log.warning('caught exception: %s' % e) diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index 1f5c766dc..96890ecff 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -143,7 +143,7 @@ def get_grid_image_for_singularity(platform): def get_middleware_type(): """ Return the middleware type from the container type. - E.g. container_type = 'singularity:pilot;docker:wrapper;middleware:container' + E.g. container_type = 'singularity:pilot;docker:wrapper;container:middleware' get_middleware_type() -> 'container', meaning that middleware should be taken from the container. The default is otherwise 'workernode', i.e. middleware is assumed to be present on the worker node. @@ -814,8 +814,8 @@ def get_root_container_script(cmd): :return: script content (string). """ - content = 'lsetup \'root 6.20.06-x86_64-centos7-gcc8-opt\';\npython %s\nexit $?' % cmd - logger.debug('setup content:\n%s' % content) + content = 'lsetup \'root 6.20.06-x86_64-centos7-gcc8-opt\'\npython %s\nexit $?' % cmd + logger.debug('root setup script content:\n\n%s\n\n' % content) return content diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 0ae12f486..834e69ec6 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '16b' # build number should be reset to '1' for every new development cycle +BUILD = '18' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/middleware.py b/pilot/util/middleware.py index 5e226c56c..e3cb23141 100644 --- a/pilot/util/middleware.py +++ b/pilot/util/middleware.py @@ -77,8 +77,8 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, _stdout_name, _stderr_name = get_logfile_names(label) write_file(path.join(job.workdir, _stdout_name), stdout, mute=False) write_file(path.join(job.workdir, _stderr_name), stderr, mute=False) - logger.debug('stage-in/out stdout=\n%s' % stdout) - logger.debug('stage-in/out stderr=\n%s' % stderr) + #logger.debug('stage-in/out stdout=\n%s' % stdout) + #logger.debug('stage-in/out stderr=\n%s' % stderr) except PilotException as e: msg = 'exception caught: %s' % e if label == 'stage-in': From c3438967e6b7b6682ef383167dbd7d11f974a98d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 13 Oct 2020 11:41:26 +0200 Subject: [PATCH 26/33] Added use_vp --- pilot/info/jobdata.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index 4edc814e1..2ac8936b8 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -126,6 +126,7 @@ class JobData(BaseData): preprocess = {} # preprocess dictionary with command to execute before payload, {'command': '..', 'args': '..'} postprocess = {} # postprocess dictionary with command to execute after payload, {'command': '..', 'args': '..'} containeroptions = {} # + use_vp = False # True for VP jobs # home package string with additional payload release information; does not need to be added to # the conversion function since it's already lower case @@ -154,7 +155,7 @@ class JobData(BaseData): list: ['piloterrorcodes', 'piloterrordiags', 'workdirsizes', 'zombies', 'corecounts'], dict: ['status', 'fileinfo', 'metadata', 'utilities', 'overwrite_queuedata', 'sizes', 'preprocess', 'postprocess', 'containeroptions'], - bool: ['is_eventservice', 'is_eventservicemerge', 'is_hpo', 'noexecstrcnv', 'debug', 'usecontainer'] + bool: ['is_eventservice', 'is_eventservicemerge', 'is_hpo', 'noexecstrcnv', 'debug', 'usecontainer', 'use_vp'] } def __init__(self, data): @@ -429,6 +430,7 @@ def load(self, data): 'is_eventservice': 'eventService', 'is_eventservicemerge': 'eventServiceMerge', 'is_hpo': 'isHPO', + 'use_vp': 'useVP', 'maxcpucount': 'maxCpuCount', 'allownooutput': 'allowNoOutput', 'imagename_jobdef': 'container_name', From acada8d37ce872ccd90af26258797a2028585bdd Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 13 Oct 2020 13:30:04 +0200 Subject: [PATCH 27/33] Now failing remote io job if file could not be opened. added support for useVP --- PILOTVERSION | 2 +- pilot/api/data.py | 20 ++++++++++++++++---- pilot/common/errorcodes.py | 4 +++- pilot/control/data.py | 3 ++- pilot/scripts/stagein.py | 8 +++++++- pilot/user/atlas/common.py | 4 ++-- pilot/util/constants.py | 2 +- pilot/util/middleware.py | 6 ++++-- 8 files changed, 36 insertions(+), 13 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 1b5f9cf22..be5439d32 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.18 \ No newline at end of file +2.8.4.20 \ No newline at end of file diff --git a/pilot/api/data.py b/pilot/api/data.py index 743636c0b..7bb6702e1 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -198,10 +198,12 @@ def sort_replicas(self, replicas, inputddms): return replicas - def resolve_replicas(self, files): # noqa: C901 + def resolve_replicas(self, files, use_vp=False): # noqa: C901 """ - Populates filespec.replicas for each entry from `files` list - :param files: list of `FileSpec` objects + Populates filespec.replicas for each entry from `files` list + + :param files: list of `FileSpec` objects + fdat.replicas = [{'ddmendpoint':'ddmendpoint', 'pfn':'replica', 'domain':'domain value'}] :return: `files` """ @@ -237,6 +239,9 @@ def resolve_replicas(self, files): # noqa: C901 query.update(sort='geoip', client_location=location) logger.info('calling rucio.list_replicas() with query=%s' % query) + # reset the schemas for VP jobs + if use_vp: + query['schemes'] = ['root'] try: replicas = c.list_replicas(**query) except Exception as e: @@ -447,6 +452,7 @@ def transfer(self, files, activity='default', **kwargs): # noqa: C901 continue try: + self.logger.debug('kwargs=%s' % str(kwargs)) result = self.transfer_files(copytool, remain_files, activity, **kwargs) self.logger.debug('transfer_files() using copytool=%s completed with result=%s' % (copytool, str(result))) break @@ -712,13 +718,19 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 if getattr(copytool, 'require_replicas', False) and files: if files[0].replicas is None: # look up replicas only once - files = self.resolve_replicas(files) + files = self.resolve_replicas(files, use_vp=kwargs['use_vp']) allowed_schemas = getattr(copytool, 'allowed_schemas', None) if self.infosys and self.infosys.queuedata: copytool_name = copytool.__name__.rsplit('.', 1)[-1] allowed_schemas = self.infosys.queuedata.resolve_allowed_schemas(activity, copytool_name) or allowed_schemas + # overwrite allowed_schemas for VP jobs + if kwargs['use_vp']: + allowed_schemas = ['root'] + self.logger.debug('overwrote allowed_schemas for VP job: %s' % str(allowed_schemas)) + else: + self.logger.debug('allowed_schemas=%s' % str(allowed_schemas)) for fspec in files: resolve_replica = getattr(copytool, 'resolve_replica', None) diff --git a/pilot/common/errorcodes.py b/pilot/common/errorcodes.py index 534434cd1..844eba0eb 100644 --- a/pilot/common/errorcodes.py +++ b/pilot/common/errorcodes.py @@ -143,6 +143,7 @@ class ErrorCodes: MISSINGRELEASEUNPACKED = 1358 PANDAQUEUENOTACTIVE = 1359 IMAGENOTFOUND = 1360 + REMOTEFILECOULDNOTBEOPENED = 1361 _error_messages = { GENERALERROR: "General pilot error, consult batch log", @@ -264,7 +265,8 @@ class ErrorCodes: POSTPROCESSFAILURE: "Post-process command failed", MISSINGRELEASEUNPACKED: "Missing release setup in unpacked container", PANDAQUEUENOTACTIVE: "PanDA queue is not active", - IMAGENOTFOUND: "Image not found" + IMAGENOTFOUND: "Image not found", + REMOTEFILECOULDNOTBEOPENED: "Remote file could not be opened" } put_error_codes = [1135, 1136, 1137, 1141, 1152, 1181] diff --git a/pilot/control/data.py b/pilot/control/data.py index 91f2dce28..10a89e12d 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -213,7 +213,8 @@ def _stage_in(args, job): client = StageInClient(job.infosys, logger=log, trace_report=trace_report) activity = 'pr' use_pcache = job.infosys.queuedata.use_pcache - kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, use_pcache=use_pcache, use_bulk=False, input_dir=args.input_dir) + kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, use_pcache=use_pcache, use_bulk=False, + input_dir=args.input_dir, use_vp=job.use_vp) client.prepare_sources(job.indata) client.transfer(job.indata, activity=activity, **kwargs) except PilotException as error: diff --git a/pilot/scripts/stagein.py b/pilot/scripts/stagein.py index 2b2d0c9e0..be9be6093 100644 --- a/pilot/scripts/stagein.py +++ b/pilot/scripts/stagein.py @@ -137,6 +137,11 @@ def get_args(): dest='istars', required=True, help='Replica is_tar') + arg_parser.add_argument('--usevp', + dest='usevp', + type=str2bool, + default=False, + help='Job object boolean use_vp') arg_parser.add_argument('--accessmodes', dest='accessmodes', required=True, @@ -382,7 +387,8 @@ def extract_error_info(err): else: client = StageInClient(infoservice, logger=logger, trace_report=trace_report) activity = 'pr' - kwargs = dict(workdir=args.workdir, cwd=args.workdir, usecontainer=False, use_pcache=args.usepcache, use_bulk=False) + kwargs = dict(workdir=args.workdir, cwd=args.workdir, usecontainer=False, use_pcache=args.usepcache, use_bulk=False, + use_vp=args.usevp) xfiles = [] for lfn, scope, filesize, checksum, allowlan, allowwan, dalan, dawan, istar, accessmode, sttoken, guid in list(zip(lfns, scopes, diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 0cc6f9bd0..169961ccc 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -192,11 +192,11 @@ def open_remote_files(indata, workdir, cmd): not_opened = "" for turl in file_dictionary: opened = file_dictionary[turl] - logger.info('turl %s could be opened' % turl) if opened else logger.info('turl %s could not be opened' % turl) + logger.info('turl could be opened: %s' % turl) if opened else logger.info('turl could not be opened: %s' % turl) if not opened: not_opened += turl if not not_opened else ",%s" % turl if not_opened: - ec = 1 + ec = errors.REMOTEFILECOULDNOTBEOPENED diagnostics = "turl not opened:%s" % not_opened if "," not in not_opened else "turls not opened:%s" % not_opened else: logger.info('nothing to verify (for remote files)') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 834e69ec6..ff93f9891 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '18' # build number should be reset to '1' for every new development cycle +BUILD = '20' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/middleware.py b/pilot/util/middleware.py index e3cb23141..b5fe6504a 100644 --- a/pilot/util/middleware.py +++ b/pilot/util/middleware.py @@ -155,13 +155,15 @@ def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, lab cmd = '%s --lfns=%s --scopes=%s -w %s -d -q %s --eventtype=%s --localsite=%s ' \ '--remotesite=%s --produserid=\"%s\" --jobid=%s --taskid=%s --jobdefinitionid=%s ' \ '--eventservicemerge=%s --usepcache=%s --filesizes=%s --checksums=%s --allowlans=%s --allowwans=%s ' \ - '--directaccesslans=%s --directaccesswans=%s --istars=%s --accessmodes=%s --storagetokens=%s --guids=%s' % \ + '--directaccesslans=%s --directaccesswans=%s --istars=%s --accessmodes=%s --storagetokens=%s --guids=%s ' \ + '--usevp=%s' % \ (final_script_path, filedata_dictionary['lfns'], filedata_dictionary['scopes'], workdir, queue, eventtype, localsite, remotesite, job.produserid.replace(' ', '%20'), job.jobid, job.taskid, job.jobdefinitionid, job.is_eventservicemerge, job.infosys.queuedata.use_pcache, filedata_dictionary['filesizes'], filedata_dictionary['checksums'], filedata_dictionary['allowlans'], filedata_dictionary['allowwans'], filedata_dictionary['directaccesslans'], filedata_dictionary['directaccesswans'], filedata_dictionary['istars'], - filedata_dictionary['accessmodes'], filedata_dictionary['storagetokens'], filedata_dictionary['guids']) + filedata_dictionary['accessmodes'], filedata_dictionary['storagetokens'], filedata_dictionary['guids'], + job.use_vp) else: # stage-out cmd = '%s --lfns=%s --scopes=%s -w %s -d -q %s --eventtype=%s --localsite=%s ' \ '--remotesite=%s --produserid=\"%s\" --jobid=%s --taskid=%s --jobdefinitionid=%s ' \ From 1faf3e80a8560d57e90e74ba154b181b42450efd Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 13 Oct 2020 13:32:40 +0200 Subject: [PATCH 28/33] Cleanup --- pilot/api/data.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 7bb6702e1..5a0ee58a3 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -202,21 +202,17 @@ def resolve_replicas(self, files, use_vp=False): # noqa: C901 """ Populates filespec.replicas for each entry from `files` list - :param files: list of `FileSpec` objects - fdat.replicas = [{'ddmendpoint':'ddmendpoint', 'pfn':'replica', 'domain':'domain value'}] - :return: `files` + + :param files: list of `FileSpec` objects. + :param use_vp: True for VP jobs (boolean). + :return: `files` """ logger = self.logger xfiles = [] - #ddmconf = self.infosys.resolve_storage_data() for fdat in files: - #ddmdat = ddmconf.get(fdat.ddmendpoint) - #if not ddmdat: - # raise Exception("Failed to resolve input ddmendpoint by name=%s (from PanDA), please check configuration. fdat=%s" % (fdat.ddmendpoint, fdat)) - ## skip fdat if need for further workflow (e.g. to properly handle OS ddms) xfiles.append(fdat) @@ -235,13 +231,12 @@ def resolve_replicas(self, files, use_vp=False): # noqa: C901 'schemes': ['srm', 'root', 'davs', 'gsiftp', 'https', 'storm'], 'dids': [dict(scope=e.scope, name=e.lfn) for e in xfiles], } - query.update(sort='geoip', client_location=location) - logger.info('calling rucio.list_replicas() with query=%s' % query) - # reset the schemas for VP jobs if use_vp: query['schemes'] = ['root'] + logger.info('calling rucio.list_replicas() with query=%s' % query) + try: replicas = c.list_replicas(**query) except Exception as e: From f2c8a58f85941f272a5c37476eb9b8c576f1b262 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 21 Oct 2020 13:30:52 +0200 Subject: [PATCH 29/33] Update --- PILOTVERSION | 2 +- pilot/api/data.py | 12 +++- pilot/scripts/stagein.py | 119 +++++++++++++++++++++------------- pilot/scripts/stageout.py | 2 +- pilot/user/atlas/common.py | 17 ++--- pilot/user/atlas/container.py | 8 ++- pilot/user/atlas/cpu.py | 28 ++++++-- pilot/user/atlas/utilities.py | 2 +- pilot/util/auxiliary.py | 13 ++++ pilot/util/constants.py | 2 +- pilot/util/default.cfg | 6 +- pilot/util/middleware.py | 94 ++++++++++++++++++++++----- 12 files changed, 211 insertions(+), 94 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index be5439d32..6bdc480f5 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.20 \ No newline at end of file +2.8.4.27 \ No newline at end of file diff --git a/pilot/api/data.py b/pilot/api/data.py index 5a0ee58a3..ae6e493f2 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -720,12 +720,11 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 if self.infosys and self.infosys.queuedata: copytool_name = copytool.__name__.rsplit('.', 1)[-1] allowed_schemas = self.infosys.queuedata.resolve_allowed_schemas(activity, copytool_name) or allowed_schemas + # overwrite allowed_schemas for VP jobs if kwargs['use_vp']: allowed_schemas = ['root'] self.logger.debug('overwrote allowed_schemas for VP job: %s' % str(allowed_schemas)) - else: - self.logger.debug('allowed_schemas=%s' % str(allowed_schemas)) for fspec in files: resolve_replica = getattr(copytool, 'resolve_replica', None) @@ -842,7 +841,14 @@ def set_status_for_direct_access(self, files): self.trace_report.update(scope=fspec.scope, dataset=fspec.dataset) self.trace_report.update(url=fspec.turl, clientState='FOUND_ROOT', stateReason='direct_access') - self.trace_report.send() + + # do not send the trace report at this point if remote file verification is to be done + # (the job object is needed for setting up the required script, and this is not known here) + if config.Pilot.remotefileverification_log: + # store the trace report for later use + write_json + else: + self.trace_report.send() def check_availablespace(self, files): """ diff --git a/pilot/scripts/stagein.py b/pilot/scripts/stagein.py index be9be6093..bb9bfad96 100644 --- a/pilot/scripts/stagein.py +++ b/pilot/scripts/stagein.py @@ -15,7 +15,7 @@ from pilot.api.es_data import StageInESClient from pilot.info import InfoService, FileSpec, infosys from pilot.util.config import config -from pilot.util.filehandling import establish_logging, write_json +from pilot.util.filehandling import establish_logging, write_json, read_json from pilot.util.tracereport import TraceReport import logging @@ -60,11 +60,11 @@ def get_args(): help='Working directory') arg_parser.add_argument('--scopes', dest='scopes', - required=True, + required=False, help='List of Rucio scopes (e.g., mc16_13TeV,mc16_13TeV') arg_parser.add_argument('--lfns', dest='lfns', - required=True, + required=False, help='LFN list (e.g., filename1,filename2') arg_parser.add_argument('--eventtype', dest='eventtype', @@ -111,31 +111,31 @@ def get_args(): help='Do not write the pilot log to file') arg_parser.add_argument('--filesizes', dest='filesizes', - required=True, + required=False, help='Replica file sizes') arg_parser.add_argument('--checksums', dest='checksums', - required=True, + required=False, help='Replica checksums') arg_parser.add_argument('--allowlans', dest='allowlans', - required=True, + required=False, help='Replica allow_lan') arg_parser.add_argument('--allowwans', dest='allowwans', - required=True, + required=False, help='Replica allow_wan') arg_parser.add_argument('--directaccesslans', dest='directaccesslans', - required=True, + required=False, help='Replica direct_access_lan') arg_parser.add_argument('--directaccesswans', dest='directaccesswans', - required=True, + required=False, help='Replica direct_access_wan') arg_parser.add_argument('--istars', dest='istars', - required=True, + required=False, help='Replica is_tar') arg_parser.add_argument('--usevp', dest='usevp', @@ -144,16 +144,20 @@ def get_args(): help='Job object boolean use_vp') arg_parser.add_argument('--accessmodes', dest='accessmodes', - required=True, + required=False, help='Replica accessmodes') arg_parser.add_argument('--storagetokens', dest='storagetokens', - required=True, + required=False, help='Replica storagetokens') arg_parser.add_argument('--guids', dest='guids', - required=True, + required=False, help='Replica guids') + arg_parser.add_argument('--replicadictionary', + dest='replicadictionary', + required=True, + help='Replica dictionary') return arg_parser.parse_args() @@ -349,21 +353,27 @@ def extract_error_info(err): # exit(ret) # get the file info - file_list_dictionary = get_file_lists(args.lfns, args.scopes, args.filesizes, args.checksums, args.allowlans, - args.allowwans, args.directaccesslans, args.directaccesswans, args.istars, - args.accessmodes, args.storagetokens, args.guids) - lfns = file_list_dictionary.get('lfns') - scopes = file_list_dictionary.get('scopes') - filesizes = file_list_dictionary.get('filesizes') - checksums = file_list_dictionary.get('checksums') - allowlans = file_list_dictionary.get('allowlans') - allowwans = file_list_dictionary.get('allowwans') - directaccesslans = file_list_dictionary.get('directaccesslans') - directaccesswans = file_list_dictionary.get('directaccesswans') - istars = file_list_dictionary.get('istars') - accessmodes = file_list_dictionary.get('accessmodes') - storagetokens = file_list_dictionary.get('storagetokens') - guids = file_list_dictionary.get('guids') + try: + replica_dictionary = read_json(os.path.join(args.workdir, args.replicadictionary)) + except Exception as e: + message('exception caught reading json: %s' % e) + exit(1) + +# file_list_dictionary = get_file_lists(args.lfns, args.scopes, args.filesizes, args.checksums, args.allowlans, +# args.allowwans, args.directaccesslans, args.directaccesswans, args.istars, +# args.accessmodes, args.storagetokens, args.guids) +# lfns = file_list_dictionary.get('lfns') +# scopes = file_list_dictionary.get('scopes') +# filesizes = file_list_dictionary.get('filesizes') +# checksums = file_list_dictionary.get('checksums') +# allowlans = file_list_dictionary.get('allowlans') +# allowwans = file_list_dictionary.get('allowwans') +# directaccesslans = file_list_dictionary.get('directaccesslans') +# directaccesswans = file_list_dictionary.get('directaccesswans') +# istars = file_list_dictionary.get('istars') +# accessmodes = file_list_dictionary.get('accessmodes') +# storagetokens = file_list_dictionary.get('storagetokens') +# guids = file_list_dictionary.get('guids') # generate the trace report trace_report = TraceReport(pq=os.environ.get('PILOT_SITENAME', ''), localSite=args.localsite, remoteSite=args.remotesite, dataset="", @@ -390,26 +400,45 @@ def extract_error_info(err): kwargs = dict(workdir=args.workdir, cwd=args.workdir, usecontainer=False, use_pcache=args.usepcache, use_bulk=False, use_vp=args.usevp) xfiles = [] - for lfn, scope, filesize, checksum, allowlan, allowwan, dalan, dawan, istar, accessmode, sttoken, guid in list(zip(lfns, - scopes, - filesizes, - checksums, - allowlans, - allowwans, - directaccesslans, - directaccesswans, - istars, - accessmodes, - storagetokens, - guids)): - files = [{'scope': scope, 'lfn': lfn, 'workdir': args.workdir, 'filesize': filesize, 'checksum': checksum, - 'allow_lan': allowlan, 'allow_wan': allowwan, 'direct_access_lan': dalan, 'guid': guid, - 'direct_access_wan': dawan, 'is_tar': istar, 'accessmode': accessmode, 'storage_token': sttoken}] + for lfn in replica_dictionary: + files = [{'scope': replica_dictionary[lfn]['scope'], + 'lfn': lfn, + 'guid': replica_dictionary[lfn]['guid'], + 'workdir': args.workdir, + 'filesize': replica_dictionary[lfn]['filesize'], + 'checksum': replica_dictionary[lfn]['checksum'], + 'allow_lan': replica_dictionary[lfn]['allowlan'], + 'allow_wan': replica_dictionary[lfn]['allowwan'], + 'direct_access_lan': replica_dictionary[lfn]['directaccesslan'], + 'direct_access_wan': replica_dictionary[lfn]['directaccesswan'], + 'is_tar': replica_dictionary[lfn]['istar'], + 'accessmode': replica_dictionary[lfn]['accessmode'], + 'storage_token': replica_dictionary[lfn]['storagetoken']}] # do not abbreviate the following two lines as otherwise the content of xfiles will be a list of generator objects _xfiles = [FileSpec(type='input', **f) for f in files] xfiles += _xfiles +# for lfn, scope, filesize, checksum, allowlan, allowwan, dalan, dawan, istar, accessmode, sttoken, guid in list(zip(lfns, +# scopes, +# filesizes, +# checksums, +# allowlans, +# allowwans, +# directaccesslans, +# directaccesswans, +# istars, +# accessmodes, +# storagetokens, +# guids)): +# files = [{'scope': scope, 'lfn': lfn, 'workdir': args.workdir, 'filesize': filesize, 'checksum': checksum, +# 'allow_lan': allowlan, 'allow_wan': allowwan, 'direct_access_lan': dalan, 'guid': guid, +# 'direct_access_wan': dawan, 'is_tar': istar, 'accessmode': accessmode, 'storage_token': sttoken}] +# +# # do not abbreviate the following two lines as otherwise the content of xfiles will be a list of generator objects +# _xfiles = [FileSpec(type='input', **f) for f in files] +# xfiles += _xfiles + try: r = client.transfer(xfiles, activity=activity, **kwargs) except Exception as e: @@ -430,12 +459,10 @@ def extract_error_info(err): if err: errcode, err = extract_error_info(err) add_to_dictionary(file_dictionary, 'error', err, errcode, None) - path = os.path.join(args.workdir, config.Container.stagein_dictionary) - _status = write_json(path, file_dictionary) + _status = write_json(os.path.join(args.workdir, config.Container.stagein_status_dictionary), file_dictionary) if err: message("containerised file transfers failed: %s" % err) exit(TRANSFER_ERROR) - message("wrote %s" % path) message("containerised file transfers finished") exit(0) diff --git a/pilot/scripts/stageout.py b/pilot/scripts/stageout.py index 5e79103fb..d61e1d29c 100644 --- a/pilot/scripts/stageout.py +++ b/pilot/scripts/stageout.py @@ -375,7 +375,7 @@ def extract_error_info(err): if err: errcode, err = extract_error_info(err) add_to_dictionary(file_dictionary, 'error', err, errcode, None, None, None, None) - path = os.path.join(args.workdir, config.Container.stageout_dictionary) + path = os.path.join(args.workdir, config.Container.stageout_status_dictionary) if os.path.exists(path): path += '.log' _status = write_json(path, file_dictionary) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 169961ccc..543ec96b2 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -27,6 +27,7 @@ from .utilities import get_memory_monitor_setup, get_network_monitor_setup, post_memory_monitor_action,\ get_memory_monitor_summary_filename, get_prefetcher_setup, get_benchmark_setup +from pilot.util.auxiliary import get_resource_name from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import TrfDownloadFailure, PilotException from pilot.util.auxiliary import get_logger, is_python3 @@ -110,18 +111,6 @@ def validate(job): return status -def get_resource_name(): - """ - Return the name of the resource (only set for HPC resources; e.g. Cori, otherwise return 'grid'). - - :return: resource_name (string). - """ - - resource_name = os.environ.get('PILOT_RESOURCE_NAME', '').lower() - if not resource_name: - resource_name = 'grid' - return resource_name - def open_remote_files(indata, workdir, cmd): """ Verify that direct i/o files can be opened. @@ -224,7 +213,7 @@ def extract_turls(indata): turls = "" for f in indata: - if f.status == 'remote_io' or True: + if f.status == 'remote_io': turls += f.turl if not turls else ",%s" % f.turl return turls @@ -259,6 +248,7 @@ def get_payload_command(job): if cmd: ec, diagnostics = resource.verify_setup_command(cmd) if ec != 0: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) raise PilotException(diagnostics, code=ec) # make sure that remote file can be opened before executing payload @@ -266,6 +256,7 @@ def get_payload_command(job): try: ec, diagnostics = open_remote_files(job.indata, job.workdir, cmd) if ec != 0: + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) raise PilotException(diagnostics, code=ec) except Exception as e: log.warning('caught exception: %s' % e) diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index 96890ecff..d9add7c27 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -814,7 +814,8 @@ def get_root_container_script(cmd): :return: script content (string). """ - content = 'lsetup \'root 6.20.06-x86_64-centos7-gcc8-opt\'\npython %s\nexit $?' % cmd + # content = 'lsetup \'root 6.20.06-x86_64-centos7-gcc8-opt\'\npython %s\nexit $?' % cmd + content = 'lsetup \'root pilot\'\npython %s\nexit $?' % cmd logger.debug('root setup script content:\n\n%s\n\n' % content) return content @@ -829,10 +830,11 @@ def get_middleware_container_script(middleware_container, cmd): :return: script content (string). """ + content = 'export PILOT_RUCIO_SITENAME=%s; ' % os.environ.get('PILOT_RUCIO_SITENAME') if 'rucio' in middleware_container: - content = 'python3 %s\nexit $?' % cmd + content += 'python3 %s\nexit $?' % cmd else: - content = 'lsetup rucio davix xrootd;python %s\nexit $?' % cmd + content += 'lsetup rucio davix xrootd;python %s\nexit $?' % cmd logger.debug('setup.sh content:\n%s' % content) return content diff --git a/pilot/user/atlas/cpu.py b/pilot/user/atlas/cpu.py index 14fea895d..54a9a1292 100644 --- a/pilot/user/atlas/cpu.py +++ b/pilot/user/atlas/cpu.py @@ -9,6 +9,7 @@ import os +from .utilities import get_memory_values from pilot.util.auxiliary import get_logger from pilot.util.container import execute @@ -69,11 +70,29 @@ def set_core_counts(job): log = get_logger(job.jobid) + # something like this could be used if prmon also gave info about ncores + # (change nprocs -> ncores and add ncores to list in utilities module, get_average_summary_dictionary_prmon()) + + #summary_dictionary = get_memory_values(job.workdir, name=job.memorymonitor) + #if summary_dictionary: + # if 'nprocs' in summary_dictionary["Other"]: + # try: + # job.actualcorecount = int(summary_dictionary["Other"]["nprocs"]) + # except Exception as e: + # log.warning('exception caught: %s' % e) + # else: + # job.corecounts = add_core_count(job.actualcorecount) + # log.debug('current core counts list: %s' % str(job.corecounts)) + # else: + # log.debug('summary_dictionary[Other]=%s' % summary_dictionary["Other"]) + #else: + # log.debug('no summary_dictionary') + if job.pgrp: # for debugging - cmd = "ps axo pgid,psr,comm,args | grep %d" % job.pgrp - exit_code, stdout, stderr = execute(cmd, mute=True) - log.debug('%s:\n%s\n' % (cmd, stdout)) + #cmd = "ps axo pgid,psr,comm,args | grep %d" % job.pgrp + #exit_code, stdout, stderr = execute(cmd, mute=True) + #log.debug('%s:\n%s\n' % (cmd, stdout)) # ps axo pgid,psr -> 154628 8 \n 154628 9 \n 1546280 1 .. # sort is redundant; uniq removes any duplicate lines; wc -l gives the final count @@ -86,10 +105,7 @@ def set_core_counts(job): except Exception as e: log.warning('failed to convert number of actual cores to int: %s' % e) else: - # overwrite the original core count (see discussion with Tadashi, 18/8/20) and add it to the list - # job.corecount = job.actualcorecount job.corecounts = add_core_count(job.actualcorecount) #, core_counts=job.corecounts) log.debug('current core counts list: %s' % str(job.corecounts)) - else: log.debug('payload process group not set - cannot check number of cores used by payload') diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index e4faf8b5b..d9a5e8a6b 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -656,7 +656,7 @@ def filter_value(value): "avgRSS": values['rss'].get('avg'), "avgSwap": values['swap'].get('avg')} # add the last of the rchar, .., values - keys = ['rchar', 'wchar', 'read_bytes', 'write_bytes'] + keys = ['rchar', 'wchar', 'read_bytes', 'write_bytes', 'nprocs'] # warning: should read_bytes/write_bytes be reported as rbytes/wbytes? for key in keys: value = get_last_value(dictionary.get(key, None)) diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index fcb0276f4..0a9ed9cc8 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -385,3 +385,16 @@ def is_python3(): """ return sys.version_info >= (3, 0) + + +def get_resource_name(): + """ + Return the name of the resource (only set for HPC resources; e.g. Cori, otherwise return 'grid'). + + :return: resource_name (string). + """ + + resource_name = os.environ.get('PILOT_RESOURCE_NAME', '').lower() + if not resource_name: + resource_name = 'grid' + return resource_name diff --git a/pilot/util/constants.py b/pilot/util/constants.py index ff93f9891..a57ca3d70 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '20' # build number should be reset to '1' for every new development cycle +BUILD = '27' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index 2a372ec4f..390aa65ea 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -211,10 +211,12 @@ use_middleware_container: True middleware_container_stagein_script: stagein.py middleware_container_stageout_script: stageout.py # error information and stage-in file status is saved in a json file by the stage-in script and later read by the pilot -stagein_dictionary: stagein_dictionary.json +stagein_status_dictionary: stagein_status.json +# replica information is passed to the stage-in script using a json file to avoid problems with very long argument lists +stagein_replica_dictionary: stagein_replicas.json middleware_stagein_stdout: stagein_stdout.txt middleware_stagein_stderr: stagein_stderr.txt -stageout_dictionary: stageout_dictionary.json +stageout_status_dictionary: stageout_status.json middleware_stageout_stdout: stageout_stdout.txt middleware_stageout_stderr: stageout_stderr.txt diff --git a/pilot/util/middleware.py b/pilot/util/middleware.py index b5fe6504a..992cb2a00 100644 --- a/pilot/util/middleware.py +++ b/pilot/util/middleware.py @@ -13,7 +13,7 @@ from pilot.common.exception import PilotException, StageInFailure, StageOutFailure from pilot.util.config import config from pilot.util.container import execute -from pilot.util.filehandling import copy, read_json, write_file, copy_pilot_source #, find_executable +from pilot.util.filehandling import copy, read_json, write_json, write_file, copy_pilot_source #, find_executable import logging logger = logging.getLogger(__name__) @@ -77,8 +77,8 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, _stdout_name, _stderr_name = get_logfile_names(label) write_file(path.join(job.workdir, _stdout_name), stdout, mute=False) write_file(path.join(job.workdir, _stderr_name), stderr, mute=False) - #logger.debug('stage-in/out stdout=\n%s' % stdout) - #logger.debug('stage-in/out stderr=\n%s' % stderr) + logger.debug('stage-in/out stdout=\n%s' % stdout) + logger.debug('stage-in/out stderr=\n%s' % stderr) except PilotException as e: msg = 'exception caught: %s' % e if label == 'stage-in': @@ -130,7 +130,23 @@ def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, lab :raises PilotException: for stage-in/out related failures """ - filedata_dictionary = get_filedata_strings(xdata) + if label == 'stage-out': + filedata_dictionary = get_filedata_strings(xdata) + else: + filedata_dictionary = get_filedata(xdata) + + # write file data to file + try: + status = write_json(path.join(job.workdir, config.Container.stagein_replica_dictionary), filedata_dictionary) + except Exception as e: + diagnostics = 'exception caught in get_command(): %s' % e + logger.warning(diagnostics) + raise PilotException(diagnostics) + else: + if not status: + diagnostics = 'failed to write replica dictionary to file' + logger.warning(diagnostics) + raise PilotException(diagnostics) # copy pilot source into container directory, unless it is already there diagnostics = copy_pilot_source(job.workdir) @@ -152,18 +168,24 @@ def get_command(job, xdata, queue, script, eventtype, localsite, remotesite, lab workdir = job.workdir if label == 'stage-in': - cmd = '%s --lfns=%s --scopes=%s -w %s -d -q %s --eventtype=%s --localsite=%s ' \ - '--remotesite=%s --produserid=\"%s\" --jobid=%s --taskid=%s --jobdefinitionid=%s ' \ - '--eventservicemerge=%s --usepcache=%s --filesizes=%s --checksums=%s --allowlans=%s --allowwans=%s ' \ - '--directaccesslans=%s --directaccesswans=%s --istars=%s --accessmodes=%s --storagetokens=%s --guids=%s ' \ - '--usevp=%s' % \ - (final_script_path, filedata_dictionary['lfns'], filedata_dictionary['scopes'], workdir, queue, eventtype, localsite, - remotesite, job.produserid.replace(' ', '%20'), job.jobid, job.taskid, job.jobdefinitionid, - job.is_eventservicemerge, job.infosys.queuedata.use_pcache, filedata_dictionary['filesizes'], - filedata_dictionary['checksums'], filedata_dictionary['allowlans'], filedata_dictionary['allowwans'], - filedata_dictionary['directaccesslans'], filedata_dictionary['directaccesswans'], filedata_dictionary['istars'], - filedata_dictionary['accessmodes'], filedata_dictionary['storagetokens'], filedata_dictionary['guids'], - job.use_vp) + cmd = "%s -w %s -d -q %s --eventtype=%s --localsite=%s --remotesite=%s --produserid=\"%s\" --jobid=%s " \ + "--taskid=%s --jobdefinitionid=%s --eventservicemerge=%s --usepcache=%s --usevp=%s " \ + "--replicadictionary=%s" % (final_script_path, workdir, queue, eventtype, localsite, remotesite, + job.produserid.replace(' ', '%20'), job.jobid, job.taskid, job.jobdefinitionid, + job.is_eventservicemerge, job.infosys.queuedata.use_pcache, job.use_vp, + config.Container.stagein_replica_dictionary) +# cmd = '%s --lfns=%s --scopes=%s -w %s -d -q %s --eventtype=%s --localsite=%s ' \ +# '--remotesite=%s --produserid=\"%s\" --jobid=%s --taskid=%s --jobdefinitionid=%s ' \ +# '--eventservicemerge=%s --usepcache=%s --filesizes=%s --checksums=%s --allowlans=%s --allowwans=%s ' \ +# '--directaccesslans=%s --directaccesswans=%s --istars=%s --accessmodes=%s --storagetokens=%s --guids=%s ' \ +# '--usevp=%s' % \ +# (final_script_path, filedata_dictionary['lfns'], filedata_dictionary['scopes'], workdir, queue, eventtype, localsite, +# remotesite, job.produserid.replace(' ', '%20'), job.jobid, job.taskid, job.jobdefinitionid, +# job.is_eventservicemerge, job.infosys.queuedata.use_pcache, filedata_dictionary['filesizes'], +# filedata_dictionary['checksums'], filedata_dictionary['allowlans'], filedata_dictionary['allowwans'], +# filedata_dictionary['directaccesslans'], filedata_dictionary['directaccesswans'], filedata_dictionary['istars'], +# filedata_dictionary['accessmodes'], filedata_dictionary['storagetokens'], filedata_dictionary['guids'], +# job.use_vp) else: # stage-out cmd = '%s --lfns=%s --scopes=%s -w %s -d -q %s --eventtype=%s --localsite=%s ' \ '--remotesite=%s --produserid=\"%s\" --jobid=%s --taskid=%s --jobdefinitionid=%s ' \ @@ -185,7 +207,7 @@ def handle_containerised_errors(job, xdata, label='stage-in'): :raises: StageInFailure, StageOutFailure """ - dictionary_name = config.Container.stagein_dictionary if label == 'stage-in' else config.Container.stageout_dictionary + dictionary_name = config.Container.stagein_status_dictionary if label == 'stage-in' else config.Container.stageout_status_dictionary # read the JSON file created by the stage-in/out script if path.exists(path.join(job.workdir, dictionary_name + '.log')): @@ -250,6 +272,44 @@ def get_logfile_names(label): return _stdout_name, _stderr_name +def get_filedata(data): + """ + Return a dictionary with LFNs, guids, scopes, datasets, ddmendpoints, etc. + Note: this dictionary will be written to a file that will be read back by the stage-in script inside the container. + Dictionary format: + { lfn1: { 'guid': guid1, 'scope': scope1, 'dataset': dataset1, 'ddmendpoint': ddmendpoint1, + 'filesize': filesize1, 'checksum': checksum1, 'allowlan': allowlan1, 'allowwan': allowwan1, + 'directaccesslan': directaccesslan1, 'directaccesswan': directaccesswan1, 'istar': istar1, + 'accessmode': accessmode1, 'storagetoken': storagetoken1}, lfn2: .. } + :param data: + :type data: + :return: + :rtype: + """ + + file_dictionary = {} + for fspec in data: + try: + _type = 'md5' if ('md5' in fspec.checksum and 'adler32' not in fspec.checksum) else 'adler32' + file_dictionary[fspec.lfn] = {'guid': fspec.guid, + 'scope': fspec.scope, + 'dataset': fspec.dataset, + 'ddmendpoint': fspec.ddmendpoint, + 'filesize': fspec.filesize, + 'checksum': fspec.checksum.get(_type, 'None'), + 'allowlan': fspec.allow_lan, + 'allowwan': fspec.allow_wan, + 'directaccesslan': fspec.direct_access_lan, + 'directaccesswan': fspec.direct_access_wan, + 'istar': fspec.is_tar, + 'accessmode': fspec.accessmode, + 'storagetoken': fspec.storage_token} + except Exception as e: + logger.warning('exception caught in get_filedata(): %s' % e) + + return file_dictionary + + def get_filedata_strings(data): """ Return a dictionary with comma-separated list of LFNs, guids, scopes, datasets, ddmendpoints, etc. From 457c7f15e217f3ddac0bd313f6ea2dafaf8c5b1b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 27 Oct 2020 10:47:35 +0100 Subject: [PATCH 30/33] Added rse_expression for VP jobs. Remote file verification now works. Fixed ES problem due to payload loop update. Added base_trace_report to config file --- PILOTVERSION | 2 +- pilot/api/data.py | 26 ++++++---- pilot/control/payloads/eventservicemerge.py | 1 + pilot/control/payloads/generic.py | 9 +++- pilot/user/atlas/common.py | 53 +++++++++++++++++---- pilot/util/constants.py | 2 +- pilot/util/default.cfg | 3 ++ 7 files changed, 76 insertions(+), 20 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 6bdc480f5..7e4113d05 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.27 \ No newline at end of file +2.8.4.31 \ No newline at end of file diff --git a/pilot/api/data.py b/pilot/api/data.py index ae6e493f2..11068b8cc 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -25,7 +25,7 @@ from pilot.info import infosys from pilot.common.exception import PilotException, ErrorCodes, SizeTooLarge, NoLocalSpace, ReplicasNotFound from pilot.util.config import config -from pilot.util.filehandling import calculate_checksum +from pilot.util.filehandling import calculate_checksum, write_json from pilot.util.math import convert_mb_to_b from pilot.util.parameters import get_maximum_input_sizes from pilot.util.workernode import get_local_disk_space @@ -235,6 +235,8 @@ def resolve_replicas(self, files, use_vp=False): # noqa: C901 # reset the schemas for VP jobs if use_vp: query['schemes'] = ['root'] + query['rse_expression'] = 'istape=False\\type=SPECIAL' + logger.info('calling rucio.list_replicas() with query=%s' % query) try: @@ -778,7 +780,7 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 self.require_protocols(files, copytool, activity, local_dir=kwargs['input_dir']) # mark direct access files with status=remote_io - self.set_status_for_direct_access(files) + self.set_status_for_direct_access(files, kwargs.get('workdir', '')) # get remain files that need to be transferred by copytool remain_files = [e for e in files if e.status not in ['remote_io', 'transferred', 'no_transfer']] @@ -812,12 +814,13 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 # return copytool.copy_in_bulk(remain_files, **kwargs) return copytool.copy_in(remain_files, **kwargs) - def set_status_for_direct_access(self, files): + def set_status_for_direct_access(self, files, workdir): """ Update the FileSpec status with 'remote_io' for direct access mode. Should be called only once since the function sends traces :param files: list of FileSpec objects. + :param workdir: work directory (string). :return: None """ @@ -830,7 +833,7 @@ def set_status_for_direct_access(self, files): fspec.status_code = 0 fspec.status = 'remote_io' - self.logger.info('stage-in: direct access (remoteio) will be used for lfn=%s (direct_lan=%s, direct_wan=%s), turl=%s' % + self.logger.info('stage-in: direct access (remote i/o) will be used for lfn=%s (direct_lan=%s, direct_wan=%s), turl=%s' % (fspec.lfn, direct_lan, direct_wan, fspec.turl)) # send trace @@ -839,14 +842,21 @@ def set_status_for_direct_access(self, files): self.trace_report.update(localSite=localsite, remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) self.trace_report.update(filename=fspec.lfn, guid=fspec.guid.replace('-', '')) self.trace_report.update(scope=fspec.scope, dataset=fspec.dataset) - self.trace_report.update(url=fspec.turl, clientState='FOUND_ROOT', stateReason='direct_access') # do not send the trace report at this point if remote file verification is to be done - # (the job object is needed for setting up the required script, and this is not known here) + # note also that we can't verify the files at this point since root will not be available from inside + # the rucio container if config.Pilot.remotefileverification_log: - # store the trace report for later use - write_json + # store the trace report for later use (the trace report class inherits from dict, so just write it as JSON) + # outside of the container, it will be available in the normal work dir + # use the normal work dir if we are not in a container + _workdir = workdir if os.path.exists(workdir) else '.' + path = os.path.join(_workdir, config.Pilot.base_trace_report) + if not os.path.exists(_workdir): + path = os.path.join('/srv', config.Pilot.base_trace_report) + self.logger.debug('writing base trace report to: %s' % path) + write_json(path, self.trace_report) else: self.trace_report.send() diff --git a/pilot/control/payloads/eventservicemerge.py b/pilot/control/payloads/eventservicemerge.py index 310bc54fd..012c76895 100644 --- a/pilot/control/payloads/eventservicemerge.py +++ b/pilot/control/payloads/eventservicemerge.py @@ -32,6 +32,7 @@ def untar_file(self, lfn, job): def utility_before_payload(self, job): """ Functions to run before payload + Note: this function updates job.jobparams (process_writetofile() call) :param job: job object """ diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 3a752a0d1..39ce6d362 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -393,6 +393,7 @@ def run_preprocess(self, job): exit_code = 0 try: + # note: this might update the jobparams cmd_before_payload = self.utility_before_payload(job) except Exception as e: log.error(e) @@ -442,12 +443,18 @@ def run(self): while True: log.info('payload iteration loop #%d' % iteration) - # first run the preprocess (if necessary) + # first run the preprocess (if necessary) - note: this might update jobparams -> must update cmd + jobparams_pre = self.__job.jobparams exit_code = self.run_preprocess(self.__job) + jobparams_post = self.__job.jobparams if exit_code: if exit_code == 160: exit_code = 0 break + if jobparams_pre != jobparams_post: + log.debug('jobparams were updated by utility_before_payload()') + # must update cmd + cmd = cmd.replace(jobparams_pre, jobparams_post) # now run the main payload, when it finishes, run the postprocess (if necessary) proc = self.run_payload(self.__job, cmd, self.__out, self.__err) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 543ec96b2..d8004b839 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -37,6 +37,7 @@ from pilot.util.container import execute from pilot.util.filehandling import remove, get_guid, remove_dir_tree, read_list, remove_core_dumps, copy,\ copy_pilot_source, write_file, read_json +from pilot.util.tracereport import TraceReport import logging logger = logging.getLogger(__name__) @@ -111,18 +112,18 @@ def validate(job): return status -def open_remote_files(indata, workdir, cmd): +def open_remote_files(indata, workdir): """ Verify that direct i/o files can be opened. :param indata: list of FileSpec. :param workdir: working directory (string). - :param cmd: asetup path (string). :return: exit code (int), diagnostics (string). """ ec = 0 diagnostics = "" + not_opened = "" # extract direct i/o files from indata (string of comma-separated turls) turls = extract_turls(indata) @@ -146,20 +147,20 @@ def open_remote_files(indata, workdir, cmd): diagnostics = 'cannot perform file open test - script path does not exist: %s' % full_script_path logger.warning(diagnostics) logger.warning('tested both path=%s and path=%s (none exists)' % (d1, d2)) - return ec, diagnostics + return ec, diagnostics, not_opened try: copy(full_script_path, final_script_path) except Exception as e: # do not set ec since this will be a pilot issue rather than site issue diagnostics = 'cannot perform file open test - pilot source copy failed: %s' % e logger.warning(diagnostics) - return ec, diagnostics + return ec, diagnostics, not_opened else: # correct the path when containers have been used final_script_path = os.path.join('.', script) _cmd = get_file_open_command(final_script_path, turls) - cmd = cmd + '; ' + create_root_container_command(workdir, _cmd) + cmd = create_root_container_command(workdir, _cmd) logger.info('*** executing file open verification script:\n\n\'%s\'\n\n' % cmd) exit_code, stdout, stderr = execute(cmd, usecontainer=False) @@ -190,7 +191,7 @@ def open_remote_files(indata, workdir, cmd): else: logger.info('nothing to verify (for remote files)') - return ec, diagnostics + return ec, diagnostics, not_opened def get_file_open_command(script_path, turls): @@ -253,13 +254,47 @@ def get_payload_command(job): # make sure that remote file can be opened before executing payload if config.Pilot.remotefileverification_log: + ec = 0 + diagnostics = "" + not_opened_turls = "" try: - ec, diagnostics = open_remote_files(job.indata, job.workdir, cmd) + ec, diagnostics, not_opened_turls = open_remote_files(job.indata, job.workdir) + except Exception as e: + log.warning('caught exception: %s' % e) + else: + # read back the base trace report + path = os.path.join(job.workdir, config.Pilot.base_trace_report) + if not os.path.exists(path): + log.warning('base trace report does not exist (cannot send trace reports): %s' % path) + try: + base_trace_report = read_json(path) + except PilotException as e: + log.warning('failed to open base trace report (cannot send trace reports): %s' % e) + else: + if not base_trace_report: + log.warning('failed to read back base trace report (cannot send trace reports)') + else: + # update and send the trace info + for fspec in job.indata: + if fspec.status == 'remote_io': + base_trace_report.update(url=fspec.turl) + base_trace_report.update(remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) + base_trace_report.update(filename=fspec.lfn, guid=fspec.guid.replace('-', '')) + base_trace_report.update(scope=fspec.scope, dataset=fspec.dataset) + if fspec.turl in not_opened_turls: + base_trace_report.update(clientState='FAILED_REMOTE_OPEN') + + # copy the base trace report (only a dictionary) into a real trace report object + trace_report = TraceReport(**base_trace_report) + if trace_report: + trace_report.send() + else: + log.warning('failed to create trace report for turl=%s' % fspec.turl) + # fail the job if the remote files could not be verified if ec != 0: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) raise PilotException(diagnostics, code=ec) - except Exception as e: - log.warning('caught exception: %s' % e) + if is_standard_atlas_job(job.swrelease): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index a57ca3d70..9e0a2fc6c 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '27' # build number should be reset to '1' for every new development cycle +BUILD = '31' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index 390aa65ea..41eb0e7d8 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -125,6 +125,9 @@ http_maxtime: 120 remotefileverification_dictionary: remotefileverification_dictionary.json remotefileverification_log: remotefileslog.txt +# The name of the base trace report (the base trace report is written to file for later use) +base_trace_report: base_trace_report.json + ################################ # Information service parameters From 0c8a08d4fce81a8c5e4c3cf9614a66ce055162cb Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 28 Oct 2020 13:28:28 +0100 Subject: [PATCH 31/33] Added container options from queuedata to middleware container setup --- PILOTVERSION | 2 +- pilot/api/data.py | 8 ++++++ pilot/control/data.py | 6 +++-- pilot/user/atlas/common.py | 47 ++++++++++++++++++----------------- pilot/user/atlas/container.py | 5 +++- pilot/util/constants.py | 2 +- pilot/util/middleware.py | 5 ++-- 7 files changed, 45 insertions(+), 30 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 7e4113d05..da786fd0e 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -2.8.4.31 \ No newline at end of file +2.8.4.34 \ No newline at end of file diff --git a/pilot/api/data.py b/pilot/api/data.py index 11068b8cc..ddc2fa2cf 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -829,6 +829,14 @@ def set_status_for_direct_access(self, files, workdir): fspec.is_directaccess(ensure_replica=True, allowed_replica_schemas=self.direct_localinput_allowed_schemas)) direct_wan = (fspec.domain == 'wan' and fspec.direct_access_wan and fspec.is_directaccess(ensure_replica=True, allowed_replica_schemas=self.remoteinput_allowed_schemas)) + + if not direct_lan and not direct_wan: + self.logger.debug('direct lan/wan transfer will not be used for lfn=%s' % fspec.lfn) + self.logger.debug('lfn=%s, direct_lan=%s, direct_wan=%s, direct_access_lan=%s, direct_access_wan=%s, ' + 'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s' % \ + (fspec.lfn, direct_lan, direct_wan, fspec.direct_access_lan, fspec.direct_access_wan, + str(self.direct_localinput_allowed_schemas), str(self.remoteinput_allowed_schemas))) + if direct_lan or direct_wan: fspec.status_code = 0 fspec.status = 'remote_io' diff --git a/pilot/control/data.py b/pilot/control/data.py index 10a89e12d..db1a93277 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -192,7 +192,8 @@ def _stage_in(args, job): logger.info('stage-in will be done in a container') try: eventtype, localsite, remotesite = get_trace_report_variables(job, label=label) - pilot.util.middleware.containerise_middleware(job, job.indata, args.queue, eventtype, localsite, remotesite, label=label) + pilot.util.middleware.containerise_middleware(job, job.indata, args.queue, eventtype, localsite, remotesite, + job.infosys.queuedata.container_options, label=label) except PilotException as e: logger.warning('stage-in containerisation threw a pilot exception: %s' % e) except Exception as e: @@ -755,7 +756,8 @@ def _do_stageout(job, xdata, activity, queue, title, output_dir=''): logger.info('stage-out will be done in a container') try: eventtype, localsite, remotesite = get_trace_report_variables(job, label=label) - pilot.util.middleware.containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, label=label) + pilot.util.middleware.containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, + job.infosys.queuedata.container_options, label=label) except PilotException as e: logger.warning('stage-out containerisation threw a pilot exception: %s' % e) except Exception as e: diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index d8004b839..26d204375 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -265,31 +265,32 @@ def get_payload_command(job): # read back the base trace report path = os.path.join(job.workdir, config.Pilot.base_trace_report) if not os.path.exists(path): - log.warning('base trace report does not exist (cannot send trace reports): %s' % path) - try: - base_trace_report = read_json(path) - except PilotException as e: - log.warning('failed to open base trace report (cannot send trace reports): %s' % e) + log.warning('base trace report does not exist (%s) - input file traces should already have been sent' % path) else: - if not base_trace_report: - log.warning('failed to read back base trace report (cannot send trace reports)') + try: + base_trace_report = read_json(path) + except PilotException as e: + log.warning('failed to open base trace report (cannot send trace reports): %s' % e) else: - # update and send the trace info - for fspec in job.indata: - if fspec.status == 'remote_io': - base_trace_report.update(url=fspec.turl) - base_trace_report.update(remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) - base_trace_report.update(filename=fspec.lfn, guid=fspec.guid.replace('-', '')) - base_trace_report.update(scope=fspec.scope, dataset=fspec.dataset) - if fspec.turl in not_opened_turls: - base_trace_report.update(clientState='FAILED_REMOTE_OPEN') - - # copy the base trace report (only a dictionary) into a real trace report object - trace_report = TraceReport(**base_trace_report) - if trace_report: - trace_report.send() - else: - log.warning('failed to create trace report for turl=%s' % fspec.turl) + if not base_trace_report: + log.warning('failed to read back base trace report (cannot send trace reports)') + else: + # update and send the trace info + for fspec in job.indata: + if fspec.status == 'remote_io': + base_trace_report.update(url=fspec.turl) + base_trace_report.update(remoteSite=fspec.ddmendpoint, filesize=fspec.filesize) + base_trace_report.update(filename=fspec.lfn, guid=fspec.guid.replace('-', '')) + base_trace_report.update(scope=fspec.scope, dataset=fspec.dataset) + if fspec.turl in not_opened_turls: + base_trace_report.update(clientState='FAILED_REMOTE_OPEN') + + # copy the base trace report (only a dictionary) into a real trace report object + trace_report = TraceReport(**base_trace_report) + if trace_report: + trace_report.send() + else: + log.warning('failed to create trace report for turl=%s' % fspec.turl) # fail the job if the remote files could not be verified if ec != 0: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) diff --git a/pilot/user/atlas/container.py b/pilot/user/atlas/container.py index d9add7c27..05ec5f43f 100644 --- a/pilot/user/atlas/container.py +++ b/pilot/user/atlas/container.py @@ -759,7 +759,7 @@ def create_root_container_command(workdir, cmd): return command -def create_middleware_container_command(workdir, cmd, label='stagein'): +def create_middleware_container_command(workdir, cmd, container_options, label='stagein'): """ Create the stage-in/out container command. @@ -776,6 +776,7 @@ def create_middleware_container_command(workdir, cmd, label='stagein'): :param workdir: working directory where script will be stored (string). :param cmd: isolated stage-in/out command (string). + :param container_options: container options from queuedata (string). :param label: 'stage-[in|out]' (string). :return: container command to be executed (string). """ @@ -800,6 +801,8 @@ def create_middleware_container_command(workdir, cmd, label='stagein'): command += 'export ALRB_CONT_RUNPAYLOAD=\"source /srv/%s\";' % script_name command += get_asetup(alrb=True) # export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; command += 'source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh -c %s' % middleware_container + command += ' ' + get_container_options(container_options) + command = command.replace(' ', ' ') logger.debug('container command: %s' % command) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 9e0a2fc6c..5200e4ce8 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '31' # build number should be reset to '1' for every new development cycle +BUILD = '34' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/middleware.py b/pilot/util/middleware.py index 992cb2a00..239c20736 100644 --- a/pilot/util/middleware.py +++ b/pilot/util/middleware.py @@ -20,7 +20,7 @@ errors = ErrorCodes() -def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, label='stage-in'): +def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, container_options, label='stage-in'): """ Containerise the middleware by performing stage-in/out steps in a script that in turn can be run in a container. @@ -30,6 +30,7 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, :param eventtype: :param localsite: :param remotesite: + :param container_options: container options from queuedata (string). :param label: 'stage-in/out' (String). :return: :raises NotImplemented: if stagein=False, until stage-out script has been written @@ -53,7 +54,7 @@ def containerise_middleware(job, xdata, queue, eventtype, localsite, remotesite, pilot_user = environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.container' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: - cmd = user.create_middleware_container_command(job.workdir, cmd, label=label) + cmd = user.create_middleware_container_command(job.workdir, cmd, container_options, label=label) except PilotException as e: raise e else: From 978a11b2f10e4e647c601c2d53d5f4ff6c8f8c71 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 28 Oct 2020 13:45:33 +0100 Subject: [PATCH 32/33] Flake8 corrections --- pilot/api/data.py | 2 +- pilot/control/payloads/generic.py | 2 +- pilot/scripts/open_remote_file.py | 4 ++-- pilot/user/atlas/common.py | 3 +-- pilot/user/atlas/cpu.py | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index ddc2fa2cf..5f81953d6 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -833,7 +833,7 @@ def set_status_for_direct_access(self, files, workdir): if not direct_lan and not direct_wan: self.logger.debug('direct lan/wan transfer will not be used for lfn=%s' % fspec.lfn) self.logger.debug('lfn=%s, direct_lan=%s, direct_wan=%s, direct_access_lan=%s, direct_access_wan=%s, ' - 'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s' % \ + 'direct_localinput_allowed_schemas=%s, remoteinput_allowed_schemas=%s' % (fspec.lfn, direct_lan, direct_wan, fspec.direct_access_lan, fspec.direct_access_wan, str(self.direct_localinput_allowed_schemas), str(self.remoteinput_allowed_schemas))) diff --git a/pilot/control/payloads/generic.py b/pilot/control/payloads/generic.py index 39ce6d362..17e579d2e 100644 --- a/pilot/control/payloads/generic.py +++ b/pilot/control/payloads/generic.py @@ -418,7 +418,7 @@ def run_preprocess(self, job): return exit_code - def run(self): + def run(self): # noqa: C901 """ Run all payload processes (including pre- and post-processes, and utilities). In the case of HPO jobs, this function will loop over all processes until the preprocess returns a special diff --git a/pilot/scripts/open_remote_file.py b/pilot/scripts/open_remote_file.py index 94c883a7d..96ab6805f 100644 --- a/pilot/scripts/open_remote_file.py +++ b/pilot/scripts/open_remote_file.py @@ -79,8 +79,8 @@ def try_open_file(turl): if __name__ == '__main__': - """ - Main function of the remote file open script. + """ + Main function of the remote file open script. """ # get the args from the arg parser diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 26d204375..2a1d87bfb 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -220,7 +220,7 @@ def extract_turls(indata): return turls -def get_payload_command(job): +def get_payload_command(job): # noqa: C901 """ Return the full command for executing the payload, including the sourcing of all setup files and setting of environment variables. @@ -296,7 +296,6 @@ def get_payload_command(job): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(ec) raise PilotException(diagnostics, code=ec) - if is_standard_atlas_job(job.swrelease): # Normal setup (production and user jobs) diff --git a/pilot/user/atlas/cpu.py b/pilot/user/atlas/cpu.py index 54a9a1292..af82e7112 100644 --- a/pilot/user/atlas/cpu.py +++ b/pilot/user/atlas/cpu.py @@ -9,7 +9,7 @@ import os -from .utilities import get_memory_values +# from .utilities import get_memory_values from pilot.util.auxiliary import get_logger from pilot.util.container import execute From 8492da1f63dcd2aa67f39b978de47ae4e7c2cef8 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 28 Oct 2020 13:46:33 +0100 Subject: [PATCH 33/33] Updated build number --- pilot/util/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 5200e4ce8..0ed9b2a5e 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '2' # released number should be fixed at 2 for Pilot 2 VERSION = '8' # version number is '1' for first real Pilot 2 release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '34' # build number should be reset to '1' for every new development cycle +BUILD = '35' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1