From 86847b6d53aeb35a7f6029e27a04e230fede81e6 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 14 Mar 2023 13:39:35 +0100 Subject: [PATCH 001/154] New version --- PILOTVERSION | 2 +- pilot/util/constants.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 7858b9c11..291ab8a6c 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.1.17 \ No newline at end of file +3.5.2.1 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 76bf639dd..1652bbd33 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -13,8 +13,8 @@ # Pilot version RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates -REVISION = '1' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '17' # build number should be reset to '1' for every new development cycle +REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '1' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From e499d0f2f9624e43813b976d930803fc9afe2771 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 17 Mar 2023 10:19:59 +0100 Subject: [PATCH 002/154] Fixed wrong order --- pilot/copytool/gs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/copytool/gs.py b/pilot/copytool/gs.py index 6f8a2e743..30d73e2bc 100644 --- a/pilot/copytool/gs.py +++ b/pilot/copytool/gs.py @@ -18,9 +18,9 @@ try: from google.cloud import storage except Exception: - storage_client = storage.Client() -else: storage_client = None +else: + storage_client = storage.Client() try: import pathlib # Python 3 From 757d1381053f04e93a54b5eff82ed2ece990b356 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Mon, 20 Mar 2023 09:15:23 +0100 Subject: [PATCH 003/154] prmon pid selection debugging turned on --- PILOTVERSION | 2 +- pilot/user/atlas/utilities.py | 7 ++++--- pilot/util/constants.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 291ab8a6c..4b07ed0a1 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.1 \ No newline at end of file +3.5.2.2 \ No newline at end of file diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index dd61397ff..ca5083b03 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -156,8 +156,9 @@ def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", #_cmd = get_trf_command(command, transformation=transformation) # get ps info using group id ps = get_ps_info(pgrp) - if dump_ps: - logger.debug('ps:\n%s' % ps) + #if dump_ps: + # logger.debug('ps:\n%s' % ps) + #logger.debug('ps:\n%s' % ps) #logger.debug('attempting to identify pid for Singularity (v.3) runtime parent process') #_pid = get_pid_for_command(ps, command="Singularity runtime parent") #if _pid: @@ -172,7 +173,7 @@ def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", return -1 ps = get_ps_info(pgrp) - #logger.debug('ps:\n%s' % ps) + logger.debug('ps:\n%s' % ps) # lookup the process id using ps aux logger.debug('attempting to identify pid from job id') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 1652bbd33..d3eff030e 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '1' # build number should be reset to '1' for every new development cycle +BUILD = '2' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From d68b499e0c736553c818d98a465f40b8adf36670 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Mon, 20 Mar 2023 10:46:26 +0100 Subject: [PATCH 004/154] Added CPU arch script --- PILOTVERSION | 2 +- pilot/control/job.py | 6 +- pilot/scripts/cpu_arch.py | 180 ++++++++++++++++++++++++++++++++++ pilot/user/atlas/common.py | 5 +- pilot/user/atlas/utilities.py | 2 +- pilot/util/constants.py | 2 +- pilot/util/filehandling.py | 11 ++- pilot/util/workernode.py | 56 +++++++++++ 8 files changed, 256 insertions(+), 8 deletions(-) create mode 100755 pilot/scripts/cpu_arch.py diff --git a/PILOTVERSION b/PILOTVERSION index 4b07ed0a1..bf24575a6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.2 \ No newline at end of file +3.5.2.5 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 36808a34f..5e3a12496 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -52,7 +52,7 @@ from pilot.util.queuehandling import scan_for_jobs, put_in_queue, queue_report, purge_queue from pilot.util.realtimelogger import cleanup as rtcleanup from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp -from pilot.util.workernode import get_disk_space, collect_workernode_info, get_node_name, get_cpu_model, get_cpu_cores +from pilot.util.workernode import get_disk_space, collect_workernode_info, get_node_name, get_cpu_model, get_cpu_cores, get_cpu_arch logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -653,6 +653,10 @@ def get_data_structure(job, state, args, xml=None, metadata=None, final=False): if product and vendor: logger.debug(f'cpuConsumptionUnit: could have added: product={product}, vendor={vendor}') + cpu_arch = get_cpu_arch(job.workdir) + if cpu_arch: + logger.debug(f'cpuConsumptionUnit: could have added: {cpu_arch}') + # add memory information if available add_memory_info(data, job.workdir, name=job.memorymonitor) if state == 'finished' or state == 'failed': diff --git a/pilot/scripts/cpu_arch.py b/pilot/scripts/cpu_arch.py new file mode 100755 index 000000000..0557be28b --- /dev/null +++ b/pilot/scripts/cpu_arch.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Alaettin Serhan Mete, alaettin.serhan.mete@cern.ch, 2023 +# - Paul Nilsson, paul.nilsson@cern.ch, 2023 + +import argparse +import logging +import re + +must_v4 = [] +must_not_v4 = [] +must_v3 = [] +must_not_v3 = [] +must_v2 = [] +must_not_v2 = [] + + +def get_flags_cpuinfo(): + """ + Get the CPU (model) name, number of cores of the corresponding CPU and the CPU flags from the /proc/cpuinfo + """ + cpu, cpu_core, flags = None, None, None + with open('/proc/cpuinfo', 'r') as fiile: + for line in fiile.readlines(): + if 'model name' in line: + cpu = line.split(':')[-1].strip() + if 'cpu cores' in line: + cpu_core = line.split(':')[-1].strip() + if 'flags' in line: + flags = line.split(':')[-1].strip() + if all([cpu, cpu_core, flags]): + return {"cpu": cpu, "cpu_core": cpu_core, "flags": flags} + + +def get_flags_pilotlog(pilotlogname): + """ + Get the site/queue name, the CPU (model) name, number of cores of the corresponding CPU and the CPU flags from the downloaded pilotlog + """ + site, cpu, cpu_core, flags = None, None, None, None + with open(pilotlogname, 'r') as fiile: + for line in fiile.readlines(): + if 'PANDA_RESOURCE' in line: + site = line.split('=')[-1].strip() + if 'model name' in line: + cpu = line.split(':')[-1].strip() + if 'coreCount' in line: + cpu_core = line.split(':')[-1].strip() + if 'flags' in line: + flags = line.split(':')[-1].strip() + if all([site, cpu, cpu_core, flags]): + return {"site": site, "cpu": cpu, "cpu_core": cpu_core, "flags": flags} + + +def set_naive(): + """ + Make a decision on the CPU architecture based on the simplified lists (must_'s) of flags + The must_not_'s have been left blank, these could be filled if need be + """ + global must_v4 + global must_not_v4 + global must_v3 + global must_not_v3 + global must_v2 + global must_not_v2 + + must_v4 = [r'AVX512.*'] + must_not_v4 = [] + + must_v3 = [r'AVX2.*'] + must_not_v3 = [] + + must_v2 = [r'SSE4_2.*'] + must_not_v2 = [] + + +def set_gcc(): + """ + Make a decision on the CPU architecture based on the modified lists (must_'s) of flags from gcc: LAHF_SAHF --> LAHF_LM; LZCNT --> ABM; removal of SSE3 + References: + https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/testsuite/gcc.target/i386/x86-64-v4.c;hb=324bec558e95584e8c1997575ae9d75978af59f1 + https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/testsuite/gcc.target/i386/x86-64-v3.c;hb=324bec558e95584e8c1997575ae9d75978af59f1 + https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/testsuite/gcc.target/i386/x86-64-v2.c;hb=324bec558e95584e8c1997575ae9d75978af59f1 + + The must_not_'s have been left blank, these could be filled if need be + """ + global must_v4 + global must_not_v4 + global must_v3 + global must_not_v3 + global must_v2 + global must_not_v2 + + must_v4 = [r'MMX', r'SSE', r'SSE2', r'LAHF_LM', r'POPCNT', r'SSE4_1', r'SSE4_2', r'SSSE3', r'AVX', r'AVX2', r'F16C', + r'FMA', r'ABM', r'MOVBE', r'XSAVE', r'AVX512F', r'AVX512BW', r'AVX512CD', r'AVX512DQ', r'AVX512VL'] + must_not_v4 = [] + + must_v3 = [r'MMX', r'SSE', r'SSE2', r'LAHF_LM', r'POPCNT', r'SSE4_1', r'SSE4_2', r'SSSE3', r'AVX', r'AVX2', r'F16C', + r'FMA', r'ABM', r'MOVBE', r'XSAVE'] + must_not_v3 = [] + + must_v2 = [r'MMX', r'SSE', r'SSE2', r'LAHF_LM', r'POPCNT', r'SSE4_1', r'SSE4_2', r'SSSE3'] + must_not_v2 = [] + + +def check_flags(must, must_not, flags): + """ + Matching of the actual CPU flags w.r.t. the lists of flags defined for deciding on architecture + """ + failed = False + for flag in must: + if not any([re.match(flag, test_flag, re.IGNORECASE) for test_flag in flags]): + logging.debug("Missing must-have: {0}".format(flag)) + failed = True + for flag in must_not: + if not any([re.match(flag, test_flag, re.IGNORECASE) for test_flag in flags]): + logging.debug("Present must-not-have: {0}".format(flag)) + failed = True + return failed + + +def all_version_checks(flag_string, name): + """ + Architecture is assigned to the CPU based on the check_flags() function + """ + flag_list = flag_string.split() + logging.debug("-------Checking V4 for {0}--------".format(name)) + failed_v4 = check_flags(must_v4, must_not_v4, flag_list) + if not failed_v4: + return "x86-64-v4" + else: + pass + logging.debug("-------Checking V3 for {0}--------".format(name)) + failed_v3 = check_flags(must_v3, must_not_v3, flag_list) + if not failed_v3: + return "x86-64-v3" + else: + pass + logging.debug("-------Checking V2 for {0}--------".format(name)) + failed_v2 = check_flags(must_v2, must_not_v2, flag_list) + if not failed_v2: + return "x86-64-v2" + else: + pass + logging.debug("-------Defaulting {0} to V1--------".format(name)) + if failed_v2 and failed_v3 and failed_v4: + return "x86-64-v1" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--logpath", default=None, type=str, help="Enter the full path to pilotlog") + parser.add_argument("--alg", default="naive", choices=["naive", "gcc"], help="algorithm type") + parser.add_argument("-d", "--debug", help="Enable additional logging", action="store_true") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, + format="CPUFLAGS-%(asctime)s-%(process)d-%(levelname)s-%(message)s", + ) + + if args.alg == "naive": + set_naive() + elif args.alg == "gcc": + set_gcc() + else: + raise RuntimeError("Invalid option specified") + + if args.logpath is not None: + pilotlog = args.logpath + loginfo = get_flags_pilotlog(pilotlog) + arch_pilotlog = all_version_checks(loginfo["flags"], loginfo["cpu"]) + print(arch_pilotlog) + else: + cpuinfo = get_flags_cpuinfo() + arch_cpuinfo = all_version_checks(cpuinfo["flags"], cpuinfo["cpu"]) + print(arch_cpuinfo) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index aa90c450f..50e307727 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -179,13 +179,14 @@ def open_remote_files(indata, workdir, nthreads): # execute file open script which will attempt to open each file # copy pilot source into container directory, unless it is already there + script = 'open_remote_file.py' diagnostics = copy_pilot_source(workdir) if diagnostics: raise PilotException(diagnostics) - script = 'open_remote_file.py' final_script_path = os.path.join(workdir, script) - os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir + if workdir not in os.environ['PYTHONPATH']: + os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir script_path = os.path.join('pilot/scripts', script) dir1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot3'), script_path) dir2 = os.path.join(workdir, script_path) diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index ca5083b03..f903d2893 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -176,7 +176,7 @@ def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", logger.debug('ps:\n%s' % ps) # lookup the process id using ps aux - logger.debug('attempting to identify pid from job id') + logger.debug(f'attempting to identify pid from job id ({jobid})') _pid = get_pid_for_jobid(ps, jobid) if _pid: logger.debug('discovered pid=%d for job id %s' % (_pid, jobid)) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index d3eff030e..39e6f1dbb 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '2' # build number should be reset to '1' for every new development cycle +BUILD = '5' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index d59776296..62974aa63 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -1093,19 +1093,26 @@ def get_valid_path_from_list(paths): return valid_path -def copy_pilot_source(workdir): +def copy_pilot_source(workdir, filename=None): """ Copy the pilot source into the work directory. + If a filename is specified, only that file will be copied. :param workdir: working directory (string). + :param filename: specific filename (string). :return: diagnostics (string). """ diagnostics = "" srcdir = os.path.join(os.environ.get('PILOT_SOURCE_DIR', '.'), 'pilot3') + + if filename: + srcdir = os.path.join(srcdir, filename) + try: logger.debug(f'copy {srcdir} to {workdir}') - cmd = 'cp -r %s/* %s' % (srcdir, workdir) + # cmd = 'cp -r %s/* %s' % (srcdir, workdir) + cmd = 'cp -r %s %s' % (srcdir, workdir) exit_code, stdout, _ = execute(cmd) if exit_code != 0: diagnostics = f'file copy failed: {exit_code}, {stdout}' diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index a1a33696b..08f15dd4b 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -17,6 +17,7 @@ from pilot.util.auxiliary import sort_words from pilot.common.exception import PilotException, ErrorCodes from pilot.util.container import execute +from pilot.util.filehandling import copy_pilot_source, copy from pilot.info import infosys from pilot.util.disk import disk_usage @@ -121,6 +122,61 @@ def get_cpu_flags(sorted=True): return flags +def get_cpu_arch(workdir): + """ + Return the CPU architecture string. + + The CPU architecture string is determined by a script (pilot/scripts/cpu_arch.py), run by the pilot. + For details about this script, see: https://its.cern.ch/jira/browse/ATLINFR-4844 + + :param workdir: job workdir (string). + :return: CPU arch (string). + """ + + cpu_arch = '' + + # copy pilot source into container directory, unless it is already there + script = 'cpu_arch.py' + script_path = os.path.join('pilot/scripts', script) + + diagnostics = copy_pilot_source(workdir, filename=script_path) + if diagnostics: + logger.warning('failed to read CPU architecture string') + return "" + + final_script_path = os.path.join(workdir, script) + if workdir not in os.environ['PYTHONPATH']: + os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir + + dir1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot3'), script_path) + dir2 = os.path.join(workdir, script_path) + full_script_path = dir1 if os.path.exists(dir1) else dir2 + if not os.path.exists(full_script_path): + logger.warning(f'failed to locate CPU architecture script: {full_script_path} does not exist') + return "" + + if os.path.exists(final_script_path): + logger.debug('CPU arch script already copied') + else: + try: + copy(full_script_path, final_script_path) + except PilotException as exc: + # do not set ec since this will be a pilot issue rather than site issue + diagnostics = f'cannot perform file open test - pilot source copy failed: {exc}' + logger.warning(diagnostics) + return "" + + # CPU arch script has now been copied, time to execute it + ec, stdout, stderr = execute('python3 cpu_arch.py --alg gcc') + if ec: + logger.debug(f'ec={ec}, stdout={stdout}, stderr={stderr}') + else: + cpu_arch = stdout + logger.debug(f'CPU arch script returned: {cpu_arch}') + + return cpu_arch + + def collect_workernode_info(path=None): """ Collect node information (cpu, memory and disk space). From 554588ece4a379fc4b24258a78947af8ac6ed255 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 20 Mar 2023 22:01:16 +0100 Subject: [PATCH 005/154] Path update --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/workernode.py | 59 ++++++++++++++++++++-------------------- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index bf24575a6..cb895838c 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.5 \ No newline at end of file +3.5.2.10 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 39e6f1dbb..b6806ccae 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '5' # build number should be reset to '1' for every new development cycle +BUILD = '10' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index 08f15dd4b..6b0fb8317 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -137,37 +137,38 @@ def get_cpu_arch(workdir): # copy pilot source into container directory, unless it is already there script = 'cpu_arch.py' - script_path = os.path.join('pilot/scripts', script) - - diagnostics = copy_pilot_source(workdir, filename=script_path) - if diagnostics: - logger.warning('failed to read CPU architecture string') - return "" - - final_script_path = os.path.join(workdir, script) - if workdir not in os.environ['PYTHONPATH']: - os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir - - dir1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot3'), script_path) - dir2 = os.path.join(workdir, script_path) - full_script_path = dir1 if os.path.exists(dir1) else dir2 - if not os.path.exists(full_script_path): - logger.warning(f'failed to locate CPU architecture script: {full_script_path} does not exist') - return "" - - if os.path.exists(final_script_path): - logger.debug('CPU arch script already copied') - else: - try: - copy(full_script_path, final_script_path) - except PilotException as exc: - # do not set ec since this will be a pilot issue rather than site issue - diagnostics = f'cannot perform file open test - pilot source copy failed: {exc}' - logger.warning(diagnostics) - return "" + srcdir = os.path.join(os.environ.get('PILOT_SOURCE_DIR', '.'), 'pilot3') + script_dir = os.path.join(srcdir, 'pilot/scripts') + + #diagnostics = copy_pilot_source(workdir, filename=script_path) + #if diagnostics: + # logger.warning('failed to read CPU architecture string') + # return "" + + #final_script_path = os.path.join(workdir, script) + if script_dir not in os.environ['PYTHONPATH']: + os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + script_dir + + #dir1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot3'), script_path) + #dir2 = os.path.join(workdir, script_path) + #full_script_path = dir1 if os.path.exists(dir1) else dir2 + #if not os.path.exists(full_script_path): + # logger.warning(f'failed to locate CPU architecture script: {full_script_path} does not exist') + # return "" + + #if os.path.exists(final_script_path): + # logger.debug('CPU arch script already copied') + #else: + # try: + # copy(full_script_path, final_script_path) + # except PilotException as exc: + # # do not set ec since this will be a pilot issue rather than site issue + # diagnostics = f'cannot perform file open test - pilot source copy failed: {exc}' + # logger.warning(diagnostics) + # return "" # CPU arch script has now been copied, time to execute it - ec, stdout, stderr = execute('python3 cpu_arch.py --alg gcc') + ec, stdout, stderr = execute(f'python3 {script_dir}/{script} --alg gcc') if ec: logger.debug(f'ec={ec}, stdout={stdout}, stderr={stderr}') else: From 9021617d96d8f3a5c5d2a60d85e2dc31c209fce1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 21 Mar 2023 11:52:51 +0100 Subject: [PATCH 006/154] Fixed bad cp pattern --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 3 +-- pilot/util/constants.py | 2 +- pilot/util/filehandling.py | 4 ++-- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index cb895838c..e78bdbccb 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.10 \ No newline at end of file +3.5.2.12 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 50e307727..5ce68bdfa 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -185,8 +185,7 @@ def open_remote_files(indata, workdir, nthreads): raise PilotException(diagnostics) final_script_path = os.path.join(workdir, script) - if workdir not in os.environ['PYTHONPATH']: - os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir + os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + workdir script_path = os.path.join('pilot/scripts', script) dir1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot3'), script_path) dir2 = os.path.join(workdir, script_path) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index b6806ccae..5ec2f5b14 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '10' # build number should be reset to '1' for every new development cycle +BUILD = '12' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 62974aa63..bd8c6c634 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -1111,8 +1111,8 @@ def copy_pilot_source(workdir, filename=None): try: logger.debug(f'copy {srcdir} to {workdir}') - # cmd = 'cp -r %s/* %s' % (srcdir, workdir) - cmd = 'cp -r %s %s' % (srcdir, workdir) + pat = '%s' if filename else '%s/*' + cmd = f'cp -r {pat} %s' % (srcdir, workdir) exit_code, stdout, _ = execute(cmd) if exit_code != 0: diagnostics = f'file copy failed: {exit_code}, {stdout}' From a04bbf6db1cd3feb23a32e41dbbd0eb36e894ed2 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 21 Mar 2023 11:53:32 +0100 Subject: [PATCH 007/154] Updated mod times --- pilot/user/atlas/common.py | 2 +- pilot/util/constants.py | 2 +- pilot/util/filehandling.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 5ce68bdfa..8e1a6ae12 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2022 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 # - Wen Guan, wen.guan@cern.ch, 2018 from collections import defaultdict diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 5ec2f5b14..78f750468 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -6,7 +6,7 @@ # # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2022 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 from os import environ diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index bd8c6c634..9fb18ae45 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2022 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 import hashlib import io From eaeda639158a552b78c2fb77015f48afa9c3cb0a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 21 Mar 2023 11:54:06 +0100 Subject: [PATCH 008/154] Removed unused imports --- pilot/util/workernode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index 6b0fb8317..a72dd650e 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -17,7 +17,7 @@ from pilot.util.auxiliary import sort_words from pilot.common.exception import PilotException, ErrorCodes from pilot.util.container import execute -from pilot.util.filehandling import copy_pilot_source, copy +#from pilot.util.filehandling import copy_pilot_source, copy from pilot.info import infosys from pilot.util.disk import disk_usage From 8739f14bdac8d1fdeda47a94376c57ee888fd78a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 21 Mar 2023 12:27:15 +0100 Subject: [PATCH 009/154] Now making sure that the job workdir still exists for tobekilled instruction - if not, ignore (will lead to later problem though) --- PILOTVERSION | 2 +- pilot/control/job.py | 5 ++++- pilot/util/constants.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index e78bdbccb..5117ff5ab 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.12 \ No newline at end of file +3.5.2.13 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 5e3a12496..c3a224ae2 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -346,7 +346,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) # does the server update contain any backchannel information? if so, update the job object handle_backchannel_command(res, job, args, test_tobekilled=test_tobekilled) - if final: + if final and os.path.exists(job.workdir): # ignore if workdir doesn't exist - might be a delayed jobUpdate os.environ['SERVER_UPDATE'] = SERVER_UPDATE_FINAL return True @@ -508,6 +508,9 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): logger.debug(f'exception caught in get_debug_command(): {error}') elif 'tobekilled' in cmd: logger.info(f'pilot received a panda server signal to kill job {job.jobid} at {time_stamp()}') + if not os.path.exists(job.workdir): # jobUpdate might be delayed - do not cause problems for new downloaded job + logger.warning(f'job.workdir ({job.workdir}) does not exist - ignore kill instruction') + return set_pilot_state(job=job, state="failed") job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.PANDAKILL) if job.pid: diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 78f750468..28a4e5244 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '12' # build number should be reset to '1' for every new development cycle +BUILD = '13' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From f34bb914f351a54e55ff82b3d7a48597f9e70a5d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 21 Mar 2023 17:26:45 +0100 Subject: [PATCH 010/154] Now informing exactly which log file is too big --- pilot/util/monitoring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 685c62484..ea1cf5728 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -530,7 +530,7 @@ def check_payload_stdout(job): localsizelimit_stdout = get_local_size_limit_stdout() if fsize > localsizelimit_stdout: exit_code = errors.STDOUTTOOBIG - diagnostics = f"Payload stdout file too big: {fsize} B (larger than limit {localsizelimit_stdout} B)" + diagnostics = f"log file {filename} is too big: {fsize} B (larger than limit {localsizelimit_stdout} B)" logger.warning(diagnostics) # kill the job From 08f6f994b0d409db26e960f303a52e2555370588 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 21 Mar 2023 17:30:58 +0100 Subject: [PATCH 011/154] Update --- pilot/user/atlas/diagnose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/user/atlas/diagnose.py b/pilot/user/atlas/diagnose.py index 15c255674..f7e53528b 100644 --- a/pilot/user/atlas/diagnose.py +++ b/pilot/user/atlas/diagnose.py @@ -726,7 +726,7 @@ def is_bad_alloc(job_report_errors): def get_log_extracts(job, state): """ - Extract special warnings and other other info from special logs. + Extract special warnings and other info from special logs. This function also discovers if the payload had any outbound connections. :param job: job object. From 7116e750bd948de9496e35b5b76950b57c898a68 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 22 Mar 2023 08:47:23 +0100 Subject: [PATCH 012/154] Updated log message --- pilot/control/job.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index c3a224ae2..5283b7b3e 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -658,7 +658,8 @@ def get_data_structure(job, state, args, xml=None, metadata=None, final=False): cpu_arch = get_cpu_arch(job.workdir) if cpu_arch: - logger.debug(f'cpuConsumptionUnit: could have added: {cpu_arch}') + # data['architecture_version'] = cpu_arch + logger.debug(f'architecture_version: could have added: {cpu_arch}') # add memory information if available add_memory_info(data, job.workdir, name=job.memorymonitor) From b54a7f18d7d37835986b09685352e65e2a04995d Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 22 Mar 2023 11:09:41 +0100 Subject: [PATCH 013/154] Zipping too large log files --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/filehandling.py | 30 ++++++++++++++++++++++++++++++ pilot/util/monitoring.py | 27 +++++++++++++++++---------- 4 files changed, 49 insertions(+), 12 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 5117ff5ab..0d2274ed2 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.13 \ No newline at end of file +3.5.2.14 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 28a4e5244..a6a9ee4ea 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '13' # build number should be reset to '1' for every new development cycle +BUILD = '14' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 9fb18ae45..e0b8328ba 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -25,6 +25,7 @@ from zlib import adler32 from functools import partial from mmap import mmap +from zipfile import ZipFile, ZIP_DEFLATED from pilot.common.exception import ConversionFailure, FileHandlingFailure, MKDirFailure, NoSuchFile from pilot.util.config import config @@ -1241,3 +1242,32 @@ def find_file(filename, startdir): break return _path + + +def zip_files(archivename, files): + """ + Zip a list of files with standard compression level. + + :param archivename: archive name (string). + :param files: list of files. + :return: status (Boolean) + """ + + status = False + try: + + zipped = False + with ZipFile(archivename, 'w', ZIP_DEFLATED) as zip: + for _file in files: + if os.path.exists(_file): + zip.write(_file) + zipped = True + if not zipped: + print('nothing was zipped') + else: + status = True + + except Exception as exc: + print(f'failed to create archive {archivename}: {exc}') + + return status diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index ea1cf5728..e1281a6f2 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -20,7 +20,7 @@ from pilot.util.config import config from pilot.util.constants import PILOT_PRE_PAYLOAD from pilot.util.container import execute -from pilot.util.filehandling import get_disk_usage, remove_files, get_local_file_size, read_file +from pilot.util.filehandling import get_disk_usage, remove_files, get_local_file_size, read_file, zip_files from pilot.util.loopingjob import looping_job from pilot.util.math import convert_mb_to_b, human2bytes from pilot.util.parameters import convert_to_int, get_maximum_input_sizes @@ -512,6 +512,7 @@ def check_payload_stdout(job): logger.debug(f'file list={file_list}') # now loop over all files and check each individually (any large enough file will fail the job) + to_be_zipped = [] for filename in file_list: logger.debug(f'check_payload_stdout: filename={filename}') @@ -530,26 +531,32 @@ def check_payload_stdout(job): localsizelimit_stdout = get_local_size_limit_stdout() if fsize > localsizelimit_stdout: exit_code = errors.STDOUTTOOBIG - diagnostics = f"log file {filename} is too big: {fsize} B (larger than limit {localsizelimit_stdout} B)" + diagnostics = f"log file {filename} is too big: {fsize} B (larger than limit {localsizelimit_stdout} B) [will be zipped]" logger.warning(diagnostics) - - # kill the job - set_pilot_state(job=job, state="failed") - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) - kill_processes(job.pid) - - # remove the payload stdout file after the log extracts have been created + to_be_zipped.append(filename) # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: # remove any lingering input files from the work dir - exit_code = remove_files(job.workdir, lfns) + remove_files(job.workdir, lfns) else: logger.info(f"payload log ({os.path.basename(filename)}) within allowed size limit ({localsizelimit_stdout} B): {fsize} B") else: logger.info(f"skipping file size check of payload stdout file ({filename}) since it has not been created yet") + if to_be_zipped: + logger.warning(f'the following files will be zipped: {to_be_zipped}') + archivename = os.path.join(job.workdir, 'oversized_files.zip') + status = zip_files(archivename, to_be_zipped) + if status: + logger.info(f'created archive {archivename}') + + # kill the job + set_pilot_state(job=job, state="failed") + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) + kill_processes(job.pid) # will not return + return exit_code, diagnostics From 4edc3e1d5a95d2a378592750b7f48939baf401ad Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 22 Mar 2023 11:13:50 +0100 Subject: [PATCH 014/154] Simplified handle_backchannel_command() --- PILOTVERSION | 2 +- pilot/control/job.py | 7 +------ pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 0d2274ed2..8a143c0c5 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.14 \ No newline at end of file +3.5.2.15 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 5283b7b3e..1d7ded82b 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -502,10 +502,7 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): cmd = res.get('command') # is it a 'command options'-type? debug_command=tail .., ls .., gdb .., ps .., du .. if ' ' in cmd and 'tobekilled' not in cmd: - try: - job.debug, job.debug_command = get_debug_command(cmd) - except Exception as error: - logger.debug(f'exception caught in get_debug_command(): {error}') + job.debug, job.debug_command = get_debug_command(cmd) elif 'tobekilled' in cmd: logger.info(f'pilot received a panda server signal to kill job {job.jobid} at {time_stamp()}') if not os.path.exists(job.workdir): # jobUpdate might be delayed - do not cause problems for new downloaded job @@ -516,8 +513,6 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): if job.pid: logger.debug('killing payload process') kill_process(job.pid) - else: - logger.debug('no pid to kill') args.abort_job.set() elif 'softkill' in cmd: logger.info(f'pilot received a panda server signal to softkill job {job.jobid} at {time_stamp()}') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index a6a9ee4ea..968780d9c 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '14' # build number should be reset to '1' for every new development cycle +BUILD = '15' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From e3ce6e3628cdd73fd04e536ce805bb942e264582 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 28 Mar 2023 16:40:10 +0200 Subject: [PATCH 015/154] Now sending cpu_architecture_level --- PILOTVERSION | 2 +- pilot/control/job.py | 3 +-- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 8a143c0c5..3ecd57c6a 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.15 \ No newline at end of file +3.5.2.16 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 1d7ded82b..9fa20961f 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -653,8 +653,7 @@ def get_data_structure(job, state, args, xml=None, metadata=None, final=False): cpu_arch = get_cpu_arch(job.workdir) if cpu_arch: - # data['architecture_version'] = cpu_arch - logger.debug(f'architecture_version: could have added: {cpu_arch}') + data['cpu_architecture_level'] = cpu_arch # add memory information if available add_memory_info(data, job.workdir, name=job.memorymonitor) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 968780d9c..46ddbf104 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '15' # build number should be reset to '1' for every new development cycle +BUILD = '16' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 547735f5ec14448a2b7675e90d517511e8c2e6ed Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 29 Mar 2023 08:40:10 +0200 Subject: [PATCH 016/154] Cleaned up get_cpu_arch() --- PILOTVERSION | 2 +- pilot/control/job.py | 2 +- pilot/util/constants.py | 2 +- pilot/util/workernode.py | 27 +-------------------------- 4 files changed, 4 insertions(+), 29 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 3ecd57c6a..2efc852dc 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.16 \ No newline at end of file +3.5.2.17 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 9fa20961f..3a1caaa96 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -651,7 +651,7 @@ def get_data_structure(job, state, args, xml=None, metadata=None, final=False): if product and vendor: logger.debug(f'cpuConsumptionUnit: could have added: product={product}, vendor={vendor}') - cpu_arch = get_cpu_arch(job.workdir) + cpu_arch = get_cpu_arch() if cpu_arch: data['cpu_architecture_level'] = cpu_arch diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 46ddbf104..ce7ff85a6 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '16' # build number should be reset to '1' for every new development cycle +BUILD = '17' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index a72dd650e..2882aabed 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -122,14 +122,13 @@ def get_cpu_flags(sorted=True): return flags -def get_cpu_arch(workdir): +def get_cpu_arch(): """ Return the CPU architecture string. The CPU architecture string is determined by a script (pilot/scripts/cpu_arch.py), run by the pilot. For details about this script, see: https://its.cern.ch/jira/browse/ATLINFR-4844 - :param workdir: job workdir (string). :return: CPU arch (string). """ @@ -140,33 +139,9 @@ def get_cpu_arch(workdir): srcdir = os.path.join(os.environ.get('PILOT_SOURCE_DIR', '.'), 'pilot3') script_dir = os.path.join(srcdir, 'pilot/scripts') - #diagnostics = copy_pilot_source(workdir, filename=script_path) - #if diagnostics: - # logger.warning('failed to read CPU architecture string') - # return "" - - #final_script_path = os.path.join(workdir, script) if script_dir not in os.environ['PYTHONPATH']: os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH') + ':' + script_dir - #dir1 = os.path.join(os.path.join(os.environ['PILOT_HOME'], 'pilot3'), script_path) - #dir2 = os.path.join(workdir, script_path) - #full_script_path = dir1 if os.path.exists(dir1) else dir2 - #if not os.path.exists(full_script_path): - # logger.warning(f'failed to locate CPU architecture script: {full_script_path} does not exist') - # return "" - - #if os.path.exists(final_script_path): - # logger.debug('CPU arch script already copied') - #else: - # try: - # copy(full_script_path, final_script_path) - # except PilotException as exc: - # # do not set ec since this will be a pilot issue rather than site issue - # diagnostics = f'cannot perform file open test - pilot source copy failed: {exc}' - # logger.warning(diagnostics) - # return "" - # CPU arch script has now been copied, time to execute it ec, stdout, stderr = execute(f'python3 {script_dir}/{script} --alg gcc') if ec: From f9bfbd245de13ebf82152a1154e8e79671b351cc Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 29 Mar 2023 09:30:37 +0200 Subject: [PATCH 017/154] Checking for 'running' state before sending heartbeat --- pilot/control/job.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 3a1caaa96..de34e545f 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2909,7 +2909,11 @@ def send_heartbeat_if_time(job, args, update_time): """ if int(time.time()) - update_time >= get_heartbeat_period(job.debug and job.debug_command): - if job.serverstate != 'finished' and job.serverstate != 'failed': + # check for state==running here, and send explicit 'running' in send_state, rather than sending job.state + # since the job state can actually change in the meantime by another thread + # job.completed will anyway be checked in https::send_update() + if job.serverstate != 'finished' and job.serverstate != 'failed' and job.state == 'running': + logger.info('will send heartbeat for job in \'running\' state') send_state(job, args, 'running') update_time = int(time.time()) From c8cd0422152644be5b9501ce018e0cc2a25f30b3 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 29 Mar 2023 09:32:38 +0200 Subject: [PATCH 018/154] Added completed field --- pilot/info/jobdata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index c0ee703b5..c826ac1bd 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -112,6 +112,7 @@ class JobData(BaseData): checkinputsize = True # False when mv copytool is used and input reside on non-local disks subprocesses = [] # list of PIDs for payload subprocesses prodproxy = "" # to keep track of production proxy on unified queues + completed = False # True when job has finished or failed, used by https::send_update() # time variable used for on-the-fly cpu consumption time measurements done by job monitoring t0 = None # payload startup time From 2aad97a11fabfa0a47141ba76078a4f6b5605c39 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 29 Mar 2023 09:40:41 +0200 Subject: [PATCH 019/154] Aborting send_update() for running state if job has already completed --- pilot/util/https.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pilot/util/https.py b/pilot/util/https.py index 6ed6f69d6..f0613fdbf 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -452,6 +452,12 @@ def send_update(update_function, data, url, port, job=None, ipv='IPv6'): job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.REACHEDMAXTIME, msg=msg) add_error_codes(data, job) + # do not allow any delayed heartbeat messages for running state, if the job has completed (ie another call to this + # function was already made by another thread for finished/failed state) + if job.completed and job.state == 'running': + logger.warning('will not send job update for running state since the job has already completed') + return None # should be ignored + while attempt < max_attempts and not done: logger.info(f'server update attempt {attempt + 1}/{max_attempts}') From 0f4ca5609e4a2d2aafb676cd547901ced75ce636 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 29 Mar 2023 09:44:26 +0200 Subject: [PATCH 020/154] Aborting send_update() for running state if job has already completed --- pilot/util/https.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pilot/util/https.py b/pilot/util/https.py index f0613fdbf..f02bc811d 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -454,9 +454,10 @@ def send_update(update_function, data, url, port, job=None, ipv='IPv6'): # do not allow any delayed heartbeat messages for running state, if the job has completed (ie another call to this # function was already made by another thread for finished/failed state) - if job.completed and job.state == 'running': - logger.warning('will not send job update for running state since the job has already completed') - return None # should be ignored + if job: # ignore for updateWorkerPilotStatus calls + if job.completed and job.state == 'running': + logger.warning('will not send job update for running state since the job has already completed') + return None # should be ignored while attempt < max_attempts and not done: logger.info(f'server update attempt {attempt + 1}/{max_attempts}') From 1b7c6d9393939e026d81a3d2dc785f43a62e0585 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 29 Mar 2023 09:59:51 +0200 Subject: [PATCH 021/154] Aborting send_update() for running state if job has already completed --- pilot/control/job.py | 3 +++ pilot/util/https.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index de34e545f..de8e05942 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -313,6 +313,9 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) job.state = state state = get_proper_state(job, state) + if state == 'finished' or state == 'holding' or state == 'failed': + logger.info(f'this job has now completed (state={state})') + job.completed = True # should the pilot make any server updates? if not args.update_server: diff --git a/pilot/util/https.py b/pilot/util/https.py index f02bc811d..ba2acf776 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -447,6 +447,7 @@ def send_update(update_function, data, url, port, job=None, ipv='IPv6'): data['state'] = 'failed' if job: job.state = 'failed' + job.completed = True msg = 'the max batch system time limit has been reached' logger.warning(msg) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.REACHEDMAXTIME, msg=msg) @@ -455,8 +456,8 @@ def send_update(update_function, data, url, port, job=None, ipv='IPv6'): # do not allow any delayed heartbeat messages for running state, if the job has completed (ie another call to this # function was already made by another thread for finished/failed state) if job: # ignore for updateWorkerPilotStatus calls - if job.completed and job.state == 'running': - logger.warning('will not send job update for running state since the job has already completed') + if job.completed and (job.state == 'running' or job.state == 'starting'): + logger.warning(f'will not send job update for {job.state} state since the job has already completed') return None # should be ignored while attempt < max_attempts and not done: From 703890f5cd8e525a88aeaed18153c39aa11e036f Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 29 Mar 2023 10:01:43 +0200 Subject: [PATCH 022/154] Not accepting any stderr from cpu_arch.py --- pilot/util/workernode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index 2882aabed..7131da494 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -144,7 +144,7 @@ def get_cpu_arch(): # CPU arch script has now been copied, time to execute it ec, stdout, stderr = execute(f'python3 {script_dir}/{script} --alg gcc') - if ec: + if ec or stderr: logger.debug(f'ec={ec}, stdout={stdout}, stderr={stderr}') else: cpu_arch = stdout From a7587cbd9983acdcb523b8f722a910fc8b3e428d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 29 Mar 2023 18:39:07 +0200 Subject: [PATCH 023/154] Now reporting received kill signal immediately --- pilot/control/job.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 9fa20961f..af8ab7b65 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2781,6 +2781,18 @@ def job_monitor(queues, traces, args): # noqa: C901 for i in range(len(jobs)): current_id = jobs[i].jobid + # if abort_job and signal was set + if abort_job and args.signal: + error_code = get_signal_error(args.signal) + jobs[i].state = 'failed' + jobs[i].piloterrorcodes, jobs[i].piloterrordiags = errors.add_error_code(error_code) + jobs[i].completed = True + # update server immediately + send_state(jobs[i], args, jobs[i].state) + if jobs[i].pid: + logger.debug('killing payload processes') + kill_processes(jobs[i].pid) + if os.environ.get('REACHED_MAXTIME', None): # the batch system max time has been reached, time to abort (in the next step) jobs[i].state = 'failed' @@ -2815,7 +2827,7 @@ def job_monitor(queues, traces, args): # noqa: C901 break # run this check again in case job_monitor_tasks() takes a long time to finish (and the job object - # has expired in the mean time) + # has expired in the meantime) try: _job = jobs[i] except Exception: @@ -2860,6 +2872,26 @@ def job_monitor(queues, traces, args): # noqa: C901 logger.info('[job] job monitor thread has finished') +def get_signal_error(sig): + """ + Return a corresponding pilot error code for the given signal. + + :param sig: signal. + :return: pilot error code (int). + """ + + _sig = str(sig) # e.g. 'SIGTERM' + codes = {'SIGBUS': errors.SIGBUS, + 'SIGQUIT': errors.SIGQUIT, + 'SIGSEGV': errors.SIGSEGV, + 'SIGTERM': errors.SIGTERM, + 'SIGXCPU': errors.SIGXCPU, + 'SIGUSR1': errors.SIGUSR1, + 'USERKILL': errors.USERKILL} + ret = codes.get(_sig) if _sig in codes else errors.KILLSIGNAL + return ret + + def download_new_proxy(role='production', proxy_type='', workdir=''): """ The production proxy has expired, try to download a new one. From 031bd71098e04045942f190041415d3586bc37be Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 29 Mar 2023 19:05:49 +0200 Subject: [PATCH 024/154] Merged version --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 2efc852dc..af1b6026d 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.17 \ No newline at end of file +3.5.2.18 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index ce7ff85a6..d089c8bf1 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '17' # build number should be reset to '1' for every new development cycle +BUILD = '18' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From d7495bcd2fba0bf5c64d09c24e87296942eba5c0 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 30 Mar 2023 14:22:49 +0200 Subject: [PATCH 025/154] Refactored check_payoad_stdout(). Updated remove_files(). Checking size of archive --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/filehandling.py | 7 ++-- pilot/util/loopingjob.py | 2 +- pilot/util/monitoring.py | 82 +++++++++++++++++++++++++++----------- 5 files changed, 65 insertions(+), 30 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index af1b6026d..21660dcca 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.18 \ No newline at end of file +3.5.2.19 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index d089c8bf1..ae7a07b65 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '18' # build number should be reset to '1' for every new development cycle +BUILD = '20' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index e0b8328ba..52bc74662 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -467,12 +467,13 @@ def remove_dir_tree(path): return 0 -def remove_files(workdir, files): +def remove_files(files, workdir=None): """ Remove all given files from workdir. + If workdir is set, it will be used as base path. - :param workdir: working directory (string). - :param files: file list. + :param files: file list + :param workdir: optional working directory (string) :return: exit code (0 if all went well, -1 otherwise) """ diff --git a/pilot/util/loopingjob.py b/pilot/util/loopingjob.py index eebfabba9..fec63978d 100644 --- a/pilot/util/loopingjob.py +++ b/pilot/util/loopingjob.py @@ -188,7 +188,7 @@ def kill_looping_job(job): # remove any lingering input files from the work dir lfns, _ = job.get_lfns_and_guids() if lfns: - _ec = remove_files(job.workdir, lfns) + _ec = remove_files(lfns, workdir=job.workdir) if _ec != 0: logger.warning('failed to remove all files') diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index e1281a6f2..e188d24f6 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -515,42 +515,38 @@ def check_payload_stdout(job): to_be_zipped = [] for filename in file_list: - logger.debug(f'check_payload_stdout: filename={filename}') if "job.log.tgz" in filename: - logger.info(f"skipping file size check of file ({filename}) since it is a special log file") + logger.debug(f"skipping file size check of file ({filename}) since it is a special log file") continue if os.path.exists(filename): - try: - # get file size in bytes - fsize = os.path.getsize(filename) - except Exception as error: - logger.warning(f"could not read file size of {filename}: {error}") - else: - # is the file too big? - localsizelimit_stdout = get_local_size_limit_stdout() - if fsize > localsizelimit_stdout: - exit_code = errors.STDOUTTOOBIG - diagnostics = f"log file {filename} is too big: {fsize} B (larger than limit {localsizelimit_stdout} B) [will be zipped]" - logger.warning(diagnostics) - to_be_zipped.append(filename) - - # remove any lingering input files from the work dir - lfns, guids = job.get_lfns_and_guids() - if lfns: - # remove any lingering input files from the work dir - remove_files(job.workdir, lfns) - else: - logger.info(f"payload log ({os.path.basename(filename)}) within allowed size limit ({localsizelimit_stdout} B): {fsize} B") + _exit_code, to_be_zipped = check_log_size(filename, to_be_zipped=to_be_zipped) + if _exit_code: # do not break loop so that other logs can get zipped if necessary + exit_code = _exit_code else: logger.info(f"skipping file size check of payload stdout file ({filename}) since it has not been created yet") + if exit_code: + # remove any lingering input files from the work dir + lfns, guids = job.get_lfns_and_guids() + if lfns: + # remove any lingering input files from the work dir + remove_files(lfns, workdir=job.workdir) + if to_be_zipped: logger.warning(f'the following files will be zipped: {to_be_zipped}') archivename = os.path.join(job.workdir, 'oversized_files.zip') status = zip_files(archivename, to_be_zipped) if status: logger.info(f'created archive {archivename}') + # verify that the new file size is not too big (ignore exit code, should already be set above) + _exit_code, to_be_zipped = check_log_size(archivename, to_be_zipped=None) + if _exit_code: + logger.warning('also the archive was too large - will be removed') + remove_files([archivename]) + + # remove logs + remove_files(to_be_zipped) # kill the job set_pilot_state(job=job, state="failed") @@ -560,6 +556,44 @@ def check_payload_stdout(job): return exit_code, diagnostics +def check_log_size(filename, to_be_zipped=None, archive=False): + """ + Check the payload log file size. + The log will be added to the list of files to be zipped, if too large. + + :param filename: file path (string) + :param to_be_zipped: list of files to be zipped + :param archive: is this file an archive? (boolean) + :return: exit code (int), to_be_zipped (list) + """ + + exit_code = 0 + + try: + # get file size in bytes + fsize = os.path.getsize(filename) + except Exception as error: + logger.warning(f"could not read file size of {filename}: {error}") + else: + # is the file too big? + localsizelimit_stdout = get_local_size_limit_stdout() + + + lim = 10 if not archive else localsizelimit_stdout + if fsize > lim: #localsizelimit_stdout: + exit_code = errors.STDOUTTOOBIG + label = 'archive' if archive else 'log file' + diagnostics = f"{label} {filename} is too big: {fsize} B (larger than limit {localsizelimit_stdout} B) [will be zipped]" + logger.warning(diagnostics) + if not to_be_zipped == None: + to_be_zipped.append(filename) + else: + logger.info( + f"payload log ({os.path.basename(filename)}) within allowed size limit ({localsizelimit_stdout} B): {fsize} B") + + return exit_code, to_be_zipped + + def check_local_space(initial=True): """ Do we have enough local disk space left to run the job? @@ -637,7 +671,7 @@ def check_work_dir(job): # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: - remove_files(job.workdir, lfns) + remove_files(lfns, workdir=job.workdir) # remeasure the size of the workdir at this point since the value is stored below workdirsize = get_disk_usage(job.workdir) From 52c25400dbc95c3dc881bad2ebc1eea327838498 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 31 Mar 2023 10:15:04 +0200 Subject: [PATCH 026/154] Updates for maxwalltime --- PILOTVERSION | 2 +- pilot/control/monitor.py | 28 +++++++++++++++++++--------- pilot/info/jobdata.py | 5 +++-- pilot/util/queuehandling.py | 28 ++++++++++++++++++++++++++-- 4 files changed, 49 insertions(+), 14 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 21660dcca..2d7ccb7d4 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.19 \ No newline at end of file +3.5.2.21 \ No newline at end of file diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 00de98a1b..ef0f39b4c 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -24,7 +24,7 @@ from pilot.util.constants import MAX_KILL_WAIT_TIME # from pilot.util.container import execute from pilot.util.features import MachineFeatures -from pilot.util.queuehandling import get_queuedata_from_job, abort_jobs_in_queues +from pilot.util.queuehandling import get_queuedata_from_job, get_maxwalltime_from_job, abort_jobs_in_queues from pilot.util.timing import get_time_since_start logger = logging.getLogger(__name__) @@ -54,8 +54,7 @@ def control(queues, traces, args): # noqa: C901 last_minute_check = t_0 queuedata = get_queuedata_from_job(queues) - max_running_time = get_max_running_time(args.lifetime, queuedata) - + push = args.harvester and args.harvester_submitmode.lower() == 'push' try: # overall loop counter (ignoring the fact that more than one job may be running) niter = 0 @@ -77,6 +76,8 @@ def control(queues, traces, args): # noqa: C901 grace_time = 10 * 60 if time_since_start - grace_time < 0: grace_time = 0 + # get the current max_running_time (can change with job) + max_running_time = get_max_running_time(args.lifetime, queuedata, queues, push) if time_since_start > max_running_time - grace_time: logger.fatal(f'max running time ({max_running_time}s) minus grace time ({grace_time}s) has been ' f'exceeded - time to abort pilot') @@ -318,22 +319,31 @@ def run_checks(queues, args): raise ExceededMaxWaitTime(diagnostics) -def get_max_running_time(lifetime, queuedata): +def get_max_running_time(lifetime, queuedata, queues, push): """ Return the maximum allowed running time for the pilot. The max time is set either as a pilot option or via the schedconfig.maxtime for the PQ in question. :param lifetime: optional pilot option time in seconds (int). :param queuedata: queuedata object - :return: max running time in seconds (int). + :param queues: + :param push: push mode (boolean) + :return: max running time in seconds (int) """ + # for push queues: try to get the walltime from the job object first, in case it exists and is set + if push: + max_running_time = get_maxwalltime_from_job() + if max_running_time: + logger.debug(f'using max running time from job: {max_running_time}s') + return max_running_time + max_running_time = lifetime # use the schedconfig value if set, otherwise use the pilot option lifetime value if not queuedata: logger.warning(f'queuedata could not be extracted from queues, will use default for max running time ' - f'({max_running_time} s)') + f'({max_running_time}s)') else: if queuedata.maxtime: try: @@ -341,12 +351,12 @@ def get_max_running_time(lifetime, queuedata): except Exception as error: logger.warning(f'exception caught: {error}') logger.warning(f'failed to convert maxtime from queuedata, will use default value for max running time ' - f'({max_running_time} s)') + f'({max_running_time}s)') else: if max_running_time == 0: max_running_time = lifetime # fallback to default value - logger.info(f'will use default value for max running time: {max_running_time} s') + logger.info(f'will use default value for max running time: {max_running_time}s') else: - logger.info(f'will use queuedata.maxtime value for max running time: {max_running_time} s') + logger.info(f'will use queuedata.maxtime value for max running time: {max_running_time}s') return max_running_time diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index c826ac1bd..ddd1d4070 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -145,6 +145,7 @@ class JobData(BaseData): # coprocess = {u'args': u'coprocess', u'command': u'echo'} containeroptions = {} # use_vp = False # True for VP jobs + maxwalltime = 0 # maxWalltime in s # home package string with additional payload release information; does not need to be added to # the conversion function since it's already lower case @@ -163,7 +164,7 @@ class JobData(BaseData): # specify the type of attributes for proper data validation and casting _keys = {int: ['corecount', 'piloterrorcode', 'transexitcode', 'exitcode', 'cpuconversionfactor', 'exeerrorcode', 'attemptnr', 'nevents', 'neventsw', 'pid', 'cpuconsumptiontime', 'maxcpucount', 'actualcorecount', - 'requestid'], + 'requestid', 'maxwalltime'], str: ['jobid', 'taskid', 'jobparams', 'transformation', 'destinationdblock', 'exeerrordiag' 'state', 'serverstate', 'workdir', 'stageout', 'platform', 'piloterrordiag', 'exitmsg', 'produserid', 'jobdefinitionid', 'writetofile', @@ -309,7 +310,7 @@ def get_kmap(): 'filesize': 'fsize', 'checksum': 'checksum', 'scope': 'scopeIn', ##'??define_internal_key': 'prodDBlocks', 'storage_token': 'prodDBlockToken', - 'ddmendpoint': 'ddmEndPointIn', + 'ddmendpoint': 'ddmEndPointIn', 'maxwalltime': 'maxWalltime' } return kmap diff --git a/pilot/util/queuehandling.py b/pilot/util/queuehandling.py index 02febd58d..ccf339d58 100644 --- a/pilot/util/queuehandling.py +++ b/pilot/util/queuehandling.py @@ -6,7 +6,7 @@ # # Authors: # - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 - +import os import time from pilot.common.errorcodes import ErrorCodes @@ -66,6 +66,31 @@ def scan_for_jobs(queues): return jobs +def get_maxwalltime_from_job(queues): + """ + Return the maxwalltime from the job object. + The algorithm requires a set PANDAID environmental variable, in order to find the correct walltime. + + :param queues: + :return: job object variable + """ + + maxwalltime = None + current_job_id = os.environ.get('PANDAID', None) + if not current_job_id: + return None + + # extract jobs from the queues + jobs = scan_for_jobs(queues) + if jobs: + for job in jobs: + if current_job_id == job.jobid: + maxwalltime = job.maxwalltime if job.maxwalltime else None + break + + return maxwalltime + + def get_queuedata_from_job(queues): """ Return the queuedata object from a job in the given queues object. @@ -88,7 +113,6 @@ def get_queuedata_from_job(queues): return queuedata - def abort_jobs_in_queues(queues, sig): """ Find all jobs in the queues and abort them. From 544da706020bc913cf1fb2c19b627748d0768273 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 31 Mar 2023 10:15:27 +0200 Subject: [PATCH 027/154] Update --- pilot/util/queuehandling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/queuehandling.py b/pilot/util/queuehandling.py index ccf339d58..27b4f3fcc 100644 --- a/pilot/util/queuehandling.py +++ b/pilot/util/queuehandling.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 +# - Paul Nilsson, paul.nilsson@cern.ch, 2018-2023 import os import time From 53371b5e0374e944ca0fa1502124edc0baeb15ca Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 31 Mar 2023 10:28:48 +0200 Subject: [PATCH 028/154] Flake8 --- pilot/util/monitoring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index e188d24f6..a8b6347c0 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -585,7 +585,7 @@ def check_log_size(filename, to_be_zipped=None, archive=False): label = 'archive' if archive else 'log file' diagnostics = f"{label} {filename} is too big: {fsize} B (larger than limit {localsizelimit_stdout} B) [will be zipped]" logger.warning(diagnostics) - if not to_be_zipped == None: + if not to_be_zipped is None: to_be_zipped.append(filename) else: logger.info( From c69ee625095a573f4b1c310cd9224459e4ed310a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 31 Mar 2023 14:30:06 +0200 Subject: [PATCH 029/154] Checking zipped archive file size --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/filehandling.py | 5 +++-- pilot/util/monitoring.py | 9 +++------ 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 21660dcca..2ab2e4c04 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.19 \ No newline at end of file +3.5.2.24 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index ae7a07b65..bd3419ddc 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '20' # build number should be reset to '1' for every new development cycle +BUILD = '24' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/filehandling.py b/pilot/util/filehandling.py index 52bc74662..34fa2ead0 100644 --- a/pilot/util/filehandling.py +++ b/pilot/util/filehandling.py @@ -479,11 +479,12 @@ def remove_files(files, workdir=None): exitcode = 0 if not isinstance(files, list): - logger.warning(f'files parameter not a list: {type(list)}') + logger.warning(f'files parameter not a list: {type(files)}') exitcode = -1 else: for _file in files: - _ec = remove(os.path.join(workdir, _file)) + path = os.path.join(workdir, _file) if workdir else _file + _ec = remove(path) if _ec != 0 and exitcode == 0: exitcode = _ec diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index e188d24f6..b0697eff0 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -540,7 +540,7 @@ def check_payload_stdout(job): if status: logger.info(f'created archive {archivename}') # verify that the new file size is not too big (ignore exit code, should already be set above) - _exit_code, to_be_zipped = check_log_size(archivename, to_be_zipped=None) + _exit_code, _ = check_log_size(archivename, to_be_zipped=None, archive=True) if _exit_code: logger.warning('also the archive was too large - will be removed') remove_files([archivename]) @@ -577,15 +577,12 @@ def check_log_size(filename, to_be_zipped=None, archive=False): else: # is the file too big? localsizelimit_stdout = get_local_size_limit_stdout() - - - lim = 10 if not archive else localsizelimit_stdout - if fsize > lim: #localsizelimit_stdout: + if fsize > localsizelimit_stdout: exit_code = errors.STDOUTTOOBIG label = 'archive' if archive else 'log file' diagnostics = f"{label} {filename} is too big: {fsize} B (larger than limit {localsizelimit_stdout} B) [will be zipped]" logger.warning(diagnostics) - if not to_be_zipped == None: + if not to_be_zipped is None: to_be_zipped.append(filename) else: logger.info( From c12088bdcd8093afdcc74d4bda368d1d685d2af8 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 31 Mar 2023 14:45:34 +0200 Subject: [PATCH 030/154] Exception handling --- pilot/control/monitor.py | 14 +++++++++----- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 12 ++++++++---- pilot/util/queuehandling.py | 1 + 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index ef0f39b4c..7de15e20f 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -6,7 +6,7 @@ # # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2022 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 # NOTE: this module should deal with non-job related monitoring, such as thread monitoring. Job monitoring is # a task for the job_monitor thread in the Job component. @@ -333,10 +333,14 @@ def get_max_running_time(lifetime, queuedata, queues, push): # for push queues: try to get the walltime from the job object first, in case it exists and is set if push: - max_running_time = get_maxwalltime_from_job() - if max_running_time: - logger.debug(f'using max running time from job: {max_running_time}s') - return max_running_time + try: + max_running_time = get_maxwalltime_from_job() + except Exception as exc: + logger.warning(f'caught exception: {exc}') + else: + if max_running_time: + logger.debug(f'using max running time from job: {max_running_time}s') + return max_running_time max_running_time = lifetime diff --git a/pilot/util/constants.py b/pilot/util/constants.py index bd3419ddc..658d4e260 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '24' # build number should be reset to '1' for every new development cycle +BUILD = '25' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index b0697eff0..6714487a6 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -351,9 +351,13 @@ def verify_disk_usage(current_time, mt, job): # time to check the disk space # check the size of the payload stdout - exit_code, diagnostics = check_payload_stdout(job) - if exit_code != 0: - return exit_code, diagnostics + try: + exit_code, diagnostics = check_payload_stdout(job) + except Exception as exc: + logger.warning(f'caught exception: {exc}') + else: + if exit_code != 0: + return exit_code, diagnostics # check the local space, if it's enough left to keep running the job exit_code, diagnostics = check_local_space(initial=False) @@ -582,7 +586,7 @@ def check_log_size(filename, to_be_zipped=None, archive=False): label = 'archive' if archive else 'log file' diagnostics = f"{label} {filename} is too big: {fsize} B (larger than limit {localsizelimit_stdout} B) [will be zipped]" logger.warning(diagnostics) - if not to_be_zipped is None: + if to_be_zipped is not None: to_be_zipped.append(filename) else: logger.info( diff --git a/pilot/util/queuehandling.py b/pilot/util/queuehandling.py index 27b4f3fcc..cdade7183 100644 --- a/pilot/util/queuehandling.py +++ b/pilot/util/queuehandling.py @@ -113,6 +113,7 @@ def get_queuedata_from_job(queues): return queuedata + def abort_jobs_in_queues(queues, sig): """ Find all jobs in the queues and abort them. From b9123c02bfaf2d259e588bacd56dd1116cb69dfe Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 31 Mar 2023 14:50:03 +0200 Subject: [PATCH 031/154] removed useless log messages --- pilot/util/features.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pilot/util/features.py b/pilot/util/features.py index a3fd9f9c7..9ef4faf37 100644 --- a/pilot/util/features.py +++ b/pilot/util/features.py @@ -91,8 +91,7 @@ def __init__(self): self.shutdowntime = "" self.total_cpu = "" self.grace_secs = "" - - logger.info('collecting machine features') + # logger.info('collecting machine features') self.set(os.environ.get('MACHINEFEATURES', ''), 'machine') @@ -117,6 +116,5 @@ def __init__(self): self.max_rss_bytes = "" self.max_swap_bytes = "" self.scratch_limit_bytes = "" - - logger.info('collecting job features') + # logger.info('collecting job features') self.set(os.environ.get('JOBFEATURES', ''), 'job') From 8cbade623b235c5b287a95dd68e5de2b98a94d4b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 31 Mar 2023 14:52:53 +0200 Subject: [PATCH 032/154] corrected useless info message --- PILOTVERSION | 2 +- pilot/util/features.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 2ab2e4c04..8d796621b 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.24 \ No newline at end of file +3.5.2.25 \ No newline at end of file diff --git a/pilot/util/features.py b/pilot/util/features.py index 9ef4faf37..00d0b7bd2 100644 --- a/pilot/util/features.py +++ b/pilot/util/features.py @@ -74,7 +74,8 @@ def set(self, path, label): value = value[:-1] if value.endswith('\n') else value setattr(self, member, value) else: - logger.info(f'{label} features path does not exist (path=\"{path}\")') + if path: + logger.warning(f'{label} features path does not exist (path=\"{path}\")') class MachineFeatures(Features): From 5547543e20892dc8f1ae4b40dbc599b2fd0a1cd4 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 31 Mar 2023 15:33:58 +0200 Subject: [PATCH 033/154] Added protection against non-int maxwalltime values --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 5 ++++- pilot/util/queuehandling.py | 3 +++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 8d796621b..c0898c15e 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.25 \ No newline at end of file +3.5.2.25b \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 658d4e260..b56da1723 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '25' # build number should be reset to '1' for every new development cycle +BUILD = '25b' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 6714487a6..9114bc438 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -581,7 +581,10 @@ def check_log_size(filename, to_be_zipped=None, archive=False): else: # is the file too big? localsizelimit_stdout = get_local_size_limit_stdout() - if fsize > localsizelimit_stdout: + + + + if fsize > 10: #localsizelimit_stdout: exit_code = errors.STDOUTTOOBIG label = 'archive' if archive else 'log file' diagnostics = f"{label} {filename} is too big: {fsize} B (larger than limit {localsizelimit_stdout} B) [will be zipped]" diff --git a/pilot/util/queuehandling.py b/pilot/util/queuehandling.py index cdade7183..5f5a839b0 100644 --- a/pilot/util/queuehandling.py +++ b/pilot/util/queuehandling.py @@ -86,6 +86,9 @@ def get_maxwalltime_from_job(queues): for job in jobs: if current_job_id == job.jobid: maxwalltime = job.maxwalltime if job.maxwalltime else None + # make sure maxwalltime is an int (might be 'NULL') + if not isinstance(maxwalltime, int): + maxwalltime = None break return maxwalltime From e922cc9987ec2b6804d9c0889fcf5f8eb29bf907 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 31 Mar 2023 16:46:13 +0200 Subject: [PATCH 034/154] fixed problem with maxwalltime and requestid, removed test code --- PILOTVERSION | 2 +- pilot/control/monitor.py | 2 +- pilot/info/jobdata.py | 14 +++++++++----- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 5 +---- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index c0898c15e..0fb9dc0a9 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.25b \ No newline at end of file +3.5.2.29 \ No newline at end of file diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 7de15e20f..0f548161d 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -334,7 +334,7 @@ def get_max_running_time(lifetime, queuedata, queues, push): # for push queues: try to get the walltime from the job object first, in case it exists and is set if push: try: - max_running_time = get_maxwalltime_from_job() + max_running_time = get_maxwalltime_from_job(queues) except Exception as exc: logger.warning(f'caught exception: {exc}') else: diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index ddd1d4070..980844e2b 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -171,8 +171,10 @@ class JobData(BaseData): 'cpuconsumptionunit', 'homepackage', 'jobsetid', 'payload', 'processingtype', 'swrelease', 'zipmap', 'imagename', 'imagename_jobdef', 'accessmode', 'transfertype', 'datasetin', ## TO BE DEPRECATED: moved to FileSpec (job.indata) - 'infilesguids', 'memorymonitor', 'allownooutput', 'pandasecrets', 'prodproxy'], - list: ['piloterrorcodes', 'piloterrordiags', 'workdirsizes', 'zombies', 'corecounts', 'subprocesses'], + 'infilesguids', 'memorymonitor', 'allownooutput', 'pandasecrets', 'prodproxy', 'alrbuserplatform', + 'debug_command'], + list: ['piloterrorcodes', 'piloterrordiags', 'workdirsizes', 'zombies', 'corecounts', 'subprocesses', + 'logdata', 'outdata', 'indata'], dict: ['status', 'fileinfo', 'metadata', 'utilities', 'overwrite_queuedata', 'sizes', 'preprocess', 'postprocess', 'coprocess', 'containeroptions', 'pilotsecrets'], bool: ['is_eventservice', 'is_eventservicemerge', 'is_hpo', 'noexecstrcnv', 'debug', 'usecontainer', @@ -306,11 +308,11 @@ def get_kmap(): # 'internal_name': 'ext_key_structure' 'lfn': 'inFiles', ##'??': 'dispatchDblock', '??define_proper_internal_name': 'dispatchDBlockToken', - 'dataset': 'realDatasetsIn', 'guid': 'GUID', 'requestid': 'reqID', + 'dataset': 'realDatasetsIn', 'guid': 'GUID', 'filesize': 'fsize', 'checksum': 'checksum', 'scope': 'scopeIn', ##'??define_internal_key': 'prodDBlocks', 'storage_token': 'prodDBlockToken', - 'ddmendpoint': 'ddmEndPointIn', 'maxwalltime': 'maxWalltime' + 'ddmendpoint': 'ddmEndPointIn' } return kmap @@ -482,7 +484,9 @@ def load(self, data, use_kmap=True): 'containeroptions': 'containerOptions', 'looping_check': 'loopingCheck', 'pandasecrets': 'secrets', - 'pilotsecrets': 'pilotSecrets' + 'pilotsecrets': 'pilotSecrets', + 'requestid': 'reqID', + 'maxwalltime': 'maxWalltime' } if use_kmap else {} self._load_data(data, kmap) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index b56da1723..f222fc952 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '25b' # build number should be reset to '1' for every new development cycle +BUILD = '29' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 9114bc438..6714487a6 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -581,10 +581,7 @@ def check_log_size(filename, to_be_zipped=None, archive=False): else: # is the file too big? localsizelimit_stdout = get_local_size_limit_stdout() - - - - if fsize > 10: #localsizelimit_stdout: + if fsize > localsizelimit_stdout: exit_code = errors.STDOUTTOOBIG label = 'archive' if archive else 'log file' diagnostics = f"{label} {filename} is too big: {fsize} B (larger than limit {localsizelimit_stdout} B) [will be zipped]" From 28c5ffb3cb1ea614a85ca5c212a63624959ad624 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 3 Apr 2023 13:50:29 +0200 Subject: [PATCH 035/154] Using job_maxwalltime. Test code --- PILOTVERSION | 2 +- pilot/control/monitor.py | 57 +++++++++++++++++++++---------------- pilot/util/constants.py | 2 +- pilot/util/queuehandling.py | 12 ++++++-- 4 files changed, 44 insertions(+), 29 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 0fb9dc0a9..4772eb8b3 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.29 \ No newline at end of file +3.5.2.35 \ No newline at end of file diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 0f548161d..1b43d3987 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -73,11 +73,17 @@ def control(queues, traces, args): # noqa: C901 # check if the pilot has run out of time (stop ten minutes before PQ limit) time_since_start = get_time_since_start(args) - grace_time = 10 * 60 + grace_time = 3 * 60 if time_since_start - grace_time < 0: grace_time = 0 # get the current max_running_time (can change with job) - max_running_time = get_max_running_time(args.lifetime, queuedata, queues, push) + try: + max_running_time = get_max_running_time(args.lifetime, queuedata, queues, push) + except Exception as exc: + logger.warning(f'caught exception: {exc}') + max_running_time = args.lifetime + + max_running_time = 4 * 60 if time_since_start > max_running_time - grace_time: logger.fatal(f'max running time ({max_running_time}s) minus grace time ({grace_time}s) has been ' f'exceeded - time to abort pilot') @@ -331,36 +337,37 @@ def get_max_running_time(lifetime, queuedata, queues, push): :return: max running time in seconds (int) """ + max_running_time = lifetime + + if not queuedata: + logger.warning(f'queuedata could not be extracted from queues, will use default for max running time ' + f'({max_running_time}s)') + return max_running_time + # for push queues: try to get the walltime from the job object first, in case it exists and is set - if push: + if push or True: try: - max_running_time = get_maxwalltime_from_job(queues) + _max_running_time = get_maxwalltime_from_job(queues, queuedata.get('params', None)) except Exception as exc: logger.warning(f'caught exception: {exc}') else: - if max_running_time: - logger.debug(f'using max running time from job: {max_running_time}s') - return max_running_time - - max_running_time = lifetime + if _max_running_time: + logger.debug(f'using max running time from job: {_max_running_time}s') + return _max_running_time # use the schedconfig value if set, otherwise use the pilot option lifetime value - if not queuedata: - logger.warning(f'queuedata could not be extracted from queues, will use default for max running time ' - f'({max_running_time}s)') - else: - if queuedata.maxtime: - try: - max_running_time = int(queuedata.maxtime) - except Exception as error: - logger.warning(f'exception caught: {error}') - logger.warning(f'failed to convert maxtime from queuedata, will use default value for max running time ' - f'({max_running_time}s)') + if queuedata.maxtime: + try: + max_running_time = int(queuedata.maxtime) + except Exception as error: + logger.warning(f'exception caught: {error}') + logger.warning(f'failed to convert maxtime from queuedata, will use default value for max running time ' + f'({max_running_time}s)') + else: + if max_running_time == 0: + max_running_time = lifetime # fallback to default value + logger.info(f'will use default value for max running time: {max_running_time}s') else: - if max_running_time == 0: - max_running_time = lifetime # fallback to default value - logger.info(f'will use default value for max running time: {max_running_time}s') - else: - logger.info(f'will use queuedata.maxtime value for max running time: {max_running_time}s') + logger.info(f'will use queuedata.maxtime value for max running time: {max_running_time}s') return max_running_time diff --git a/pilot/util/constants.py b/pilot/util/constants.py index f222fc952..31537c7c3 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '29' # build number should be reset to '1' for every new development cycle +BUILD = '35' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/queuehandling.py b/pilot/util/queuehandling.py index 5f5a839b0..ea655ab6c 100644 --- a/pilot/util/queuehandling.py +++ b/pilot/util/queuehandling.py @@ -66,26 +66,34 @@ def scan_for_jobs(queues): return jobs -def get_maxwalltime_from_job(queues): +def get_maxwalltime_from_job(queues, params): """ Return the maxwalltime from the job object. The algorithm requires a set PANDAID environmental variable, in order to find the correct walltime. :param queues: + :param params: queuedata.params (dictionary) :return: job object variable """ maxwalltime = None + use_job_maxwalltime = False current_job_id = os.environ.get('PANDAID', None) if not current_job_id: return None + # on push queues, one can set params.use_job_maxwalltime to decide if job.maxwalltime should be used to check + # job running time + if params: + use_job_maxwalltime = params.get('job_maxwalltime', False) + logger.debug(f'use_job_maxwalltime={use_job_maxwalltime} (type={type(use_job_maxwalltime)}, current job id={current_job_id})') + # extract jobs from the queues jobs = scan_for_jobs(queues) if jobs: for job in jobs: if current_job_id == job.jobid: - maxwalltime = job.maxwalltime if job.maxwalltime else None + maxwalltime = job.maxwalltime if job.maxwalltime and use_job_maxwalltime else None # make sure maxwalltime is an int (might be 'NULL') if not isinstance(maxwalltime, int): maxwalltime = None From 6b88d9575230868d4ce97b68b194bc25fd7b816c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 3 Apr 2023 15:44:50 +0200 Subject: [PATCH 036/154] Removed bad get --- PILOTVERSION | 2 +- pilot/control/monitor.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 4772eb8b3..eddcda9a1 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.35 \ No newline at end of file +3.5.2.36 \ No newline at end of file diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 1b43d3987..a9b7f9986 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -347,7 +347,7 @@ def get_max_running_time(lifetime, queuedata, queues, push): # for push queues: try to get the walltime from the job object first, in case it exists and is set if push or True: try: - _max_running_time = get_maxwalltime_from_job(queues, queuedata.get('params', None)) + _max_running_time = get_maxwalltime_from_job(queues, queuedata.params) except Exception as exc: logger.warning(f'caught exception: {exc}') else: diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 31537c7c3..af34083b1 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '35' # build number should be reset to '1' for every new development cycle +BUILD = '36' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From ca56fa4c87f1cb95ed33f650480ca4ca4188c7e9 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 3 Apr 2023 19:36:06 +0200 Subject: [PATCH 037/154] Removed args.abort_job.set() --- PILOTVERSION | 2 +- pilot/control/job.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index eddcda9a1..b161406c6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.36 \ No newline at end of file +3.5.2.37 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index f853cb79e..535ace92d 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -516,7 +516,7 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): if job.pid: logger.debug('killing payload process') kill_process(job.pid) - args.abort_job.set() + #args.abort_job.set() elif 'softkill' in cmd: logger.info(f'pilot received a panda server signal to softkill job {job.jobid} at {time_stamp()}') # event service kill instruction diff --git a/pilot/util/constants.py b/pilot/util/constants.py index af34083b1..ffe2c518f 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '36' # build number should be reset to '1' for every new development cycle +BUILD = '38' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 98904c9c00347d41f0e7145a2a1e4037a1d75cbb Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 4 Apr 2023 10:59:42 +0200 Subject: [PATCH 038/154] Added abort --- pilot/control/job.py | 1 + pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 535ace92d..5c37b25dc 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2803,6 +2803,7 @@ def job_monitor(queues, traces, args): # noqa: C901 logger.info('monitor loop #%d: job %d:%s is in state \'%s\'', n, i, current_id, jobs[i].state) if jobs[i].state == 'finished' or jobs[i].state == 'failed': logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) + abort = True break # perform the monitoring tasks diff --git a/pilot/util/constants.py b/pilot/util/constants.py index ffe2c518f..ebcb312bc 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '38' # build number should be reset to '1' for every new development cycle +BUILD = '39' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 4237c2cf1693326aff9da08748836df883506cdb Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 4 Apr 2023 11:20:57 +0200 Subject: [PATCH 039/154] Corrected lost heartbeat --- PILOTVERSION | 2 +- pilot/util/default.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index b161406c6..e28cf7893 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.37 \ No newline at end of file +3.5.2.39 \ No newline at end of file diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index ae9f3cfbd..cb4ad0e46 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -51,7 +51,7 @@ rtlogging:logstash;http://aipanda020.cern.ch:8443 # A lost heartbeat is 60*60*3 s, i.e. 3h heartbeat: 1800 debug_heartbeat: 60 -lost_heartbeat = 10800 +lost_heartbeat: 10800 # Heartbeat message file (only used when Pilot is not sending heartbeats to server) heartbeat_message: heartbeat.json From fb861d6f241ed23dd5ae28f82fb94c7a355a3bae Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 5 Apr 2023 14:45:13 +0200 Subject: [PATCH 040/154] Created get_globaljobid() --- pilot/control/monitor.py | 4 ++-- pilot/util/auxiliary.py | 33 ++++++++++++++++++++++++++------- pilot/util/constants.py | 2 +- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index a9b7f9986..cae399769 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -83,7 +83,7 @@ def control(queues, traces, args): # noqa: C901 logger.warning(f'caught exception: {exc}') max_running_time = args.lifetime - max_running_time = 4 * 60 + # for testing: max_running_time = 4 * 60 if time_since_start > max_running_time - grace_time: logger.fatal(f'max running time ({max_running_time}s) minus grace time ({grace_time}s) has been ' f'exceeded - time to abort pilot') @@ -345,7 +345,7 @@ def get_max_running_time(lifetime, queuedata, queues, push): return max_running_time # for push queues: try to get the walltime from the job object first, in case it exists and is set - if push or True: + if push: try: _max_running_time = get_maxwalltime_from_job(queues, queuedata.params) except Exception as exc: diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index 83c71e729..07d12264b 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -127,17 +127,36 @@ def get_batchsystem_jobid(): # Condor (get jobid from classad file) if '_CONDOR_JOB_AD' in os.environ: try: - with open(os.environ.get("_CONDOR_JOB_AD"), 'r') as _fp: - for line in _fp: - res = re.search(r'^GlobalJobId\s*=\s*"(.*)"', line) - if res is None: - continue - return "Condor", res.group(1) + ret = get_globaljobid() except OSError as exc: - logger.warning("failed to read HTCondor job classAd: %s", exc) + logger.warning(f"failed to read HTCondor job classAd: {exc}") + else: + return "Condor", ret return None, "" +def get_globaljobid(): + """ + Return the GlobalJobId value from the condor class ad. + + :return: GlobalJobId value (string). + """ + + ret = "" + with open(os.environ.get("_CONDOR_JOB_AD"), 'r') as _fp: + for line in _fp: + res = re.search(r'^GlobalJobId\s*=\s*"(.*)"', line) + if res is None: + continue + try: + ret = res.group(1) + except Exception as exc: + logger.warning(f'failed to interpret GlobalJobId: {exc}') + break + + return ret + + def get_job_scheduler_id(): """ Get the job scheduler id from the environment variable PANDA_JSID diff --git a/pilot/util/constants.py b/pilot/util/constants.py index ebcb312bc..3a2992d9f 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '39' # build number should be reset to '1' for every new development cycle +BUILD = '40' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 43d256f55b87b6cdf67faa291ec6b939d5e75703 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 5 Apr 2023 19:19:29 +0200 Subject: [PATCH 041/154] Setting graceful stop after lost heartbeat --- pilot/control/monitor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index cae399769..bee85000c 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -276,6 +276,7 @@ def run_checks(queues, args): logger.warning(diagnostics) logger.warning('aborting pilot - no need to wait for job to finish - kill everything') args.job_aborted.set() + args.graceful_stop.set() args.abort_job.clear() raise ExceededMaxWaitTime(diagnostics) #else: From 7804f6fbaa7222c8fa285815dbd6aeca235a01a2 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 7 Apr 2023 10:11:34 +0200 Subject: [PATCH 042/154] Removed cpu_architect_level pending lsetup update --- PILOTVERSION | 2 +- pilot/control/job.py | 6 +++--- pilot/util/constants.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index e28cf7893..76f433215 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.39 \ No newline at end of file +3.5.2.41 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 5c37b25dc..60a9fea08 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -654,9 +654,9 @@ def get_data_structure(job, state, args, xml=None, metadata=None, final=False): if product and vendor: logger.debug(f'cpuConsumptionUnit: could have added: product={product}, vendor={vendor}') - cpu_arch = get_cpu_arch() - if cpu_arch: - data['cpu_architecture_level'] = cpu_arch + #cpu_arch = get_cpu_arch() + #if cpu_arch: + # data['cpu_architecture_level'] = cpu_arch # add memory information if available add_memory_info(data, job.workdir, name=job.memorymonitor) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 3a2992d9f..3a54fd1e5 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '40' # build number should be reset to '1' for every new development cycle +BUILD = '41' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From f154e920b071b6dc25b18d2702d60f5bb162f5eb Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 7 Apr 2023 12:19:28 +0200 Subject: [PATCH 043/154] Improved exception handling after command execution timeout --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/container.py | 26 ++++++++++++++++++-------- pilot/util/processes.py | 2 +- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 76f433215..6f7534f0f 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.41 \ No newline at end of file +3.5.2.42 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 3a54fd1e5..060d3de17 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '41' # build number should be reset to '1' for every new development cycle +BUILD = '42' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/container.py b/pilot/util/container.py index 39f4c8d55..c578c2c4e 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -9,9 +9,11 @@ import subprocess import logging -from os import environ, getcwd, setpgrp, getpgid, killpg, kill #, getpgid #setsid -from signal import SIGTERM +from os import environ, getcwd, setpgrp, getpgid, kill #, getpgid #setsid +from time import sleep +from signal import SIGTERM, SIGKILL from pilot.common.errorcodes import ErrorCodes +from pilot.util.processes import kill_process_group logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -72,12 +74,20 @@ def execute(executable, **kwargs): logger.warning(stderr) exit_code = errors.COMMANDTIMEDOUT process.kill() - #logger.debug('XXX executing process.communicate()') - #stdout, stderr = process.communicate() - #stderr += '\n' + _stderr - killpg(getpgid(process.pid), SIGTERM) - kill(process.pid, SIGTERM) - logger.debug('Sent soft kill signals') + try: + logger.warning('killing lingering process group') + kill_process_group(getpgid(process.pid)) + except ProcessLookupError as exc: + stderr += f'\n(kill process group) ProcessLookupError={exc}' + try: + logger.warning('killing lingering process') + kill(process.pid, SIGTERM) + logger.warning('sleeping a bit before sending SIGKILL') + sleep(10) + kill(process.pid, SIGKILL) + except ProcessLookupError as exc: + stderr += f'\n(kill process) ProcessLookupError={exc}' + logger.warning(f'sent soft kill signals - final stderr: {stderr}') else: exit_code = process.poll() diff --git a/pilot/util/processes.py b/pilot/util/processes.py index cb885cb0a..ce96b4e35 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -224,7 +224,7 @@ def kill_process_group(pgrp): Kill the process group. :param pgrp: process group id (int). - :return: boolean (True if SIGMTERM followed by SIGKILL signalling was successful) + :return: boolean (True if SIGTERM followed by SIGKILL signalling was successful) """ status = False From 349bcf473fd7cc14df86da8a17720f753d2500b5 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 7 Apr 2023 12:39:05 +0200 Subject: [PATCH 044/154] Added encode_globaljobid() --- pilot/util/auxiliary.py | 78 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index 07d12264b..bb9fc62cf 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2022 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 import os import re @@ -676,3 +676,79 @@ def sort_words(input_str): logger.warning(f'failed to sort input string: {input_str}, exc={exc}') return output_str + + +def encode_globaljobid(jobid, processingtype, maxsize=31): + """ + Encode the global job id on HTCondor. + To be used as an environmental variable on HTCondor nodes to facilitate debugging. + + Format: ::._ + + Note: due to batch system restrictions, this string is limited to 31 (maxsize) characters, using the least significant + characters (i.e. the left part of the string might get cut). Also, the cluster ID and process IDs are converted to hex + to limit the sizes. The schedd host name is further encoded using the last digit in the host name (spce03.sdcc.bnl.gov -> spce03 -> 3). + + :param jobid: panda job id (string) + :param processingtype: panda processing type (string) + :return: encoded global job id (string). + """ + ret = "" + + def reformat(num, maxsize=8): + # can handle clusterid=4294967297, ie larger than 0xffffffff + try: + num_hex = hex(int(num)).replace('0x', '') + if len(num_hex) > maxsize: # i.e. larger than 'ffffffff' or 'ff' + num_hex = num_hex[-maxsize:] # take the least significant bits + num_hex = '0x' + num_hex + num_int = int(num_hex, base=16) + size = "{0:0" + str(maxsize) + "x}" # e.g. "{0:08x}" + num_hex = size.format(num_int) + except Exception as exc: + logger.warning(exc) + num_hex = "" + return num_hex + + def get_schedd_id(host): + # spce03.sdcc.bnl.gov -> spce03 -> 3 + try: + schedd_id = host.split('.')[0][-1] + except Exception as exc: + logger.warning(f'failed to extract schedd from host={host}: {exc}') + schedd_id = None + return schedd_id + + globaljobid = get_globaljobid() + if not globaljobid: + return "" + + try: + _globaljobid = globaljobid.split('#') + host = _globaljobid[0] + tmp = _globaljobid[1].split('.') + # timestamp = _globaljobid[2] - ignore this one + clusterid = tmp[0] + processid = tmp[1] + except Exception as exc: + logger.warning(exc) + return "" + + logger.debug(f'clusterid={clusterid}') + logger.debug(f'host name={host}') + clusterid_hex = reformat(clusterid, maxsize=8) # 00283984 + processid_hex = reformat(processid, maxsize=2) # 00 + schedd_id = get_schedd_id(host) # 3 + if clusterid_hex and processid_hex and schedd_id: + global_name = f'{jobid}:{processingtype}:{clusterid_hex}.{processid_hex}_{schedd_id}' + else: + global_name = '' + + if len(global_name) > maxsize: + logger.warning(f'HTCondor: global name is exceeding maxsize({maxsize}), will be truncated: {global_name}') + global_name = global_name[-maxsize:] + logger.debug(f'HTCondor: final global name={global_name}') + else: + logger.debug(f'HTCondor: global name is within limits: {global_name} (length={len(global_name)}, max size={maxsize})') + + return global_name From 00d08eabc20f32ecd4c7eef67fee1ac9ba2f6ddb Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 7 Apr 2023 12:50:05 +0200 Subject: [PATCH 045/154] Created htcondor_envvar() --- pilot/control/job.py | 25 +++++++++++++++++++++++-- pilot/util/constants.py | 2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 60a9fea08..57393d243 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -7,7 +7,7 @@ # Authors: # - Mario Lassnig, mario.lassnig@cern.ch, 2016-2017 # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2022 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 # - Wen Guan, wen.guan@cern.ch, 2018 from __future__ import print_function # Python 2 @@ -30,7 +30,7 @@ from pilot.util.activemq import ActiveMQ from pilot.util.auxiliary import get_batchsystem_jobid, get_job_scheduler_id, \ set_pilot_state, get_pilot_state, check_for_final_server_update, pilot_version_banner, is_virtual_machine, \ - has_instruction_sets, locate_core_file, get_display_info + has_instruction_sets, locate_core_file, get_display_info, encode_globaljobid from pilot.util.config import config from pilot.util.common import should_abort, was_pilot_killed from pilot.util.constants import PILOT_MULTIJOB_START_TIME, PILOT_PRE_GETJOB, PILOT_POST_GETJOB, PILOT_KILL_SIGNAL, LOG_TRANSFER_NOT_DONE, \ @@ -2005,6 +2005,9 @@ def retrieve(queues, traces, args): # noqa: C901 add_to_pilot_timing(job.jobid, PILOT_PRE_GETJOB, time_pre_getjob, args) add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args) + # for debugging on HTCondor purposes, set special env var + htcondor_envvar(job.jobid, job.processingtype) + # add the job definition to the jobs queue and increase the job counter, # and wait until the job has finished put_in_queue(job, queues.jobs) @@ -2041,6 +2044,24 @@ def retrieve(queues, traces, args): # noqa: C901 logger.info('[job] retrieve thread has finished') +def htcondor_envvar(jobid, processingtype): + """ + On HTCondor nodes, set special env var (HTCondor_JOB_ID) for debugging Lustre. + + :param jobid: PanDA job id (string) + :param processingtype: PanDA processing type (string) + :return: + """ + + # only proceed if there is a condor class ad + if os.environ.get('_CONDOR_JOB_AD', None): + globaljobid = encode_globaljobid(jobid, processingtype) + if globaljobid: + os.environ['HTCondor_JOB_ID'] = globaljobid + else: + logger.debug('not a condor batch system - will not set HTCondor_JOB_ID') # REMOVE ME + + def handle_proxy(job): """ Handle the proxy in unified dispatch. diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 060d3de17..d506f094c 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '42' # build number should be reset to '1' for every new development cycle +BUILD = '43' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From c980f0d3c4e15b526325557fb9282516f3b943f2 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 7 Apr 2023 12:56:44 +0200 Subject: [PATCH 046/154] Flake8 --- PILOTVERSION | 2 +- pilot/util/auxiliary.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 6f7534f0f..046f82092 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.42 \ No newline at end of file +3.5.2.43 \ No newline at end of file diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index bb9fc62cf..82a1283cf 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -693,7 +693,6 @@ def encode_globaljobid(jobid, processingtype, maxsize=31): :param processingtype: panda processing type (string) :return: encoded global job id (string). """ - ret = "" def reformat(num, maxsize=8): # can handle clusterid=4294967297, ie larger than 0xffffffff From ef2877b474bf022148af11079568af6b9c97bdf7 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 7 Apr 2023 14:01:54 +0200 Subject: [PATCH 047/154] Command not found -> command not found. Refactoring and proctetions --- PILOTVERSION | 2 +- pilot/control/job.py | 9 ++++++--- pilot/util/auxiliary.py | 41 +++++++++++++++++++++++++++++++++++++++-- pilot/util/constants.py | 2 +- pilot/util/container.py | 2 +- pilot/util/processes.py | 39 +-------------------------------------- 6 files changed, 49 insertions(+), 46 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 046f82092..52ffb50a4 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.43 \ No newline at end of file +3.5.2.44 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 57393d243..e98c863d6 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2055,9 +2055,12 @@ def htcondor_envvar(jobid, processingtype): # only proceed if there is a condor class ad if os.environ.get('_CONDOR_JOB_AD', None): - globaljobid = encode_globaljobid(jobid, processingtype) - if globaljobid: - os.environ['HTCondor_JOB_ID'] = globaljobid + try: + globaljobid = encode_globaljobid(jobid, processingtype) + if globaljobid: + os.environ['HTCondor_JOB_ID'] = globaljobid + except Exception as exc: + logger.warning(f'caught exception: {exc}') else: logger.debug('not a condor batch system - will not set HTCondor_JOB_ID') # REMOVE ME diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index 82a1283cf..f3453fe4e 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -91,7 +91,7 @@ def display_architecture_info(): logger.info("architecture information:") _, stdout, stderr = execute("lsb_release -a", mute=True) - if 'Command not found' in stdout or 'Command not found' in stderr: + if 'command not found' in stdout or 'command not found' in stderr: # Dump standard architecture info files if available dump("/etc/lsb-release") dump("/etc/SuSE-release") @@ -578,7 +578,7 @@ def list_hardware(): """ exit_code, stdout, stderr = execute('lshw -numeric -C display', mute=True) - if 'Command not found' in stdout or 'Command not found' in stderr: + if 'command not found' in stdout or 'command not found' in stderr: stdout = '' return stdout @@ -751,3 +751,40 @@ def get_schedd_id(host): logger.debug(f'HTCondor: global name is within limits: {global_name} (length={len(global_name)}, max size={maxsize})') return global_name + + +def kill_process_group(pgrp): + """ + Kill the process group. + DO NOT MOVE TO PROCESSES.PY - will lead to circular import since execute() needs it as well. + :param pgrp: process group id (int). + :return: boolean (True if SIGTERM followed by SIGKILL signalling was successful) + """ + + status = False + _sleep = True + + # kill the process gracefully + logger.info("killing group process %d", pgrp) + try: + os.killpg(pgrp, signal.SIGTERM) + except Exception as error: + logger.warning("exception thrown when killing child group process under SIGTERM: %s", error) + _sleep = False + else: + logger.info("SIGTERM sent to process group %d", pgrp) + + if _sleep: + _t = 30 + logger.info("sleeping %d s to allow processes to exit", _t) + time.sleep(_t) + + try: + os.killpg(pgrp, signal.SIGKILL) + except Exception as error: + logger.warning("exception thrown when killing child group process with SIGKILL: %s", error) + else: + logger.info("SIGKILL sent to process group %d", pgrp) + status = True + + return status diff --git a/pilot/util/constants.py b/pilot/util/constants.py index d506f094c..246f87bf8 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '43' # build number should be reset to '1' for every new development cycle +BUILD = '45' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/container.py b/pilot/util/container.py index c578c2c4e..af8314b0b 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -13,7 +13,7 @@ from time import sleep from signal import SIGTERM, SIGKILL from pilot.common.errorcodes import ErrorCodes -from pilot.util.processes import kill_process_group +from pilot.util.auxiliary import kill_process_group logger = logging.getLogger(__name__) errors = ErrorCodes() diff --git a/pilot/util/processes.py b/pilot/util/processes.py index ce96b4e35..30bf7f902 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -14,7 +14,7 @@ import threading from pilot.util.container import execute -from pilot.util.auxiliary import whoami +from pilot.util.auxiliary import whoami, kill_process_group from pilot.util.filehandling import read_file, remove_dir_tree import logging @@ -219,43 +219,6 @@ def kill_child_processes(pid): kill_process(i) -def kill_process_group(pgrp): - """ - Kill the process group. - - :param pgrp: process group id (int). - :return: boolean (True if SIGTERM followed by SIGKILL signalling was successful) - """ - - status = False - _sleep = True - - # kill the process gracefully - logger.info("killing group process %d", pgrp) - try: - os.killpg(pgrp, signal.SIGTERM) - except Exception as error: - logger.warning("exception thrown when killing child group process under SIGTERM: %s", error) - _sleep = False - else: - logger.info("SIGTERM sent to process group %d", pgrp) - - if _sleep: - _t = 30 - logger.info("sleeping %d s to allow processes to exit", _t) - time.sleep(_t) - - try: - os.killpg(pgrp, signal.SIGKILL) - except Exception as error: - logger.warning("exception thrown when killing child group process with SIGKILL: %s", error) - else: - logger.info("SIGKILL sent to process group %d", pgrp) - status = True - - return status - - def kill_process(pid): """ Kill process. From 60b516ff21561ffb2d635d1fb41273255128cc30 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 7 Apr 2023 15:06:36 +0200 Subject: [PATCH 048/154] Refactoring --- PILOTVERSION | 2 +- pilot/util/auxiliary.py | 38 +-------------------------- pilot/util/constants.py | 6 ++--- pilot/util/container.py | 6 ++--- pilot/util/processes.py | 3 ++- pilot/util/processgroups.py | 52 +++++++++++++++++++++++++++++++++++++ 6 files changed, 62 insertions(+), 45 deletions(-) create mode 100644 pilot/util/processgroups.py diff --git a/PILOTVERSION b/PILOTVERSION index 52ffb50a4..11ade4cb5 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.5.2.44 \ No newline at end of file +3.6.0.47 \ No newline at end of file diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index f3453fe4e..dc6496956 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -15,6 +15,7 @@ from collections.abc import Set, Mapping from collections import deque, OrderedDict from numbers import Number +from signal import SIGTERM, SIGKILL from time import sleep from pilot.util.constants import ( @@ -751,40 +752,3 @@ def get_schedd_id(host): logger.debug(f'HTCondor: global name is within limits: {global_name} (length={len(global_name)}, max size={maxsize})') return global_name - - -def kill_process_group(pgrp): - """ - Kill the process group. - DO NOT MOVE TO PROCESSES.PY - will lead to circular import since execute() needs it as well. - :param pgrp: process group id (int). - :return: boolean (True if SIGTERM followed by SIGKILL signalling was successful) - """ - - status = False - _sleep = True - - # kill the process gracefully - logger.info("killing group process %d", pgrp) - try: - os.killpg(pgrp, signal.SIGTERM) - except Exception as error: - logger.warning("exception thrown when killing child group process under SIGTERM: %s", error) - _sleep = False - else: - logger.info("SIGTERM sent to process group %d", pgrp) - - if _sleep: - _t = 30 - logger.info("sleeping %d s to allow processes to exit", _t) - time.sleep(_t) - - try: - os.killpg(pgrp, signal.SIGKILL) - except Exception as error: - logger.warning("exception thrown when killing child group process with SIGKILL: %s", error) - else: - logger.info("SIGKILL sent to process group %d", pgrp) - status = True - - return status diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 246f87bf8..c07cc6a9f 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -12,9 +12,9 @@ # Pilot version RELEASE = '3' # released number should be fixed at 3 for Pilot 3 -VERSION = '5' # version number is '1' for first release, '0' until then, increased for bigger updates -REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '45' # build number should be reset to '1' for every new development cycle +VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates +REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '47' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/container.py b/pilot/util/container.py index af8314b0b..c54bcd2e1 100644 --- a/pilot/util/container.py +++ b/pilot/util/container.py @@ -13,7 +13,7 @@ from time import sleep from signal import SIGTERM, SIGKILL from pilot.common.errorcodes import ErrorCodes -from pilot.util.auxiliary import kill_process_group +from pilot.util.processgroups import kill_process_group logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -73,9 +73,9 @@ def execute(executable, **kwargs): stderr = f'subprocess communicate sent TimeoutExpired: {exc}' logger.warning(stderr) exit_code = errors.COMMANDTIMEDOUT - process.kill() try: - logger.warning('killing lingering process group') + logger.warning('killing lingering subprocess and process group') + process.kill() kill_process_group(getpgid(process.pid)) except ProcessLookupError as exc: stderr += f'\n(kill process group) ProcessLookupError={exc}' diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 30bf7f902..7465d7769 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -14,8 +14,9 @@ import threading from pilot.util.container import execute -from pilot.util.auxiliary import whoami, kill_process_group +from pilot.util.auxiliary import whoami from pilot.util.filehandling import read_file, remove_dir_tree +from pilot.util.processgroups import kill_process_group import logging logger = logging.getLogger(__name__) diff --git a/pilot/util/processgroups.py b/pilot/util/processgroups.py new file mode 100644 index 000000000..43007e1cf --- /dev/null +++ b/pilot/util/processgroups.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Paul Nilsson, paul.nilsson@cern.ch, 2023 + +import os +from signal import SIGTERM, SIGKILL +from time import sleep + +import logging +logger = logging.getLogger(__name__) + + +def kill_process_group(pgrp): + """ + Kill the process group. + DO NOT MOVE TO PROCESSES.PY - will lead to circular import since execute() needs it as well. + :param pgrp: process group id (int). + :return: boolean (True if SIGTERM followed by SIGKILL signalling was successful) + """ + + status = False + _sleep = True + + # kill the process gracefully + logger.info(f"killing group process {pgrp}") + try: + os.killpg(pgrp, SIGTERM) + except Exception as error: + logger.warning(f"exception thrown when killing child group process under SIGTERM: {error}") + _sleep = False + else: + logger.info(f"SIGTERM sent to process group {pgrp}") + + if _sleep: + nap = 30 + logger.info(f"sleeping {nap} s to allow processes to exit") + sleep(nap) + + try: + os.killpg(pgrp, SIGKILL) + except Exception as error: + logger.warning(f"exception thrown when killing child group process with SIGKILL: {error}") + else: + logger.info(f"SIGKILL sent to process group {pgrp}") + status = True + + return status From 73bdbc52f5de13fcb259ae3e12c870c507fa38e5 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 7 Apr 2023 15:28:22 +0200 Subject: [PATCH 049/154] Flake8 --- pilot/util/auxiliary.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index dc6496956..4f14e82fc 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -15,7 +15,6 @@ from collections.abc import Set, Mapping from collections import deque, OrderedDict from numbers import Number -from signal import SIGTERM, SIGKILL from time import sleep from pilot.util.constants import ( From 8b7dfc6615a13afead3ef4f9ca8642b1c0456db9 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 7 Apr 2023 17:09:20 +0200 Subject: [PATCH 050/154] Added log message. Dumping /etc/os-release instead of executing lsb_release command --- pilot/control/job.py | 1 + pilot/util/auxiliary.py | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index e98c863d6..bb87c3cd8 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2059,6 +2059,7 @@ def htcondor_envvar(jobid, processingtype): globaljobid = encode_globaljobid(jobid, processingtype) if globaljobid: os.environ['HTCondor_JOB_ID'] = globaljobid + logger.info(f'set env var HTCondor_JOB_ID={globaljobid}') except Exception as exc: logger.warning(f'caught exception: {exc}') else: diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index 4f14e82fc..a952c7ea4 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -89,18 +89,19 @@ def display_architecture_info(): """ logger.info("architecture information:") - - _, stdout, stderr = execute("lsb_release -a", mute=True) - if 'command not found' in stdout or 'command not found' in stderr: - # Dump standard architecture info files if available - dump("/etc/lsb-release") - dump("/etc/SuSE-release") - dump("/etc/redhat-release") - dump("/etc/debian_version") - dump("/etc/issue") - dump("$MACHTYPE", cmd="echo") - else: - logger.info("\n%s", stdout) + dump("/etc/os-release") + + #_, stdout, stderr = execute("lsb_release -a", mute=True) + #if 'command not found' in stdout or 'command not found' in stderr: + # # Dump standard architecture info files if available + # dump("/etc/lsb-release") + # dump("/etc/SuSE-release") + # dump("/etc/redhat-release") + # dump("/etc/debian_version") + # dump("/etc/issue") + # dump("$MACHTYPE", cmd="echo") + #else: + # logger.info("\n%s", stdout) def get_batchsystem_jobid(): From 9cd8fe190dbe20a9d59c65568fffbbc9ef14b533 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 7 Apr 2023 17:09:55 +0200 Subject: [PATCH 051/154] Update --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 11ade4cb5..bca62904e 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.47 \ No newline at end of file +3.6.0.48 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index c07cc6a9f..4461ef244 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '47' # build number should be reset to '1' for every new development cycle +BUILD = '48' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 04a1d26d7f4949e6a7778e8aba36ea5e89bac8ad Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Apr 2023 12:11:24 +0200 Subject: [PATCH 052/154] Piping remote open output to files --- PILOTVERSION | 2 +- pilot/control/job.py | 2 +- pilot/user/atlas/common.py | 7 +++++-- pilot/util/constants.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index bca62904e..ce7211ab1 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.48 \ No newline at end of file +3.6.0.49 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index bb87c3cd8..26e0508c6 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -52,7 +52,7 @@ from pilot.util.queuehandling import scan_for_jobs, put_in_queue, queue_report, purge_queue from pilot.util.realtimelogger import cleanup as rtcleanup from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp -from pilot.util.workernode import get_disk_space, collect_workernode_info, get_node_name, get_cpu_model, get_cpu_cores, get_cpu_arch +from pilot.util.workernode import get_disk_space, collect_workernode_info, get_node_name, get_cpu_model, get_cpu_cores #, get_cpu_arch logger = logging.getLogger(__name__) errors = ErrorCodes() diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 8e1a6ae12..97b5cd4ae 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -307,7 +307,7 @@ def parse_remotefileverification_dictionary(workdir): return exitcode, diagnostics, not_opened -def get_file_open_command(script_path, turls, nthreads): +def get_file_open_command(script_path, turls, nthreads, stdout='remote_open.stdout', stderr='remote_open.stderr'): """ :param script_path: path to script (string). @@ -316,7 +316,10 @@ def get_file_open_command(script_path, turls, nthreads): :return: comma-separated list of turls (string). """ - return "%s --turls=\'%s\' -w %s -t %s" % (script_path, turls, os.path.dirname(script_path), str(nthreads)) + cmd = f"{script_path} --turls=\'{turls}\' -w {os.path.dirname(script_path)} -t {nthreads}" + if stdout and stderr: + cmd + f' 1>{stdout} 2>{stderr}' + return cmd def extract_turls(indata): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4461ef244..e9c2b6212 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '48' # build number should be reset to '1' for every new development cycle +BUILD = '50' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From fa5e5ee3951044e902490f8dab36b8418f74369b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Apr 2023 15:33:12 +0200 Subject: [PATCH 053/154] Correction --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index ce7211ab1..f59bf36da 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.49 \ No newline at end of file +3.6.0.51 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 97b5cd4ae..d11def620 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -318,7 +318,7 @@ def get_file_open_command(script_path, turls, nthreads, stdout='remote_open.stdo cmd = f"{script_path} --turls=\'{turls}\' -w {os.path.dirname(script_path)} -t {nthreads}" if stdout and stderr: - cmd + f' 1>{stdout} 2>{stderr}' + cmd += f' 1>{stdout} 2>{stderr}' return cmd diff --git a/pilot/util/constants.py b/pilot/util/constants.py index e9c2b6212..87e55fbd5 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '50' # build number should be reset to '1' for every new development cycle +BUILD = '51' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From f8677296084edcd43622fd6fa3627ad0b672157d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Apr 2023 18:04:31 +0200 Subject: [PATCH 054/154] Special cric download priority for dask queue --- PILOTVERSION | 2 +- pilot/info/extinfo.py | 5 ++++- pilot/util/constants.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index f59bf36da..ac64b0f1a 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.51 \ No newline at end of file +3.6.0.52 \ No newline at end of file diff --git a/pilot/info/extinfo.py b/pilot/info/extinfo.py index 76bfff15d..cd16dbd44 100644 --- a/pilot/info/extinfo.py +++ b/pilot/info/extinfo.py @@ -215,7 +215,10 @@ def load_storage_data(self, ddmendpoints=[], priority=[], cache_time=60): pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.setup' % pilot_user, globals(), locals(), [pilot_user], 0) ddm_source_priority = user.get_ddm_source_priority() - priority = priority or ddm_source_priority + if os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': + priority = ['LOCAL'] + else: + priority = priority or ddm_source_priority logger.debug(f'storage data priority={priority}') return self.load_data(sources, priority, cache_time) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 87e55fbd5..3c57bdceb 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '51' # build number should be reset to '1' for every new development cycle +BUILD = '52' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From d804eba9e1cd101c1683a8cb0379485a8f1455e1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Apr 2023 18:46:27 +0200 Subject: [PATCH 055/154] Reformat res on dask queue --- PILOTVERSION | 2 +- pilot/control/job.py | 14 ++++++++++++++ pilot/util/constants.py | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index ac64b0f1a..c66fae635 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.52 \ No newline at end of file +3.6.0.53 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 26e0508c6..ce7b8052e 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -12,6 +12,7 @@ from __future__ import print_function # Python 2 +import json import os import time import hashlib @@ -1951,6 +1952,19 @@ def retrieve(queues, traces, args): # noqa: C901 args.graceful_stop.set() break + # reformat res on dask queue + if res and os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': + # res = {'{"5816670699": {"StatusCode": 0, .. + tmp = '' + for _res in res: + tmp = _res # tmp = '{"5816670699": {"StatusCode": 0, .. + break + tmp = json.loads(tmp) + for _res in tmp: + res = tmp[_res] + break + # res = {"StatusCode": 0, .. + if not res: getjob_failures += 1 if getjob_failures >= args.getjob_failures: diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 3c57bdceb..2e81310fe 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '52' # build number should be reset to '1' for every new development cycle +BUILD = '53' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 9eb784ba40134b9b8742747055737e0edcb67e25 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Apr 2023 19:02:25 +0200 Subject: [PATCH 056/154] Reformat res on dask queue --- pilot/control/job.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index ce7b8052e..9385c321e 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1955,14 +1955,21 @@ def retrieve(queues, traces, args): # noqa: C901 # reformat res on dask queue if res and os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': # res = {'{"5816670699": {"StatusCode": 0, .. + logger.debug(f'res={res}') tmp = '' for _res in res: tmp = _res # tmp = '{"5816670699": {"StatusCode": 0, .. break - tmp = json.loads(tmp) - for _res in tmp: - res = tmp[_res] - break + logger.debug(f'tmp={tmp}') + try: + tmp = json.loads(tmp) + except Exception as exc: + logger.debug(f'exc={exc}') + res = None + else: + for _res in tmp: + res = tmp[_res] + break # res = {"StatusCode": 0, .. if not res: From 23401eb30f8b39e1b0613a94cbf5857523166d9c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 Apr 2023 11:12:38 +0200 Subject: [PATCH 057/154] Removed reformatting code --- pilot/control/job.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 9385c321e..98e9e97d1 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1952,26 +1952,6 @@ def retrieve(queues, traces, args): # noqa: C901 args.graceful_stop.set() break - # reformat res on dask queue - if res and os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': - # res = {'{"5816670699": {"StatusCode": 0, .. - logger.debug(f'res={res}') - tmp = '' - for _res in res: - tmp = _res # tmp = '{"5816670699": {"StatusCode": 0, .. - break - logger.debug(f'tmp={tmp}') - try: - tmp = json.loads(tmp) - except Exception as exc: - logger.debug(f'exc={exc}') - res = None - else: - for _res in tmp: - res = tmp[_res] - break - # res = {"StatusCode": 0, .. - if not res: getjob_failures += 1 if getjob_failures >= args.getjob_failures: From 258bd8d019313b9eeef6b26224368b43f92d6b73 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 Apr 2023 12:01:51 +0200 Subject: [PATCH 058/154] Support for pilot running in a pod --- PILOTVERSION | 2 +- pilot.py | 7 +++++++ pilot/control/job.py | 12 +++++++----- pilot/util/constants.py | 2 +- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index c66fae635..8b506cf30 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.53 \ No newline at end of file +3.6.0.54 \ No newline at end of file diff --git a/pilot.py b/pilot.py index e339f2809..29912234b 100755 --- a/pilot.py +++ b/pilot.py @@ -308,6 +308,13 @@ def get_args(): required=True, help='Pilot user (e.g. name of experiment corresponding to pilot plug-in)') + # Kubernetes (pilot running in a pod) + arg_parser.add_argument('--pod', + dest='pod', + type=str2bool, + default=False, + help='Pilot running in a Kubernetes pod') + # Harvester specific options (if any of the following options are used, args.harvester will be set to True) arg_parser.add_argument('--harvester-workdir', dest='harvester_workdir', diff --git a/pilot/control/job.py b/pilot/control/job.py index 98e9e97d1..13ec9c7b4 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1458,19 +1458,21 @@ def proceed_with_getjob(timefloor, starttime, jobnumber, getjob_requests, max_ge return True -def get_job_definition_from_file(path, harvester): +def get_job_definition_from_file(path, harvester, pod): """ Get a job definition from a pre-placed file. In Harvester mode, also remove any existing job request files since it is no longer needed/wanted. - :param path: path to job definition file. + :param path: path to job definition file :param harvester: True if Harvester is being used (determined from args.harvester), otherwise False + :param pod: True if pilot is running in a pod, otherwise False :return: job definition dictionary. """ # remove any existing Harvester job request files (silent in non-Harvester mode) and read the JSON - if harvester: - remove_job_request_file() + if harvester or pod: + if harvester: + remove_job_request_file() if is_json(path): job_definition_list = parse_job_definition_file(path) if not job_definition_list: @@ -1579,7 +1581,7 @@ def get_job_definition(queues, args): res = get_fake_job() elif os.path.exists(path): logger.info(f'will read job definition from file: {path}') - res = get_job_definition_from_file(path, args.harvester) + res = get_job_definition_from_file(path, args.harvester, args.pod) else: if args.harvester and args.harvester_submitmode.lower() == 'push': pass # local job definition file not found (go to sleep) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 2e81310fe..786eaadef 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '53' # build number should be reset to '1' for every new development cycle +BUILD = '54' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 724055661184816a2686ed96dfd129f763e75fed Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 Apr 2023 12:30:20 +0200 Subject: [PATCH 059/154] Updated --pod option --- pilot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot.py b/pilot.py index 29912234b..f8aacf184 100755 --- a/pilot.py +++ b/pilot.py @@ -311,7 +311,7 @@ def get_args(): # Kubernetes (pilot running in a pod) arg_parser.add_argument('--pod', dest='pod', - type=str2bool, + action='store_true', default=False, help='Pilot running in a Kubernetes pod') From 893a03ad43d31c69c095579f25b25a6ec3b5072e Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 Apr 2023 12:59:23 +0200 Subject: [PATCH 060/154] Removed excessive logging about max running time --- pilot/control/monitor.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index bee85000c..85692f210 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -59,6 +59,7 @@ def control(queues, traces, args): # noqa: C901 # overall loop counter (ignoring the fact that more than one job may be running) niter = 0 + max_running_time_old = 0 while not args.graceful_stop.is_set(): # every few seconds, run the monitoring checks if args.graceful_stop.wait(1) or args.graceful_stop.is_set(): @@ -82,6 +83,10 @@ def control(queues, traces, args): # noqa: C901 except Exception as exc: logger.warning(f'caught exception: {exc}') max_running_time = args.lifetime + else: + if max_running_time != max_running_time_old: + max_running_time_old = max_running_time + logger.info(f'using max running time = {max_running_time}s') # for testing: max_running_time = 4 * 60 if time_since_start > max_running_time - grace_time: @@ -353,7 +358,7 @@ def get_max_running_time(lifetime, queuedata, queues, push): logger.warning(f'caught exception: {exc}') else: if _max_running_time: - logger.debug(f'using max running time from job: {_max_running_time}s') + #logger.debug(f'using max running time from job: {_max_running_time}s') return _max_running_time # use the schedconfig value if set, otherwise use the pilot option lifetime value @@ -367,8 +372,8 @@ def get_max_running_time(lifetime, queuedata, queues, push): else: if max_running_time == 0: max_running_time = lifetime # fallback to default value - logger.info(f'will use default value for max running time: {max_running_time}s') - else: - logger.info(f'will use queuedata.maxtime value for max running time: {max_running_time}s') + # logger.debug(f'will use default value for max running time: {max_running_time}s') + #else: + # logger.debug(f'will use queuedata.maxtime value for max running time: {max_running_time}s') return max_running_time From 26b667df78264e5dbbc8f1f772fce1222da29c8e Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 Apr 2023 13:00:24 +0200 Subject: [PATCH 061/154] Removed excessive logging about max running time --- pilot/control/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 85692f210..633461eb2 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -59,7 +59,7 @@ def control(queues, traces, args): # noqa: C901 # overall loop counter (ignoring the fact that more than one job may be running) niter = 0 - max_running_time_old = 0 + max_running_time_old = args.lifetime while not args.graceful_stop.is_set(): # every few seconds, run the monitoring checks if args.graceful_stop.wait(1) or args.graceful_stop.is_set(): From a255ae187a2653a130f0cac59cb900b7ed9aec46 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 Apr 2023 13:00:44 +0200 Subject: [PATCH 062/154] Removed excessive logging about max running time --- pilot/control/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 633461eb2..85692f210 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -59,7 +59,7 @@ def control(queues, traces, args): # noqa: C901 # overall loop counter (ignoring the fact that more than one job may be running) niter = 0 - max_running_time_old = args.lifetime + max_running_time_old = 0 while not args.graceful_stop.is_set(): # every few seconds, run the monitoring checks if args.graceful_stop.wait(1) or args.graceful_stop.is_set(): From 8e5b329c1e9738c19e6df4bd076160fc12ca6d15 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 13 Apr 2023 16:38:37 +0200 Subject: [PATCH 063/154] Skip storing jobid on file if pod --- pilot/control/job.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 13ec9c7b4..f93905e38 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1054,7 +1054,8 @@ def validate(queues, traces, args): logger.warning(f'exception caught: {error}') # store the PanDA job id for the wrapper to pick up - store_jobid(job.jobid, args.sourcedir) + if not args.pod: + store_jobid(job.jobid, args.sourcedir) # run the delayed space check now delayed_space_check(queues, traces, args, job) From b4898146907cd031eb3c1bbda87b2c89293b041b Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 13 Apr 2023 16:42:35 +0200 Subject: [PATCH 064/154] Always use args.lifetime as max running time if pod --- pilot/control/monitor.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 85692f210..9c68c1c15 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -79,7 +79,7 @@ def control(queues, traces, args): # noqa: C901 grace_time = 0 # get the current max_running_time (can change with job) try: - max_running_time = get_max_running_time(args.lifetime, queuedata, queues, push) + max_running_time = get_max_running_time(args.lifetime, queuedata, queues, push, args.pod) except Exception as exc: logger.warning(f'caught exception: {exc}') max_running_time = args.lifetime @@ -331,18 +331,23 @@ def run_checks(queues, args): raise ExceededMaxWaitTime(diagnostics) -def get_max_running_time(lifetime, queuedata, queues, push): +def get_max_running_time(lifetime, queuedata, queues, push, pod): """ Return the maximum allowed running time for the pilot. The max time is set either as a pilot option or via the schedconfig.maxtime for the PQ in question. + If running in a Kubernetes pod, always use the args.lifetime as maxtime (it will be determined by the harvester submitter). :param lifetime: optional pilot option time in seconds (int). :param queuedata: queuedata object :param queues: :param push: push mode (boolean) + :param pod: pod mode (boolean) :return: max running time in seconds (int) """ + if pod: + return lifetime + max_running_time = lifetime if not queuedata: From 4aaceb7080550bbbd4652e0a5b8ac085317a3b03 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 13 Apr 2023 17:08:07 +0200 Subject: [PATCH 065/154] Log message --- pilot/control/job.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index f93905e38..b1953bf3c 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1057,11 +1057,12 @@ def validate(queues, traces, args): if not args.pod: store_jobid(job.jobid, args.sourcedir) + # make sure that ctypes is available (needed at the end by orphan killer) + verify_ctypes(queues, job) + # run the delayed space check now delayed_space_check(queues, traces, args, job) - # make sure that ctypes is available (needed at the end by orphan killer) - verify_ctypes(queues, job) else: logger.debug(f'failed to validate job={job.jobid}') put_in_queue(job, queues.failed_jobs) @@ -2373,6 +2374,8 @@ def queue_monitor(queues, traces, args): # noqa: C901 if not scan_for_jobs(queues): logger.warning('queues are still empty of jobs - will begin queue monitoring anyway') + logger.debug('starting queue_monitor()') + job = None while True: # will abort when graceful_stop has been set or if enough time has passed after kill signal time.sleep(1) From 5d6f2d80beb1fac43b2a3e3f62f6384e4ff3382b Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 14 Apr 2023 11:51:22 +0200 Subject: [PATCH 066/154] Improved machinefeatures handling --- pilot/control/job.py | 2 -- pilot/control/monitor.py | 16 ++++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index b1953bf3c..966c6e5b2 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2374,8 +2374,6 @@ def queue_monitor(queues, traces, args): # noqa: C901 if not scan_for_jobs(queues): logger.warning('queues are still empty of jobs - will begin queue monitoring anyway') - logger.debug('starting queue_monitor()') - job = None while True: # will abort when graceful_stop has been set or if enough time has passed after kill signal time.sleep(1) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 9c68c1c15..43fdbca1f 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -159,12 +159,16 @@ def run_shutdowntime_minute_check(time_since_start): return False # will be ignored # ignore shutdowntime if not known - try: - shutdowntime = int(machinefeatures.get('shutdowntime')) - except (TypeError, ValueError) as exc: - logger.debug(f'failed to convert shutdowntime: {exc}') - return False # will be ignored - logger.debug(f'machinefeatures shutdowntime={shutdowntime} - now={now}') + shutdowntime = None + _shutdowntime = machinefeatures.get('shutdowntime', None) + if _shutdowntime: + try: + shutdowntime = int(_shutdowntime) + except (TypeError, ValueError) as exc: + logger.debug(f'failed to convert shutdowntime: {exc}') + return False # will be ignored + else: + logger.debug(f'machinefeatures shutdowntime={shutdowntime} - now={now}') if not shutdowntime: logger.debug('ignoring shutdowntime since it is not set') return False # will be ignored From 4044e57f93191ac2fb8053124cd998d6ca604154 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 14 Apr 2023 12:55:21 +0200 Subject: [PATCH 067/154] Sending job update for pod --- PILOTVERSION | 2 +- pilot/control/job.py | 10 ++++++++++ pilot/util/constants.py | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 8b506cf30..38b692f70 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.54 \ No newline at end of file +3.6.0.55 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 966c6e5b2..3b5a0df70 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -317,6 +317,9 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) if state == 'finished' or state == 'holding' or state == 'failed': logger.info(f'this job has now completed (state={state})') job.completed = True + #elif args.pod and args.workflow == 'stager' and state == 'running': + # logger.info(f'this job has now completed (state={state})') + # job.completed = True # should the pilot make any server updates? if not args.update_server: @@ -1226,6 +1229,13 @@ def create_data_payload(queues, traces, args): # if the job does not have any input data, then pretend that stage-in has finished and put the job # in the finished_data_in queue put_in_queue(job, queues.finished_data_in) + # for stager jobs in pod mode, let the server know the job is running, then terminate the pilot as it is no longer needed + if args.pod and args.workflow == 'stager': + set_pilot_state(job=job, state='running') + send_state(job, args, 'running') + logger.info('pilot is no longer needed - terminating') + args.job_aborted.set() + args.graceful_stop.set() # only in normal workflow; in the stager workflow there is no payloads queue if not args.workflow == 'stager': diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 786eaadef..13f0e9d9d 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '54' # build number should be reset to '1' for every new development cycle +BUILD = '55' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From ee024c836085bfd018fab6af90a478e386767f8c Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 14 Apr 2023 13:26:02 +0200 Subject: [PATCH 068/154] Making sure that capath is set in context --- pilot/util/https.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pilot/util/https.py b/pilot/util/https.py index ba2acf776..a3c8d30cd 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -7,7 +7,7 @@ # Authors: # - Daniel Drizhuk, d.drizhuk@gmail.com, 2017 # - Mario Lassnig, mario.lassnig@cern.ch, 2017 -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2022 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 import json import os @@ -43,7 +43,7 @@ # anisyonk: public copy of `_ctx` to avoid logic break since ssl_context is reset inside the request() -- FIXME # anisyonk: public instance, should be properly initialized by `https_setup()` # anisyonk: use lightweight class definition instead of namedtuple since tuple is immutable and we don't need/use any tuple features here -ctx = type('ctx', (object,), dict(ssl_context=None, user_agent='Pilot2 client', capath=None, cacert=None)) +ctx = type('ctx', (object,), dict(ssl_context=None, user_agent='Pilot3 client', capath=None, cacert=None)) def _tester(func, *args): @@ -260,6 +260,9 @@ def update_ctx(): x509 = os.environ.get('X509_USER_PROXY', _ctx.cacert) if x509 != _ctx.cacert and os.path.exists(x509): _ctx.cacert = x509 + certdir = os.environ.get('X509_CERT_DIR', _ctx.capath) + if certdir != _ctx.capath and os.path.exists(certdir): + _ctx.capath = certdir def get_curl_command(plain, dat, ipv): From 98dba03c9377173353342da5662a7b828ac65f78 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 14 Apr 2023 13:33:16 +0200 Subject: [PATCH 069/154] Enforcing running state for stager pods --- pilot/control/job.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 3b5a0df70..ad8d5f0b5 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -305,7 +305,6 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) """ # insert out of batch time error code if MAXTIME has been reached - logger.debug(f"REACHED_MAXTIME={os.environ.get('REACHED_MAXTIME', None)}") if os.environ.get('REACHED_MAXTIME', None): msg = 'the max batch system time limit has been reached' logger.warning(msg) @@ -317,8 +316,9 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) if state == 'finished' or state == 'holding' or state == 'failed': logger.info(f'this job has now completed (state={state})') job.completed = True - #elif args.pod and args.workflow == 'stager' and state == 'running': - # logger.info(f'this job has now completed (state={state})') + elif args.pod and args.workflow == 'stager': + state = 'running' # stager pods should only send 'running' since harvester already has set the 'starting' state + job.state = state # job.completed = True # should the pilot make any server updates? From bb6b5d98424f22f800c051edd256b90b7cfd53a4 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 14 Apr 2023 13:53:06 +0200 Subject: [PATCH 070/154] Added debug --- pilot/util/https.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pilot/util/https.py b/pilot/util/https.py index a3c8d30cd..d585c97a9 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -263,7 +263,10 @@ def update_ctx(): certdir = os.environ.get('X509_CERT_DIR', _ctx.capath) if certdir != _ctx.capath and os.path.exists(certdir): _ctx.capath = certdir - + logger.debug(f"X509_CERT_DIR={os.environ.get('X509_CERT_DIR')}") + logger.debug(f"_ctx.capath={_ctx.capath}") + logger.debug(f"certdir={certdir}") + logger.debug(f"os.path.exists(certdir)={os.path.exists(certdir)}") def get_curl_command(plain, dat, ipv): """ From 7f35ad2aa79cee64fd71328d086474029c959b16 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 17 Apr 2023 18:28:27 +0200 Subject: [PATCH 071/154] Setting pilot exit error code when server communication fails for pod --- PILOTVERSION | 2 +- pilot/control/job.py | 7 ++++++- pilot/util/constants.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 38b692f70..e8afd3564 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.55 \ No newline at end of file +3.6.0.57 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index ad8d5f0b5..f14602c00 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1232,7 +1232,12 @@ def create_data_payload(queues, traces, args): # for stager jobs in pod mode, let the server know the job is running, then terminate the pilot as it is no longer needed if args.pod and args.workflow == 'stager': set_pilot_state(job=job, state='running') - send_state(job, args, 'running') + ret = send_state(job, args, 'running') + if not ret: + job.state = 'failed' + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COMMUNICATIONFAILURE) + put_in_queue(job, queues.failed_jobs) + traces.pilot['error_code'] = errors.COMMUNICATIONFAILURE logger.info('pilot is no longer needed - terminating') args.job_aborted.set() args.graceful_stop.set() diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 13f0e9d9d..8282e49e2 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '55' # build number should be reset to '1' for every new development cycle +BUILD = '57' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From a40e468a0a0e2f2b6dbc3b9d3952a416c074ea70 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 17 Apr 2023 18:45:59 +0200 Subject: [PATCH 072/154] Removed useless code. Added timeout for https request --- pilot/control/job.py | 3 --- pilot/util/https.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index f14602c00..0eb478a1a 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1234,9 +1234,6 @@ def create_data_payload(queues, traces, args): set_pilot_state(job=job, state='running') ret = send_state(job, args, 'running') if not ret: - job.state = 'failed' - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.COMMUNICATIONFAILURE) - put_in_queue(job, queues.failed_jobs) traces.pilot['error_code'] = errors.COMMUNICATIONFAILURE logger.info('pilot is no longer needed - terminating') args.job_aborted.set() diff --git a/pilot/util/https.py b/pilot/util/https.py index d585c97a9..f724a71f3 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -216,7 +216,7 @@ def request(url, data=None, plain=False, secure=True, ipv='IPv6'): failed = True break try: - status, output, stderr = execute(req, obscure=obscure) + status, output, stderr = execute(req, obscure=obscure, timeout=130) except Exception as exc: logger.warning(f'exception: {exc}') failed = True From d23ca3aa6bc13d5c1787775193a679df0c1b71c1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 17 Apr 2023 19:17:55 +0200 Subject: [PATCH 073/154] Added shell exit code --- pilot/util/auxiliary.py | 1 + pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/util/auxiliary.py b/pilot/util/auxiliary.py index a952c7ea4..7697f6f6d 100644 --- a/pilot/util/auxiliary.py +++ b/pilot/util/auxiliary.py @@ -204,6 +204,7 @@ def get_error_code_translation_dictionary(): errors.MIDDLEWAREIMPORTFAILURE: [76, "Failed to import middleware module"], # added to traces object errors.MISSINGINPUTFILE: [77, "Missing input file in SE"], # should pilot report this type of error to wrapper? errors.PANDAQUEUENOTACTIVE: [78, "PanDA queue is not active"], + errors.COMMUNICATIONFAILURE: [79, "PanDA server communication failure"], errors.KILLSIGNAL: [137, "General kill signal"], # Job terminated by unknown kill signal errors.SIGTERM: [143, "Job killed by signal: SIGTERM"], # 128+15 errors.SIGQUIT: [131, "Job killed by signal: SIGQUIT"], # 128+3 diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 8282e49e2..9a0e1932e 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '57' # build number should be reset to '1' for every new development cycle +BUILD = '58' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 8d8d006799209789ef961ab39e5937dff35671d5 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 18 Apr 2023 15:08:10 +0200 Subject: [PATCH 074/154] Displaying pilot exit codes at end --- pilot.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pilot.py b/pilot.py index f8aacf184..eb2516c35 100755 --- a/pilot.py +++ b/pilot.py @@ -547,10 +547,11 @@ def wrap_up(): logging.warning(f'failed to convert exit code to int: {exitcode}, {exc}') exitcode = 1008 - logging.info('pilot has finished') + sec = shell_exit_code(exitcode) + logging.info(f'pilot has finished (exit code={exitcode}, shell exit code={sec})') logging.shutdown() - return shell_exit_code(exitcode) + return sec def get_pilot_source_dir(): From b67a8836a9d9f20f66be7167dc8e772adc247509 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 19 Apr 2023 11:20:19 +0200 Subject: [PATCH 075/154] Added memory monitoring for sPHENIX --- PILOTVERSION | 2 +- pilot/user/sphenix/setup.py | 73 +++ pilot/user/sphenix/utilities.py | 798 +++++++++++++++++++++++++++++++- pilot/util/constants.py | 2 +- 4 files changed, 871 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index e8afd3564..54c848536 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.57 \ No newline at end of file +3.6.0.58 \ No newline at end of file diff --git a/pilot/user/sphenix/setup.py b/pilot/user/sphenix/setup.py index d180f72d1..aa4ee3dc6 100644 --- a/pilot/user/sphenix/setup.py +++ b/pilot/user/sphenix/setup.py @@ -14,6 +14,8 @@ from datetime import datetime from pilot.common.errorcodes import ErrorCodes +from pilot.common.exception import NoSoftwareDir +from pilot.info import infosys from pilot.util.auxiliary import find_pattern_in_list from pilot.util.container import execute from pilot.util.filehandling import copy, head @@ -24,6 +26,77 @@ errors = ErrorCodes() +def get_file_system_root_path(): + """ + Return the root path of the local file system. + The function returns "/cvmfs" or "/(some path)/cvmfs" in case the expected file system root path is not + where it usually is (e.g. on an HPC). A site can set the base path by exporting ATLAS_SW_BASE. + + :return: path (string) + """ + + return os.environ.get('ATLAS_SW_BASE', '/cvmfs') + + +def get_alrb_export(add_if=False): + """ + Return the export command for the ALRB path if it exists. + If the path does not exist, return empty string. + + :param add_if: Boolean. True means that an if statement will be placed around the export. + :return: export command + """ + + path = "%s/atlas.cern.ch/repo" % get_file_system_root_path() + cmd = "export ATLAS_LOCAL_ROOT_BASE=%s/ATLASLocalRootBase;" % path if os.path.exists(path) else "" + + # if [ -z "$ATLAS_LOCAL_ROOT_BASE" ]; then export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase; fi; + if cmd and add_if: + cmd = 'if [ -z \"$ATLAS_LOCAL_ROOT_BASE\" ]; then ' + cmd + ' fi;' + + return cmd + + +def get_asetup(asetup=True, alrb=False, add_if=False): + """ + Define the setup for asetup, i.e. including full path to asetup and setting of ATLAS_LOCAL_ROOT_BASE + Only include the actual asetup script if asetup=True. This is not needed if the jobPars contain the payload command + but the pilot still needs to add the exports and the atlasLocalSetup. + + :param asetup: Boolean. True value means that the pilot should include the asetup command. + :param alrb: Boolean. True value means that the function should return special setup used with ALRB and containers. + :param add_if: Boolean. True means that an if statement will be placed around the export. + :raises: NoSoftwareDir if appdir does not exist. + :return: source /asetup.sh (string). + """ + + cmd = "" + alrb_cmd = get_alrb_export(add_if=add_if) + if alrb_cmd != "": + cmd = alrb_cmd + if not alrb: + cmd += "source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh --quiet;" + if asetup: + cmd += "source $AtlasSetup/scripts/asetup.sh" + else: + try: # use try in case infosys has not been initiated + appdir = infosys.queuedata.appdir + except Exception: + appdir = "" + if appdir == "": + appdir = os.environ.get('VO_ATLAS_SW_DIR', '') + if appdir != "": + # make sure that the appdir exists + if not os.path.exists(appdir): + msg = 'appdir does not exist: %s' % appdir + logger.warning(msg) + raise NoSoftwareDir(msg) + if asetup: + cmd = "source %s/scripts/asetup.sh" % appdir + + return cmd + + def get_analysis_trf(transform, workdir): """ Prepare to download the user analysis transform with curl. diff --git a/pilot/user/sphenix/utilities.py b/pilot/user/sphenix/utilities.py index d9c177ec2..ec52d8a60 100644 --- a/pilot/user/sphenix/utilities.py +++ b/pilot/user/sphenix/utilities.py @@ -5,12 +5,802 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2022 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2023 + +import os +import time +from re import search + +# from pilot.info import infosys +from .setup import get_asetup +from pilot.util.container import execute +from pilot.util.filehandling import read_json, copy, write_json, remove +from pilot.util.parameters import convert_to_int +from pilot.util.processes import is_process_running import logging logger = logging.getLogger(__name__) +def get_memory_monitor_summary_filename(selector=None): + """ + Return the name for the memory monitor summary file. + + :param selector: special conditions flag (boolean). + :return: File name (string). + """ + + name = "memory_monitor_summary.json" + if selector: + name += '_snapshot' + + return name + + +def get_memory_monitor_output_filename(suffix='txt'): + """ + Return the filename of the memory monitor text output file. + + :return: File name (string). + """ + + return "memory_monitor_output.%s" % suffix + + +def get_memory_monitor_setup(pid, pgrp, jobid, workdir, command, setup="", use_container=True, transformation="", outdata=None, dump_ps=False): + """ + Return the proper setup for the memory monitor. + If the payload release is provided, the memory monitor can be setup with the same release. Until early 2018, the + memory monitor was still located in the release area. After many problems with the memory monitor, it was decided + to use a fixed version for the setup. Currently, release 21.0.22 is used. + + :param pid: job process id (int). + :param pgrp: process group id (int). + :param jobid: job id (int). + :param workdir: job work directory (string). + :param command: payload command (string). + :param setup: optional setup in case asetup can not be used, which uses infosys (string). + :param use_container: optional boolean. + :param transformation: optional name of transformation, e.g. Sim_tf.py (string). + :param outdata: optional list of output fspec objects (list). + :param dump_ps: should ps output be dumped when identifying prmon process? (Boolean). + :return: job work directory (string), pid for process inside container (int). + """ + + # try to get the pid from a pid.txt file which might be created by a container_script + pid = get_proper_pid(pid, pgrp, jobid, command=command, transformation=transformation, outdata=outdata, use_container=use_container, dump_ps=dump_ps) + if pid == -1: + logger.warning('process id was not identified before payload finished - will not launch memory monitor') + return "", pid + + if not setup: + setup = get_asetup(asetup=False) + setup += 'lsetup prmon;' + if not setup.endswith(';'): + setup += ';' + + cmd = "prmon" + interval = 60 + options = " --pid %d --filename %s --json-summary %s --interval %d" %\ + (pid, get_memory_monitor_output_filename(), get_memory_monitor_summary_filename(), interval) + cmd = "cd " + workdir + ";" + setup + cmd + options + + return cmd, pid + + +def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", use_container=True, dump_ps=False): + """ + Return a pid from the proper source to be used with the memory monitor. + The given pid comes from Popen(), but in the case containers are used, the pid should instead come from a ps aux + lookup. + If the main process has finished before the proper pid has been identified (it will take time if the payload is + running inside a container), then this function will abort and return -1. The called should handle this and not + launch the memory monitor as it is not needed any longer. + + :param pid: process id (int). + :param pgrp: process group id (int). + :param jobid: job id (int). + :param command: payload command (string). + :param transformation: optional name of transformation, e.g. Sim_tf.py (string). + :param outdata: list of output fspec object (list). + :param use_container: optional boolean. + :return: pid (int). + """ + + if not use_container: + return pid + + # abort if main process has finished already + if not is_process_running(pid): + return -1 + + #_cmd = get_trf_command(command, transformation=transformation) + # get ps info using group id + ps = get_ps_info(pgrp) + #if dump_ps: + # logger.debug('ps:\n%s' % ps) + #logger.debug('ps:\n%s' % ps) + #logger.debug('attempting to identify pid for Singularity (v.3) runtime parent process') + #_pid = get_pid_for_command(ps, command="Singularity runtime parent") + #if _pid: + # logger.debug('discovered pid=%d for process \"%s\"' % (_pid, _cmd)) + # return _pid + + i = 0 + imax = 120 + while i < imax: + # abort if main process has finished already + if not is_process_running(pid): + return -1 + + ps = get_ps_info(pgrp) + logger.debug('ps:\n%s' % ps) + + # lookup the process id using ps aux + logger.debug(f'attempting to identify pid from job id ({jobid})') + _pid = get_pid_for_jobid(ps, jobid) + if _pid: + logger.debug('discovered pid=%d for job id %s' % (_pid, jobid)) + break + + #logger.debug('attempting to identify pid from transform name and its output') + #_pid = get_pid_for_trf(ps, transformation, outdata) if outdata else None + #if _pid: + # logger.debug('discovered pid=%d for transform name \"%s\"' % (_pid, transformation)) + # break + + logger.warning('payload pid has not yet been identified (#%d/#%d)' % (i + 1, imax)) + + # wait until the payload has launched + time.sleep(5) + i += 1 + + if _pid: + pid = _pid + + logger.info('will use pid=%d for memory monitor' % pid) + + return pid + + +def get_ps_info(pgrp, whoami=None, options='axfo pid,user,args'): + """ + Return ps info for the given user. + + :param pgrp: process group id (int). + :param whoami: user name (string). + :return: ps aux for given user (string). + """ + + if not whoami: + whoami = os.getuid() + + cmd = "ps -u %s %s" % (whoami, options) + #cmd = "ps %s | grep %s" % (options, whoami) + #cmd = "ps %s | grep %s | awk -v p=%s '$1 == p {print $5}" % (options, whoami, pgrp) + #cmd = "ps %s | awk -v p=%s '$1 == p {print $5}" % (options, pgrp) + exit_code, stdout, stderr = execute(cmd) + + return stdout + + +def get_pid_for_jobid(ps, jobid): + """ + Return the process id for the ps entry that contains the job id. + + :param ps: ps command output (string). + :param jobid: PanDA job id (int). + :return: pid (int) or None if no such process. + """ + + pid = None + + for line in ps.split('\n'): + if jobid in line and 'xrootd' not in line: + # extract pid + _pid = search(r'(\d+) ', line) + try: + pid = int(_pid.group(1)) + except Exception as e: + logger.warning('pid has wrong type: %s' % e) + else: + logger.debug('extracted pid=%d from ps output' % pid) + break + + return pid + + +def get_pid_for_trf(ps, transformation, outdata): + """ + Return the process id for the given command and user. + Note: function returns 0 in case pid could not be found. + + :param ps: ps command output (string). + :param transformation: transformation name, e.g. Sim_tf.py (String). + :param outdata: fspec objects (list). + :return: pid (int) or None if no such process. + """ + + pid = None + candidates = [] + + # in the case of user analysis job, the transformation will contain a URL which should be stripped + if "/" in transformation: + transformation = transformation.split('/')[-1] + logger.debug('using transformation name: %s' % transformation) + for line in ps.split('\n'): + if transformation in line: + candidates.append(line) + break + + if candidates: + for line in candidates: + for fspec in outdata: + if fspec.lfn in line: + # extract pid + _pid = search(r'(\d+) ', line) + try: + pid = int(_pid.group(1)) + except Exception as e: + logger.warning('pid has wrong type: %s' % e) + else: + logger.debug('extracted pid=%d from ps output' % pid) + break + if pid: + break + else: + logger.debug('pid not found in ps output for trf=%s' % transformation) + + return pid + + +def get_pid_for_command(ps, command="python pilot3/pilot.py"): + """ + Return the process id for the given command and user. + The function returns 0 in case pid could not be found. + If no command is specified, the function looks for the "python pilot3/pilot.py" command in the ps output. + + :param ps: ps command output (string). + :param command: command string expected to be in ps output (string). + :return: pid (int) or None if no such process. + """ + + pid = None + found = None + + for line in ps.split('\n'): + if command in line: + found = line + break + if found: + # extract pid + _pid = search(r'(\d+) ', found) + try: + pid = int(_pid.group(1)) + except Exception as e: + logger.warning('pid has wrong type: %s' % e) + else: + logger.debug('extracted pid=%d from ps output: %s' % (pid, found)) + else: + logger.debug('command not found in ps output: %s' % command) + + return pid + + +def get_trf_command(command, transformation=""): + """ + Return the last command in the full payload command string. + Note: this function returns the last command in job.command which is only set for containers. + + :param command: full payload command (string). + :param transformation: optional name of transformation, e.g. Sim_tf.py (string). + :return: trf command (string). + """ + + payload_command = "" + if command: + if not transformation: + payload_command = command.split(';')[-2] + else: + if transformation in command: + payload_command = command[command.find(transformation):] + + # clean-up the command, remove '-signs and any trailing ; + payload_command = payload_command.strip() + payload_command = payload_command.replace("'", "") + payload_command = payload_command.rstrip(";") + + return payload_command + + +def get_memory_monitor_info_path(workdir, allowtxtfile=False): + """ + Find the proper path to the utility info file + Priority order: + 1. JSON summary file from workdir + 2. JSON summary file from pilot initdir + 3. Text output file from workdir (if allowtxtfile is True) + + :param workdir: relevant work directory (string). + :param allowtxtfile: boolean attribute to allow for reading the raw memory monitor output. + :return: path (string). + """ + + pilot_initdir = os.environ.get('PILOT_HOME', '') + path = os.path.join(workdir, get_memory_monitor_summary_filename()) + init_path = os.path.join(pilot_initdir, get_memory_monitor_summary_filename()) + + if not os.path.exists(path): + if os.path.exists(init_path): + path = init_path + else: + logger.info("neither %s, nor %s exist" % (path, init_path)) + path = "" + + if path == "" and allowtxtfile: + path = os.path.join(workdir, get_memory_monitor_output_filename()) + if not os.path.exists(path): + logger.warning("file does not exist either: %s" % (path)) + + return path + + +def get_memory_monitor_info(workdir, allowtxtfile=False, name=""): # noqa: C901 + """ + Add the utility info to the node structure if available. + + :param workdir: relevant work directory (string). + :param allowtxtfile: boolean attribute to allow for reading the raw memory monitor output. + :param name: name of memory monitor (string). + :return: node structure (dictionary). + """ + + node = {} + + # Get the values from the memory monitor file (json if it exists, otherwise the preliminary txt file) + # Note that only the final json file will contain the totRBYTES, etc + try: + summary_dictionary = get_memory_values(workdir, name=name) + except Exception as e: + logger.warning('failed to get memory values from memory monitor tool: %s' % e) + summary_dictionary = {} + else: + logger.debug("summary_dictionary=%s" % str(summary_dictionary)) + + # Fill the node dictionary + if summary_dictionary and summary_dictionary != {}: + # first determine which memory monitor version was running (MemoryMonitor or prmon) + if 'maxRSS' in summary_dictionary['Max']: + version = 'MemoryMonitor' + elif 'rss' in summary_dictionary['Max']: + version = 'prmon' + else: + version = 'unknown' + if version == 'MemoryMonitor': + try: + node['maxRSS'] = summary_dictionary['Max']['maxRSS'] + node['maxVMEM'] = summary_dictionary['Max']['maxVMEM'] + node['maxSWAP'] = summary_dictionary['Max']['maxSwap'] + node['maxPSS'] = summary_dictionary['Max']['maxPSS'] + node['avgRSS'] = summary_dictionary['Avg']['avgRSS'] + node['avgVMEM'] = summary_dictionary['Avg']['avgVMEM'] + node['avgSWAP'] = summary_dictionary['Avg']['avgSwap'] + node['avgPSS'] = summary_dictionary['Avg']['avgPSS'] + except Exception as e: + logger.warning("exception caught while parsing memory monitor file: %s" % e) + logger.warning("will add -1 values for the memory info") + node['maxRSS'] = -1 + node['maxVMEM'] = -1 + node['maxSWAP'] = -1 + node['maxPSS'] = -1 + node['avgRSS'] = -1 + node['avgVMEM'] = -1 + node['avgSWAP'] = -1 + node['avgPSS'] = -1 + else: + logger.info("extracted standard info from memory monitor json") + try: + node['totRCHAR'] = summary_dictionary['Max']['totRCHAR'] + node['totWCHAR'] = summary_dictionary['Max']['totWCHAR'] + node['totRBYTES'] = summary_dictionary['Max']['totRBYTES'] + node['totWBYTES'] = summary_dictionary['Max']['totWBYTES'] + node['rateRCHAR'] = summary_dictionary['Avg']['rateRCHAR'] + node['rateWCHAR'] = summary_dictionary['Avg']['rateWCHAR'] + node['rateRBYTES'] = summary_dictionary['Avg']['rateRBYTES'] + node['rateWBYTES'] = summary_dictionary['Avg']['rateWBYTES'] + except Exception: + logger.warning("standard memory fields were not found in memory monitor json (or json doesn't exist yet)") + else: + logger.info("extracted standard memory fields from memory monitor json") + elif version == 'prmon': + try: + node['maxRSS'] = int(summary_dictionary['Max']['rss']) + node['maxVMEM'] = int(summary_dictionary['Max']['vmem']) + node['maxSWAP'] = int(summary_dictionary['Max']['swap']) + node['maxPSS'] = int(summary_dictionary['Max']['pss']) + node['avgRSS'] = summary_dictionary['Avg']['rss'] + node['avgVMEM'] = summary_dictionary['Avg']['vmem'] + node['avgSWAP'] = summary_dictionary['Avg']['swap'] + node['avgPSS'] = summary_dictionary['Avg']['pss'] + except Exception as e: + logger.warning("exception caught while parsing prmon file: %s" % e) + logger.warning("will add -1 values for the memory info") + node['maxRSS'] = -1 + node['maxVMEM'] = -1 + node['maxSWAP'] = -1 + node['maxPSS'] = -1 + node['avgRSS'] = -1 + node['avgVMEM'] = -1 + node['avgSWAP'] = -1 + node['avgPSS'] = -1 + else: + logger.info("extracted standard info from prmon json") + try: + node['totRCHAR'] = int(summary_dictionary['Max']['rchar']) + node['totWCHAR'] = int(summary_dictionary['Max']['wchar']) + node['totRBYTES'] = int(summary_dictionary['Max']['read_bytes']) + node['totWBYTES'] = int(summary_dictionary['Max']['write_bytes']) + node['rateRCHAR'] = summary_dictionary['Avg']['rchar'] + node['rateWCHAR'] = summary_dictionary['Avg']['wchar'] + node['rateRBYTES'] = summary_dictionary['Avg']['read_bytes'] + node['rateWBYTES'] = summary_dictionary['Avg']['write_bytes'] + except Exception: + logger.warning("standard memory fields were not found in prmon json (or json doesn't exist yet)") + else: + logger.info("extracted standard memory fields from prmon json") + else: + logger.warning('unknown memory monitor version') + else: + logger.info("memory summary dictionary not yet available") + + return node + + +def get_max_memory_monitor_value(value, maxvalue, totalvalue): # noqa: C90 + """ + Return the max and total value (used by memory monitoring). + Return an error code, 1, in case of value error. + + :param value: value to be tested (integer). + :param maxvalue: current maximum value (integer). + :param totalvalue: total value (integer). + :return: exit code, maximum and total value (tuple of integers). + """ + + ec = 0 + try: + value_int = int(value) + except Exception as e: + logger.warning("exception caught: %s" % e) + ec = 1 + else: + totalvalue += value_int + if value_int > maxvalue: + maxvalue = value_int + + return ec, maxvalue, totalvalue + + +def convert_unicode_string(unicode_string): + """ + Convert a unicode string into str. + + :param unicode_string: + :return: string. + """ + + if unicode_string is not None: + return str(unicode_string) + return None + + +def get_average_summary_dictionary_prmon(path): + """ + Loop over the memory monitor output file and create the averaged summary dictionary. + + prmon keys: + 'Time', 'nprocs', 'nthreads', 'pss', 'rchar', 'read_bytes', 'rss', 'rx_bytes', + 'rx_packets', 'stime', 'swap', 'tx_bytes', 'tx_packets', 'utime', 'vmem', 'wchar', + 'write_bytes', 'wtime' + + The function uses the first line in the output file to define the dictionary keys used + later in the function. This means that any change in the format such as new columns + will be handled automatically. + + :param path: path to memory monitor txt output file (string). + :return: summary dictionary. + """ + + summary_dictionary = {} + + # get the raw memory monitor output, convert to dictionary + dictionary = convert_text_file_to_dictionary(path) + + if dictionary: + # Calculate averages and store all values + summary_dictionary = {"Max": {}, "Avg": {}, "Other": {}, "Time": {}} + + def filter_value(value): + """ Inline function used to remove any string or None values from data. """ + if type(value) == str or value is None: + return False + else: + return True + + keys = ['vmem', 'pss', 'rss', 'swap'] + values = {} + for key in keys: + value_list = list(filter(filter_value, dictionary.get(key, 0))) # Python 2/3 + n = len(value_list) + average = int(float(sum(value_list)) / float(n)) if n > 0 else 0 + maximum = max(value_list) + values[key] = {'avg': average, 'max': maximum} + + summary_dictionary["Max"] = {"maxVMEM": values['vmem'].get('max'), "maxPSS": values['pss'].get('max'), + "maxRSS": values['rss'].get('max'), "maxSwap": values['swap'].get('max')} + summary_dictionary["Avg"] = {"avgVMEM": values['vmem'].get('avg'), "avgPSS": values['pss'].get('avg'), + "avgRSS": values['rss'].get('avg'), "avgSwap": values['swap'].get('avg')} + + # add the last of the rchar, .., values + keys = ['rchar', 'wchar', 'read_bytes', 'write_bytes', 'nprocs'] + time_keys = ['stime', 'utime'] + keys = keys + time_keys + # warning: should read_bytes/write_bytes be reported as rbytes/wbytes? + for key in keys: + value = get_last_value(dictionary.get(key, None)) + if value: + if key in time_keys: + summary_dictionary["Time"][key] = value + else: + summary_dictionary["Other"][key] = value + + return summary_dictionary + + +def get_metadata_dict_from_txt(path, storejson=False, jobid=None): + """ + Convert memory monitor text output to json, store it, and return a selection as a dictionary. + + :param path: + :param storejson: store dictionary on disk if True (boolean). + :param jobid: job id (string). + :return: prmon metadata (dictionary). + """ + + # get the raw memory monitor output, convert to dictionary + dictionary = convert_text_file_to_dictionary(path) + + if dictionary and storejson: + # add metadata + dictionary['type'] = 'MemoryMonitorData' + dictionary['pandaid'] = jobid + + path = os.path.join(os.path.dirname(path), get_memory_monitor_output_filename(suffix='json')) + logger.debug('writing prmon dictionary to: %s' % path) + write_json(path, dictionary) + else: + logger.debug('nothing to write (no prmon dictionary)') + + # filter dictionary? + # .. + + return dictionary + + +def convert_text_file_to_dictionary(path): + """ + Convert row-column text file to dictionary. + User first row identifiers as dictionary keys. + Note: file must follow the convention: + NAME1 NAME2 .. + value1 value2 .. + .. .. .. + + :param path: path to file (string). + :return: dictionary. + """ + + summary_keys = [] # to keep track of content + header_locked = False + dictionary = {} + + with open(path) as f: + for line in f: + line = convert_unicode_string(line) + if line != "": + try: + # Remove empty entries from list (caused by multiple \t) + _l = line.replace('\n', '') + _l = [_f for _f in _l.split('\t') if _f] + + # define dictionary keys + if type(_l[0]) == str and not header_locked: + summary_keys = _l + for key in _l: + dictionary[key] = [] + header_locked = True + else: # sort the memory measurements in the correct columns + for i, key in enumerate(_l): + # for key in _l: + key_entry = summary_keys[i] # e.g. Time + value = convert_to_int(key) + dictionary[key_entry].append(value) + except Exception: + logger.warning("unexpected format of utility output: %s" % line) + + return dictionary + + +def get_last_value(value_list): + value = None + if value_list: + value = value_list[-1] + return value + + +def get_average_summary_dictionary(path): + """ + Loop over the memory monitor output file and create the averaged summary dictionary. + + :param path: path to memory monitor txt output file (string). + :return: summary dictionary. + """ + + maxvmem = -1 + maxrss = -1 + maxpss = -1 + maxswap = -1 + avgvmem = 0 + avgrss = 0 + avgpss = 0 + avgswap = 0 + totalvmem = 0 + totalrss = 0 + totalpss = 0 + totalswap = 0 + n = 0 + summary_dictionary = {} + + rchar = None + wchar = None + rbytes = None + wbytes = None + + first = True + with open(path) as f: + for line in f: + # Skip the first line + if first: + first = False + continue + line = convert_unicode_string(line) + if line != "": + try: + # Remove empty entries from list (caused by multiple \t) + _l = [_f for _f in line.split('\t') if _f] + # _time = _l[0] # 'Time' not user + vmem = _l[1] + pss = _l[2] + rss = _l[3] + swap = _l[4] + # note: the last rchar etc values will be reported + if len(_l) == 9: + rchar = int(_l[5]) + wchar = int(_l[6]) + rbytes = int(_l[7]) + wbytes = int(_l[8]) + else: + rchar = None + wchar = None + rbytes = None + wbytes = None + except Exception: + logger.warning("unexpected format of utility output: %s (expected format: Time, VMEM," + " PSS, RSS, Swap [, RCHAR, WCHAR, RBYTES, WBYTES])" % (line)) + else: + # Convert to int + ec1, maxvmem, totalvmem = get_max_memory_monitor_value(vmem, maxvmem, totalvmem) + ec2, maxpss, totalpss = get_max_memory_monitor_value(pss, maxpss, totalpss) + ec3, maxrss, totalrss = get_max_memory_monitor_value(rss, maxrss, totalrss) + ec4, maxswap, totalswap = get_max_memory_monitor_value(swap, maxswap, totalswap) + if ec1 or ec2 or ec3 or ec4: + logger.warning("will skip this row of numbers due to value exception: %s" % (line)) + else: + n += 1 + + # Calculate averages and store all values + summary_dictionary = {"Max": {}, "Avg": {}, "Other": {}} + summary_dictionary["Max"] = {"maxVMEM": maxvmem, "maxPSS": maxpss, "maxRSS": maxrss, "maxSwap": maxswap} + if rchar: + summary_dictionary["Other"]["rchar"] = rchar + if wchar: + summary_dictionary["Other"]["wchar"] = wchar + if rbytes: + summary_dictionary["Other"]["rbytes"] = rbytes + if wbytes: + summary_dictionary["Other"]["wbytes"] = wbytes + if n > 0: + avgvmem = int(float(totalvmem) / float(n)) + avgpss = int(float(totalpss) / float(n)) + avgrss = int(float(totalrss) / float(n)) + avgswap = int(float(totalswap) / float(n)) + summary_dictionary["Avg"] = {"avgVMEM": avgvmem, "avgPSS": avgpss, "avgRSS": avgrss, "avgSwap": avgswap} + + return summary_dictionary + + +def get_memory_values(workdir, name=""): + """ + Find the values in the memory monitor output file. + + In case the summary JSON file has not yet been produced, create a summary dictionary with the same format + using the output text file (produced by the memory monitor and which is updated once per minute). + + FORMAT: + {"Max":{"maxVMEM":40058624,"maxPSS":10340177,"maxRSS":16342012,"maxSwap":16235568}, + "Avg":{"avgVMEM":19384236,"avgPSS":5023500,"avgRSS":6501489,"avgSwap":5964997}, + "Other":{"rchar":NN,"wchar":NN,"rbytes":NN,"wbytes":NN}} + + :param workdir: relevant work directory (string). + :param name: name of memory monitor (string). + :return: memory values dictionary. + """ + + summary_dictionary = {} + + # Get the path to the proper memory info file (priority ordered) + path = get_memory_monitor_info_path(workdir, allowtxtfile=True) + if os.path.exists(path): + logger.info("using path: %s (trf name=%s)" % (path, name)) + + # Does a JSON summary file exist? If so, there's no need to calculate maximums and averages in the pilot + if path.lower().endswith('json'): + # Read the dictionary from the JSON file + summary_dictionary = read_json(path) + else: + # Loop over the output file, line by line, and look for the maximum PSS value + if name == "prmon": + summary_dictionary = get_average_summary_dictionary_prmon(path) + else: + summary_dictionary = get_average_summary_dictionary(path) + logger.debug('summary_dictionary=%s (trf name=%s)' % (str(summary_dictionary), name)) + else: + if path == "": + logger.warning("filename not set for memory monitor output") + else: + # Normally this means that the memory output file has not been produced yet + pass + + return summary_dictionary + + +def post_memory_monitor_action(job): + """ + Perform post action items for memory monitor. + + :param job: job object. + :return: + """ + + nap = 3 + path1 = os.path.join(job.workdir, get_memory_monitor_summary_filename()) + path2 = os.environ.get('PILOT_HOME') + i = 0 + maxretry = 20 + while i <= maxretry: + if os.path.exists(path1): + break + logger.info("taking a short nap (%d s) to allow the memory monitor to finish writing to the summary file (#%d/#%d)" + % (nap, i, maxretry)) + time.sleep(nap) + i += 1 + + try: + copy(path1, path2) + except Exception as e: + logger.warning('failed to copy memory monitor output: %s' % e) + + def precleanup(): """ Pre-cleanup at the beginning of the job to remove any pre-existing files from previous jobs in the main work dir. @@ -18,4 +808,8 @@ def precleanup(): :return: """ - pass + logger.debug('performing pre-cleanup of potentially pre-existing files from earlier job in main work dir') + path = os.path.join(os.environ.get('PILOT_HOME'), get_memory_monitor_summary_filename()) + if os.path.exists(path): + logger.info('removing no longer needed file: %s' % path) + remove(path) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 9a0e1932e..3d1754337 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '58' # build number should be reset to '1' for every new development cycle +BUILD = '59' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From ed59c55c1386b3441d8631c3c19720a76e2bf199 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 19 Apr 2023 11:28:45 +0200 Subject: [PATCH 076/154] Flake8 --- pilot/control/job.py | 1 - pilot/util/https.py | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 0eb478a1a..6423ec2b3 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -12,7 +12,6 @@ from __future__ import print_function # Python 2 -import json import os import time import hashlib diff --git a/pilot/util/https.py b/pilot/util/https.py index f724a71f3..5c2a7e29a 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -263,10 +263,7 @@ def update_ctx(): certdir = os.environ.get('X509_CERT_DIR', _ctx.capath) if certdir != _ctx.capath and os.path.exists(certdir): _ctx.capath = certdir - logger.debug(f"X509_CERT_DIR={os.environ.get('X509_CERT_DIR')}") - logger.debug(f"_ctx.capath={_ctx.capath}") - logger.debug(f"certdir={certdir}") - logger.debug(f"os.path.exists(certdir)={os.path.exists(certdir)}") + def get_curl_command(plain, dat, ipv): """ From 975bb7cd8f64152a44a5b8dbeeeed9252321f9a4 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 10:24:34 +0200 Subject: [PATCH 077/154] Fixed scan_for_jobs() for no job queues --- PILOTVERSION | 2 +- pilot/util/queuehandling.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/PILOTVERSION b/PILOTVERSION index 54c848536..8127358fb 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.58 \ No newline at end of file +3.6.0.59 \ No newline at end of file diff --git a/pilot/util/queuehandling.py b/pilot/util/queuehandling.py index ea655ab6c..1ab8f796e 100644 --- a/pilot/util/queuehandling.py +++ b/pilot/util/queuehandling.py @@ -52,6 +52,9 @@ def scan_for_jobs(queues): while time.time() - _t0 < 30: for queue in queues._fields: + # ignore queues with no job objects + if queue == 'completed_jobids' or queue == 'messages': + continue _queue = getattr(queues, queue) jobs = list(_queue.queue) if len(jobs) > 0: From 7689c04871c07d75b360970d962d8c55d618a511 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 11:06:57 +0200 Subject: [PATCH 078/154] Updated memory monitoring for sPHENIX --- pilot/user/sphenix/common.py | 151 +++++++++++++++++++++++++++----- pilot/user/sphenix/utilities.py | 23 +++-- pilot/util/constants.py | 2 +- 3 files changed, 145 insertions(+), 31 deletions(-) diff --git a/pilot/user/sphenix/common.py b/pilot/user/sphenix/common.py index 0582a717c..c88e17673 100644 --- a/pilot/user/sphenix/common.py +++ b/pilot/user/sphenix/common.py @@ -5,14 +5,28 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2022 +# - Paul Nilsson, paul.nilsson@cern.ch, 2017-2023 import os +import re from signal import SIGTERM from pilot.common.exception import TrfDownloadFailure from pilot.util.config import config -from pilot.util.constants import UTILITY_BEFORE_PAYLOAD, UTILITY_AFTER_PAYLOAD_STARTED +from pilot.util.constants import ( + UTILITY_BEFORE_PAYLOAD, + UTILITY_WITH_PAYLOAD, + UTILITY_AFTER_PAYLOAD_STARTED, + UTILITY_AFTER_PAYLOAD_FINISHED, + UTILITY_AFTER_PAYLOAD_STARTED2, + UTILITY_BEFORE_STAGEIN, + UTILITY_AFTER_PAYLOAD_FINISHED2 +) +from .utilities import ( + get_memory_monitor_setup, + post_memory_monitor_action, + get_memory_monitor_summary_filename +) from pilot.util.filehandling import read_file from .setup import get_analysis_trf @@ -124,35 +138,124 @@ def remove_redundant_files(workdir, outputfiles=None, piloterrors=[], debugmode= def get_utility_commands(order=None, job=None): """ - Return a dictionary of utility commands and arguments to be executed in parallel with the payload. - This could e.g. be memory and network monitor commands. A separate function can be used to determine the - corresponding command setups using the utility command name. - If the optional order parameter is set, the function should return the list of corresponding commands. - E.g. if order=UTILITY_BEFORE_PAYLOAD, the function should return all commands that are to be executed before the - payload. If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be prepended to the payload execution - string. If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be executed after the payload has been started - should be returned. + Return a dictionary of utility commands and arguments to be executed + in parallel with the payload. This could e.g. be memory and network + monitor commands. A separate function can be used to determine the + corresponding command setups using the utility command name. If the + optional order parameter is set, the function should return the list + of corresponding commands. - FORMAT: {'command': , 'args': } + For example: - :param order: optional sorting order (see pilot.util.constants) + If order=UTILITY_BEFORE_PAYLOAD, the function should return all + commands that are to be executed before the payload. + + If order=UTILITY_WITH_PAYLOAD, the corresponding commands will be + prepended to the payload execution string. + + If order=UTILITY_AFTER_PAYLOAD_STARTED, the commands that should be + executed after the payload has been started should be returned. + + If order=UTILITY_WITH_STAGEIN, the commands that should be executed + parallel with stage-in will be returned. + + FORMAT: {'command': , 'args': , 'label': , 'ignore_failure': } + + :param order: optional sorting order (see pilot.util.constants). :param job: optional job object. :return: dictionary of utilities to be executed in parallel with the payload. """ - return {} + if order == UTILITY_BEFORE_PAYLOAD and job.preprocess: + return {} + + if order == UTILITY_WITH_PAYLOAD: + return {} + + if order == UTILITY_AFTER_PAYLOAD_STARTED: + return get_utility_after_payload_started() + + if order == UTILITY_AFTER_PAYLOAD_STARTED2 and job.coprocess: + return {} + + if order == UTILITY_AFTER_PAYLOAD_FINISHED: + return {} + + if order == UTILITY_AFTER_PAYLOAD_FINISHED2 and job.postprocess: + return {} + + if order == UTILITY_BEFORE_STAGEIN: + return {} + + return None + + +def get_utility_after_payload_started(): + """ + Return the command dictionary for the utility after the payload has started. + + Command FORMAT: {'command': , 'args': , 'label': } + + :return: command (dictionary). + """ + + com = {} + try: + cmd = config.Pilot.utility_after_payload_started + except Exception: + pass + else: + if cmd: + com = {'command': cmd, 'args': '', 'label': cmd.lower(), 'ignore_failure': True} def get_utility_command_setup(name, job, setup=None): """ Return the proper setup for the given utility command. - If a payload setup is specified - :param name: - :param setup: - :return: - """ + If a payload setup is specified, then the utility command string should be prepended to it. - pass + :param name: name of utility (string). + :param job: job object. + :param setup: optional payload setup string. + :return: utility command setup (string). + """ + + if name == 'MemoryMonitor': + # must know if payload is running in a container or not + # (enables search for pid in ps output) + use_container = job.usecontainer or 'runcontainer' in job.transformation + dump_ps = ("PRMON_DEBUG" in job.infosys.queuedata.catchall) + + setup, pid = get_memory_monitor_setup( + job.pid, + job.pgrp, + job.jobid, + job.workdir, + job.command, + use_container=use_container, + transformation=job.transformation, + outdata=job.outdata, + dump_ps=dump_ps + ) + + _pattern = r"([\S]+)\ ." + pattern = re.compile(_pattern) + _name = re.findall(pattern, setup.split(';')[-1]) + if _name: + job.memorymonitor = _name[0] + else: + logger.warning('trf name could not be identified in setup string') + + # update the pgrp if the pid changed + if pid not in (job.pid, -1): + logger.debug('updating pgrp=%d for pid=%d', job.pgrp, pid) + try: + job.pgrp = os.getpgid(pid) + except Exception as exc: + logger.warning('os.getpgid(%d) failed with: %s', pid, exc) + return setup + + return "" def get_utility_command_execution_order(name): @@ -179,7 +282,8 @@ def post_utility_command_action(name, job): :return: """ - pass + if name == 'MemoryMonitor': + post_memory_monitor_action(job) def get_utility_command_kill_signal(name): @@ -202,7 +306,12 @@ def get_utility_command_output_filename(name, selector=None): :return: filename (string). """ - return "" + if name == 'MemoryMonitor': + filename = get_memory_monitor_summary_filename(selector=selector) + else: + filename = "" + + return filename def verify_job(job): diff --git a/pilot/user/sphenix/utilities.py b/pilot/user/sphenix/utilities.py index ec52d8a60..5390cbc75 100644 --- a/pilot/user/sphenix/utilities.py +++ b/pilot/user/sphenix/utilities.py @@ -12,7 +12,7 @@ from re import search # from pilot.info import infosys -from .setup import get_asetup +# from .setup import get_asetup from pilot.util.container import execute from pilot.util.filehandling import read_json, copy, write_json, remove from pilot.util.parameters import convert_to_int @@ -73,17 +73,22 @@ def get_memory_monitor_setup(pid, pgrp, jobid, workdir, command, setup="", use_c logger.warning('process id was not identified before payload finished - will not launch memory monitor') return "", pid - if not setup: - setup = get_asetup(asetup=False) - setup += 'lsetup prmon;' - if not setup.endswith(';'): - setup += ';' - - cmd = "prmon" + #if not setup: + # setup = get_asetup(asetup=False) + # setup += 'lsetup prmon;' + #if not setup.endswith(';'): + # setup += ';' + + path = os.environ.get('ATLAS_LOCAL_ROOT', '') + if path: + path = os.path.join(path, 'prmon/current/bin') + path += '/' + cmd = f"{path}prmon" interval = 60 options = " --pid %d --filename %s --json-summary %s --interval %d" %\ (pid, get_memory_monitor_output_filename(), get_memory_monitor_summary_filename(), interval) - cmd = "cd " + workdir + ";" + setup + cmd + options + #cmd = "cd " + workdir + ";" + setup + cmd + options + cmd = "cd " + workdir + ";" + cmd + options return cmd, pid diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 3d1754337..de232c0d5 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '59' # build number should be reset to '1' for every new development cycle +BUILD = '60' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From d3af0af7a0f639a5f4419e48b8b32e8cfd685342 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 11:44:33 +0200 Subject: [PATCH 079/154] Updated cpu_arch --- PILOTVERSION | 2 +- pilot/user/atlas/utilities.py | 28 ++++++++++++++++++++++++++++ pilot/user/generic/utilities.py | 14 +++++++++++++- pilot/user/rubin/utilities.py | 14 +++++++++++++- pilot/user/sphenix/utilities.py | 12 ++++++++++++ pilot/util/constants.py | 2 +- pilot/util/workernode.py | 25 +++++++++++++++++++++++-- 7 files changed, 91 insertions(+), 6 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 8127358fb..f30485b01 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.59 \ No newline at end of file +3.6.0.61 \ No newline at end of file diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index f903d2893..dcc9e871b 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -852,3 +852,31 @@ def precleanup(): if os.path.exists(path): logger.info('removing no longer needed file: %s' % path) remove(path) + + +def get_cpu_arch(): + """ + Return the CPU architecture string. + + The CPU architecture string is determined by a script (cpu_arch.py), run by the pilot but setup with lsetup. + For details about this script, see: https://its.cern.ch/jira/browse/ATLINFR-4844 + + :return: CPU arch (string). + """ + + cpu_arch = '' + + # copy pilot source into container directory, unless it is already there + setup = get_asetup(asetup=False) + script = 'cpu_arch.py --alg gcc' + cmd = setup + '; ' + script + + # CPU arch script has now been copied, time to execute it + ec, stdout, stderr = execute(cmd) + if ec or stderr: + logger.debug(f'ec={ec}, stdout={stdout}, stderr={stderr}') + else: + cpu_arch = stdout + logger.debug(f'CPU arch script returned: {cpu_arch}') + + return cpu_arch diff --git a/pilot/user/generic/utilities.py b/pilot/user/generic/utilities.py index 0963c67c4..38cc1902c 100644 --- a/pilot/user/generic/utilities.py +++ b/pilot/user/generic/utilities.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2023 import logging logger = logging.getLogger(__name__) @@ -19,3 +19,15 @@ def precleanup(): """ pass + + +def get_cpu_arch(): + """ + Return the CPU architecture string. + + If not returned by this function, the pilot will resort to use the internal scripts/cpu_arch.py. + + :return: CPU arch (string). + """ + + return "" diff --git a/pilot/user/rubin/utilities.py b/pilot/user/rubin/utilities.py index dbb12707b..023517cd3 100644 --- a/pilot/user/rubin/utilities.py +++ b/pilot/user/rubin/utilities.py @@ -5,7 +5,7 @@ # http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2021 +# - Paul Nilsson, paul.nilsson@cern.ch, 2020-2023 import os import time @@ -537,3 +537,15 @@ def precleanup(): """ pass + + +def get_cpu_arch(): + """ + Return the CPU architecture string. + + If not returned by this function, the pilot will resort to use the internal scripts/cpu_arch.py. + + :return: CPU arch (string). + """ + + return "" diff --git a/pilot/user/sphenix/utilities.py b/pilot/user/sphenix/utilities.py index 5390cbc75..6bcc04094 100644 --- a/pilot/user/sphenix/utilities.py +++ b/pilot/user/sphenix/utilities.py @@ -818,3 +818,15 @@ def precleanup(): if os.path.exists(path): logger.info('removing no longer needed file: %s' % path) remove(path) + + +def get_cpu_arch(): + """ + Return the CPU architecture string. + + If not returned by this function, the pilot will resort to use the internal scripts/cpu_arch.py. + + :return: CPU arch (string). + """ + + return "" diff --git a/pilot/util/constants.py b/pilot/util/constants.py index de232c0d5..4016841a4 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '60' # build number should be reset to '1' for every new development cycle +BUILD = '61' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index 7131da494..a9d57cf9e 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -122,9 +122,9 @@ def get_cpu_flags(sorted=True): return flags -def get_cpu_arch(): +def get_cpu_arch_internal(): """ - Return the CPU architecture string. + Return the CPU architecture string (using internal script). The CPU architecture string is determined by a script (pilot/scripts/cpu_arch.py), run by the pilot. For details about this script, see: https://its.cern.ch/jira/browse/ATLINFR-4844 @@ -152,6 +152,27 @@ def get_cpu_arch(): return cpu_arch +# export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase;source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLo\ +# > calSetup.sh --quiet;lsetup cpu_flags; cpu_arch.py --alg gcc + +def get_cpu_arch(): + """ + Return the CPU architecture string. + + The CPU architecture string is determined by a script (cpu_arch.py), run by the pilot but setup with lsetup. + For details about this script, see: https://its.cern.ch/jira/browse/ATLINFR-4844 + + :return: CPU arch (string). + """ + + pilot_user = os.environ.get('PILOT_USER', 'generic').lower() + user = __import__('pilot.user.%s.utilities' % pilot_user, globals(), locals(), [pilot_user], 0) + cpu_arch = user.get_cpu_arch() + if not cpu_arch: + cpu_arch = get_cpu_arch_internal() + + return cpu_arch + def collect_workernode_info(path=None): """ From cd4235f9409d17b2ce9f45d91a6cad957feca6cb Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 12:08:40 +0200 Subject: [PATCH 080/154] Flake8 --- PILOTVERSION | 2 +- pilot/control/job.py | 7 ++++--- pilot/user/sphenix/common.py | 1 + pilot/util/constants.py | 2 +- pilot/util/workernode.py | 2 -- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index f30485b01..ca2e72f7c 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.61 \ No newline at end of file +3.6.0.60 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 6423ec2b3..0fadfb11f 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -52,7 +52,7 @@ from pilot.util.queuehandling import scan_for_jobs, put_in_queue, queue_report, purge_queue from pilot.util.realtimelogger import cleanup as rtcleanup from pilot.util.timing import add_to_pilot_timing, timing_report, get_postgetjob_time, get_time_since, time_stamp -from pilot.util.workernode import get_disk_space, collect_workernode_info, get_node_name, get_cpu_model, get_cpu_cores #, get_cpu_arch +from pilot.util.workernode import get_disk_space, collect_workernode_info, get_node_name, get_cpu_model, get_cpu_cores, get_cpu_arch logger = logging.getLogger(__name__) errors = ErrorCodes() @@ -657,8 +657,9 @@ def get_data_structure(job, state, args, xml=None, metadata=None, final=False): if product and vendor: logger.debug(f'cpuConsumptionUnit: could have added: product={product}, vendor={vendor}') - #cpu_arch = get_cpu_arch() - #if cpu_arch: + cpu_arch = get_cpu_arch() + if cpu_arch: + logger.debug(f'cpu arch={cpu_arch}') # data['cpu_architecture_level'] = cpu_arch # add memory information if available diff --git a/pilot/user/sphenix/common.py b/pilot/user/sphenix/common.py index c88e17673..732d14732 100644 --- a/pilot/user/sphenix/common.py +++ b/pilot/user/sphenix/common.py @@ -207,6 +207,7 @@ def get_utility_after_payload_started(): else: if cmd: com = {'command': cmd, 'args': '', 'label': cmd.lower(), 'ignore_failure': True} + return com def get_utility_command_setup(name, job, setup=None): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4016841a4..de232c0d5 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '61' # build number should be reset to '1' for every new development cycle +BUILD = '60' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/workernode.py b/pilot/util/workernode.py index a9d57cf9e..99be9ca04 100644 --- a/pilot/util/workernode.py +++ b/pilot/util/workernode.py @@ -152,8 +152,6 @@ def get_cpu_arch_internal(): return cpu_arch -# export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase;source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLo\ -# > calSetup.sh --quiet;lsetup cpu_flags; cpu_arch.py --alg gcc def get_cpu_arch(): """ From 138763e26778a2d2368c631dd89329911cc95381 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 12:16:28 +0200 Subject: [PATCH 081/154] Update --- pilot/control/monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 43fdbca1f..809830097 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -355,8 +355,8 @@ def get_max_running_time(lifetime, queuedata, queues, push, pod): max_running_time = lifetime if not queuedata: - logger.warning(f'queuedata could not be extracted from queues, will use default for max running time ' - f'({max_running_time}s)') + #logger.warning(f'queuedata could not be extracted from queues, will use default for max running time ' + # f'({max_running_time}s)') return max_running_time # for push queues: try to get the walltime from the job object first, in case it exists and is set From 758d7a4d16961557079c1385c65c00c6a851e5b1 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 12:24:14 +0200 Subject: [PATCH 082/154] Update --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index ca2e72f7c..f2c62b9bc 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.60 \ No newline at end of file +3.6.0.62 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index de232c0d5..84e82af1a 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '60' # build number should be reset to '1' for every new development cycle +BUILD = '62' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 847fcfe86487367eff52b27c3fd62a0363af00d6 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 12:32:08 +0200 Subject: [PATCH 083/154] Added dask_scheduler_ip and jupyter_session_ip --- pilot/info/jobdata.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index 980844e2b..b621b3edb 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -146,6 +146,8 @@ class JobData(BaseData): containeroptions = {} # use_vp = False # True for VP jobs maxwalltime = 0 # maxWalltime in s + dask_scheduler_ip = '' # enhanced job definition for DASK jobs + jupyter_session_ip = '' # enhanced job definition for DASK jobs # home package string with additional payload release information; does not need to be added to # the conversion function since it's already lower case @@ -172,7 +174,7 @@ class JobData(BaseData): 'swrelease', 'zipmap', 'imagename', 'imagename_jobdef', 'accessmode', 'transfertype', 'datasetin', ## TO BE DEPRECATED: moved to FileSpec (job.indata) 'infilesguids', 'memorymonitor', 'allownooutput', 'pandasecrets', 'prodproxy', 'alrbuserplatform', - 'debug_command'], + 'debug_command', 'dask_scheduler_ip', 'jupyter_session_ip'], list: ['piloterrorcodes', 'piloterrordiags', 'workdirsizes', 'zombies', 'corecounts', 'subprocesses', 'logdata', 'outdata', 'indata'], dict: ['status', 'fileinfo', 'metadata', 'utilities', 'overwrite_queuedata', 'sizes', 'preprocess', @@ -486,7 +488,9 @@ def load(self, data, use_kmap=True): 'pandasecrets': 'secrets', 'pilotsecrets': 'pilotSecrets', 'requestid': 'reqID', - 'maxwalltime': 'maxWalltime' + 'maxwalltime': 'maxWalltime', + 'dask_scheduler_ip': 'scheduler_ip', + 'jupyter_session_ip': 'session_ip' } if use_kmap else {} self._load_data(data, kmap) From 5f5d08cc2f239ef1bf10c36638a797a3c2bb1f29 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 12:35:07 +0200 Subject: [PATCH 084/154] Added schedulerIP and sessionIP to job metrics for Dask jobs --- pilot/user/atlas/jobmetrics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pilot/user/atlas/jobmetrics.py b/pilot/user/atlas/jobmetrics.py index 26652f1a6..4dc40303d 100644 --- a/pilot/user/atlas/jobmetrics.py +++ b/pilot/user/atlas/jobmetrics.py @@ -81,6 +81,11 @@ def get_job_metrics_string(job): # extract event number from file and add to job metrics if it exists job_metrics = add_event_number(job_metrics, job.workdir) + # add DASK IPs if set + if job.dask_scheduler_ip and job.jupyter_session_ip: + job_metrics += get_job_metrics_entry("schedulerIP", job.dask_scheduler_ip) + job_metrics += get_job_metrics_entry("sessionIP", job.jupyter_session_ip) + return job_metrics From a0b0f681da7f1442a1f632b6de6fb8b5556bdaac Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 12:35:26 +0200 Subject: [PATCH 085/154] Update --- pilot/info/jobdata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index b621b3edb..4925eac99 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -146,8 +146,8 @@ class JobData(BaseData): containeroptions = {} # use_vp = False # True for VP jobs maxwalltime = 0 # maxWalltime in s - dask_scheduler_ip = '' # enhanced job definition for DASK jobs - jupyter_session_ip = '' # enhanced job definition for DASK jobs + dask_scheduler_ip = '' # enhanced job definition for Dask jobs + jupyter_session_ip = '' # enhanced job definition for Dask jobs # home package string with additional payload release information; does not need to be added to # the conversion function since it's already lower case From 4060392310e986fba36d2fe656b5624a6ae88312 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 16:03:09 +0200 Subject: [PATCH 086/154] Removed lsetup text from cpu_arch info --- PILOTVERSION | 2 +- pilot/user/atlas/utilities.py | 14 +++++++++++++- pilot/util/constants.py | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index f2c62b9bc..92a90009a 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.62 \ No newline at end of file +3.6.0.64 \ No newline at end of file diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index dcc9e871b..234effe25 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -866,8 +866,18 @@ def get_cpu_arch(): cpu_arch = '' + def filter_output(stdout): + """ Remove lsetup info """ + if stdout: + if stdout.endswith('\n'): + stdout = stdout[:-1] + tmp = stdout.split('\n') + stdout = tmp[-1] + + return stdout + # copy pilot source into container directory, unless it is already there - setup = get_asetup(asetup=False) + setup = get_asetup(asetup=False) + 'lsetup cpu_flags; ' script = 'cpu_arch.py --alg gcc' cmd = setup + '; ' + script @@ -876,6 +886,8 @@ def get_cpu_arch(): if ec or stderr: logger.debug(f'ec={ec}, stdout={stdout}, stderr={stderr}') else: + logger.debug(stdout) + stdout = filter_output(stdout) cpu_arch = stdout logger.debug(f'CPU arch script returned: {cpu_arch}') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 84e82af1a..1741c82df 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '62' # build number should be reset to '1' for every new development cycle +BUILD = '64' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 455be95d4c7d9b2b43136c8a57e7d79830927138 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 20 Apr 2023 16:23:36 +0200 Subject: [PATCH 087/154] Corrected prmon setup (wrong pid) --- PILOTVERSION | 2 +- pilot/user/sphenix/common.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 92a90009a..79e41c5b6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.64 \ No newline at end of file +3.6.0.65 \ No newline at end of file diff --git a/pilot/user/sphenix/common.py b/pilot/user/sphenix/common.py index 732d14732..46f7e6c0e 100644 --- a/pilot/user/sphenix/common.py +++ b/pilot/user/sphenix/common.py @@ -224,7 +224,7 @@ def get_utility_command_setup(name, job, setup=None): if name == 'MemoryMonitor': # must know if payload is running in a container or not # (enables search for pid in ps output) - use_container = job.usecontainer or 'runcontainer' in job.transformation + use_container = False #job.usecontainer or 'runcontainer' in job.transformation dump_ps = ("PRMON_DEBUG" in job.infosys.queuedata.catchall) setup, pid = get_memory_monitor_setup( diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 1741c82df..25203624b 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '64' # build number should be reset to '1' for every new development cycle +BUILD = '65' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From d680974aedde41d1e8e14bd4d0a5b45a73fe31bb Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Tue, 25 Apr 2023 14:39:44 -0700 Subject: [PATCH 088/154] update FRONTIER_SERVER --- pilot/user/atlas/resource/{cori.py => nersc.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename pilot/user/atlas/resource/{cori.py => nersc.py} (89%) diff --git a/pilot/user/atlas/resource/cori.py b/pilot/user/atlas/resource/nersc.py similarity index 89% rename from pilot/user/atlas/resource/cori.py rename to pilot/user/atlas/resource/nersc.py index 597a8ed6a..a606e318c 100644 --- a/pilot/user/atlas/resource/cori.py +++ b/pilot/user/atlas/resource/nersc.py @@ -64,8 +64,8 @@ def get_setup_command(job, prepareasetup): # test if HARVESTER_PYTHONPATH is defined if os.environ.get('HARVESTER_PYTHONPATH', '') != "": cmd += "export PYTHONPATH=$HARVESTER_PYTHONPATH:$PYTHONPATH;" - #unset FRONTIER_SERVER variable - cmd += "unset FRONTIER_SERVER" + #set FRONTIER_SERVER for NERSC + cmd += "export FRONTIER_SERVER=\"(serverurl=http://atlasfrontier-ai.cern.ch:8000/atlr)(serverurl=http://atlasfrontier2-ai.cern.ch:8000/atlr)(serverurl=http://atlasfrontier1-ai.cern.ch:8000/atlr)(proxyurl=http://frontiercache.nersc.gov:3128)\"" logger.debug('get_setup_command return value: {0}'.format(str(cmd))) From 317b6c8fdbab979a11d4d4135dfea69d1d3591c2 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Tue, 25 Apr 2023 14:51:24 -0700 Subject: [PATCH 089/154] flake8 --- pilot/user/atlas/resource/nersc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pilot/user/atlas/resource/nersc.py b/pilot/user/atlas/resource/nersc.py index a606e318c..7c05640ac 100644 --- a/pilot/user/atlas/resource/nersc.py +++ b/pilot/user/atlas/resource/nersc.py @@ -65,7 +65,11 @@ def get_setup_command(job, prepareasetup): if os.environ.get('HARVESTER_PYTHONPATH', '') != "": cmd += "export PYTHONPATH=$HARVESTER_PYTHONPATH:$PYTHONPATH;" #set FRONTIER_SERVER for NERSC - cmd += "export FRONTIER_SERVER=\"(serverurl=http://atlasfrontier-ai.cern.ch:8000/atlr)(serverurl=http://atlasfrontier2-ai.cern.ch:8000/atlr)(serverurl=http://atlasfrontier1-ai.cern.ch:8000/atlr)(proxyurl=http://frontiercache.nersc.gov:3128)\"" + cmd += ("export FRONTIER_SERVER=" + "\"(serverurl=http://atlasfrontier-ai.cern.ch:8000/atlr)" + "(serverurl=http://atlasfrontier2-ai.cern.ch:8000/atlr)" + "(serverurl=http://atlasfrontier1-ai.cern.ch:8000/atlr)" + "(proxyurl=http://frontiercache.nersc.gov:3128)\"") logger.debug('get_setup_command return value: {0}'.format(str(cmd))) From ee93877020cb894ece69aeeef7e09752f59503e1 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Tue, 25 Apr 2023 17:25:53 -0700 Subject: [PATCH 090/154] fix bug when appending preExec to existing executable --- pilot/eventservice/esprocess/esprocess.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index 7a0e3209c..7c61ab458 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -126,7 +126,11 @@ def init_yampl_socket(self, executable): if "PILOT_EVENTRANGECHANNEL" in executable: executable = "export PILOT_EVENTRANGECHANNEL=\"%s\"; " % (socket_name) + executable elif "--preExec" not in executable: - executable += " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\'" % (socket_name) + pre_exec_param = " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\'" % (socket_name) + executable = executable().strip() + if executable.endswith(";"): + executable = executable[:-1] + executable += pre_exec_param else: if "import jobproperties as jps" in executable: executable = executable.replace("import jobproperties as jps;", From 20e755197f93e91c49f2021b0a31266c2d99bf29 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Tue, 25 Apr 2023 17:47:26 -0700 Subject: [PATCH 091/154] unneeded temporary --- pilot/eventservice/esprocess/esprocess.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index 7c61ab458..7dcfa374a 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -126,11 +126,10 @@ def init_yampl_socket(self, executable): if "PILOT_EVENTRANGECHANNEL" in executable: executable = "export PILOT_EVENTRANGECHANNEL=\"%s\"; " % (socket_name) + executable elif "--preExec" not in executable: - pre_exec_param = " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\'" % (socket_name) executable = executable().strip() if executable.endswith(";"): executable = executable[:-1] - executable += pre_exec_param + executable += " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\'" % (socket_name) else: if "import jobproperties as jps" in executable: executable = executable.replace("import jobproperties as jps;", From ee6515d76873d7907936a64858c951cbf3f291e0 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 26 Apr 2023 11:37:12 +0200 Subject: [PATCH 092/154] Corrected reqID --- PILOTVERSION | 2 +- pilot/info/jobdata.py | 3 ++- pilot/util/constants.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 79e41c5b6..bfd733424 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.65 \ No newline at end of file +3.6.0.66 \ No newline at end of file diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index 4925eac99..2c283f38d 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -314,7 +314,8 @@ def get_kmap(): 'filesize': 'fsize', 'checksum': 'checksum', 'scope': 'scopeIn', ##'??define_internal_key': 'prodDBlocks', 'storage_token': 'prodDBlockToken', - 'ddmendpoint': 'ddmEndPointIn' + 'ddmendpoint': 'ddmEndPointIn', + 'requestid': 'reqID' } return kmap diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 25203624b..30e9703cc 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '65' # build number should be reset to '1' for every new development cycle +BUILD = '66' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 25ffa0aa6b2c35f3dd0518406cb8a64059e77a53 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 26 Apr 2023 13:42:04 +0200 Subject: [PATCH 093/154] Corrected cpu_flags setup --- PILOTVERSION | 2 +- pilot/user/atlas/utilities.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index bfd733424..8e4214b20 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.66 \ No newline at end of file +3.6.0.67 \ No newline at end of file diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index 234effe25..cd8cdd727 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -879,7 +879,7 @@ def filter_output(stdout): # copy pilot source into container directory, unless it is already there setup = get_asetup(asetup=False) + 'lsetup cpu_flags; ' script = 'cpu_arch.py --alg gcc' - cmd = setup + '; ' + script + cmd = setup + script # CPU arch script has now been copied, time to execute it ec, stdout, stderr = execute(cmd) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 30e9703cc..4977fe5b5 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '66' # build number should be reset to '1' for every new development cycle +BUILD = '67' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 996b5b40e3c177cf4020ecff7b72059ceae58121 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 26 Apr 2023 14:19:02 +0200 Subject: [PATCH 094/154] Updated PILOT_SOURCE_DIR setting --- PILOTVERSION | 2 +- pilot.py | 3 ++- pilot/util/constants.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 8e4214b20..6c6a50a27 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.67 \ No newline at end of file +3.6.0.68 \ No newline at end of file diff --git a/pilot.py b/pilot.py index eb2516c35..2e9b17c39 100755 --- a/pilot.py +++ b/pilot.py @@ -455,7 +455,8 @@ def set_environment_variables(): environ['PILOT_HOME'] = mainworkdir # TODO: replace with singleton # pilot source directory (e.g. /cluster/home/usatlas1/gram_scratch_hHq4Ns/condorg_oqmHdWxz) - environ['PILOT_SOURCE_DIR'] = args.sourcedir # TODO: replace with singleton + if not environ.get('PILOT_SOURCE_DIR', None): + environ['PILOT_SOURCE_DIR'] = args.sourcedir # TODO: replace with singleton # set the pilot user (e.g. ATLAS) environ['PILOT_USER'] = args.pilot_user # TODO: replace with singleton diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4977fe5b5..4c63178ce 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '67' # build number should be reset to '1' for every new development cycle +BUILD = '68' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 37860a871a72d95d7ac30ff0ab57b098c3723b6f Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 27 Apr 2023 11:25:04 +0200 Subject: [PATCH 095/154] Dask test --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 3 +++ pilot/util/constants.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 6c6a50a27..ecfed0fcb 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.68 \ No newline at end of file +3.6.0.69 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index d11def620..56a974dce 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -549,6 +549,9 @@ def get_payload_command(job): show_memory_usage() + if os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': + cmd = 'python3 -m pip install \"dask[complete]\"; ' + cmd + logger.info('payload run command: %s', cmd) return cmd diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4c63178ce..70ca70504 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '68' # build number should be reset to '1' for every new development cycle +BUILD = '69' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From d385b9b3b9f78c0a9dda102685b40e18ee214d4e Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 27 Apr 2023 15:49:12 +0200 Subject: [PATCH 096/154] Removed test code --- pilot/user/atlas/common.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 56a974dce..6877361f8 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -548,10 +548,6 @@ def get_payload_command(job): cmd = add_athena_proc_number(cmd) show_memory_usage() - - if os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': - cmd = 'python3 -m pip install \"dask[complete]\"; ' + cmd - logger.info('payload run command: %s', cmd) return cmd From 54b781c61e32d6a5a5dd2d847bcbe5a2a4d9f753 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 28 Apr 2023 10:59:21 +0200 Subject: [PATCH 097/154] Setting PYTHONPATH for DASK --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 3 +++ pilot/util/constants.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index ecfed0fcb..fe8d6e8a5 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.69 \ No newline at end of file +3.6.0.70 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 6877361f8..f40d920c1 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -547,6 +547,9 @@ def get_payload_command(job): # Explicitly add the ATHENA_PROC_NUMBER (or JOB value) cmd = add_athena_proc_number(cmd) + if os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': + cmd = 'export PYTHONPATH=/usr/local/lib/python3.6/site-packages/dask:$PYTHONPATH' + cmd + show_memory_usage() logger.info('payload run command: %s', cmd) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 70ca70504..0ca2f9803 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '69' # build number should be reset to '1' for every new development cycle +BUILD = '70' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From b08598db87b37affbed10371ab777fa3d3525d9f Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 28 Apr 2023 11:26:39 +0200 Subject: [PATCH 098/154] Setting PYTHONPATH for DASK --- pilot/user/atlas/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index f40d920c1..5a65e318e 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -548,7 +548,7 @@ def get_payload_command(job): cmd = add_athena_proc_number(cmd) if os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': - cmd = 'export PYTHONPATH=/usr/local/lib/python3.6/site-packages/dask:$PYTHONPATH' + cmd + cmd = 'export PYTHONPATH=/usr/lib64/python3.6:/usr/local/lib/python3.6/site-packages/dask:$PYTHONPATH' + cmd show_memory_usage() logger.info('payload run command: %s', cmd) From badff82727ea031776b73be14f3e43b757139d69 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 28 Apr 2023 18:35:22 +0200 Subject: [PATCH 099/154] Removed PYTHONPATH for DASK --- pilot/user/atlas/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index 5a65e318e..b7d7f0491 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -547,8 +547,8 @@ def get_payload_command(job): # Explicitly add the ATHENA_PROC_NUMBER (or JOB value) cmd = add_athena_proc_number(cmd) - if os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': - cmd = 'export PYTHONPATH=/usr/lib64/python3.6:/usr/local/lib/python3.6/site-packages/dask:$PYTHONPATH' + cmd + #if os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': + # cmd = 'export PYTHONPATH=/usr/lib64/python3.6:/usr/local/lib/python3.6/site-packages/dask:$PYTHONPATH' + cmd show_memory_usage() logger.info('payload run command: %s', cmd) From 8931f9105e5fdd5c3cc120390a15b9e9b5671c7e Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 28 Apr 2023 14:59:46 -0700 Subject: [PATCH 100/154] Support CA + eventService --- pilot/eventservice/esprocess/esprocess.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index 7dcfa374a..f268f50ba 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -123,21 +123,29 @@ def stop_message_thread(self): def init_yampl_socket(self, executable): socket_name = self.__message_thread.get_yampl_socket_name() + + is_ca = "--CA" in executable + if is_ca: + preexec_socket_config = " --preExec \'ConfigFlags.MP.EventRangeChannel=\"%s\"\'" % (socket_name) + else: + preexec_socket_config = " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\'" % (socket_name) + if "PILOT_EVENTRANGECHANNEL" in executable: executable = "export PILOT_EVENTRANGECHANNEL=\"%s\"; " % (socket_name) + executable elif "--preExec" not in executable: executable = executable().strip() if executable.endswith(";"): executable = executable[:-1] - executable += " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\'" % (socket_name) + executable += preexec_socket_config else: if "import jobproperties as jps" in executable: executable = executable.replace("import jobproperties as jps;", "import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\";" % (socket_name)) + if is_ca: + logger.warning("Found jobproperties config in CA job") else: if "--preExec " in executable: - new_str = "--preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\' " % socket_name - executable = executable.replace("--preExec ", new_str) + executable = executable.replace("--preExec ", preexec_socket_config) else: logger.warn("--preExec has an unknown format - expected \'--preExec \"\' or \"--preExec \'\", got: %s" % (executable)) From fce1f4566c1213b30c465149b499f1ecc1da145e Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 28 Apr 2023 15:54:58 -0700 Subject: [PATCH 101/154] space --- pilot/eventservice/esprocess/esprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index f268f50ba..41db7621b 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -126,9 +126,9 @@ def init_yampl_socket(self, executable): is_ca = "--CA" in executable if is_ca: - preexec_socket_config = " --preExec \'ConfigFlags.MP.EventRangeChannel=\"%s\"\'" % (socket_name) + preexec_socket_config = " --preExec \'ConfigFlags.MP.EventRangeChannel=\"%s\"\' " % (socket_name) else: - preexec_socket_config = " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\'" % (socket_name) + preexec_socket_config = " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\' " % (socket_name) if "PILOT_EVENTRANGECHANNEL" in executable: executable = "export PILOT_EVENTRANGECHANNEL=\"%s\"; " % (socket_name) + executable From 3839f324e224a88c51e9c97357427b7bf729e9af Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 28 Apr 2023 15:58:20 -0700 Subject: [PATCH 102/154] flake8 --- pilot/eventservice/esprocess/esprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index 41db7621b..04800f0d1 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -128,7 +128,8 @@ def init_yampl_socket(self, executable): if is_ca: preexec_socket_config = " --preExec \'ConfigFlags.MP.EventRangeChannel=\"%s\"\' " % (socket_name) else: - preexec_socket_config = " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\' " % (socket_name) + preexec_socket_config = \ + " --preExec \'from AthenaMP.AthenaMPFlags import jobproperties as jps;jps.AthenaMPFlags.EventRangeChannel=\"%s\"\' " % (socket_name) if "PILOT_EVENTRANGECHANNEL" in executable: executable = "export PILOT_EVENTRANGECHANNEL=\"%s\"; " % (socket_name) + executable From d4c6325442a36a0691909ef40ee049a41778acb9 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Sat, 29 Apr 2023 15:14:16 +0200 Subject: [PATCH 103/154] Setting DASK_SCHEDULER_IP --- PILOTVERSION | 2 +- pilot/user/atlas/common.py | 3 +++ pilot/util/constants.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index fe8d6e8a5..6fb3c6c65 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.70 \ No newline at end of file +3.6.0.71 \ No newline at end of file diff --git a/pilot/user/atlas/common.py b/pilot/user/atlas/common.py index b7d7f0491..46cf65d60 100644 --- a/pilot/user/atlas/common.py +++ b/pilot/user/atlas/common.py @@ -550,6 +550,9 @@ def get_payload_command(job): #if os.environ.get('PILOT_QUEUE', '') == 'GOOGLE_DASK': # cmd = 'export PYTHONPATH=/usr/lib64/python3.6:/usr/local/lib/python3.6/site-packages/dask:$PYTHONPATH' + cmd + if job.dask_scheduler_ip: + cmd += f'export DASK_SCHEDULER_IP={job.dask_scheduler_ip}; ' + cmd + show_memory_usage() logger.info('payload run command: %s', cmd) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 0ca2f9803..88e28652e 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '70' # build number should be reset to '1' for every new development cycle +BUILD = '71' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From e8c5f2fb79ab553cd855d2b82133e14203fb073e Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 2 May 2023 12:33:43 +0200 Subject: [PATCH 104/154] Preparing for list_replicas() --- pilot/api/data.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pilot/api/data.py b/pilot/api/data.py index 81088f0cb..3505e3a30 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -232,6 +232,13 @@ def resolve_replicas(self, files, use_vp=False): if not xfiles: # no files for replica look-up return files + # loop over rucio_client.list_replicas() in case of many input files + # replicas = list_replicas(xfiles) + + + + # def list_replicas(xfiles): + # load replicas from Rucio from rucio.client import Client c = Client() @@ -267,6 +274,10 @@ def resolve_replicas(self, files, use_vp=False): replicas = list(replicas) logger.debug("replicas received from Rucio: %s", replicas) + ### end of list_replicas() function + + + files_lfn = dict(((e.scope, e.lfn), e) for e in xfiles) for replica in replicas: k = replica['scope'], replica['name'] From 030ff729e472322684112a5fe71f4bff5fb9f56b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 2 May 2023 13:57:48 +0200 Subject: [PATCH 105/154] Added waiting time between server update --- pilot/util/default.cfg | 3 +++ pilot/util/https.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index cb4ad0e46..da4102229 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -130,6 +130,9 @@ utility_with_stagein: http_connect_timeout: 100 http_maxtime: 120 +# in case of server update failure, how long time should the pilot wait between attempts? (in seconds) +update_sleep: 120 + # Remote file open verification (if not wanted, clear the remotefileverification_log) remotefileverification_dictionary: remotefileverification_dictionary.json remotefileverification_log: remotefileslog.txt diff --git a/pilot/util/https.py b/pilot/util/https.py index 5c2a7e29a..4d042ae6e 100644 --- a/pilot/util/https.py +++ b/pilot/util/https.py @@ -496,6 +496,9 @@ def send_update(update_function, data, url, port, job=None, ipv='IPv6'): res['pilotSecrets'] = pilotsecrets attempt += 1 + if not done: + sleep(config.Pilot.update_sleep) + return res From 3b74c6ed4600829951e949c90713418c613bad63 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 2 May 2023 14:25:25 +0200 Subject: [PATCH 106/154] Added rucio host --- PILOTVERSION | 2 +- pilot.py | 5 ++++- pilot/util/constants.py | 2 +- pilot/util/default.cfg | 3 +++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 6fb3c6c65..b4202bbf9 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.71 \ No newline at end of file +3.6.0.72 \ No newline at end of file diff --git a/pilot.py b/pilot.py index 2e9b17c39..b5d8900bb 100755 --- a/pilot.py +++ b/pilot.py @@ -24,6 +24,7 @@ from pilot.common.errorcodes import ErrorCodes from pilot.common.exception import PilotException +from pilot.util.config import config from pilot.info import infosys from pilot.util.auxiliary import pilot_version_banner, shell_exit_code from pilot.util.constants import SUCCESS, FAILURE, ERRNO_NOJOBS, PILOT_START_TIME, PILOT_END_TIME, get_pilot_version, \ @@ -70,6 +71,9 @@ def main(): if args.update_server: send_worker_status('started', args.queue, args.url, args.port, logger, 'IPv6') # note: assuming IPv6, fallback in place + if not args.rucio_host: + args.rucio_host = config.Rucio.host + # initialize InfoService try: infosys.init(args.queue) @@ -641,7 +645,6 @@ def set_redirectall(args): # get the args from the arg parser args = get_args() args.last_heartbeat = time.time() - # args.rucio_host = 'https://voatlasrucio-server-prod.cern.ch:443' # Define and set the main harvester control boolean args.harvester = is_harvester_mode(args) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 88e28652e..67cbeab24 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '71' # build number should be reset to '1' for every new development cycle +BUILD = '72' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index da4102229..e238d87d6 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -279,6 +279,9 @@ scratch: /tmp/scratch/ # Rucio server URL for traces url: https://rucio-lb-prod.cern.ch/traces/ +# Rucio host +host: https://voatlasrucio-server-prod.cern.ch:443 + ################################ # Message broker parameters From 0d4d7e4cc9ba0fca06bde59bcd23a85dbf64c6e1 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 2 May 2023 14:59:21 +0200 Subject: [PATCH 107/154] Refactored resolve_replicas() --- pilot/api/data.py | 96 +++++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 45 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index 3505e3a30..fdd590d1d 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -215,7 +215,7 @@ def resolve_replicas(self, files, use_vp=False): :param files: list of `FileSpec` objects. :param use_vp: True for VP jobs (boolean). - :return: `files` + :return: files object. """ logger = self.logger @@ -224,7 +224,7 @@ def resolve_replicas(self, files, use_vp=False): show_memory_usage() for fdat in files: - ## skip fdat if need for further workflow (e.g. to properly handle OS ddms) + # skip fdat if need for further workflow (e.g. to properly handle OS ddms) xfiles.append(fdat) show_memory_usage() @@ -232,51 +232,11 @@ def resolve_replicas(self, files, use_vp=False): if not xfiles: # no files for replica look-up return files - # loop over rucio_client.list_replicas() in case of many input files - # replicas = list_replicas(xfiles) - - - - # def list_replicas(xfiles): - - # load replicas from Rucio - from rucio.client import Client - c = Client() - - show_memory_usage() - - location = self.detect_client_location() - if not location: - raise PilotException("Failed to get client location for Rucio", code=ErrorCodes.RUCIOLOCATIONFAILED) - - query = { - 'schemes': ['srm', 'root', 'davs', 'gsiftp', 'https', 'storm', 'file'], - 'dids': [dict(scope=e.scope, name=e.lfn) for e in xfiles], - } - query.update(sort='geoip', client_location=location) - # reset the schemas for VP jobs - if use_vp: - query['schemes'] = ['root'] - query['rse_expression'] = 'istape=False\\type=SPECIAL' - - # add signature lifetime for signed URL storages - query.update(signature_lifetime=24 * 3600) # note: default is otherwise 1h - - logger.info('calling rucio.list_replicas() with query=%s', query) - + # get the list of replicas try: - replicas = c.list_replicas(**query) + replicas = self.list_replicas(xfiles, use_vp) except Exception as exc: - raise PilotException("Failed to get replicas from Rucio: %s" % exc, code=ErrorCodes.RUCIOLISTREPLICASFAILED) - - show_memory_usage() - - replicas = list(replicas) - logger.debug("replicas received from Rucio: %s", replicas) - - ### end of list_replicas() function - - + raise exc files_lfn = dict(((e.scope, e.lfn), e) for e in xfiles) for replica in replicas: @@ -321,6 +281,52 @@ def resolve_replicas(self, files, use_vp=False): return files + def list_replicas(self, xfiles, use_vp): + """ + Wrapper around rucio_client.list_replicas() + + :param xfiles: files object. + :param use_vp: True for VP jobs (boolean). + :return: replicas (list). + """ + + # load replicas from Rucio + from rucio.client import Client + rucio_client = Client() + + show_memory_usage() + + location = self.detect_client_location() + if not location: + raise PilotException("Failed to get client location for Rucio", code=ErrorCodes.RUCIOLOCATIONFAILED) + + query = { + 'schemes': ['srm', 'root', 'davs', 'gsiftp', 'https', 'storm', 'file'], + 'dids': [dict(scope=e.scope, name=e.lfn) for e in xfiles], + } + query.update(sort='geoip', client_location=location) + # reset the schemas for VP jobs + if use_vp: + query['schemes'] = ['root'] + query['rse_expression'] = 'istape=False\\type=SPECIAL' + + # add signature lifetime for signed URL storages + query.update(signature_lifetime=24 * 3600) # note: default is otherwise 1h + + self.logger.info(f'calling rucio.list_replicas() with query={query}') + + try: + replicas = rucio_client.list_replicas(**query) + except Exception as exc: + raise PilotException(f"Failed to get replicas from Rucio: {exc}", code=ErrorCodes.RUCIOLISTREPLICASFAILED) + + show_memory_usage() + + replicas = list(replicas) + self.logger.debug(f"replicas received from Rucio: {replicas}") + + return replicas + def add_replicas(self, fdat, replica): """ Add the replicas to the fdat structure. From a88b6dfae81ca7e3d5f3d3e0ccee6c23528d9e40 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 2 May 2023 15:17:14 +0200 Subject: [PATCH 108/154] Update --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index b4202bbf9..6a6d13dd1 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.72 \ No newline at end of file +3.6.0.73 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 67cbeab24..5a477e755 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '72' # build number should be reset to '1' for every new development cycle +BUILD = '73' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 09677bdf60d0d796e2fa59d92fb23f87b0ec8817 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 5 May 2023 09:59:19 +0200 Subject: [PATCH 109/154] Now reporting cpu arch --- PILOTVERSION | 2 +- pilot/control/job.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 6a6d13dd1..a75d48955 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.73 \ No newline at end of file +3.6.0.74 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 0fadfb11f..c487bc911 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -660,7 +660,7 @@ def get_data_structure(job, state, args, xml=None, metadata=None, final=False): cpu_arch = get_cpu_arch() if cpu_arch: logger.debug(f'cpu arch={cpu_arch}') - # data['cpu_architecture_level'] = cpu_arch + data['cpu_architecture_level'] = cpu_arch # add memory information if available add_memory_info(data, job.workdir, name=job.memorymonitor) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 5a477e755..95de3ea78 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '73' # build number should be reset to '1' for every new development cycle +BUILD = '74' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From d6b5961781a95a1a1fb3d9154f0d15474b54387a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 5 May 2023 17:08:52 +0200 Subject: [PATCH 110/154] Looking for mv_final_destination in catchall --- pilot/api/data.py | 16 ++++++++++++++++ pilot/copytool/mv.py | 18 +++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/pilot/api/data.py b/pilot/api/data.py index fdd590d1d..a8bd0b4ea 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -823,11 +823,24 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 kwargs['trace_report'] = self.trace_report self.logger.info('ready to transfer (stage-in) files: %s', remain_files) + # is there an override in catchall to allow mv to final destination (relevant for mv copytool only) + kwargs['mvfinaldest'] = self.allow_mvfinaldest(kwargs.get('catchall', '')) + # use bulk downloads if necessary # if kwargs['use_bulk_transfer'] # return copytool.copy_in_bulk(remain_files, **kwargs) return copytool.copy_in(remain_files, **kwargs) + def allow_mvfinaldest(self, catchall): + """ + Is there an override in catchall to allow mv to final destination? + + :param catchall: catchall from queuedata (string) + :return: True if 'mv_final_destination' is present in catchall, otherwise False (Boolean) + """ + + return True if catchall and 'mv_final_destination' in catchall else False + def set_status_for_direct_access(self, files, workdir): """ Update the FileSpec status with 'remote_io' for direct access mode. @@ -1109,6 +1122,9 @@ def transfer_files(self, copytool, files, activity, **kwargs): # add the trace report kwargs['trace_report'] = self.trace_report + # is there an override in catchall to allow mv to final destination (relevant for mv copytool only) + kwargs['mvfinaldest'] = self.allow_mvfinaldest(kwargs.get('catchall', '')) + return copytool.copy_out(files, **kwargs) #class StageInClientAsync(object): diff --git a/pilot/copytool/mv.py b/pilot/copytool/mv.py index 89f175456..356224f5b 100644 --- a/pilot/copytool/mv.py +++ b/pilot/copytool/mv.py @@ -131,7 +131,11 @@ def copy_in(files, copy_type="symlink", **kwargs): logger.debug(f"workdir={kwargs.get('workdir')}") logger.debug(f"jobworkdir={kwargs.get('jobworkdir')}") - exit_code, stdout, stderr = move_all_files(files, copy_type, kwargs.get('workdir'), kwargs.get('jobworkdir')) + exit_code, stdout, stderr = move_all_files(files, + copy_type, + kwargs.get('workdir'), + kwargs.get('jobworkdir'), + mvfinaldest=kwargs.get('mvfinaldest', False)) if exit_code != 0: # raise failure raise StageInFailure(stdout) @@ -153,7 +157,11 @@ def copy_out(files, copy_type="mv", **kwargs): if not kwargs.get('workdir'): raise StageOutFailure("Workdir is not specified") - exit_code, stdout, stderr = move_all_files(files, copy_type, kwargs.get('workdir'), '') + exit_code, stdout, stderr = move_all_files(files, + copy_type, + kwargs.get('workdir'), + '', + mvfinaldest=kwargs.get('mvfinaldest', False)) if exit_code != 0: # raise failure if exit_code == ErrorCodes.MKDIR: @@ -170,7 +178,7 @@ def copy_out(files, copy_type="mv", **kwargs): return files -def move_all_files(files, copy_type, workdir, jobworkdir): +def move_all_files(files, copy_type, workdir, jobworkdir, mvfinaldest=False): """ Move all files. @@ -198,7 +206,7 @@ def move_all_files(files, copy_type, workdir, jobworkdir): name = fspec.lfn if fspec.filetype == 'input': - if user.mv_to_final_destination(): + if user.mv_to_final_destination() or mvfinaldest: subpath = user.get_path(fspec.scope, fspec.lfn) logger.debug(f'subpath={subpath}') source = os.path.join(workdir, subpath) @@ -209,7 +217,7 @@ def move_all_files(files, copy_type, workdir, jobworkdir): else: source = os.path.join(workdir, name) # is the copytool allowed to move files to the final destination (not in Nordugrid/ATLAS) - if user.mv_to_final_destination(): + if user.mv_to_final_destination() or mvfinaldest: # create any sub dirs if they don't exist already, and find the final destination path ec, diagnostics, destination = build_final_path(fspec.turl) if ec: From 70444f2367d0ad48acce1f89e03b570ea66c0cb3 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Mon, 8 May 2023 14:59:07 +0200 Subject: [PATCH 111/154] Ignore machine features failure --- pilot/control/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 809830097..5ee43a181 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -165,7 +165,7 @@ def run_shutdowntime_minute_check(time_since_start): try: shutdowntime = int(_shutdowntime) except (TypeError, ValueError) as exc: - logger.debug(f'failed to convert shutdowntime: {exc}') + #logger.debug(f'failed to convert shutdowntime: {exc}') return False # will be ignored else: logger.debug(f'machinefeatures shutdowntime={shutdowntime} - now={now}') From d5b9715470ccfa93bf435b3f798b06c34b466df6 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Mon, 8 May 2023 16:18:13 +0200 Subject: [PATCH 112/154] Removed abort --- PILOTVERSION | 2 +- pilot/control/job.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index a75d48955..05bd89ef8 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.74 \ No newline at end of file +3.6.0.75 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index c487bc911..597f25e39 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2845,7 +2845,7 @@ def job_monitor(queues, traces, args): # noqa: C901 logger.info('monitor loop #%d: job %d:%s is in state \'%s\'', n, i, current_id, jobs[i].state) if jobs[i].state == 'finished' or jobs[i].state == 'failed': logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) - abort = True + # abort = True - do not set abort here as it will abort the entire thread, not just the current monitor loop break # perform the monitoring tasks diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 95de3ea78..4f58b6e10 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '74' # build number should be reset to '1' for every new development cycle +BUILD = '75' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 773d587f0e7f51f2d963581d92792c5f39eea3d5 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 9 May 2023 10:18:31 +0200 Subject: [PATCH 113/154] Added mvfinaldestination support --- PILOTVERSION | 2 +- pilot/api/data.py | 20 ++++++++++---------- pilot/copytool/rucio.py | 3 ++- pilot/util/constants.py | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 05bd89ef8..92c8b014e 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.75 \ No newline at end of file +3.6.0.78 \ No newline at end of file diff --git a/pilot/api/data.py b/pilot/api/data.py index a8bd0b4ea..5711929bd 100644 --- a/pilot/api/data.py +++ b/pilot/api/data.py @@ -100,6 +100,16 @@ def __init__(self, infosys_instance=None, acopytools=None, logger=None, default_ raise PilotException("failed to resolve acopytools settings") logger.info('configured copytools per activity: acopytools=%s', self.acopytools) + def allow_mvfinaldest(self, catchall): + """ + Is there an override in catchall to allow mv to final destination? + + :param catchall: catchall from queuedata (string) + :return: True if 'mv_final_destination' is present in catchall, otherwise False (Boolean) + """ + + return True if catchall and 'mv_final_destination' in catchall else False + def set_acopytools(self): """ Set the internal acopytools. @@ -831,16 +841,6 @@ def transfer_files(self, copytool, files, activity=None, **kwargs): # noqa: C90 # return copytool.copy_in_bulk(remain_files, **kwargs) return copytool.copy_in(remain_files, **kwargs) - def allow_mvfinaldest(self, catchall): - """ - Is there an override in catchall to allow mv to final destination? - - :param catchall: catchall from queuedata (string) - :return: True if 'mv_final_destination' is present in catchall, otherwise False (Boolean) - """ - - return True if catchall and 'mv_final_destination' in catchall else False - def set_status_for_direct_access(self, files, workdir): """ Update the FileSpec status with 'remote_io' for direct access mode. diff --git a/pilot/copytool/rucio.py b/pilot/copytool/rucio.py index 6728d7a3c..afa014f76 100644 --- a/pilot/copytool/rucio.py +++ b/pilot/copytool/rucio.py @@ -9,6 +9,7 @@ # - Alexey Anisenkov, anisyonk@cern.ch, 2018 # - Paul Nilsson, paul.nilsson@cern.ch, 2018-2021 # - Tomas Javurek, tomas.javurek@cern.ch, 2019 +# - Tomas Javurek, tomas.javurek@cern.ch, 2019 # - David Cameron, david.cameron@cern.ch, 2019 from __future__ import absolute_import # Python 2 (2to3 complains about this) @@ -591,7 +592,7 @@ def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, tra if rucio_host: logger.debug(f'using rucio_host={rucio_host}') rucio_client = Client(rucio_host=rucio_host) - upload_client = UploadClient(client=rucio_client, logger=logger) + upload_client = UploadClient(_client=rucio_client, logger=logger) else: upload_client = UploadClient(logger=logger) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4f58b6e10..fa554b3cc 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '75' # build number should be reset to '1' for every new development cycle +BUILD = '78' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 9ede324e4d0b0266662b74fd69de170453c094a6 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 9 May 2023 11:30:53 +0200 Subject: [PATCH 114/154] Updated run_checks() for graceful_stop and abort_job --- pilot/control/monitor.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 5ee43a181..76f8fe9c2 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -291,12 +291,12 @@ def run_checks(queues, args): #else: # logger.debug(f'time since last successful heartbeat: {last_heartbeat} s') - if args.abort_job.is_set(): + if args.graceful_stop.is_set(): # find all running jobs and stop them, find all jobs in queues relevant to this module abort_jobs_in_queues(queues, args.signal) t_max = 2 * 60 - logger.warning('pilot monitor received instruction that abort_job has been requested') + logger.warning('pilot monitor received instruction that args.graceful_stop has been set') logger.warning(f'will wait for a maximum of {t_max} s for threads to finish') t_0 = time.time() ret = False @@ -310,10 +310,6 @@ def run_checks(queues, args): if ret: return - if not args.graceful_stop.is_set(): - logger.warning('setting graceful_stop') - args.graceful_stop.set() - if not args.job_aborted.is_set(): t_max = 60 logger.warning(f'will wait for a maximum of {t_max} s for graceful_stop to take effect') From 02c9e7845899d17ec288badff1b82a5faf48a665 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 9 May 2023 11:40:16 +0200 Subject: [PATCH 115/154] Updated functions for graceful_stop and abort_job --- pilot/control/data.py | 15 +++++++++------ pilot/util/constants.py | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pilot/control/data.py b/pilot/control/data.py index ff228c828..4b3090d24 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -539,14 +539,16 @@ def copytool_in(queues, traces, args): # noqa: C901 traces.pilot['command'] = 'abort' logger.warning('copytool_in detected a set abort_job pre stage-in (due to a kill signal)') declare_failed_by_kill(job, queues.failed_data_in, args.signal) - break + if args.graceful_stop.is_set(): + break if _stage_in(args, job): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning('copytool_in detected a set abort_job post stage-in (due to a kill signal)') declare_failed_by_kill(job, queues.failed_data_in, args.signal) - break + if args.graceful_stop.is_set(): + break put_in_queue(job, queues.finished_data_in) # remove the job from the current stage-in queue @@ -604,9 +606,8 @@ def copytool_out(queues, traces, args): """ cont = True - logger.debug('entering copytool_out loop') if args.graceful_stop.is_set(): - logger.debug('graceful_stop already set') + logger.debug('graceful_stop already set - do not start copytool_out thread') processed_jobs = [] while cont: @@ -638,14 +639,16 @@ def copytool_out(queues, traces, args): traces.pilot['command'] = 'abort' logger.warning('copytool_out detected a set abort_job pre stage-out (due to a kill signal)') declare_failed_by_kill(job, queues.failed_data_out, args.signal) - break + if abort: + break if _stage_out_new(job, args): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning('copytool_out detected a set abort_job post stage-out (due to a kill signal)') #declare_failed_by_kill(job, queues.failed_data_out, args.signal) - break + if args.graceful_stop.is_set(): + break #queues.finished_data_out.put(job) processed_jobs.append(job.jobid) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index fa554b3cc..249ada36e 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '78' # build number should be reset to '1' for every new development cycle +BUILD = '79' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 5eb4b7f8745126c10889c7174813dc2da515e3fd Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 9 May 2023 14:04:50 +0200 Subject: [PATCH 116/154] Updated job monitor --- PILOTVERSION | 2 +- pilot/control/job.py | 47 +++++++++++++++++++++++------------------ pilot/util/constants.py | 2 +- pilot/util/processes.py | 2 +- 4 files changed, 30 insertions(+), 23 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 92c8b014e..17a8a4f4d 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.78 \ No newline at end of file +3.6.0.80 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 597f25e39..36b2e9fab 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -519,7 +519,7 @@ def handle_backchannel_command(res, job, args, test_tobekilled=False): if job.pid: logger.debug('killing payload process') kill_process(job.pid) - #args.abort_job.set() + args.abort_job.set() elif 'softkill' in cmd: logger.info(f'pilot received a panda server signal to softkill job {job.jobid} at {time_stamp()}') # event service kill instruction @@ -2564,7 +2564,6 @@ def check_for_abort_job(args, caller=''): abort_job = False if args.abort_job.is_set(): logger.warning('%s detected an abort_job request (signal=%s)', caller, args.signal) - logger.warning('in case pilot is running more than one job, all jobs will be aborted') abort_job = True return abort_job @@ -2756,7 +2755,7 @@ def job_monitor(queues, traces, args): # noqa: C901 Monitoring of job parameters. This function monitors certain job parameters, such as job looping, at various time intervals. The main loop is executed once a minute, while individual verifications may be executed at any time interval (>= 1 minute). E.g. - looping jobs are checked once per ten minutes (default) and the heartbeat is send once per 30 minutes. Memory + looping jobs are checked once every ten minutes (default) and the heartbeat is sent once every 30 minutes. Memory usage is checked once a minute. :param queues: internal queues for job handling. @@ -2775,17 +2774,14 @@ def job_monitor(queues, traces, args): # noqa: C901 # overall loop counter (ignoring the fact that more than one job may be running) n = 0 - while not args.graceful_stop.is_set(): + cont = True + while cont: time.sleep(0.5) - # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) - # (abort at the end of the loop) - abort = should_abort(args, label='job:job_monitor') - if traces.pilot.get('command') == 'abort': logger.warning('job monitor received an abort command') - # check for any abort_job requests + # check for any abort_job requests (either kill signal or tobekilled command) abort_job = check_for_abort_job(args, caller='job monitor') if not abort_job: if not queues.current_data_in.empty(): @@ -2816,8 +2812,6 @@ def job_monitor(queues, traces, args): # noqa: C901 time.sleep(1) continue - time.sleep(60) - # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function jobs = queues.monitored_payloads.queue if args.workflow != 'stager' else None if jobs: @@ -2826,26 +2820,37 @@ def job_monitor(queues, traces, args): # noqa: C901 for i in range(len(jobs)): current_id = jobs[i].jobid - # if abort_job and signal was set + error_code = None if abort_job and args.signal: + # if abort_job and a kill signal was set error_code = get_signal_error(args.signal) + elif abort_job: # i.e. no kill signal + logger.info('tobekilled seen by job_monitor (error code should already be set) - abort job only') + elif os.environ.get('REACHED_MAXTIME', None): + # the batch system max time has been reached, time to abort (in the next step) + logger.info('REACHED_MAXTIME seen by job monitor - abort everything') + if not args.graceful_stop.is_set(): + logger.info('setting graceful_stop since it was not set already') + args.graceful_stop.set() + error_code = errors.REACHEDMAXTIME + sent_update = False + if error_code: jobs[i].state = 'failed' jobs[i].piloterrorcodes, jobs[i].piloterrordiags = errors.add_error_code(error_code) jobs[i].completed = True - # update server immediately - send_state(jobs[i], args, jobs[i].state) + status = send_state(jobs[i], args, jobs[i].state) + if status: + sent_update = True if jobs[i].pid: logger.debug('killing payload processes') kill_processes(jobs[i].pid) - if os.environ.get('REACHED_MAXTIME', None): - # the batch system max time has been reached, time to abort (in the next step) - jobs[i].state = 'failed' - logger.info('monitor loop #%d: job %d:%s is in state \'%s\'', n, i, current_id, jobs[i].state) if jobs[i].state == 'finished' or jobs[i].state == 'failed': logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) # abort = True - do not set abort here as it will abort the entire thread, not just the current monitor loop + if not sent_update: # e.g. this is the case for tobekilled (error code not set in this function) + send_state(jobs[i], args, jobs[i].state) break # perform the monitoring tasks @@ -2905,8 +2910,10 @@ def job_monitor(queues, traces, args): # noqa: C901 n += 1 - if abort or abort_job: - break + # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) + abort = should_abort(args, label='job:job_monitor') + if abort: + cont = False # proceed to set the job_aborted flag? if threads_aborted(): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 249ada36e..bf54a1573 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '79' # build number should be reset to '1' for every new development cycle +BUILD = '80' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 7465d7769..28043820d 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -129,7 +129,7 @@ def dump_stack_trace(pid): def kill_processes(pid): """ - Kill process beloging to given process group. + Kill process belonging to the process group that the given pid belongs to. :param pid: process id (int). :return: From 0a0e4608ad4912d756e88a39ee6ed05e154a7dec Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 9 May 2023 14:59:56 +0200 Subject: [PATCH 117/154] Avoiding setting graceful_stop --- PILOTVERSION | 2 +- pilot/control/job.py | 10 +++++----- pilot/util/constants.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 17a8a4f4d..65ef5bde6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.80 \ No newline at end of file +3.6.0.81 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 36b2e9fab..25b209ddc 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -3026,11 +3026,11 @@ def check_job_monitor_waiting_time(args, peeking_time, abort_override=False): logger.warning(msg) else: print(msg) - if abort or abort_override: - # do not set graceful stop if pilot has not finished sending the final job update - # i.e. wait until SERVER_UPDATE is DONE_FINAL - check_for_final_server_update(args.update_server) - args.graceful_stop.set() + #if abort or abort_override: + # # do not set graceful stop if pilot has not finished sending the final job update + # # i.e. wait until SERVER_UPDATE is DONE_FINAL + # check_for_final_server_update(args.update_server) + # args.graceful_stop.set() def fail_monitored_job(job, exit_code, diagnostics, queues, traces): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index bf54a1573..23476ef89 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '80' # build number should be reset to '1' for every new development cycle +BUILD = '81' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 90c6286f4bc9b1607cdc29e2ec02c384df88e6ff Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 10 May 2023 10:14:47 +0200 Subject: [PATCH 118/154] Fixed problematic job.completed leading to tobekilled --- PILOTVERSION | 2 +- pilot/control/job.py | 12 +++++++++--- pilot/util/constants.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 65ef5bde6..1f5543614 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.81 \ No newline at end of file +3.6.0.82 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 25b209ddc..c61d5eae3 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -314,11 +314,10 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) state = get_proper_state(job, state) if state == 'finished' or state == 'holding' or state == 'failed': logger.info(f'this job has now completed (state={state})') - job.completed = True + # job.completed = True - do not set that here (only after the successful final server update) elif args.pod and args.workflow == 'stager': state = 'running' # stager pods should only send 'running' since harvester already has set the 'starting' state job.state = state - # job.completed = True # should the pilot make any server updates? if not args.update_server: @@ -354,6 +353,11 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) if final and os.path.exists(job.workdir): # ignore if workdir doesn't exist - might be a delayed jobUpdate os.environ['SERVER_UPDATE'] = SERVER_UPDATE_FINAL + + if state == 'finished' or state == 'holding' or state == 'failed': + logger.info(f'setting job as completed (state={state})') + job.completed = True + return True if final: @@ -1949,6 +1953,7 @@ def retrieve(queues, traces, args): # noqa: C901 # do not set graceful stop if pilot has not finished sending the final job update # i.e. wait until SERVER_UPDATE is DONE_FINAL check_for_final_server_update(args.update_server) + logger.warning('setting graceful_stop since proceed_with_getjob() returned False (pilot will end)') args.graceful_stop.set() break @@ -1965,13 +1970,14 @@ def retrieve(queues, traces, args): # noqa: C901 # do not set graceful stop if pilot has not finished sending the final job update # i.e. wait until SERVER_UPDATE is DONE_FINAL check_for_final_server_update(args.update_server) + logger.warning('setting graceful_stop since no job definition could be received (pilot will end)') args.graceful_stop.set() break if not res: getjob_failures += 1 if getjob_failures >= args.getjob_failures: - logger.warning(f'did not get a job -- max number of job request failures reached: {getjob_failures}') + logger.warning(f'did not get a job -- max number of job request failures reached: {getjob_failures} (setting graceful_stop)') args.graceful_stop.set() break diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 23476ef89..f2e083d03 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '81' # build number should be reset to '1' for every new development cycle +BUILD = '82' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From b370fded891495b829e899dbf8948e289ced2393 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 10 May 2023 10:19:22 +0200 Subject: [PATCH 119/154] Flake8 --- pilot/control/data.py | 2 +- pilot/control/job.py | 14 +++++++------- pilot/control/monitor.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pilot/control/data.py b/pilot/control/data.py index 4b3090d24..f08240e58 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -594,7 +594,7 @@ def copytool_in(queues, traces, args): # noqa: C901 logger.info('[data] copytool_in thread has finished') -def copytool_out(queues, traces, args): +def copytool_out(queues, traces, args): # noqa: C901 """ Main stage-out thread. Perform stage-out as soon as a job object can be extracted from the data_out queue. diff --git a/pilot/control/job.py b/pilot/control/job.py index c61d5eae3..1b64cdd83 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2909,10 +2909,10 @@ def job_monitor(queues, traces, args): # noqa: C901 elif os.environ.get('PILOT_JOB_STATE') == 'stagein': logger.info('job monitoring is waiting for stage-in to finish') - else: - # check the waiting time in the job monitor. set global graceful_stop if necessary - if args.workflow != 'stager': - check_job_monitor_waiting_time(args, peeking_time, abort_override=abort_job) + #else: + # # check the waiting time in the job monitor. set global graceful_stop if necessary + # if args.workflow != 'stager': + # check_job_monitor_waiting_time(args, peeking_time, abort_override=abort_job) n += 1 @@ -3024,10 +3024,10 @@ def check_job_monitor_waiting_time(args, peeking_time, abort_override=False): waiting_time = int(time.time()) - peeking_time msg = 'no jobs in monitored_payloads queue (waited for %d s)' % waiting_time if waiting_time > 60 * 60: - abort = True + # abort = True msg += ' - aborting' - else: - abort = False + #else: + # abort = False if logger: logger.warning(msg) else: diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 76f8fe9c2..65e606a96 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -164,7 +164,7 @@ def run_shutdowntime_minute_check(time_since_start): if _shutdowntime: try: shutdowntime = int(_shutdowntime) - except (TypeError, ValueError) as exc: + except (TypeError, ValueError): # as exc: #logger.debug(f'failed to convert shutdowntime: {exc}') return False # will be ignored else: From f4a682fb31a8f0093eba5dab56d93db884b0094c Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 10 May 2023 10:20:40 +0200 Subject: [PATCH 120/154] Flake8 --- pilot/control/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 1b64cdd83..9903a95c1 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -3024,8 +3024,8 @@ def check_job_monitor_waiting_time(args, peeking_time, abort_override=False): waiting_time = int(time.time()) - peeking_time msg = 'no jobs in monitored_payloads queue (waited for %d s)' % waiting_time if waiting_time > 60 * 60: - # abort = True msg += ' - aborting' + # abort = True #else: # abort = False if logger: From ff247bfa114af3931f26948775d308a65a6c5d11 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 10 May 2023 10:30:37 +0200 Subject: [PATCH 121/154] Code merge --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 1f5543614..f616117a6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.82 \ No newline at end of file +3.6.0.83 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index f2e083d03..85dd18fff 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '82' # build number should be reset to '1' for every new development cycle +BUILD = '83' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From d26054ddb15ee9ca5465fb3389037f8338804caa Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 10 May 2023 10:37:52 +0200 Subject: [PATCH 122/154] Fixed missing job.completed --- pilot/control/job.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index 9903a95c1..0f7f46eb8 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2839,14 +2839,15 @@ def job_monitor(queues, traces, args): # noqa: C901 logger.info('setting graceful_stop since it was not set already') args.graceful_stop.set() error_code = errors.REACHEDMAXTIME - sent_update = False + sent_update = job.completed # job.completed gets set to True after a successful final server update if error_code: jobs[i].state = 'failed' jobs[i].piloterrorcodes, jobs[i].piloterrordiags = errors.add_error_code(error_code) jobs[i].completed = True - status = send_state(jobs[i], args, jobs[i].state) - if status: - sent_update = True + if not sent_update: + status = send_state(jobs[i], args, jobs[i].state) + if status: + sent_update = True if jobs[i].pid: logger.debug('killing payload processes') kill_processes(jobs[i].pid) From 5ee1b66e9ff45ebbec9c3f956f5d8c74c9f8cfdc Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 10 May 2023 11:57:47 +0200 Subject: [PATCH 123/154] job.completed update --- PILOTVERSION | 2 +- pilot/control/job.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index f616117a6..e6ef3308a 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.83 \ No newline at end of file +3.6.0.84 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 0f7f46eb8..31a4cdd7a 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2839,7 +2839,7 @@ def job_monitor(queues, traces, args): # noqa: C901 logger.info('setting graceful_stop since it was not set already') args.graceful_stop.set() error_code = errors.REACHEDMAXTIME - sent_update = job.completed # job.completed gets set to True after a successful final server update + sent_update = jobs[i].completed # job.completed gets set to True after a successful final server update if error_code: jobs[i].state = 'failed' jobs[i].piloterrorcodes, jobs[i].piloterrordiags = errors.add_error_code(error_code) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 85dd18fff..be868f465 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '83' # build number should be reset to '1' for every new development cycle +BUILD = '84' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 7ae3eb05f408ffdb6ade0896de3df41b3213d425 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 10 May 2023 16:14:51 +0200 Subject: [PATCH 124/154] Updated job.completed --- PILOTVERSION | 2 +- pilot/control/job.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index e6ef3308a..6c2f2d3e1 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.84 \ No newline at end of file +3.6.0.85 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 31a4cdd7a..f169346df 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2856,7 +2856,7 @@ def job_monitor(queues, traces, args): # noqa: C901 if jobs[i].state == 'finished' or jobs[i].state == 'failed': logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) # abort = True - do not set abort here as it will abort the entire thread, not just the current monitor loop - if not sent_update: # e.g. this is the case for tobekilled (error code not set in this function) + if not jobs[i].completed: # e.g. this is the case for tobekilled (error code not set in this function) send_state(jobs[i], args, jobs[i].state) break diff --git a/pilot/util/constants.py b/pilot/util/constants.py index be868f465..b7030ffa4 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '84' # build number should be reset to '1' for every new development cycle +BUILD = '85' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 5d56c8b6fb50efeca0dbae70685dff99a0bf463d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 10 May 2023 17:23:00 +0200 Subject: [PATCH 125/154] Updating threads_aborted() --- pilot/util/processes.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 28043820d..edf827c2d 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -619,6 +619,7 @@ def threads_aborted(): thread_count = threading.activeCount() pilot_thread_count = 0 daemon_threads = 0 + main_thread_count = 0 # count all threads still alive for thread in threading.enumerate(): @@ -626,12 +627,16 @@ def threads_aborted(): daemon_threads += 1 #tag = 'daemon' elif thread == threading.main_thread(): + main_thread_count += 1 #tag = 'main' - pass else: # only count threads spawned by the main thread, no the main thread itself or any daemon threads #tag = 'pilot?' pilot_thread_count += 1 - #logger.debug(f'thread={thread}, pilot_thread_count={pilot_thread_count}, daemon_thread_count={daemon_threads}, tag={tag}') + #logger.debug(f'thread={thread},' + # f'pilot_thread_count={pilot_thread_count}, ' + # f'daemon_thread_count={daemon_threads}, ' + # f'main_thread_count={main_thread_count}, ' + # f'tag={tag}') if pilot_thread_count == 0: logger.debug(f'aborting since only the main Pilot thread is still running ' f'(total thread count={thread_count} with {daemon_threads} daemon thread(s)') From 7650a900dd88652a604bce0a2265d03ac79a6933 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 10 May 2023 18:29:49 +0200 Subject: [PATCH 126/154] Updating threads_aborted() --- pilot/control/job.py | 1 + pilot/util/processes.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pilot/control/job.py b/pilot/control/job.py index f169346df..52f19b928 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -1955,6 +1955,7 @@ def retrieve(queues, traces, args): # noqa: C901 check_for_final_server_update(args.update_server) logger.warning('setting graceful_stop since proceed_with_getjob() returned False (pilot will end)') args.graceful_stop.set() + args.abort_job.set() break # store time stamp diff --git a/pilot/util/processes.py b/pilot/util/processes.py index edf827c2d..4be0b4d81 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -622,6 +622,7 @@ def threads_aborted(): main_thread_count = 0 # count all threads still alive + names = [] for thread in threading.enumerate(): if thread.isDaemon(): # ignore any daemon threads, they will be aborted when python ends daemon_threads += 1 @@ -632,6 +633,7 @@ def threads_aborted(): else: # only count threads spawned by the main thread, no the main thread itself or any daemon threads #tag = 'pilot?' pilot_thread_count += 1 + names.append(f'{thread}') #logger.debug(f'thread={thread},' # f'pilot_thread_count={pilot_thread_count}, ' # f'daemon_thread_count={daemon_threads}, ' From ef4a182acce80c26cbeda51427e64f94bbd5ad29 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 10 May 2023 19:09:23 +0200 Subject: [PATCH 127/154] Updating threads_aborted() --- pilot/control/data.py | 8 ++++---- pilot/control/job.py | 18 +++++++++--------- pilot/control/payload.py | 12 ++++++------ pilot/util/processes.py | 6 +++++- pilot/workflow/generic.py | 2 +- pilot/workflow/stager.py | 2 +- 6 files changed, 26 insertions(+), 22 deletions(-) diff --git a/pilot/control/data.py b/pilot/control/data.py index f08240e58..13c407e31 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -81,7 +81,7 @@ def control(queues, traces, args): #abort_jobs_in_queues(queues, args.signal) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='control'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -585,7 +585,7 @@ def copytool_in(queues, traces, args): # noqa: C901 continue # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='copytool_in'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -671,7 +671,7 @@ def copytool_out(queues, traces, args): # noqa: C901 break # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='copytool_out'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -1083,7 +1083,7 @@ def queue_monitoring(queues, traces, args): break # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='queue_monitoring'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: diff --git a/pilot/control/job.py b/pilot/control/job.py index 52f19b928..8eb9a2058 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -108,7 +108,7 @@ def control(queues, traces, args): #abort_jobs_in_queues(queues, args.signal) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='control'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -1075,7 +1075,7 @@ def validate(queues, traces, args): put_in_queue(job, queues.failed_jobs) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='validate'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -1248,7 +1248,7 @@ def create_data_payload(queues, traces, args): put_in_queue(job, queues.payloads) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='create_data_payload'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -2059,7 +2059,7 @@ def retrieve(queues, traces, args): # noqa: C901 time.sleep(0.5) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='retrieve'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -2458,7 +2458,7 @@ def queue_monitor(queues, traces, args): # noqa: C901 break # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='queue_monitor'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -2611,7 +2611,7 @@ def interceptor(queues, traces, args): break # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='interceptor'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -2676,7 +2676,7 @@ def message_listener(queues, traces, args): # proceed to set the job_aborted flag? if args.subscribe_to_msgsvc: - if threads_aborted(): + if threads_aborted(caller='message_listener'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -2748,7 +2748,7 @@ def fast_job_monitor(queues, traces, args): logger.debug('fast monitoring reported an error: %d', exit_code) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='fast_job_monitor'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -2924,7 +2924,7 @@ def job_monitor(queues, traces, args): # noqa: C901 cont = False # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='job_monitor'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: diff --git a/pilot/control/payload.py b/pilot/control/payload.py index 5d15a520a..b777d7054 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -85,7 +85,7 @@ def control(queues, traces, args): #abort_jobs_in_queues(queues, args.signal) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='control'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -120,7 +120,7 @@ def validate_pre(queues, traces, args): put_in_queue(job, queues.failed_payloads) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='validate_pre'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -300,7 +300,7 @@ def execute_payloads(queues, traces, args): # noqa: C901 time.sleep(5) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='execute_payloads'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -534,7 +534,7 @@ def run_realtimelog(queues, traces, args): # noqa: C901 realtime_logger.sending_logs(args, job) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='run_realtimelog'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -729,7 +729,7 @@ def validate_post(queues, traces, args): put_in_queue(job, queues.data_out) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='validate_post'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: @@ -766,7 +766,7 @@ def failed_post(queues, traces, args): put_in_queue(job, queues.data_out) # proceed to set the job_aborted flag? - if threads_aborted(): + if threads_aborted(caller='failed_post'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 4be0b4d81..e0c75aa5a 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -605,7 +605,7 @@ def threads_aborted_deprecated(abort_at=2): return aborted -def threads_aborted(): +def threads_aborted(caller=''): """ Have the Pilot threads been aborted? This function will count all the threads still running, but will only return True if all @@ -643,6 +643,10 @@ def threads_aborted(): logger.debug(f'aborting since only the main Pilot thread is still running ' f'(total thread count={thread_count} with {daemon_threads} daemon thread(s)') abort = True + elif pilot_thread_count == 1 and caller: + if caller in names[0]: + logger.info(f'caller={caller} is remaining thread - safe to abort') + abort = True return abort diff --git a/pilot/workflow/generic.py b/pilot/workflow/generic.py index 20ce7d095..5396a0cbc 100644 --- a/pilot/workflow/generic.py +++ b/pilot/workflow/generic.py @@ -194,7 +194,7 @@ def run(args): thread.join(0.1) # have all threads finished? - abort = threads_aborted() + abort = threads_aborted(caller='run') if abort: logger.debug(f'all relevant threads have aborted (thread count={threading.activeCount()})') break diff --git a/pilot/workflow/stager.py b/pilot/workflow/stager.py index 1ae4d103a..8e8c931a6 100644 --- a/pilot/workflow/stager.py +++ b/pilot/workflow/stager.py @@ -151,7 +151,7 @@ def run(args): if thread_count != threading.activeCount(): # has all threads finished? #abort = threads_aborted(abort_at=1) - abort = threads_aborted() + abort = threads_aborted(caller='run') if abort: break From d7db6f1fc30052485e01fb5f83f351c55a59d46b Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 11 May 2023 13:29:09 +0200 Subject: [PATCH 128/154] Updates to job_monitor and threads_aborted --- PILOTVERSION | 2 +- pilot/control/job.py | 21 ++++++++++++++------- pilot/util/constants.py | 2 +- pilot/util/processes.py | 20 +++++++++++--------- 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 6c2f2d3e1..525dd9097 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.85 \ No newline at end of file +3.6.0.90 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 8eb9a2058..65c80ef9a 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2449,7 +2449,6 @@ def queue_monitor(queues, traces, args): # noqa: C901 else: # now ready for the next job (or quit) put_in_queue(job.jobid, queues.completed_jobids) - put_in_queue(job, queues.completed_jobs) if _job: del _job @@ -2476,6 +2475,10 @@ def update_server(job, args): :return: """ + if job.completed: + logger.warning('job has already completed - cannot send another final update') + return + # user specific actions pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) @@ -2840,15 +2843,13 @@ def job_monitor(queues, traces, args): # noqa: C901 logger.info('setting graceful_stop since it was not set already') args.graceful_stop.set() error_code = errors.REACHEDMAXTIME - sent_update = jobs[i].completed # job.completed gets set to True after a successful final server update + logger.debug(f'(1) jobs[i].completed={jobs[i].completed}') if error_code: jobs[i].state = 'failed' jobs[i].piloterrorcodes, jobs[i].piloterrordiags = errors.add_error_code(error_code) jobs[i].completed = True - if not sent_update: - status = send_state(jobs[i], args, jobs[i].state) - if status: - sent_update = True + if not jobs[i].completed: # job.completed gets set to True after a successful final server update: + send_state(jobs[i], args, jobs[i].state) if jobs[i].pid: logger.debug('killing payload processes') kill_processes(jobs[i].pid) @@ -2861,6 +2862,8 @@ def job_monitor(queues, traces, args): # noqa: C901 send_state(jobs[i], args, jobs[i].state) break + logger.debug(f'(2) jobs[i].completed={jobs[i].completed}') + # perform the monitoring tasks exit_code, diagnostics = job_monitor_tasks(jobs[i], mt, args) logger.debug(f'job_monitor_tasks returned {exit_code}, {diagnostics}') @@ -2898,7 +2901,7 @@ def job_monitor(queues, traces, args): # noqa: C901 try: update_time = send_heartbeat_if_time(_job, args, update_time) except Exception as error: - logger.warning('(2) exception caught: %s (job id=%s)', error, current_id) + logger.warning('exception caught: %s (job id=%s)', error, current_id) break else: # note: when sending a state change to the server, the server might respond with 'tobekilled' @@ -3001,6 +3004,10 @@ def send_heartbeat_if_time(job, args, update_time): :return: possibly updated update_time (from time.time()). """ + if job.completed: + logger.info('job already completed - will not send any further updates') + return update_time + if int(time.time()) - update_time >= get_heartbeat_period(job.debug and job.debug_command): # check for state==running here, and send explicit 'running' in send_state, rather than sending job.state # since the job state can actually change in the meantime by another thread diff --git a/pilot/util/constants.py b/pilot/util/constants.py index b7030ffa4..5b47b6fd0 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '85' # build number should be reset to '1' for every new development cycle +BUILD = '90' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index e0c75aa5a..9f0fccad6 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -626,24 +626,26 @@ def threads_aborted(caller=''): for thread in threading.enumerate(): if thread.isDaemon(): # ignore any daemon threads, they will be aborted when python ends daemon_threads += 1 - #tag = 'daemon' + tag = 'daemon' elif thread == threading.main_thread(): main_thread_count += 1 - #tag = 'main' + tag = 'main' else: # only count threads spawned by the main thread, no the main thread itself or any daemon threads - #tag = 'pilot?' + tag = 'pilot?' pilot_thread_count += 1 names.append(f'{thread}') - #logger.debug(f'thread={thread},' - # f'pilot_thread_count={pilot_thread_count}, ' - # f'daemon_thread_count={daemon_threads}, ' - # f'main_thread_count={main_thread_count}, ' - # f'tag={tag}') + logger.debug(f'thread={thread},' + f'caller={caller}, ' + f'pilot_thread_count={pilot_thread_count}, ' + f'daemon_thread_count={daemon_threads}, ' + f'main_thread_count={main_thread_count}, ' + f'names={names}, ' + f'tag={tag}') if pilot_thread_count == 0: logger.debug(f'aborting since only the main Pilot thread is still running ' f'(total thread count={thread_count} with {daemon_threads} daemon thread(s)') abort = True - elif pilot_thread_count == 1 and caller: + elif pilot_thread_count == 1 and caller and caller != 'run': if caller in names[0]: logger.info(f'caller={caller} is remaining thread - safe to abort') abort = True From d4b3476fdd5a8e401ab512f53467e05161455840 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 11 May 2023 16:56:28 +0200 Subject: [PATCH 129/154] Added debug info --- PILOTVERSION | 2 +- pilot/control/job.py | 17 ++++++++++++----- pilot/util/constants.py | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 525dd9097..f46a3edac 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.90 \ No newline at end of file +3.6.0.91 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 65c80ef9a..0766c66ff 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2438,7 +2438,7 @@ def queue_monitor(queues, traces, args): # noqa: C901 # send final server update update_server(job, args) - + logger.debug(f'job.completed={job.completed}') # we can now stop monitoring this job, so remove it from the monitored_payloads queue and add it to the # completed_jobs queue which will tell retrieve() that it can download another job try: @@ -2793,6 +2793,7 @@ def job_monitor(queues, traces, args): # noqa: C901 # check for any abort_job requests (either kill signal or tobekilled command) abort_job = check_for_abort_job(args, caller='job monitor') + logger.debug(f'abort_job={abort_job}') if not abort_job: if not queues.current_data_in.empty(): # make sure to send heartbeat regularly if stage-in takes a long time @@ -2821,9 +2822,10 @@ def job_monitor(queues, traces, args): # noqa: C901 # sleep for a while if stage-in has not completed time.sleep(1) continue - + logger.debug('back here') # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function jobs = queues.monitored_payloads.queue if args.workflow != 'stager' else None + logger.debug(f'back here (jobs={jobs})') if jobs: # update the peeking time peeking_time = int(time.time()) @@ -2858,8 +2860,11 @@ def job_monitor(queues, traces, args): # noqa: C901 if jobs[i].state == 'finished' or jobs[i].state == 'failed': logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) # abort = True - do not set abort here as it will abort the entire thread, not just the current monitor loop + logger.debug(f'(10) jobs[i].completed={jobs[i].completed}') if not jobs[i].completed: # e.g. this is the case for tobekilled (error code not set in this function) + logger.debug(f'(11) jobs[i].completed={jobs[i].completed}') send_state(jobs[i], args, jobs[i].state) + logger.debug(f'(12) jobs[i].completed={jobs[i].completed}') break logger.debug(f'(2) jobs[i].completed={jobs[i].completed}') @@ -2911,22 +2916,24 @@ def job_monitor(queues, traces, args): # noqa: C901 put_in_queue(_job, queues.data_out) #abort = True break - + logger.debug('now here') elif os.environ.get('PILOT_JOB_STATE') == 'stagein': logger.info('job monitoring is waiting for stage-in to finish') #else: # # check the waiting time in the job monitor. set global graceful_stop if necessary # if args.workflow != 'stager': # check_job_monitor_waiting_time(args, peeking_time, abort_override=abort_job) - + logger.debug('and now here') n += 1 # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) abort = should_abort(args, label='job:job_monitor') + logger.debug(f'abort={abort}') if abort: cont = False - + logger.debug(f'cont={cont}') # proceed to set the job_aborted flag? + logger.debug('will check threads') if threads_aborted(caller='job_monitor'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 5b47b6fd0..10dbdc611 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '90' # build number should be reset to '1' for every new development cycle +BUILD = '91' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 47f437676fccd993c8609bcc41fb1a4ddf4a4b49 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 11 May 2023 17:10:26 +0200 Subject: [PATCH 130/154] Cleanup --- pilot/util/processes.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 9f0fccad6..1988f6ac9 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -626,21 +626,21 @@ def threads_aborted(caller=''): for thread in threading.enumerate(): if thread.isDaemon(): # ignore any daemon threads, they will be aborted when python ends daemon_threads += 1 - tag = 'daemon' + #tag = 'daemon' elif thread == threading.main_thread(): main_thread_count += 1 - tag = 'main' + #tag = 'main' else: # only count threads spawned by the main thread, no the main thread itself or any daemon threads - tag = 'pilot?' pilot_thread_count += 1 names.append(f'{thread}') - logger.debug(f'thread={thread},' - f'caller={caller}, ' - f'pilot_thread_count={pilot_thread_count}, ' - f'daemon_thread_count={daemon_threads}, ' - f'main_thread_count={main_thread_count}, ' - f'names={names}, ' - f'tag={tag}') + #tag = 'pilot?' + #logger.debug(f'thread={thread},' + # f'caller={caller}, ' + # f'pilot_thread_count={pilot_thread_count}, ' + # f'daemon_thread_count={daemon_threads}, ' + # f'main_thread_count={main_thread_count}, ' + # f'names={names}, ' + # f'tag={tag}') if pilot_thread_count == 0: logger.debug(f'aborting since only the main Pilot thread is still running ' f'(total thread count={thread_count} with {daemon_threads} daemon thread(s)') From 22c5cf59a5c3bf4a8a3afd19eef337d39e3362de Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 12 May 2023 09:30:38 +0200 Subject: [PATCH 131/154] Removed send_state from job_monitor --- PILOTVERSION | 2 +- pilot/control/job.py | 21 +++++---------------- pilot/util/constants.py | 2 +- 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index f46a3edac..c3bdfa99f 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.91 \ No newline at end of file +3.6.0.93 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 0766c66ff..ba6bc5d0f 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2413,7 +2413,7 @@ def queue_monitor(queues, traces, args): # noqa: C901 while i < imax and os.environ.get('PILOT_WRAP_UP', '') == 'NORMAL': job = get_finished_or_failed_job(args, queues) if job: - logger.debug('returned job has state=%s', job.state) + logger.debug(f'returned job has job.state={job.state} and job.completed={job.completed}') #if job.state == 'failed': # logger.warning('will abort failed job (should prepare for final server update)') break @@ -2822,10 +2822,9 @@ def job_monitor(queues, traces, args): # noqa: C901 # sleep for a while if stage-in has not completed time.sleep(1) continue - logger.debug('back here') + # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function jobs = queues.monitored_payloads.queue if args.workflow != 'stager' else None - logger.debug(f'back here (jobs={jobs})') if jobs: # update the peeking time peeking_time = int(time.time()) @@ -2845,7 +2844,6 @@ def job_monitor(queues, traces, args): # noqa: C901 logger.info('setting graceful_stop since it was not set already') args.graceful_stop.set() error_code = errors.REACHEDMAXTIME - logger.debug(f'(1) jobs[i].completed={jobs[i].completed}') if error_code: jobs[i].state = 'failed' jobs[i].piloterrorcodes, jobs[i].piloterrordiags = errors.add_error_code(error_code) @@ -2860,15 +2858,8 @@ def job_monitor(queues, traces, args): # noqa: C901 if jobs[i].state == 'finished' or jobs[i].state == 'failed': logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) # abort = True - do not set abort here as it will abort the entire thread, not just the current monitor loop - logger.debug(f'(10) jobs[i].completed={jobs[i].completed}') - if not jobs[i].completed: # e.g. this is the case for tobekilled (error code not set in this function) - logger.debug(f'(11) jobs[i].completed={jobs[i].completed}') - send_state(jobs[i], args, jobs[i].state) - logger.debug(f'(12) jobs[i].completed={jobs[i].completed}') break - logger.debug(f'(2) jobs[i].completed={jobs[i].completed}') - # perform the monitoring tasks exit_code, diagnostics = job_monitor_tasks(jobs[i], mt, args) logger.debug(f'job_monitor_tasks returned {exit_code}, {diagnostics}') @@ -2916,24 +2907,22 @@ def job_monitor(queues, traces, args): # noqa: C901 put_in_queue(_job, queues.data_out) #abort = True break - logger.debug('now here') + elif os.environ.get('PILOT_JOB_STATE') == 'stagein': logger.info('job monitoring is waiting for stage-in to finish') #else: # # check the waiting time in the job monitor. set global graceful_stop if necessary # if args.workflow != 'stager': # check_job_monitor_waiting_time(args, peeking_time, abort_override=abort_job) - logger.debug('and now here') + n += 1 # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) abort = should_abort(args, label='job:job_monitor') - logger.debug(f'abort={abort}') if abort: cont = False - logger.debug(f'cont={cont}') + # proceed to set the job_aborted flag? - logger.debug('will check threads') if threads_aborted(caller='job_monitor'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 10dbdc611..d1aa3f542 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '91' # build number should be reset to '1' for every new development cycle +BUILD = '93' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From b0e22df4afe277e358ad25f19ffab919615bbfbd Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 12 May 2023 09:34:13 +0200 Subject: [PATCH 132/154] Added checks --- pilot/util/default.cfg | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index e238d87d6..280c1592b 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -143,6 +143,9 @@ base_trace_report: base_trace_report.json # JSON file for keeping job secrets (if any) pandasecrets: panda_secrets.json +# Pilot will run the following job independent checks (see also Payload section below) +checks: proxy,space + ################################ # Information service parameters @@ -194,6 +197,9 @@ metadata: metadata.xml payloadstdout: payload.stdout payloadstderr: payload.stderr +# Pilot will run the following job dependent checks (see also Pilot section above) +checks: looping + ################################ # Container parameters From 17f9d3457b5b928e624db525f846492a98cb6379 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 12 May 2023 09:38:09 +0200 Subject: [PATCH 133/154] Added checks --- pilot/util/default.cfg | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index 280c1592b..ef85ca3bb 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -144,7 +144,10 @@ base_trace_report: base_trace_report.json pandasecrets: panda_secrets.json # Pilot will run the following job independent checks (see also Payload section below) -checks: proxy,space +# proxy = standard proxy validation +# space = remaining disk space +# last_heartbeat = time since last successful heartbeat +checks: proxy,space,last_heartbeat ################################ # Information service parameters @@ -198,6 +201,7 @@ payloadstdout: payload.stdout payloadstderr: payload.stderr # Pilot will run the following job dependent checks (see also Pilot section above) +# looping = looping job check checks: looping ################################ From 37348b846224324570c54a588f623cf155d67801 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 12 May 2023 09:52:33 +0200 Subject: [PATCH 134/154] Added pilot checks --- pilot/control/monitor.py | 22 +++++++++++----------- pilot/util/common.py | 25 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 65e606a96..c81357362 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -20,6 +20,7 @@ from pilot.common.exception import PilotException, ExceededMaxWaitTime from pilot.util.auxiliary import check_for_final_server_update, set_pilot_state +from pilot.util.common import is_pilot_check from pilot.util.config import config from pilot.util.constants import MAX_KILL_WAIT_TIME # from pilot.util.container import execute @@ -279,17 +280,16 @@ def run_checks(queues, args): """ # check how long time has passed since last successful heartbeat - last_heartbeat = time.time() - args.last_heartbeat - if last_heartbeat > config.Pilot.lost_heartbeat and args.update_server: - diagnostics = f'too much time has passed since last successful heartbeat ({last_heartbeat} s)' - logger.warning(diagnostics) - logger.warning('aborting pilot - no need to wait for job to finish - kill everything') - args.job_aborted.set() - args.graceful_stop.set() - args.abort_job.clear() - raise ExceededMaxWaitTime(diagnostics) - #else: - # logger.debug(f'time since last successful heartbeat: {last_heartbeat} s') + if is_pilot_check(check='last_heartbeat'): + last_heartbeat = time.time() - args.last_heartbeat + if last_heartbeat > config.Pilot.lost_heartbeat and args.update_server: + diagnostics = f'too much time has passed since last successful heartbeat ({last_heartbeat} s)' + logger.warning(diagnostics) + logger.warning('aborting pilot - no need to wait for job to finish - kill everything') + args.job_aborted.set() + args.graceful_stop.set() + args.abort_job.clear() + raise ExceededMaxWaitTime(diagnostics) if args.graceful_stop.is_set(): # find all running jobs and stop them, find all jobs in queues relevant to this module diff --git a/pilot/util/common.py b/pilot/util/common.py index aad7b635a..da5af2a97 100644 --- a/pilot/util/common.py +++ b/pilot/util/common.py @@ -10,6 +10,7 @@ import os import logging +from pilot.util.config import config from pilot.util.constants import PILOT_KILL_SIGNAL from pilot.util.timing import get_time_since @@ -56,3 +57,27 @@ def was_pilot_killed(timing): if PILOT_KILL_SIGNAL in timing[i]: was_killed = True return was_killed + + +def is_pilot_check(check=''): + """ + Should the given pilot check be run? + + Consult config.Pilot.checks if the given check is listed. + + :param check: name of check (string) + :return: True if check is present in config.Pilot.checks (and if config is outdated), False othersise (Boolean) + """ + + status = False + if not check: + return status + + try: + if check in config.Pilot.checks: + status = True + except AttributeError as exc: + logger.warning(f'attribute Pilot.checks not present in config file - please update: exc={exc}') + status = True # to allow check to proceed when config file is outdated + + return status From d39da587e9a298f10fb26f6fb17324c8984f1c3c Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Fri, 12 May 2023 10:03:13 +0200 Subject: [PATCH 135/154] Added pilot checks --- PILOTVERSION | 2 +- pilot/control/monitor.py | 45 +++++++++++++++++++++------------------- pilot/util/constants.py | 2 +- pilot/util/default.cfg | 4 +++- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index c3bdfa99f..9f560d4b8 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.93 \ No newline at end of file +3.6.0.94 \ No newline at end of file diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index c81357362..1653fee7b 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -100,37 +100,40 @@ def control(queues, traces, args): # noqa: C901 logger.info(f'{time_since_start}s have passed since pilot start') # every minute run the following check - if time.time() - last_minute_check > 60: - reached_maxtime = run_shutdowntime_minute_check(time_since_start) - if reached_maxtime: - reached_maxtime_abort(args) - break - last_minute_check = time.time() + if is_pilot_check(check='machinefeatures'): + if time.time() - last_minute_check > 60: + reached_maxtime = run_shutdowntime_minute_check(time_since_start) + if reached_maxtime: + reached_maxtime_abort(args) + break + last_minute_check = time.time() # take a nap time.sleep(1) # time to check the CPU usage? - if int(time.time() - tcpu) > cpuchecktime and False: # for testing only - processes = get_process_info('python3 pilot3/pilot.py', pid=getpid()) - if processes: - logger.info(f'PID={getpid()} has CPU usage={processes[0]}% CMD={processes[2]}') - nproc = processes[3] - if nproc > 1: - logger.info(f'.. there are {nproc} such processes running') - tcpu = time.time() + if is_pilot_check(check='cpu_usage'): + if int(time.time() - tcpu) > cpuchecktime and False: # for testing only + processes = get_process_info('python3 pilot3/pilot.py', pid=getpid()) + if processes: + logger.info(f'PID={getpid()} has CPU usage={processes[0]}% CMD={processes[2]}') + nproc = processes[3] + if nproc > 1: + logger.info(f'.. there are {nproc} such processes running') + tcpu = time.time() # proceed with running the other checks run_checks(queues, args) # thread monitoring - if int(time.time() - traces.pilot['lifetime_start']) % threadchecktime == 0: - # get all threads - for thread in threading.enumerate(): - # logger.info('thread name: %s', thread.name) - if not thread.is_alive(): - logger.fatal(f'thread \'{thread.name}\' is not alive') - # args.graceful_stop.set() + if is_pilot_check(check='threads'): + if int(time.time() - traces.pilot['lifetime_start']) % threadchecktime == 0: + # get all threads + for thread in threading.enumerate(): + # logger.info('thread name: %s', thread.name) + if not thread.is_alive(): + logger.fatal(f'thread \'{thread.name}\' is not alive') + # args.graceful_stop.set() niter += 1 diff --git a/pilot/util/constants.py b/pilot/util/constants.py index d1aa3f542..4a0307375 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '93' # build number should be reset to '1' for every new development cycle +BUILD = '94' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index ef85ca3bb..8b923c420 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -147,7 +147,9 @@ pandasecrets: panda_secrets.json # proxy = standard proxy validation # space = remaining disk space # last_heartbeat = time since last successful heartbeat -checks: proxy,space,last_heartbeat +# machinefeatures = look for machinefeatures +# jobfeatures = look for jobfeatures +checks: proxy,space,last_heartbeat,machinefeatures,jobfeatures,cpu_usage,threads ################################ # Information service parameters From a25fb2bc4b7ab8574c014bf5d01cd426540b8f48 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 12 May 2023 14:31:16 +0200 Subject: [PATCH 136/154] Added thread names, increased waiting time --- PILOTVERSION | 2 +- pilot/control/monitor.py | 2 +- pilot/util/constants.py | 2 +- pilot/util/processes.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 9f560d4b8..5a05adff6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.94 \ No newline at end of file +3.6.0.96 \ No newline at end of file diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index 1653fee7b..d102438b7 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -314,7 +314,7 @@ def run_checks(queues, args): return if not args.job_aborted.is_set(): - t_max = 60 + t_max = 180 logger.warning(f'will wait for a maximum of {t_max} s for graceful_stop to take effect') t_0 = time.time() ret = False diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4a0307375..6752a7103 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '94' # build number should be reset to '1' for every new development cycle +BUILD = '96' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 1988f6ac9..7080e64f6 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -643,7 +643,7 @@ def threads_aborted(caller=''): # f'tag={tag}') if pilot_thread_count == 0: logger.debug(f'aborting since only the main Pilot thread is still running ' - f'(total thread count={thread_count} with {daemon_threads} daemon thread(s)') + f'(total thread count={thread_count} with {daemon_threads} daemon thread(s): names={names}') abort = True elif pilot_thread_count == 1 and caller and caller != 'run': if caller in names[0]: From 5f280ad17d8cce8d8a09966008c1f92b57a82f3c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 12 May 2023 19:26:37 +0200 Subject: [PATCH 137/154] Removed useless comments. Always writing out all thread names --- pilot/control/data.py | 8 -------- pilot/control/job.py | 18 ------------------ pilot/control/payload.py | 12 ------------ pilot/util/constants.py | 2 +- pilot/util/processes.py | 2 +- 5 files changed, 2 insertions(+), 40 deletions(-) diff --git a/pilot/control/data.py b/pilot/control/data.py index 13c407e31..eb0a4dab8 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -84,8 +84,6 @@ def control(queues, traces, args): if threads_aborted(caller='control'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[data] control thread has finished') @@ -588,8 +586,6 @@ def copytool_in(queues, traces, args): # noqa: C901 if threads_aborted(caller='copytool_in'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[data] copytool_in thread has finished') @@ -674,8 +670,6 @@ def copytool_out(queues, traces, args): # noqa: C901 if threads_aborted(caller='copytool_out'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[data] copytool_out thread has finished') @@ -1086,7 +1080,5 @@ def queue_monitoring(queues, traces, args): if threads_aborted(caller='queue_monitoring'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[data] queue_monitor thread has finished') diff --git a/pilot/control/job.py b/pilot/control/job.py index ba6bc5d0f..fe6282b4b 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -111,8 +111,6 @@ def control(queues, traces, args): if threads_aborted(caller='control'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[job] control thread has finished') # test kill signal during end of generic workflow @@ -1078,8 +1076,6 @@ def validate(queues, traces, args): if threads_aborted(caller='validate'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[job] validate thread has finished') @@ -1251,8 +1247,6 @@ def create_data_payload(queues, traces, args): if threads_aborted(caller='create_data_payload'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[job] create_data_payload thread has finished') @@ -2062,8 +2056,6 @@ def retrieve(queues, traces, args): # noqa: C901 if threads_aborted(caller='retrieve'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[job] retrieve thread has finished') @@ -2460,8 +2452,6 @@ def queue_monitor(queues, traces, args): # noqa: C901 if threads_aborted(caller='queue_monitor'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[job] queue monitor thread has finished') @@ -2617,8 +2607,6 @@ def interceptor(queues, traces, args): if threads_aborted(caller='interceptor'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[job] interceptor thread has finished') @@ -2682,8 +2670,6 @@ def message_listener(queues, traces, args): if threads_aborted(caller='message_listener'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') if args.amq: logger.debug('closing ActiveMQ connections') @@ -2754,8 +2740,6 @@ def fast_job_monitor(queues, traces, args): if threads_aborted(caller='fast_job_monitor'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[job] fast job monitor thread has finished') @@ -2926,8 +2910,6 @@ def job_monitor(queues, traces, args): # noqa: C901 if threads_aborted(caller='job_monitor'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[job] job monitor thread has finished') diff --git a/pilot/control/payload.py b/pilot/control/payload.py index b777d7054..08a77c811 100644 --- a/pilot/control/payload.py +++ b/pilot/control/payload.py @@ -88,8 +88,6 @@ def control(queues, traces, args): if threads_aborted(caller='control'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[payload] control thread has finished') @@ -123,8 +121,6 @@ def validate_pre(queues, traces, args): if threads_aborted(caller='validate_pre'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[payload] validate_pre thread has finished') @@ -303,8 +299,6 @@ def execute_payloads(queues, traces, args): # noqa: C901 if threads_aborted(caller='execute_payloads'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[payload] execute_payloads thread has finished') @@ -537,8 +531,6 @@ def run_realtimelog(queues, traces, args): # noqa: C901 if threads_aborted(caller='run_realtimelog'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[payload] run_realtimelog thread has finished') @@ -732,8 +724,6 @@ def validate_post(queues, traces, args): if threads_aborted(caller='validate_post'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[payload] validate_post thread has finished') @@ -769,7 +759,5 @@ def failed_post(queues, traces, args): if threads_aborted(caller='failed_post'): logger.debug('will proceed to set job_aborted') args.job_aborted.set() - else: - logger.debug('will not set job_aborted yet') logger.info('[payload] failed_post thread has finished') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 6752a7103..9440886ea 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '96' # build number should be reset to '1' for every new development cycle +BUILD = '97' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 7080e64f6..d72876c35 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -632,8 +632,8 @@ def threads_aborted(caller=''): #tag = 'main' else: # only count threads spawned by the main thread, no the main thread itself or any daemon threads pilot_thread_count += 1 - names.append(f'{thread}') #tag = 'pilot?' + names.append(f'{thread}') #logger.debug(f'thread={thread},' # f'caller={caller}, ' # f'pilot_thread_count={pilot_thread_count}, ' From f4bed866e0ecbb3a4144564aceb5b13d58225990 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 12 May 2023 23:37:19 +0200 Subject: [PATCH 138/154] Corrections to threads --- PILOTVERSION | 2 +- pilot/control/monitor.py | 47 ++++++++++++++++++++++------------------ pilot/util/constants.py | 2 +- pilot/util/processes.py | 7 +++--- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 5a05adff6..c4aefcdb6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.96 \ No newline at end of file +3.6.0.98 \ No newline at end of file diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index d102438b7..a67cf7b1d 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -298,7 +298,7 @@ def run_checks(queues, args): # find all running jobs and stop them, find all jobs in queues relevant to this module abort_jobs_in_queues(queues, args.signal) - t_max = 2 * 60 + t_max = 5 * 60 logger.warning('pilot monitor received instruction that args.graceful_stop has been set') logger.warning(f'will wait for a maximum of {t_max} s for threads to finish') t_0 = time.time() @@ -306,32 +306,37 @@ def run_checks(queues, args): while time.time() - t_0 < t_max: if args.job_aborted.is_set(): logger.warning('job_aborted has been set - aborting pilot monitoring') - args.abort_job.clear() + #args.abort_job.clear() ret = True break time.sleep(1) if ret: return - if not args.job_aborted.is_set(): - t_max = 180 - logger.warning(f'will wait for a maximum of {t_max} s for graceful_stop to take effect') - t_0 = time.time() - ret = False - while time.time() - t_0 < t_max: - if args.job_aborted.is_set(): - logger.warning('job_aborted has been set - aborting pilot monitoring') - args.abort_job.clear() - ret = True - break - time.sleep(1) - if ret: - return - - diagnostics = 'reached maximum waiting time - threads should have finished' - args.abort_job.clear() - args.job_aborted.set() - raise ExceededMaxWaitTime(diagnostics) + diagnostics = 'reached maximum waiting time - threads should have finished (ignore exception)' + #args.abort_job.clear() + args.job_aborted.set() + raise ExceededMaxWaitTime(diagnostics) + +# if not args.job_aborted.is_set(): +# t_max = 180 +# logger.warning(f'will wait for a maximum of {t_max} s for graceful_stop to take effect') +# t_0 = time.time() +# ret = False +# while time.time() - t_0 < t_max: +# if args.job_aborted.is_set(): +# logger.warning('job_aborted has been set - aborting pilot monitoring') +# #args.abort_job.clear() +# ret = True +# break +# time.sleep(1) +# if ret: +# return + +# diagnostics = 'reached maximum waiting time - threads should have finished' +# args.abort_job.clear() +# args.job_aborted.set() +# raise ExceededMaxWaitTime(diagnostics) def get_max_running_time(lifetime, queuedata, queues, push, pod): diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 9440886ea..8a464a4bb 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '97' # build number should be reset to '1' for every new development cycle +BUILD = '98' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index d72876c35..3dea4d0d8 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -642,12 +642,13 @@ def threads_aborted(caller=''): # f'names={names}, ' # f'tag={tag}') if pilot_thread_count == 0: + logger.debug(f'caller={caller}, main_thread_count={main_thread_count}') logger.debug(f'aborting since only the main Pilot thread is still running ' f'(total thread count={thread_count} with {daemon_threads} daemon thread(s): names={names}') abort = True - elif pilot_thread_count == 1 and caller and caller != 'run': - if caller in names[0]: - logger.info(f'caller={caller} is remaining thread - safe to abort') + elif pilot_thread_count == 1 and caller: # and caller != 'run': + if caller in names[0] or caller == 'run': + logger.info(f'caller={caller} is remaining thread - safe to abort (names={names})') abort = True return abort From 14090a8535b85272c7350e94db83a21e080e7cee Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Sat, 13 May 2023 00:12:49 +0200 Subject: [PATCH 139/154] Cleanup --- pilot/control/job.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index fe6282b4b..c38825200 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2777,7 +2777,6 @@ def job_monitor(queues, traces, args): # noqa: C901 # check for any abort_job requests (either kill signal or tobekilled command) abort_job = check_for_abort_job(args, caller='job monitor') - logger.debug(f'abort_job={abort_job}') if not abort_job: if not queues.current_data_in.empty(): # make sure to send heartbeat regularly if stage-in takes a long time From c0534bffd372106174eedf49cba9ccd24e57d51a Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Sat, 13 May 2023 17:24:27 +0200 Subject: [PATCH 140/154] Setting job_aborted --- pilot/util/constants.py | 2 +- pilot/workflow/generic.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 8a464a4bb..bf926890f 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '98' # build number should be reset to '1' for every new development cycle +BUILD = '99' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/workflow/generic.py b/pilot/workflow/generic.py index 5396a0cbc..792a0ebfc 100644 --- a/pilot/workflow/generic.py +++ b/pilot/workflow/generic.py @@ -196,6 +196,9 @@ def run(args): # have all threads finished? abort = threads_aborted(caller='run') if abort: + logger.debug('will proceed to set job_aborted') + args.job_aborted.set() + sleep(5) # allow monitor thread to finish (should pick up job_aborted within 1 second) logger.debug(f'all relevant threads have aborted (thread count={threading.activeCount()})') break From 32b36ac5d23c9f4a32732c169500e2c4e2fb2ddc Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Sat, 13 May 2023 18:58:18 +0200 Subject: [PATCH 141/154] Testing later finishing of threads --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/processes.py | 16 +++++++++------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index c4aefcdb6..bb976d117 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.98 \ No newline at end of file +3.6.0.100 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index bf926890f..27d7d90f5 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '99' # build number should be reset to '1' for every new development cycle +BUILD = '100' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 3dea4d0d8..8a10d0046 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -641,16 +641,18 @@ def threads_aborted(caller=''): # f'main_thread_count={main_thread_count}, ' # f'names={names}, ' # f'tag={tag}') - if pilot_thread_count == 0: - logger.debug(f'caller={caller}, main_thread_count={main_thread_count}') - logger.debug(f'aborting since only the main Pilot thread is still running ' - f'(total thread count={thread_count} with {daemon_threads} daemon thread(s): names={names}') - abort = True - elif pilot_thread_count == 1 and caller: # and caller != 'run': + #if pilot_thread_count == 0: + # logger.debug(f'caller={caller}, main_thread_count={main_thread_count}') + # logger.debug(f'aborting since only the main Pilot thread is still running ' + # f'(total thread count={thread_count} with {daemon_threads} daemon thread(s): names={names}') + # abort = True + if pilot_thread_count == 0 and caller: # and caller != 'run': if caller in names[0] or caller == 'run': logger.info(f'caller={caller} is remaining thread - safe to abort (names={names})') abort = True - + elif pilot_thread_count == 0: + logger.info(f'safe to abort? (names={names})') + abort = True return abort From 31eeb15c7e8610011c7a53cae112a6b0387a599a Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Tue, 16 May 2023 10:35:51 +0200 Subject: [PATCH 142/154] Added debug info --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/processes.py | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index bb976d117..8662288f6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.100 \ No newline at end of file +3.6.0.101 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 27d7d90f5..c2499a791 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '100' # build number should be reset to '1' for every new development cycle +BUILD = '101' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 8a10d0046..fa7bd934c 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -653,6 +653,11 @@ def threads_aborted(caller=''): elif pilot_thread_count == 0: logger.info(f'safe to abort? (names={names})') abort = True + elif pilot_thread_count < 3: + logger.info(f'waiting for threads to finish: {names}' + f'(pilot_thread_count={pilot_thread_count}' + f'main_thread_count={main_thread_count}' + f'daemon_threads={daemon_threads})') return abort From 094292e51d88e9af9a0c3c9fa6ea539a28b4c4d3 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Tue, 16 May 2023 14:37:27 +0200 Subject: [PATCH 143/154] Update --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/processes.py | 13 ++++++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 8662288f6..dae6097e6 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.101 \ No newline at end of file +3.6.0.103 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index c2499a791..1403332b8 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '101' # build number should be reset to '1' for every new development cycle +BUILD = '103' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index fa7bd934c..a754c89ab 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -653,11 +653,14 @@ def threads_aborted(caller=''): elif pilot_thread_count == 0: logger.info(f'safe to abort? (names={names})') abort = True - elif pilot_thread_count < 3: - logger.info(f'waiting for threads to finish: {names}' - f'(pilot_thread_count={pilot_thread_count}' - f'main_thread_count={main_thread_count}' - f'daemon_threads={daemon_threads})') + elif pilot_thread_count == 1: + mon = [thread for thread in names if ('monitor' in thread and '_monitor' not in thread)] # exclude job_monitor and queue_monitor(ing) + if mon: + logger.info(f'only monitor.control thread still running - safe to abort: {names}') + abort = True + else: + logger.info(f'waiting for thread to finish: {names}') + return abort From 8f69cfce7b70907b06d3392e328eac8d53c2e999 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 18 May 2023 12:40:59 +0200 Subject: [PATCH 144/154] Out-commented ps output --- PILOTVERSION | 2 +- pilot/user/atlas/utilities.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index dae6097e6..c2556f929 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.103 \ No newline at end of file +3.6.0.104 \ No newline at end of file diff --git a/pilot/user/atlas/utilities.py b/pilot/user/atlas/utilities.py index cd8cdd727..63b9ccb51 100644 --- a/pilot/user/atlas/utilities.py +++ b/pilot/user/atlas/utilities.py @@ -173,7 +173,7 @@ def get_proper_pid(pid, pgrp, jobid, command="", transformation="", outdata="", return -1 ps = get_ps_info(pgrp) - logger.debug('ps:\n%s' % ps) + #logger.debug('ps:\n%s' % ps) # lookup the process id using ps aux logger.debug(f'attempting to identify pid from job id ({jobid})') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 1403332b8..d2b2de558 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '103' # build number should be reset to '1' for every new development cycle +BUILD = '104' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 2fdd3b33f0b024ebc90bd5a0eddf3b945bce3ab2 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 18 May 2023 12:41:40 +0200 Subject: [PATCH 145/154] Cleanup --- pilot/control/job.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pilot/control/job.py b/pilot/control/job.py index c38825200..2b4465403 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2078,8 +2078,6 @@ def htcondor_envvar(jobid, processingtype): logger.info(f'set env var HTCondor_JOB_ID={globaljobid}') except Exception as exc: logger.warning(f'caught exception: {exc}') - else: - logger.debug('not a condor batch system - will not set HTCondor_JOB_ID') # REMOVE ME def handle_proxy(job): From 50ab8625c66c53af1d008abb182881d7f9a14b49 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 19 May 2023 11:33:02 +0200 Subject: [PATCH 146/154] ssl_enable and ssl_verify is now configurable --- pilot/util/constants.py | 2 +- pilot/util/default.cfg | 2 ++ pilot/util/realtimelogger.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index d2b2de558..2b5c1529a 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '104' # build number should be reset to '1' for every new development cycle +BUILD = '105' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/default.cfg b/pilot/util/default.cfg index 8b923c420..cb59a63ad 100644 --- a/pilot/util/default.cfg +++ b/pilot/util/default.cfg @@ -46,6 +46,8 @@ iddsserver: https://pandaserver.cern.ch:25443 # The log type and URL for the real-time logging server (format: ;) rtlogging:logstash;http://aipanda020.cern.ch:8443 +ssl_enable: True +ssl_verify: False # The heartbeat period in seconds (30*60 = 1800 s in normal mode, 5*60 = 300 s in debug mode) # A lost heartbeat is 60*60*3 s, i.e. 3h diff --git a/pilot/util/realtimelogger.py b/pilot/util/realtimelogger.py index b084ef0b8..e6a011226 100644 --- a/pilot/util/realtimelogger.py +++ b/pilot/util/realtimelogger.py @@ -148,7 +148,8 @@ def __init__(self, args, info_dic, workdir, secrets, level=INFO): transport = HttpTransport( server, port, - ssl_verify=False, + ssl_enable=config.Pilot.ssl_enable, + ssl_verify=config.Pilot.ssl_verify, timeout=5.0, username=secrets.get('logstash_login', 'unknown_login'), password=secrets.get('logstash_password', 'unknown_password') From 750d3cc7fe97357b1a9f32fd5d1f7377fa1a39e4 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 19 May 2023 17:40:56 +0200 Subject: [PATCH 147/154] Do not include daemon threads in log message --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/processes.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index c2556f929..e7e03e9c1 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.104 \ No newline at end of file +3.6.0.105 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 2b5c1529a..adb6649d6 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '105' # build number should be reset to '1' for every new development cycle +BUILD = '106' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/processes.py b/pilot/util/processes.py index a754c89ab..8d76a5cd4 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -630,10 +630,11 @@ def threads_aborted(caller=''): elif thread == threading.main_thread(): main_thread_count += 1 #tag = 'main' + names.append(f'{thread}') else: # only count threads spawned by the main thread, no the main thread itself or any daemon threads pilot_thread_count += 1 #tag = 'pilot?' - names.append(f'{thread}') + names.append(f'{thread}') #logger.debug(f'thread={thread},' # f'caller={caller}, ' # f'pilot_thread_count={pilot_thread_count}, ' From 2847e8b2f896de8f982593d741f9f49f69447a52 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 19 May 2023 17:58:48 +0200 Subject: [PATCH 148/154] Timing out threads if monitor thread will end --- pilot/control/monitor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pilot/control/monitor.py b/pilot/control/monitor.py index a67cf7b1d..9e58f1353 100644 --- a/pilot/control/monitor.py +++ b/pilot/control/monitor.py @@ -64,7 +64,8 @@ def control(queues, traces, args): # noqa: C901 while not args.graceful_stop.is_set(): # every few seconds, run the monitoring checks if args.graceful_stop.wait(1) or args.graceful_stop.is_set(): - logger.warning('aborting monitor loop since graceful_stop has been set') + logger.warning('aborting monitor loop since graceful_stop has been set (timing out remaining threads)') + run_checks(queues, args) break # abort if kill signal arrived too long time ago, ie loop is stuck From 23b48cab1dc8595b4173b476ee5c063f19d9fb6d Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Fri, 19 May 2023 18:18:41 +0200 Subject: [PATCH 149/154] Aborting loop if graceful_stop is set - for continue cases --- PILOTVERSION | 2 +- pilot/control/job.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/PILOTVERSION b/PILOTVERSION index e7e03e9c1..0c84bbd06 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.105 \ No newline at end of file +3.6.0.106 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 2b4465403..fc57dee92 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2768,6 +2768,14 @@ def job_monitor(queues, traces, args): # noqa: C901 n = 0 cont = True while cont: + + # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) + abort = should_abort(args, label='job:job_monitor') + if abort: + logger.info('aborting loop') + cont = False + break + time.sleep(0.5) if traces.pilot.get('command') == 'abort': @@ -2901,6 +2909,7 @@ def job_monitor(queues, traces, args): # noqa: C901 # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) abort = should_abort(args, label='job:job_monitor') if abort: + logger.info('will abort loop') cont = False # proceed to set the job_aborted flag? From 62f0438b10facf7edaf47e5b47869f428303b3c2 Mon Sep 17 00:00:00 2001 From: Julien Esseiva Date: Fri, 19 May 2023 11:09:37 -0700 Subject: [PATCH 150/154] Fix type in esprocess and added `ATLAS_POOLCOND_PATH` in NERSC resource --- pilot/eventservice/esprocess/esprocess.py | 2 +- pilot/user/atlas/resource/nersc.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pilot/eventservice/esprocess/esprocess.py b/pilot/eventservice/esprocess/esprocess.py index 04800f0d1..b1151a63f 100644 --- a/pilot/eventservice/esprocess/esprocess.py +++ b/pilot/eventservice/esprocess/esprocess.py @@ -134,7 +134,7 @@ def init_yampl_socket(self, executable): if "PILOT_EVENTRANGECHANNEL" in executable: executable = "export PILOT_EVENTRANGECHANNEL=\"%s\"; " % (socket_name) + executable elif "--preExec" not in executable: - executable = executable().strip() + executable = executable.strip() if executable.endswith(";"): executable = executable[:-1] executable += preexec_socket_config diff --git a/pilot/user/atlas/resource/nersc.py b/pilot/user/atlas/resource/nersc.py index 7c05640ac..4d704abe1 100644 --- a/pilot/user/atlas/resource/nersc.py +++ b/pilot/user/atlas/resource/nersc.py @@ -64,7 +64,8 @@ def get_setup_command(job, prepareasetup): # test if HARVESTER_PYTHONPATH is defined if os.environ.get('HARVESTER_PYTHONPATH', '') != "": cmd += "export PYTHONPATH=$HARVESTER_PYTHONPATH:$PYTHONPATH;" - #set FRONTIER_SERVER for NERSC + #set FRONTIER_SERVER and ATLAS_POOLCOND_PATH for NERSC + cmd += "export ATLAS_POOLCOND_PATH=/cvmfs/atlas-condb.cern.ch/repo/conditions;" cmd += ("export FRONTIER_SERVER=" "\"(serverurl=http://atlasfrontier-ai.cern.ch:8000/atlr)" "(serverurl=http://atlasfrontier2-ai.cern.ch:8000/atlr)" From f5da24acd94b896078102f788cb329642bb999ac Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Mon, 22 May 2023 09:41:05 +0200 Subject: [PATCH 151/154] Merged with Julien's PR --- pilot/util/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/constants.py b/pilot/util/constants.py index adb6649d6..4352d353c 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '106' # build number should be reset to '1' for every new development cycle +BUILD = '107' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From d23ff96e9085c5e7c30e164b04bedbd7978ccf4b Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 24 May 2023 11:16:15 +0200 Subject: [PATCH 152/154] Flake8 --- pilot/util/processes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pilot/util/processes.py b/pilot/util/processes.py index 8d76a5cd4..5591df6a8 100644 --- a/pilot/util/processes.py +++ b/pilot/util/processes.py @@ -616,7 +616,7 @@ def threads_aborted(caller=''): """ abort = False - thread_count = threading.activeCount() + #thread_count = threading.activeCount() pilot_thread_count = 0 daemon_threads = 0 main_thread_count = 0 From 49118c770aa01372ec17bda0366b7c46b08bc186 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 24 May 2023 11:42:56 +0200 Subject: [PATCH 153/154] Merged with Julien's PR --- PILOTVERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PILOTVERSION b/PILOTVERSION index 0c84bbd06..514a660e7 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.106 \ No newline at end of file +3.6.0.107 \ No newline at end of file From e87fcd16777669758ff600b1132380be6ae4325c Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 24 May 2023 11:45:22 +0200 Subject: [PATCH 154/154] Merged with Julien's PR --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 514a660e7..ecc7664c9 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.0.107 \ No newline at end of file +3.6.0.108 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4352d353c..b4d22629c 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '0' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '107' # build number should be reset to '1' for every new development cycle +BUILD = '108' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1