From 1d275b86aa0df1f6b19835a5539943da16aa6258 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Jul 2023 15:53:37 +0200 Subject: [PATCH 1/8] New version --- PILOTVERSION | 2 +- pilot/util/constants.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index aeb99147..7297d6c0 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.3.8 \ No newline at end of file +3.6.4.1 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 4963987f..dd6d5898 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -13,8 +13,8 @@ # Pilot version RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates -REVISION = '3' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '8' # build number should be reset to '1' for every new development cycle +REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates +BUILD = '1' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 215e2f0bbb47c805f5bf244afdce3a5baa933aa3 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Jul 2023 16:16:19 +0200 Subject: [PATCH 2/8] No stage-out immediately after stage-in in stager mode --- pilot/control/data.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pilot/control/data.py b/pilot/control/data.py index 20180503..b92ac042 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -584,13 +584,13 @@ def copytool_in(queues, traces, args): # noqa: C901 logger.warning(f'path does not exist: {path}') # stage-out log file - job.stageout = "log" - if not _stage_out_new(job, args): - logger.info(f"job {job.jobid} failed during stage-out of log, adding job object to failed_data_outs queue") - put_in_queue(job, queues.failed_data_out) - else: - logger.info(f"job {job.jobid} has finished") - put_in_queue(job, queues.finished_jobs) + #job.stageout = "log" + #if not _stage_out_new(job, args): + # logger.info(f"job {job.jobid} failed during stage-out of log, adding job object to failed_data_outs queue") + # put_in_queue(job, queues.failed_data_out) + #else: + # logger.info(f"job {job.jobid} has finished") + # put_in_queue(job, queues.finished_jobs) logger.info('stage-in thread is no longer needed - terminating') abort = True From 89ea31a09d5c8746c6908638ea47300216092c59 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Jul 2023 16:53:13 +0200 Subject: [PATCH 3/8] Only stopping stage-in thread in stager mode --- PILOTVERSION | 2 +- pilot/control/data.py | 2 +- pilot/util/constants.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 7297d6c0..92fdfe2f 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.4.1 \ No newline at end of file +3.6.4.2 \ No newline at end of file diff --git a/pilot/control/data.py b/pilot/control/data.py index b92ac042..2441e77a 100644 --- a/pilot/control/data.py +++ b/pilot/control/data.py @@ -619,7 +619,7 @@ def copytool_in(queues, traces, args): # noqa: C901 logger.debug('an abort was received - finishing stage-in thread') # proceed to set the job_aborted flag? - if threads_aborted(caller='copytool_in'): + if threads_aborted(caller='copytool_in') and args.workflow != 'stager': # only finish this thread in stager mode logger.debug('will proceed to set job_aborted') args.job_aborted.set() diff --git a/pilot/util/constants.py b/pilot/util/constants.py index dd6d5898..a4f3a889 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '1' # build number should be reset to '1' for every new development cycle +BUILD = '2' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 487aafe9dd931ca0decc8bda114db70bfdeeb9d6 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Jul 2023 17:18:03 +0200 Subject: [PATCH 4/8] Update --- PILOTVERSION | 2 +- pilot/control/job.py | 5 +++++ pilot/util/constants.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 92fdfe2f..e1880a31 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.4.2 \ No newline at end of file +3.6.4.3 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index a6bf1629..2a9dc1a1 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2817,13 +2817,18 @@ def job_monitor(queues, traces, args): # noqa: C901 elif not queues.finished_data_in.empty(): # stage-in has finished, or there were no input files to begin with, job object ends up in finished_data_in queue if args.workflow == 'stager': + logger.debug('stage-in finished - waiting for lease time to finish') if args.pod: # wait maximum args.leasetime seconds, then abort time.sleep(10) time_now = int(time.time()) if time_now - start_time >= args.leasetime: logger.warning(f'lease time is up: {time_now - start_time} s has passed since start - abort stager pilot') + else: + logger.debug('tick-tock') + continue else: + logger.debug('continuing') continue else: logger.debug('stage-in has finished - no need for job_monitor to continue') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index a4f3a889..3d1382cb 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '2' # build number should be reset to '1' for every new development cycle +BUILD = '3' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 68c20477e4e727d9816b8c41a604054e46cd78dc Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Jul 2023 17:47:30 +0200 Subject: [PATCH 5/8] Update --- PILOTVERSION | 2 +- pilot/control/job.py | 8 +++++--- pilot/util/constants.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index e1880a31..4eaef03a 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.4.3 \ No newline at end of file +3.6.4.4 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 2a9dc1a1..ca803cc3 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2770,6 +2770,7 @@ def job_monitor(queues, traces, args): # noqa: C901 # overall loop counter (ignoring the fact that more than one job may be running) n = 0 cont = True + first = True while cont: # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set) @@ -2817,18 +2818,19 @@ def job_monitor(queues, traces, args): # noqa: C901 elif not queues.finished_data_in.empty(): # stage-in has finished, or there were no input files to begin with, job object ends up in finished_data_in queue if args.workflow == 'stager': - logger.debug('stage-in finished - waiting for lease time to finish') + if first: + logger.debug('stage-in finished - waiting for lease time to finish') + first = False if args.pod: # wait maximum args.leasetime seconds, then abort time.sleep(10) time_now = int(time.time()) if time_now - start_time >= args.leasetime: logger.warning(f'lease time is up: {time_now - start_time} s has passed since start - abort stager pilot') + args.graceful_stop.set() else: - logger.debug('tick-tock') continue else: - logger.debug('continuing') continue else: logger.debug('stage-in has finished - no need for job_monitor to continue') diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 3d1382cb..b16aae46 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '3' # build number should be reset to '1' for every new development cycle +BUILD = '4' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 0c01c0c08e235d3963aaf485a2575479b3e083fb Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Wed, 12 Jul 2023 18:32:09 +0200 Subject: [PATCH 6/8] Update --- PILOTVERSION | 2 +- pilot/control/job.py | 4 +++- pilot/util/constants.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 4eaef03a..691fa558 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.4.4 \ No newline at end of file +3.6.4.5 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index ca803cc3..19b24632 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2827,7 +2827,9 @@ def job_monitor(queues, traces, args): # noqa: C901 time_now = int(time.time()) if time_now - start_time >= args.leasetime: logger.warning(f'lease time is up: {time_now - start_time} s has passed since start - abort stager pilot') - args.graceful_stop.set() + jobs[i].stageout = 'log' # only stage-out log file + put_in_queue(jobs[i], queues.data_out) + #args.graceful_stop.set() else: continue else: diff --git a/pilot/util/constants.py b/pilot/util/constants.py index b16aae46..8ef74a9a 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '4' # build number should be reset to '1' for every new development cycle +BUILD = '5' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 From 7f1684fa692b9bf343829305089e3aae7725f295 Mon Sep 17 00:00:00 2001 From: Paul Nilsson Date: Thu, 13 Jul 2023 12:33:16 +0200 Subject: [PATCH 7/8] Reverted disk size check function, Corrected missing queuedata --- PILOTVERSION | 2 +- pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 58 +++++++++++++++++++++------------------- 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 691fa558..bb69760d 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.4.5 \ No newline at end of file +3.6.4.6 \ No newline at end of file diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 8ef74a9a..216f8f76 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '5' # build number should be reset to '1' for every new development cycle +BUILD = '6' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 652519c3..7a127d6a 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -28,6 +28,7 @@ get_subprocesses from pilot.util.timing import get_time_since from pilot.util.workernode import get_local_disk_space, check_hz +from pilot.info import infosys import logging logger = logging.getLogger(__name__) @@ -652,36 +653,40 @@ def check_work_dir(job): if os.path.exists(job.workdir): # get the limit of the workdir maxwdirsize = get_max_allowed_work_dir_size() - workdirsize = get_disk_usage(job.workdir) - # is user dir within allowed size limit? - if workdirsize > maxwdirsize: - exit_code = errors.USERDIRTOOLARGE - diagnostics = f'work directory ({job.workdir}) is too large: {workdirsize} B (must be < {maxwdirsize} B)' - logger.fatal(diagnostics) + if os.path.exists(job.workdir): + workdirsize = get_disk_usage(job.workdir) - cmd = 'ls -altrR %s' % job.workdir - _ec, stdout, stderr = execute(cmd, mute=True) - logger.info(f'{cmd}:\n{stdout}') + # is user dir within allowed size limit? + if workdirsize > maxwdirsize: + exit_code = errors.USERDIRTOOLARGE + diagnostics = f'work directory ({job.workdir}) is too large: {workdirsize} B (must be < {maxwdirsize} B)' + logger.fatal(diagnostics) - # kill the job - set_pilot_state(job=job, state="failed") - job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) - kill_processes(job.pid) + cmd = 'ls -altrR %s' % job.workdir + _ec, stdout, stderr = execute(cmd, mute=True) + logger.info(f'{cmd}:\n{stdout}') - # remove any lingering input files from the work dir - lfns, guids = job.get_lfns_and_guids() - if lfns: - remove_files(lfns, workdir=job.workdir) + # kill the job + set_pilot_state(job=job, state="failed") + job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(exit_code) + kill_processes(job.pid) - # re-measure the size of the workdir at this point since the value is stored below - workdirsize = get_disk_usage(job.workdir) - else: - logger.info(f'size of work directory {job.workdir}: {workdirsize} B (within {maxwdirsize} B limit)') + # remove any lingering input files from the work dir + lfns, guids = job.get_lfns_and_guids() + if lfns: + remove_files(lfns, workdir=job.workdir) - # Store the measured disk space (the max value will later be sent with the job metrics) - if workdirsize > 0: - job.add_workdir_size(workdirsize) + # remeasure the size of the workdir at this point since the value is stored below + workdirsize = get_disk_usage(job.workdir) + else: + logger.info(f'size of work directory {job.workdir}: {workdirsize} B (within {maxwdirsize} B limit)') + + # Store the measured disk space (the max value will later be sent with the job metrics) + if workdirsize > 0: + job.add_workdir_size(workdirsize) + else: + logger.warning(f'job work dir does not exist: {job.workdir}') else: logger.warning('skipping size check of workdir since it has not been created yet') @@ -713,16 +718,15 @@ def get_max_allowed_work_dir_size(): return maxwdirsize -def get_max_input_size(queuedata, megabyte=False): +def get_max_input_size(megabyte=False): """ Return a proper maxinputsize value. - :param queuedata: job.infosys.queuedata object. :param megabyte: return results in MB (Boolean). :return: max input size (int). """ - _maxinputsize = queuedata.maxwdir # normally 14336+2000 MB + _maxinputsize = infosys.queuedata.maxwdir # normally 14336+2000 MB max_input_file_sizes = 14 * 1024 * 1024 * 1024 # 14 GB, 14336 MB (pilot default) max_input_file_sizes_mb = 14 * 1024 # 14336 MB (pilot default) if _maxinputsize != "": From 9b1ed8d58f04226c1c29d7be57950dd82f8b87e0 Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Thu, 13 Jul 2023 16:24:29 +0200 Subject: [PATCH 8/8] Patch for lost heartbeat in job monitor loop --- PILOTVERSION | 2 +- pilot/control/job.py | 5 +++-- pilot/util/constants.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index bb69760d..7d1be157 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.6.4.6 \ No newline at end of file +3.6.4.7 \ No newline at end of file diff --git a/pilot/control/job.py b/pilot/control/job.py index 19b24632..6de5cdef 100644 --- a/pilot/control/job.py +++ b/pilot/control/job.py @@ -2834,8 +2834,9 @@ def job_monitor(queues, traces, args): # noqa: C901 continue else: continue - else: - logger.debug('stage-in has finished - no need for job_monitor to continue') + + if args.workflow == 'stager': + logger.debug('stage-in has finished - no need for job_monitor to continue') break # peek at the jobs in the validated_jobs queue and send the running ones to the heartbeat function diff --git a/pilot/util/constants.py b/pilot/util/constants.py index 216f8f76..a2dd8b14 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -14,7 +14,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '6' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '4' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '6' # build number should be reset to '1' for every new development cycle +BUILD = '7' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1