From 61670c5b4a8a7b8316b4da1dc49fdc8f2782fdcd Mon Sep 17 00:00:00 2001 From: PalNilsson Date: Wed, 4 Dec 2024 09:44:46 +0100 Subject: [PATCH] Delayed first CPU consumption time --- PILOTVERSION | 2 +- pilot/info/jobdata.py | 1 + pilot/util/constants.py | 2 +- pilot/util/monitoring.py | 15 +++++++++++---- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/PILOTVERSION b/PILOTVERSION index 4fb479ab..8c5d7627 100644 --- a/PILOTVERSION +++ b/PILOTVERSION @@ -1 +1 @@ -3.9.2.38 \ No newline at end of file +3.9.2.39 \ No newline at end of file diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py index b769f831..c9e41ad6 100644 --- a/pilot/info/jobdata.py +++ b/pilot/info/jobdata.py @@ -133,6 +133,7 @@ class JobData(BaseData): prodproxy = "" # to keep track of production proxy on unified queues completed = False # True when job has finished or failed, used by https::send_update() lsetuptime = 0 # payload setup time (lsetup) + runningstart = None # time when the payload started running (only for internal monitoring purposes, not the actual start time) # time variable used for on-the-fly cpu consumption time measurements done by job monitoring t0 = None # payload startup time diff --git a/pilot/util/constants.py b/pilot/util/constants.py index e7454438..4cc40074 100644 --- a/pilot/util/constants.py +++ b/pilot/util/constants.py @@ -28,7 +28,7 @@ RELEASE = '3' # released number should be fixed at 3 for Pilot 3 VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates -BUILD = '38' # build number should be reset to '1' for every new development cycle +BUILD = '39' # build number should be reset to '1' for every new development cycle SUCCESS = 0 FAILURE = 1 diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py index 8471904b..87a131ee 100644 --- a/pilot/util/monitoring.py +++ b/pilot/util/monitoring.py @@ -100,7 +100,11 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i # update timing info for running jobs (to avoid an update after the job has finished) if job.state == 'running': + # keep track of the time since the job started running (approximate since it is set here, move later) + if not job.runningstart: + job.runningstart = current_time + # check the disk space # make sure that any utility commands are still running (and determine pid of memory monitor- as early as possible) if job.utilities != {}: utility_monitor(job) @@ -108,10 +112,13 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i # confirm that the worker node has a proper SC_CLK_TCK (problems seen on MPPMU) check_hz() - # set the CPU consumption time for the job - exit_code, diagnostics = set_cpu_consumption_time(job) - if exit_code: - return exit_code, diagnostics + # set the CPU consumption time for the job (if it has been running for > 10s) + if job.runningstart and (current_time - job.runningstart) > 10: + exit_code, diagnostics = set_cpu_consumption_time(job) + if exit_code: + return exit_code, diagnostics + else: + logger.debug('skipping CPU consumption time check since job has not been running for long enough') # keep track of the subprocesses running (store payload subprocess PIDs) store_subprocess_pids(job)