Delayed first CPU consumption time

PalNilsson · Dec 4, 2024 · 61670c5 · 61670c5
1 parent e8d5b8a
commit 61670c5
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 6 deletions.
diff --git a/PILOTVERSION b/PILOTVERSION
@@ -1 +1 @@
-3.9.2.38
+3.9.2.39
diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py
@@ -133,6 +133,7 @@ class JobData(BaseData):
     prodproxy = ""                 # to keep track of production proxy on unified queues
     completed = False              # True when job has finished or failed, used by https::send_update()
     lsetuptime = 0                 # payload setup time (lsetup)
+    runningstart = None            # time when the payload started running (only for internal monitoring purposes, not the actual start time)
 
     # time variable used for on-the-fly cpu consumption time measurements done by job monitoring
     t0 = None                      # payload startup time

diff --git a/pilot/util/constants.py b/pilot/util/constants.py
@@ -28,7 +28,7 @@
 RELEASE = '3'   # released number should be fixed at 3 for Pilot 3
 VERSION = '9'   # version number is '1' for first release, '0' until then, increased for bigger updates
 REVISION = '2'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '38'     # build number should be reset to '1' for every new development cycle
+BUILD = '39'     # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1

diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py
@@ -100,18 +100,25 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i
 
     # update timing info for running jobs (to avoid an update after the job has finished)
     if job.state == 'running':
+        # keep track of the time since the job started running (approximate since it is set here, move later)
+        if not job.runningstart:
+            job.runningstart = current_time
 
+        # check the disk space
         # make sure that any utility commands are still running (and determine pid of memory monitor- as early as possible)
         if job.utilities != {}:
             utility_monitor(job)
 
         # confirm that the worker node has a proper SC_CLK_TCK (problems seen on MPPMU)
         check_hz()
 
-        # set the CPU consumption time for the job
-        exit_code, diagnostics = set_cpu_consumption_time(job)
-        if exit_code:
-            return exit_code, diagnostics
+        # set the CPU consumption time for the job (if it has been running for > 10s)
+        if job.runningstart and (current_time - job.runningstart) > 10:
+            exit_code, diagnostics = set_cpu_consumption_time(job)
+            if exit_code:
+                return exit_code, diagnostics
+        else:
+            logger.debug('skipping CPU consumption time check since job has not been running for long enough')
 
         # keep track of the subprocesses running (store payload subprocess PIDs)
         store_subprocess_pids(job)