From 61670c5b4a8a7b8316b4da1dc49fdc8f2782fdcd Mon Sep 17 00:00:00 2001
From: PalNilsson <Paul.Nilsson@cern.ch>
Date: Wed, 4 Dec 2024 09:44:46 +0100
Subject: [PATCH] Delayed first CPU consumption time

---
 PILOTVERSION             |  2 +-
 pilot/info/jobdata.py    |  1 +
 pilot/util/constants.py  |  2 +-
 pilot/util/monitoring.py | 15 +++++++++++----
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/PILOTVERSION b/PILOTVERSION
index 4fb479ab..8c5d7627 100644
--- a/PILOTVERSION
+++ b/PILOTVERSION
@@ -1 +1 @@
-3.9.2.38
\ No newline at end of file
+3.9.2.39
\ No newline at end of file
diff --git a/pilot/info/jobdata.py b/pilot/info/jobdata.py
index b769f831..c9e41ad6 100644
--- a/pilot/info/jobdata.py
+++ b/pilot/info/jobdata.py
@@ -133,6 +133,7 @@ class JobData(BaseData):
     prodproxy = ""                 # to keep track of production proxy on unified queues
     completed = False              # True when job has finished or failed, used by https::send_update()
     lsetuptime = 0                 # payload setup time (lsetup)
+    runningstart = None            # time when the payload started running (only for internal monitoring purposes, not the actual start time)
 
     # time variable used for on-the-fly cpu consumption time measurements done by job monitoring
     t0 = None                      # payload startup time
diff --git a/pilot/util/constants.py b/pilot/util/constants.py
index e7454438..4cc40074 100644
--- a/pilot/util/constants.py
+++ b/pilot/util/constants.py
@@ -28,7 +28,7 @@
 RELEASE = '3'   # released number should be fixed at 3 for Pilot 3
 VERSION = '9'   # version number is '1' for first release, '0' until then, increased for bigger updates
 REVISION = '2'  # revision number should be reset to '0' for every new version release, increased for small updates
-BUILD = '38'     # build number should be reset to '1' for every new development cycle
+BUILD = '39'     # build number should be reset to '1' for every new development cycle
 
 SUCCESS = 0
 FAILURE = 1
diff --git a/pilot/util/monitoring.py b/pilot/util/monitoring.py
index 8471904b..87a131ee 100644
--- a/pilot/util/monitoring.py
+++ b/pilot/util/monitoring.py
@@ -100,7 +100,11 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i
 
     # update timing info for running jobs (to avoid an update after the job has finished)
     if job.state == 'running':
+        # keep track of the time since the job started running (approximate since it is set here, move later)
+        if not job.runningstart:
+            job.runningstart = current_time
 
+        # check the disk space
         # make sure that any utility commands are still running (and determine pid of memory monitor- as early as possible)
         if job.utilities != {}:
             utility_monitor(job)
@@ -108,10 +112,13 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i
         # confirm that the worker node has a proper SC_CLK_TCK (problems seen on MPPMU)
         check_hz()
 
-        # set the CPU consumption time for the job
-        exit_code, diagnostics = set_cpu_consumption_time(job)
-        if exit_code:
-            return exit_code, diagnostics
+        # set the CPU consumption time for the job (if it has been running for > 10s)
+        if job.runningstart and (current_time - job.runningstart) > 10:
+            exit_code, diagnostics = set_cpu_consumption_time(job)
+            if exit_code:
+                return exit_code, diagnostics
+        else:
+            logger.debug('skipping CPU consumption time check since job has not been running for long enough')
 
         # keep track of the subprocesses running (store payload subprocess PIDs)
         store_subprocess_pids(job)