Skip to content

Commit

Permalink
Delayed first CPU consumption time
Browse files Browse the repository at this point in the history
  • Loading branch information
PalNilsson committed Dec 4, 2024
1 parent e8d5b8a commit 61670c5
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 6 deletions.
2 changes: 1 addition & 1 deletion PILOTVERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.9.2.38
3.9.2.39
1 change: 1 addition & 0 deletions pilot/info/jobdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ class JobData(BaseData):
prodproxy = "" # to keep track of production proxy on unified queues
completed = False # True when job has finished or failed, used by https::send_update()
lsetuptime = 0 # payload setup time (lsetup)
runningstart = None # time when the payload started running (only for internal monitoring purposes, not the actual start time)

# time variable used for on-the-fly cpu consumption time measurements done by job monitoring
t0 = None # payload startup time
Expand Down
2 changes: 1 addition & 1 deletion pilot/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
RELEASE = '3' # released number should be fixed at 3 for Pilot 3
VERSION = '9' # version number is '1' for first release, '0' until then, increased for bigger updates
REVISION = '2' # revision number should be reset to '0' for every new version release, increased for small updates
BUILD = '38' # build number should be reset to '1' for every new development cycle
BUILD = '39' # build number should be reset to '1' for every new development cycle

SUCCESS = 0
FAILURE = 1
Expand Down
15 changes: 11 additions & 4 deletions pilot/util/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,25 @@ def job_monitor_tasks(job: JobData, mt: MonitoringTime, args: object) -> tuple[i

# update timing info for running jobs (to avoid an update after the job has finished)
if job.state == 'running':
# keep track of the time since the job started running (approximate since it is set here, move later)
if not job.runningstart:
job.runningstart = current_time

# check the disk space
# make sure that any utility commands are still running (and determine pid of memory monitor- as early as possible)
if job.utilities != {}:
utility_monitor(job)

# confirm that the worker node has a proper SC_CLK_TCK (problems seen on MPPMU)
check_hz()

# set the CPU consumption time for the job
exit_code, diagnostics = set_cpu_consumption_time(job)
if exit_code:
return exit_code, diagnostics
# set the CPU consumption time for the job (if it has been running for > 10s)
if job.runningstart and (current_time - job.runningstart) > 10:
exit_code, diagnostics = set_cpu_consumption_time(job)
if exit_code:
return exit_code, diagnostics
else:
logger.debug('skipping CPU consumption time check since job has not been running for long enough')

# keep track of the subprocesses running (store payload subprocess PIDs)
store_subprocess_pids(job)
Expand Down

0 comments on commit 61670c5

Please sign in to comment.