-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #37 from PanDAWMS/next
3.3.1.9
- Loading branch information
Showing
19 changed files
with
290 additions
and
215 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
3.3.0.39 | ||
3.3.1.9 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,7 @@ | |
# Authors: | ||
# - Mario Lassnig, [email protected], 2016-2017 | ||
# - Daniel Drizhuk, [email protected], 2017 | ||
# - Paul Nilsson, [email protected], 2017-2021 | ||
# - Paul Nilsson, [email protected], 2017-2022 | ||
# - Wen Guan, [email protected], 2018 | ||
|
||
from __future__ import print_function # Python 2 | ||
|
@@ -300,8 +300,17 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) | |
:return: boolean (True if successful, False otherwise). | ||
""" | ||
|
||
state = get_proper_state(job, state) | ||
# insert out of batch time error code if MAXTIME has been reached | ||
logger.debug(f"REACHED_MAXTIME={os.environ.get('REACHED_MAXTIME', None)}") | ||
if os.environ.get('REACHED_MAXTIME', None): | ||
msg = 'the max batch system time limit has been reached' | ||
logger.warning(msg) | ||
job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.REACHEDMAXTIME, msg=msg) | ||
state = 'failed' | ||
job.state = state | ||
|
||
state = get_proper_state(job, state) | ||
logger.debug(f'state={state}') | ||
# should the pilot make any server updates? | ||
if not args.update_server: | ||
logger.info('pilot will not update the server (heartbeat message will be written to file)') | ||
|
@@ -311,7 +320,7 @@ def send_state(job, args, state, xml=None, metadata=None, test_tobekilled=False) | |
|
||
# build the data structure needed for updateJob | ||
data = get_data_structure(job, state, args, xml=xml, metadata=metadata) | ||
|
||
logger.debug(f'data={data}') | ||
# write the heartbeat message to file if the server is not to be updated by the pilot (Nordugrid mode) | ||
if not args.update_server: | ||
# if in harvester mode write to files required by harvester | ||
|
@@ -578,6 +587,7 @@ def get_data_structure(job, state, args, xml=None, metadata=None): | |
:return: data structure (dictionary). | ||
""" | ||
|
||
logger.debug(f'state={state}') | ||
data = {'jobId': job.jobid, | ||
'state': state, | ||
'timestamp': time_stamp(), | ||
|
@@ -642,7 +652,7 @@ def get_data_structure(job, state, args, xml=None, metadata=None): | |
add_memory_info(data, job.workdir, name=job.memorymonitor) | ||
if state == 'finished' or state == 'failed': | ||
add_timing_and_extracts(data, job, state, args) | ||
add_error_codes(data, job) | ||
https.add_error_codes(data, job) | ||
|
||
return data | ||
|
||
|
@@ -805,37 +815,6 @@ def get_requested_log_tail(debug_command, workdir): | |
return _tail | ||
|
||
|
||
def add_error_codes(data, job): | ||
""" | ||
Add error codes to data structure. | ||
:param data: data dictionary. | ||
:param job: job object. | ||
:return: | ||
""" | ||
|
||
# error codes | ||
pilot_error_code = job.piloterrorcode | ||
pilot_error_codes = job.piloterrorcodes | ||
if pilot_error_codes != []: | ||
logger.warning(f'pilotErrorCodes = {pilot_error_codes} (will report primary/first error code)') | ||
data['pilotErrorCode'] = pilot_error_codes[0] | ||
else: | ||
data['pilotErrorCode'] = pilot_error_code | ||
|
||
# add error info | ||
pilot_error_diag = job.piloterrordiag | ||
pilot_error_diags = job.piloterrordiags | ||
if pilot_error_diags != []: | ||
logger.warning(f'pilotErrorDiags = {pilot_error_diags} (will report primary/first error diag)') | ||
data['pilotErrorDiag'] = pilot_error_diags[0] | ||
else: | ||
data['pilotErrorDiag'] = pilot_error_diag | ||
data['transExitCode'] = job.transexitcode | ||
data['exeErrorCode'] = job.exeerrorcode | ||
data['exeErrorDiag'] = job.exeerrordiag | ||
|
||
|
||
def get_cpu_consumption_time(cpuconsumptiontime): | ||
""" | ||
Get the CPU consumption time. | ||
|
@@ -1277,6 +1256,8 @@ def get_job_label(args): | |
elif status == 'test' and args.job_label != 'ptest': | ||
logger.warning('PQ status set to test - will use job label / prodSourceLabel test') | ||
job_label = 'test' | ||
elif infosys.queuedata.type == 'unified': | ||
job_label = 'unified' | ||
else: | ||
job_label = args.job_label | ||
|
||
|
@@ -2587,6 +2568,11 @@ def job_monitor(queues, traces, args): # noqa: C901 | |
peeking_time = int(time.time()) | ||
for i in range(len(jobs)): | ||
current_id = jobs[i].jobid | ||
|
||
if os.environ.get('REACHED_MAXTIME', None): | ||
# the batch system max time has been reached, time to abort (in the next step) | ||
jobs[i].state = 'failed' | ||
|
||
logger.info('monitor loop #%d: job %d:%s is in state \'%s\'', n, i, current_id, jobs[i].state) | ||
if jobs[i].state == 'finished' or jobs[i].state == 'failed': | ||
logger.info('will abort job monitoring soon since job state=%s (job is still in queue)', jobs[i].state) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ | |
# | ||
# Authors: | ||
# - Daniel Drizhuk, [email protected], 2017 | ||
# - Paul Nilsson, [email protected], 2017-2021 | ||
# - Paul Nilsson, [email protected], 2017-2022 | ||
|
||
# NOTE: this module should deal with non-job related monitoring, such as thread monitoring. Job monitoring is | ||
# a task for the job_monitor thread in the Job component. | ||
|
@@ -77,6 +77,7 @@ def control(queues, traces, args): | |
logger.fatal(f'max running time ({max_running_time}s) minus grace time ({grace_time}s) has been exceeded - must abort pilot') | ||
logger.info('setting REACHED_MAXTIME and graceful stop') | ||
environ['REACHED_MAXTIME'] = 'REACHED_MAXTIME' # TODO: use singleton instead | ||
logger.debug(f"REACHED_MAXTIME={environ.get('REACHED_MAXTIME', None)}") | ||
# do not set graceful stop if pilot has not finished sending the final job update | ||
# i.e. wait until SERVER_UPDATE is FINAL_DONE | ||
check_for_final_server_update(args.update_server) | ||
|
@@ -197,7 +198,7 @@ def run_checks(queues, args): | |
|
||
t_max = 2 * 60 | ||
logger.warning('pilot monitor received instruction that abort_job has been requested') | ||
logger.warning('will wait for a maximum of %d seconds for threads to finish', t_max) | ||
logger.warning(f'will wait for a maximum of {t_max} s for threads to finish') | ||
t_0 = time.time() | ||
while time.time() - t_0 < t_max: | ||
if args.job_aborted.is_set(): | ||
|
@@ -211,7 +212,7 @@ def run_checks(queues, args): | |
args.graceful_stop.set() | ||
|
||
if not args.job_aborted.is_set(): | ||
logger.warning('will wait for a maximum of %d seconds for graceful_stop to take effect', t_max) | ||
logger.warning(f'will wait for a maximum of {t_max} s for graceful_stop to take effect') | ||
t_max = 10 | ||
t_0 = time.time() | ||
while time.time() - t_0 < t_max: | ||
|
@@ -241,21 +242,21 @@ def get_max_running_time(lifetime, queuedata): | |
|
||
# use the schedconfig value if set, otherwise use the pilot option lifetime value | ||
if not queuedata: | ||
logger.warning('queuedata could not be extracted from queues, will use default for max running time ' | ||
'(%d s)', max_running_time) | ||
logger.warning(f'queuedata could not be extracted from queues, will use default for max running time ' | ||
f'({max_running_time} s)') | ||
else: | ||
if queuedata.maxtime: | ||
try: | ||
max_running_time = int(queuedata.maxtime) | ||
except Exception as error: | ||
logger.warning('exception caught: %s', error) | ||
logger.warning('failed to convert maxtime from queuedata, will use default value for max running time ' | ||
'(%d s)', max_running_time) | ||
logger.warning(f'exception caught: {error}') | ||
logger.warning(f'failed to convert maxtime from queuedata, will use default value for max running time ' | ||
f'({max_running_time} s)') | ||
else: | ||
if max_running_time == 0: | ||
max_running_time = lifetime # fallback to default value | ||
logger.info('will use default value for max running time: %d s', max_running_time) | ||
logger.info(f'will use default value for max running time: {max_running_time} s') | ||
else: | ||
logger.info('will use queuedata.maxtime value for max running time: %d s', max_running_time) | ||
logger.info(f'will use queuedata.maxtime value for max running time: {max_running_time} s') | ||
|
||
return max_running_time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Authors: | ||
# - Paul Nilsson, [email protected], 2017-2021 | ||
# - Paul Nilsson, [email protected], 2017-2022 | ||
|
||
import os | ||
import re | ||
|
@@ -83,6 +83,7 @@ def display_architecture_info(): | |
Display OS/architecture information. | ||
The function attempts to use the lsb_release -a command if available. If that is not available, | ||
it will dump the contents of | ||
WARNING: lsb_release will not be available on CentOS Stream 9 | ||
:return: | ||
""" | ||
|
Oops, something went wrong.