Skip to content

Commit

Permalink
Merge pull request #185 from HSF/flin
Browse files Browse the repository at this point in the history
v2.0.30 ; htcondor: default to arc (REST) for ARC CE
  • Loading branch information
mightqxc authored Apr 27, 2023
2 parents 8102df5 + 7665bf3 commit dbaea7c
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 9 deletions.
2 changes: 1 addition & 1 deletion pandaharvester/commit_timestamp.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
timestamp = "19-04-2023 13:49:56 on flin (by mightqxc)"
timestamp = "27-04-2023 13:02:05 on flin (by mightqxc)"
17 changes: 14 additions & 3 deletions pandaharvester/harvestermonitor/htcondor_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@
}


# Condor jobs held with these reasons should be killed
TO_KILL_HOLD_REASONS = [
'Job not found',
'Failed to start GAHP',
]


# pilot error object
PILOT_ERRORS = PilotErrors()

Expand Down Expand Up @@ -95,13 +102,17 @@ def _check_one_worker(workspec, job_ads_all_dict, cancel_unknown=False, held_tim
newStatus = WorkSpec.ST_cancelled
elif batchStatus in ['5']:
# 5 held
errStr = 'Condor HoldReason: {0} '.format(job_ads_dict.get('HoldReason'))
hold_reason = job_ads_dict.get('HoldReason')
errStr = 'Condor HoldReason: {0} '.format(hold_reason)
if (
job_ads_dict.get('HoldReason') == 'Job not found'
hold_reason in TO_KILL_HOLD_REASONS
or int(time.time()) - int(job_ads_dict.get('EnteredCurrentStatus', 0)) > held_timeout
):
# Kill the job if held too long or other reasons
tmpLog.debug('trying to kill job submissionHost={0} batchID={1} due to held too long or not found'.format(workspec.submissionHost, workspec.batchID))
if hold_reason in TO_KILL_HOLD_REASONS:
tmpLog.debug('trying to kill job submissionHost={0} batchID={1} due to HoldReason: {2}'.format(workspec.submissionHost, workspec.batchID, hold_reason))
else:
tmpLog.debug('trying to kill job submissionHost={0} batchID={1} due to held too long'.format(workspec.submissionHost, workspec.batchID))
for submissionHost, batchIDs_list in six.iteritems(get_host_batchid_map([workspec])):
condor_job_manage = CondorJobManage(id=workspec.submissionHost)
try:
Expand Down
8 changes: 4 additions & 4 deletions pandaharvester/harvestersubmitter/htcondor_submitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, e
'pilotArgs': pilot_args,
'submissionHost': workspec.submissionHost,
'submissionHostShort': workspec.submissionHost.split('.')[0],
'ceARCGridType': ce_info_dict.get('ce_grid_type', 'nordugrid'),
'ceARCGridType': ce_info_dict.get('ce_grid_type', 'arc'),
'tokenDir': token_dir,
'tokenFilename': token_filename,
'tokenPath': token_path,
Expand Down Expand Up @@ -443,16 +443,16 @@ def __init__(self, **kwarg):
except AttributeError:
self.rcPilotRandomWeightPermille = 0
# submission to ARC CE's with nordugrid (gridftp) or arc (REST) grid type
self.submit_arc_grid_type = 'nordugrid'
self.submit_arc_grid_type = 'arc'
try:
extra_plugin_configs = harvester_config.master.extraPluginConfigs['HTCondorSubmitter']
except AttributeError:
pass
except KeyError:
pass
else:
if extra_plugin_configs.get('submit_arc_grid_type') == 'arc':
self.submit_arc_grid_type = 'arc'
if extra_plugin_configs.get('submit_arc_grid_type') == 'nordugrid':
self.submit_arc_grid_type = 'nordugrid'
# record of information of CE statistics
self.ceStatsLock = threading.Lock()
self.ceStats = dict()
Expand Down
2 changes: 1 addition & 1 deletion pandaharvester/panda_pkg_info.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
release_version = "0.2.29"
release_version = "0.2.30"

0 comments on commit dbaea7c

Please sign in to comment.