Skip to content

Commit

Permalink
v0.1.5
Browse files Browse the repository at this point in the history
  • Loading branch information
mightqxc committed Oct 4, 2019
2 parents 5a000d3 + 743becb commit d2f88af
Show file tree
Hide file tree
Showing 23 changed files with 523 additions and 135 deletions.
2 changes: 1 addition & 1 deletion pandaharvester/commit_timestamp.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
timestamp = "10-09-2019 13:14:16 on release (by fahui)"
timestamp = "04-10-2019 11:57:38 on release (by fahui)"
2 changes: 1 addition & 1 deletion pandaharvester/harvesterbody/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@ def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log,
else:
newStatus = WorkSpec.ST_idle
elif not workSpec.is_post_processed():
if not queue_config.is_no_heartbeat_status(newStatus):
if not queue_config.is_no_heartbeat_status(newStatus) and not queue_config.truePilot:
# post processing unless heartbeat is suppressed
jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID,
None, True,
Expand Down
2 changes: 1 addition & 1 deletion pandaharvester/harvesterbody/submitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def run(self):
# get commands
comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName)
commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr)
mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr))
mainLog.debug('got {0} {1} commands'.format(len(commandSpecs), comStr))
for commandSpec in commandSpecs:
newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params)
for tmpResource, tmpNewVal in iteritems(newLimits):
Expand Down
11 changes: 11 additions & 0 deletions pandaharvester/harvesterbody/sweeper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy
from pandaharvester.harvestercore.plugin_factory import PluginFactory
from pandaharvester.harvesterbody.agent_base import AgentBase
from pandaharvester.harvestercore.command_spec import CommandSpec

# logger
_logger = core_utils.setup_logger('sweeper')
Expand All @@ -25,6 +26,16 @@ def run(self):
while True:
sw_main = core_utils.get_stopwatch()
mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run')
# get commands to kill
sw_getcomm = core_utils.get_stopwatch()
mainLog.debug('try to get commands')
comStr = CommandSpec.COM_killWorkers
commandSpecs = self.dbProxy.get_commands_for_receiver('sweeper', comStr)
mainLog.debug('got {0} {1} commands'.format(len(commandSpecs), comStr))
for commandSpec in commandSpecs:
n_to_kill = self.dbProxy.kill_workers_by_query(commandSpec.params)
mainLog.debug('will kill {0} workers with {1}'.format(n_to_kill, commandSpec.params))
mainLog.debug('done handling commands' + sw_getcomm.get_elapsed_time())
# killing stage
sw_kill = core_utils.get_stopwatch()
mainLog.debug('try to get workers to kill')
Expand Down
4 changes: 3 additions & 1 deletion pandaharvester/harvestercore/command_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@ class CommandSpec(SpecBase):
# commands
COM_reportWorkerStats = 'REPORT_WORKER_STATS'
COM_setNWorkers = 'SET_N_WORKERS'
COM_killWorkers = 'KILL_WORKERS'
# mapping between command and receiver
receiver_map = {
COM_reportWorkerStats: 'propagator',
COM_setNWorkers: 'submitter'
COM_setNWorkers: 'submitter',
COM_killWorkers: 'sweeper',
}

# constructor
Expand Down
2 changes: 1 addition & 1 deletion pandaharvester/harvestercore/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def get_output_file_report(jobspec):
chksum = fileSpec.chksum.split(':')[-1]
else:
chksum = fileSpec.chksum
xml += """"<File ID="{guid}">
xml += """<File ID="{guid}">
<logical>
<lfn name="{lfn}"/>
</logical>
Expand Down
65 changes: 63 additions & 2 deletions pandaharvester/harvestercore/db_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3032,7 +3032,7 @@ def refresh_cache(self, main_key, sub_key, new_info):
return False

# get a cached info
def get_cache(self, main_key, sub_key=None):
def get_cache(self, main_key, sub_key=None, from_local_cache=True):
useDB = False
try:
# get logger
Expand All @@ -3045,7 +3045,7 @@ def get_cache(self, main_key, sub_key=None):
# lock dict
globalDict.acquire()
# found
if cacheKey in globalDict:
if from_local_cache and cacheKey in globalDict:
# release dict
globalDict.release()
# make spec
Expand Down Expand Up @@ -5370,3 +5370,64 @@ def get_workers_from_ids(self, ids):
core_utils.dump_error_message(_logger)
# return
return {}

# send kill command to workers by query
def kill_workers_by_query(self, params):
try:
# get logger
tmpLog = core_utils.make_logger(_logger, method_name='kill_workers_by_query')
tmpLog.debug('start')
# sql to set killTime
sqlL = "UPDATE {0} SET killTime=:setTime ".format(workTableName)
sqlL += "WHERE workerID=:workerID AND killTime IS NULL AND NOT status IN (:st1,:st2,:st3) "
# sql to get workers
constraints_query_string_list = []
tmp_varMap = {}
constraint_map = {'status': params.get('status', [WorkSpec.ST_submitted]),
'computingSite': params.get('computingSite', []),
'computingElement': params.get('computingElement', []),
'submissionHost': params.get('submissionHost', [])}
tmpLog.debug('query {0}'.format(constraint_map))
for attribute, match_list in iteritems(constraint_map):
if match_list == 'ALL':
pass
elif not match_list:
tmpLog.debug('{0} constraint is not specified in the query. Skipped'.format(attribute))
return 0
else:
one_param_list = [ ':param_{0}_{1}'.format(attribute, v_i) for v_i in range(len(match_list)) ]
tmp_varMap.update(zip(one_param_list, match_list))
params_string = '(' + ','.join(one_param_list) + ')'
constraints_query_string_list.append('{0} IN {1}'.format(attribute, params_string))
constranits_query_string = ' AND '.join(constraints_query_string_list)
sqlW = "SELECT workerID FROM {0} ".format(workTableName)
sqlW += "WHERE {0} ".format(constranits_query_string)
# set an older time to trigger sweeper
setTime = datetime.datetime.utcnow() - datetime.timedelta(hours=6)
# get workers
varMap = dict()
varMap.update(tmp_varMap)
self.execute(sqlW, varMap)
resW = self.cur.fetchall()
nRow = 0
for workerID, in resW:
# set killTime
varMap = dict()
varMap[':workerID'] = workerID
varMap[':setTime'] = setTime
varMap[':st1'] = WorkSpec.ST_finished
varMap[':st2'] = WorkSpec.ST_failed
varMap[':st3'] = WorkSpec.ST_cancelled
self.execute(sqlL, varMap)
nRow += self.cur.rowcount
# commit
self.commit()
tmpLog.debug('set killTime to {0} workers'.format(nRow))
return nRow
except Exception:
# roll back
self.rollback()
# dump error
core_utils.dump_error_message(_logger)
# return
return None
2 changes: 1 addition & 1 deletion pandaharvester/harvestercore/fifos.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
_attribute_list = ['id', 'item', 'score']

# fifo object spec
FifoObject = collections.namedtuple('FifoObject', _attribute_list, verbose=False, rename=False)
FifoObject = collections.namedtuple('FifoObject', _attribute_list, rename=False)

# logger
_logger = core_utils.setup_logger('fifos')
Expand Down
38 changes: 34 additions & 4 deletions pandaharvester/harvestercore/queue_config_mapper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import json
import copy
import time
import datetime
import threading
import importlib
Expand Down Expand Up @@ -178,6 +179,7 @@ def __init__(self, update_db=True):
self.configFromCacher = harvester_config.qconf.configFromCacher
except AttributeError:
self.configFromCacher = False
self.updateInterval = 600

# load config from DB cache of URL with validation
def _load_config_from_cache(self):
Expand Down Expand Up @@ -242,15 +244,43 @@ def _get_resolver():
resolver = None
return resolver

# update last reload time
def _update_last_reload_time(self):
new_info = '{0:.3f}'.format(time.time())
return self.dbProxy.refresh_cache('_qconf_last_reload', '_universal', new_info)

# get last reload time
def _get_last_reload_time(self):
cacheSpec = self.dbProxy.get_cache('_qconf_last_reload', '_universal', from_local_cache=False)
if cacheSpec is None:
return None
timestamp = float(cacheSpec.data)
return timestamp

# load data
def load_data(self):
mainLog = _make_logger(method_name='QueueConfigMapper.load_data')
# check interval
timeNow = datetime.datetime.utcnow()
if self.lastUpdate is not None and timeNow - self.lastUpdate < datetime.timedelta(minutes=10):
return
with self.lock:
# check if to update
timeNow_timestamp = time.time()
if self.lastUpdate is not None:
last_reload_timestamp = self._get_last_reload_time()
if (last_reload_timestamp is not None and self.lastUpdate is not None
and datetime.datetime.utcfromtimestamp(last_reload_timestamp) < self.lastUpdate
and timeNow_timestamp - last_reload_timestamp < self.updateInterval):
return
# start
with self.lock:
# update timesatmp of last reload, lock with check interval
got_timesatmp_update_lock = self.dbProxy.get_process_lock('qconf_reload', 'qconf_universal', self.updateInterval)
if got_timesatmp_update_lock:
retVal = self._update_last_reload_time()
if retVal:
mainLog.debug('updated last reload timestamp')
else:
mainLog.warning('failed to update last reload timestamp. Skipped')
else:
mainLog.debug('did not get qconf_reload timestamp lock. Skipped to update last reload timestamp')
# init
newQueueConfig = dict()
localTemplatesDict = dict()
Expand Down
4 changes: 3 additions & 1 deletion pandaharvester/harvestermiddleware/direct_ssh_herder.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def __init__(self, **kwarg):
self.sockDir = getattr(self, 'sockDir', '/tmp')
self.numMasters = getattr(self, 'numMasters', 1)
self.execStr = getattr(self, 'execStr', '')
self.connectionLifetime = getattr(self, 'connectionLifetime', None)
try:
self._get_connection()
except Exception as e:
Expand Down Expand Up @@ -151,7 +152,8 @@ def _get_connection(self):
sshMasterPool.make_control_master(self.remoteHost, self.remotePort, self.numMasters,
ssh_username=self.sshUserName, ssh_password=self.sshPassword,
private_key=self.privateKey, pass_phrase=self.passPhrase,
jump_host=self.jumpHost, jump_port=self.jumpPort, sock_dir=self.sockDir)
jump_host=self.jumpHost, jump_port=self.jumpPort, sock_dir=self.sockDir,
connection_lifetime=self.connectionLifetime)
conn = sshMasterPool.get_connection(self.remoteHost, self.remotePort, self.execStr)
if conn is not None:
tmpLog.debug('connected successfully')
Expand Down
26 changes: 19 additions & 7 deletions pandaharvester/harvestermiddleware/ssh_master_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import threading
import uuid
import os
import time

import six
import pexpect
Expand Down Expand Up @@ -39,7 +40,7 @@ def make_dict_key(self, host, port):
def make_control_master(self, remote_host, remote_port, num_masters=1,
ssh_username=None, ssh_password=None, private_key=None, pass_phrase=None,
jump_host=None, jump_port=None, login_timeout=60, reconnect=False,
with_lock=True, sock_dir=None):
with_lock=True, sock_dir=None, connection_lifetime=None):
dict_key = self.make_dict_key(remote_host, remote_port)
if with_lock:
self.lock.acquire()
Expand All @@ -56,7 +57,8 @@ def make_control_master(self, remote_host, remote_port, num_masters=1,
'jump_host': jump_host,
'jump_port': jump_port,
'login_timeout': login_timeout,
'sock_dir': sock_dir
'sock_dir': sock_dir,
'connection_lifetime': connection_lifetime,
}
else:
num_masters = self.params[dict_key]['num_masters']
Expand All @@ -68,6 +70,7 @@ def make_control_master(self, remote_host, remote_port, num_masters=1,
jump_port = self.params[dict_key]['jump_port']
login_timeout = self.params[dict_key]['login_timeout']
sock_dir = self.params[dict_key]['sock_dir']
connection_lifetime = self.params[dict_key]['connection_lifetime']
# make a master
for i in range(num_masters - len(self.pool[dict_key])):
# make a socket file
Expand All @@ -94,6 +97,7 @@ def make_control_master(self, remote_host, remote_port, num_masters=1,
loginString,
]
c = pexpect_spawn(com, echo=False)
baseLogger.debug('pexpect_spawn')
c.logfile_read = baseLogger.handlers[0].stream
isOK = False
for iTry in range(3):
Expand Down Expand Up @@ -132,27 +136,35 @@ def make_control_master(self, remote_host, remote_port, num_masters=1,
# exec to confirm login
c.sendline('echo {0}'.format(loginString))
if isOK:
self.pool[dict_key].append((sock_file, c))
conn_exp_time = (time.time() + connection_lifetime) if connection_lifetime is not None else None
self.pool[dict_key].append((sock_file, c, conn_exp_time))
if with_lock:
self.lock.release()

# get a connection
def get_connection(self, remote_host, remote_port, exec_string):
baseLogger.debug('get_connection start')
dict_key = self.make_dict_key(remote_host, remote_port)
self.lock.acquire()
active_masters = []
someClosed = False
for sock_file, child in self.pool[dict_key]:
if child.isalive():
active_masters.append((sock_file, child))
for sock_file, child, conn_exp_time in list(self.pool[dict_key]):
if child.isalive() and time.time() <= conn_exp_time:
active_masters.append((sock_file, child, conn_exp_time))
else:
child.close()
self.pool[dict_key].remove((sock_file, child, conn_exp_time))
someClosed = True
if child.isalive():
baseLogger.debug('a connection process is dead')
else:
baseLogger.debug('a connection is expired')
if someClosed:
self.make_control_master(remote_host, remote_port, reconnect=True, with_lock=False)
active_masters = [item for item in self.pool[dict_key] if os.path.exists(item[0])]
baseLogger.debug('reconnected; now {0} active connections'.format(len(active_masters)))
if len(active_masters) > 0:
sock_file, child = random.choice(active_masters)
sock_file, child, conn_exp_time = random.choice(active_masters)
con = subprocess.Popen(['ssh', 'dummy', '-S', sock_file, exec_string],
shell=False,
stdin=subprocess.PIPE,
Expand Down
27 changes: 22 additions & 5 deletions pandaharvester/harvestermisc/htcondor_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,18 +246,33 @@ def wrapper(self, *args, **kwargs):
# Make logger
tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorClient.renew_session_if_error')
func_name = func.__name__
try:
self.schedd
except AttributeError:
if self.lock.acquire(False):
is_renewed = self.renew_session()
self.lock.release()
if not is_renewed:
errStr = 'failed to communicate with {0}'.format(self.submissionHost)
tmpLog.error(errStr)
tmpLog.debug('got RuntimeError: {0}'.format(e))
raise Exception(errStr)
try:
ret = func(self, *args, **kwargs)
except RuntimeError as e:
tmpLog.debug('got RuntimeError: {0}'.format(e))
if self.lock.acquire(False):
self.renew_session()
is_renewed = self.renew_session()
self.lock.release()
if to_retry:
tmpLog.debug('condor session renewed. Retrying {0}'.format(func_name))
ret = func(self, *args, **kwargs)
if is_renewed:
if to_retry:
tmpLog.debug('condor session renewed. Retrying {0}'.format(func_name))
ret = func(self, *args, **kwargs)
else:
tmpLog.debug('condor session renewed')
raise
else:
tmpLog.debug('condor session renewed')
tmpLog.error('failed to renew condor session')
raise
else:
tmpLog.debug('another thread is renewing condor session; skipped...')
Expand Down Expand Up @@ -324,11 +339,13 @@ def renew_session(self, retry=3, init=False):
tmpLog.warning('Failed. Retry...')
else:
tmpLog.warning('Retry {0} times. Still failed. Skipped'.format(i_try))
return False
i_try += 1
self.secman.invalidateAllSessions()
time.sleep(3)
# Sleep
time.sleep(3)
return True


# Condor job query
Expand Down
Loading

0 comments on commit d2f88af

Please sign in to comment.