diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/k8s/job_cern.yaml b/examples/k8s/job_cern.yaml new file mode 100644 index 00000000..c5a08fe1 --- /dev/null +++ b/examples/k8s/job_cern.yaml @@ -0,0 +1,80 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: grid-job +spec: + ttlSecondsAfterFinished: 172800 + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: atlas-grid-centos7 + image: atlasadc/atlas-grid-centos7 + env: + - name: computingSite + value: "$computingSite" + - name: pandaQueueName + value: "$pandaQueueName" + - name: proxySecretPath + value: "$proxySecretPath" + - name: proxyContent + value: "$proxyContent" + - name: workerID + value: "$workerID" + - name: logs_frontend_w + value: "$logs_frontend_w" + - name: logs_frontend_r + value: "$logs_frontend_r" + - name: resourceType + value: "$resourceType" + - name: HARVESTER_WORKER_ID + value: "$HARVESTER_WORKER_ID" + - name: HARVESTER_ID + value: "$HARVESTER_ID" + - name: PANDA_JSID + value: "$PANDA_JSID" + - name: TMPDIR + value: "/root" + - name: PILOT_NOKILL + value: "True" + command: ["/usr/bin/bash"] + args: ["-c", "cd; wget https://raw.githubusercontent.com/HSF/harvester/k8s_analysis/pandaharvester/harvestercloud/pilots_starter.py; chmod 755 pilots_starter.py; ./pilots_starter.py || true"] + volumeMounts: + - name: atlas + mountPath: /cvmfs/atlas.cern.ch + - name: atlas-condb + mountPath: /cvmfs/atlas-condb.cern.ch + - name: atlas-nightlies + mountPath: /cvmfs/atlas-nightlies.cern.ch + - name: sft + mountPath: /cvmfs/sft.cern.ch + - name: grid + mountPath: /cvmfs/grid.cern.ch + - name: proxy-secret + mountPath: /proxy + volumes: + - name: atlas + persistentVolumeClaim: + claimName: csi-cvmfs-atlas-pvc + readOnly: true + - name: atlas-condb + persistentVolumeClaim: + claimName: csi-cvmfs-atlas-condb-pvc + readOnly: true + - name: atlas-nightlies + persistentVolumeClaim: + claimName: csi-cvmfs-atlas-nightlies-pvc + readOnly: true + - name: sft + persistentVolumeClaim: + claimName: csi-cvmfs-sft-pvc + readOnly: true + - name: grid + persistentVolumeClaim: + claimName: csi-cvmfs-grid-pvc + readOnly: true + - name: proxy-secret + secret: + secretName: proxy-secret \ No newline at end of file diff --git a/examples/k8s/k8s_cvmfs.yaml b/examples/k8s/k8s_cvmfs_1.13.yaml similarity index 100% rename from examples/k8s/k8s_cvmfs.yaml rename to examples/k8s/k8s_cvmfs_1.13.yaml diff --git a/examples/k8s/k8s_cvmfs_1.15.yaml b/examples/k8s/k8s_cvmfs_1.15.yaml new file mode 100644 index 00000000..64fe7690 --- /dev/null +++ b/examples/k8s/k8s_cvmfs_1.15.yaml @@ -0,0 +1,101 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cvmfs-atlas +provisioner: cvmfs.csi.cern.ch +parameters: + repository: atlas.cern.ch +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cvmfs-sft +provisioner: cvmfs.csi.cern.ch +parameters: + repository: sft.cern.ch +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cvmfs-grid +provisioner: cvmfs.csi.cern.ch +parameters: + repository: grid.cern.ch +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cvmfs-atlas-condb +provisioner: cvmfs.csi.cern.ch +parameters: + repository: atlas-condb.cern.ch +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cvmfs-atlas-nightlies +provisioner: cvmfs.csi.cern.ch +parameters: + repository: atlas-nightlies.cern.ch +--- +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: csi-cvmfs-atlas-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Gi + storageClassName: csi-cvmfs-atlas +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: csi-cvmfs-sft-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Gi + storageClassName: csi-cvmfs-sft +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: csi-cvmfs-grid-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Gi + storageClassName: csi-cvmfs-grid + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: csi-cvmfs-atlas-condb-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Gi + storageClassName: csi-cvmfs-atlas-condb +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: csi-cvmfs-atlas-nightlies-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Gi + storageClassName: csi-cvmfs-atlas-nightlies \ No newline at end of file diff --git a/pandaharvester/README.md b/pandaharvester/README.md index 46194ca1..1ef2b4d4 100644 --- a/pandaharvester/README.md +++ b/pandaharvester/README.md @@ -13,4 +13,3 @@ * **Submitter**: Classes to submit jobs to the batch system * **Test**: Test scripts * **Worker Maker**: Makes workers - diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 6c142a49..86b15de7 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "10-03-2020 15:13:51 on release (by fahui)" +timestamp = "16-06-2020 14:57:09 on release (by fahui)" diff --git a/pandaharvester/harvesterbody/cred_manager.py b/pandaharvester/harvesterbody/cred_manager.py index 72c51c21..75774123 100644 --- a/pandaharvester/harvesterbody/cred_manager.py +++ b/pandaharvester/harvesterbody/cred_manager.py @@ -42,8 +42,13 @@ def __init__(self, single_mode=False): pluginPar['inCertFile'] = inCertFile pluginPar['outCertFile'] = outCertFile pluginPar['voms'] = voms - exeCore = self.pluginFactory.get_plugin(pluginPar) - self.exeCores.append(exeCore) + try: + exeCore = self.pluginFactory.get_plugin(pluginPar) + self.exeCores.append(exeCore) + except Exception as e: + _logger.error('Problem instantiating cred manager for {0}'.format(pluginPar)) + _logger.error('Exception {0}'.format(e)) + # get list def get_list(self, data): @@ -74,8 +79,11 @@ def execute(self): # do nothing if exeCore is None: continue - # make logger - mainLog = self.make_logger(_logger, "{0} {1}".format(exeCore.__class__.__name__, exeCore.outCertFile), + + # make logger + mainLog = self.make_logger(_logger, "{0} {1} {2}".format(exeCore.__class__.__name__, + exeCore.inCertFile, + exeCore.outCertFile), method_name='execute') try: # check credential diff --git a/pandaharvester/harvesterbody/job_fetcher.py b/pandaharvester/harvesterbody/job_fetcher.py index b0d7823b..b8eecfa4 100644 --- a/pandaharvester/harvesterbody/job_fetcher.py +++ b/pandaharvester/harvesterbody/job_fetcher.py @@ -10,6 +10,7 @@ from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy from pandaharvester.harvesterbody.agent_base import AgentBase from pandaharvester.harvestercore.plugin_factory import PluginFactory +from pandaharvester.harvestermisc.info_utils import PandaQueuesDict # logger _logger = core_utils.setup_logger('job_fetcher') @@ -35,6 +36,10 @@ def run(self): nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(harvester_config.jobfetcher.nQueues, harvester_config.jobfetcher.lookupTime) mainLog.debug('got {0} queues'.format(len(nJobsPerQueue))) + + # get up to date queue configuration + pandaQueueDict = PandaQueuesDict() + # loop over all queues for queueName, nJobs in iteritems(nJobsPerQueue): # check queue @@ -44,17 +49,24 @@ def run(self): method_name='run') # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) + siteName = queueConfig.siteName # upper limit if nJobs > harvester_config.jobfetcher.maxJobs: nJobs = harvester_config.jobfetcher.maxJobs + # get jobs - default_prodSourceLabel = queueConfig.get_source_label() + try: + is_grandly_unified_queue = pandaQueueDict.is_grandly_unified_queue(siteName) + except Exception: + is_grandly_unified_queue = False + + default_prodSourceLabel = queueConfig.get_source_label(is_gu=is_grandly_unified_queue) + pdpm = getattr(queueConfig, 'prodSourceLabelRandomWeightsPermille', {}) choice_list = core_utils.make_choice_list(pdpm=pdpm, default=default_prodSourceLabel) prodSourceLabel = random.choice(choice_list) tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format(nJobs, prodSourceLabel)) sw = core_utils.get_stopwatch() - siteName = queueConfig.siteName jobs, errStr = self.communicator.get_jobs(siteName, self.nodeName, prodSourceLabel, self.nodeName, nJobs, @@ -89,11 +101,6 @@ def run(self): fileGroupDictList.append(extractorCore.get_aux_inputs(jobSpec)) for fileGroupDict in fileGroupDictList: for tmpLFN, fileAttrs in iteritems(fileGroupDict): - # check file status - if tmpLFN not in fileStatMap: - fileStatMap[tmpLFN] = self.dbProxy.get_file_status(tmpLFN, 'input', - queueConfig.ddmEndpointIn, - 'starting') # make file spec fileSpec = FileSpec() fileSpec.PandaID = jobSpec.PandaID @@ -101,28 +108,31 @@ def run(self): fileSpec.lfn = tmpLFN fileSpec.endpoint = queueConfig.ddmEndpointIn fileSpec.scope = fileAttrs['scope'] - # set preparing to skip stage-in if the file is (being) taken care of by another job - if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \ - or 'to_prepare' in fileStatMap[tmpLFN]: - fileSpec.status = 'preparing' - else: - fileSpec.status = 'to_prepare' - if fileSpec.status not in fileStatMap[tmpLFN]: - fileStatMap[tmpLFN][fileSpec.status] = 0 - fileStatMap[tmpLFN][fileSpec.status] += 1 if 'INTERNAL_FileType' in fileAttrs: fileSpec.fileType = fileAttrs['INTERNAL_FileType'] jobSpec.auxInput = JobSpec.AUX_hasAuxInput else: fileSpec.fileType = 'input' + # check file status + if tmpLFN not in fileStatMap: + fileStatMap[tmpLFN] = self.dbProxy.get_file_status(tmpLFN, fileSpec.fileType, + queueConfig.ddmEndpointIn, + 'starting') + # set preparing to skip stage-in if the file is (being) taken care of by another job + if [x for x in ['ready', 'preparing', 'to_prepare', 'triggered'] + if x in fileStatMap[tmpLFN]]: + fileSpec.status = 'preparing' + else: + fileSpec.status = 'to_prepare' + fileStatMap[tmpLFN].setdefault(fileSpec.status, None) if 'INTERNAL_URL' in fileAttrs: fileSpec.url = fileAttrs['INTERNAL_URL'] jobSpec.add_in_file(fileSpec) jobSpec.trigger_propagation() jobSpecs.append(jobSpec) # insert to DB - tmpLog.debug("Converting of {0} jobs {1}".format(len(jobs),sw_startconvert.get_elapsed_time())) - sw_insertdb =core_utils.get_stopwatch() + tmpLog.debug("Converting of {0} jobs {1}".format(len(jobs), sw_startconvert.get_elapsed_time())) + sw_insertdb = core_utils.get_stopwatch() self.dbProxy.insert_jobs(jobSpecs) tmpLog.debug('Insert of {0} jobs {1}'.format(len(jobSpecs), sw_insertdb.get_elapsed_time())) mainLog.debug('done') diff --git a/pandaharvester/harvesterbody/monitor.py b/pandaharvester/harvesterbody/monitor.py index 1028bc2f..a1400452 100644 --- a/pandaharvester/harvesterbody/monitor.py +++ b/pandaharvester/harvesterbody/monitor.py @@ -85,7 +85,12 @@ def run(self): # loop over all workers for queueName, configIdWorkSpecs in iteritems(workSpecsPerQueue): for configID, workSpecsList in iteritems(configIdWorkSpecs): - retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID, check_source='DB') + try: + retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID, check_source='DB') + except Exception as e: + mainLog.error('monitor_agent_core excepted with {0}'.format(e)) + retVal = None # skip the loop + if monitor_fifo.enabled and retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: @@ -192,8 +197,13 @@ def run(self): else: workSpec.pandaid_list = [] workSpec.force_update('pandaid_list') - retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, from_fifo=True, - config_id=configID, check_source='FIFO') + try: + retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, from_fifo=True, + config_id=configID, check_source='FIFO') + except Exception as e: + mainLog.error('monitor_agent_core excepted with {0}'.format(e)) + retVal = None # skip the loop + if retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal qc_key = (queueName, configID) @@ -644,8 +654,10 @@ def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, tmp_log.debug('kill workerID={0} due to queuing longer than {1} seconds'.format( workerID, workerQueueTimeLimit)) self.dbProxy.kill_worker(workSpec.workerID) - diagMessage = 'Killed by Harvester due to worker queuing too long' + diagMessage + diagMessage = 'Killed by Harvester due to worker queuing too long. ' + diagMessage workSpec.set_pilot_error(PilotErrors.ERR_FAILEDBYSERVER, diagMessage) + # set closed + workSpec.set_pilot_closed() # expired heartbeat - only when requested in the configuration try: # check if the queue configuration requires checking for worker heartbeat @@ -694,7 +706,8 @@ def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, else: newStatus = WorkSpec.ST_idle elif not workSpec.is_post_processed(): - if not queue_config.is_no_heartbeat_status(newStatus) and not queue_config.truePilot: + if (not queue_config.is_no_heartbeat_status(newStatus) and not queue_config.truePilot) \ + or (hasattr(messenger, 'forcePostProcessing') and messenger.forcePostProcessing): # post processing unless heartbeat is suppressed jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID, None, True, @@ -764,10 +777,16 @@ def monitor_event_digester(self, locked_by, max_events): for configID, workSpecsList in iteritems(_val): qc_key = (queueName, configID) tmpLog.debug('checking workers of queueName={0} configID={1}'.format(*qc_key)) - retVal = self.monitor_agent_core(locked_by, queueName, workSpecsList, - from_fifo=True, config_id=configID, - check_source='Event') - retMap[qc_key] = retVal + try: + retVal = self.monitor_agent_core(locked_by, queueName, workSpecsList, + from_fifo=True, config_id=configID, + check_source='Event') + except Exception as e: + tmpLog.error('monitor_agent_core excepted with {0}'.format(e)) + retVal = None # skip the loop + + if retVal: + retMap[qc_key] = retVal tmpLog.debug('done') return retMap diff --git a/pandaharvester/harvesterbody/preparator.py b/pandaharvester/harvesterbody/preparator.py index 3cc94956..b49c27b5 100644 --- a/pandaharvester/harvesterbody/preparator.py +++ b/pandaharvester/harvesterbody/preparator.py @@ -7,6 +7,7 @@ from pandaharvester.harvesterbody.agent_base import AgentBase from pandaharvester.harvestercore.pilot_errors import PilotErrors from pandaharvester.harvestercore.job_spec import JobSpec +from pandaharvester.harvestercore.file_spec import FileSpec # logger _logger = core_utils.setup_logger('preparator') @@ -43,7 +44,8 @@ def run(self): harvester_config.preparator.checkInterval, harvester_config.preparator.lockInterval, lockedBy, - max_files_per_job=maxFilesPerJob) + max_files_per_job=maxFilesPerJob, + ng_file_status_list=['ready']) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: @@ -152,7 +154,8 @@ def run(self): lockedBy, 'preparing', max_files_per_job=maxFilesPerJob, - ng_file_status_list=['triggered']) + ng_file_status_list=['triggered', + 'ready']) mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger))) # loop over all jobs fileStatMap = dict() @@ -175,8 +178,10 @@ def run(self): # get plugin if jobSpec.auxInput in [None, JobSpec.AUX_hasAuxInput]: preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator) + fileType = 'input' else: preparatorCore = self.pluginFactory.get_plugin(queueConfig.aux_preparator) + fileType = FileSpec.AUX_INPUT if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) @@ -190,43 +195,54 @@ def run(self): # check file status if queueConfig.ddmEndpointIn not in fileStatMap: fileStatMap[queueConfig.ddmEndpointIn] = dict() + # check if has to_prepare + hasToPrepare = False + for fileSpec in jobSpec.inFiles: + if fileSpec.status == 'to_prepare': + hasToPrepare = True + break newFileStatusData = [] toWait = False newInFiles = [] for fileSpec in jobSpec.inFiles: if fileSpec.status in ['preparing', 'to_prepare']: newInFiles.append(fileSpec) - if fileSpec.status == 'preparing': updateStatus = False if fileSpec.lfn not in fileStatMap[queueConfig.ddmEndpointIn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \ - = self.dbProxy.get_file_status(fileSpec.lfn, 'input', queueConfig.ddmEndpointIn, + = self.dbProxy.get_file_status(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn, 'starting') if 'ready' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready fileSpec.status = 'ready' + if fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]['ready']['path']: + fileSpec.path = list( + fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]['ready']['path'])[0] # set group info if any - groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, 'input', + groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn) if groupInfo is not None: fileSpec.groupID = groupInfo['groupID'] fileSpec.groupStatus = groupInfo['groupStatus'] fileSpec.groupUpdateTime = groupInfo['groupUpdateTime'] updateStatus = True - elif 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] or \ - 'triggered' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: + elif (not hasToPrepare and + 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]) or \ + 'triggered' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is being prepared by another toWait = True + if fileSpec.status != 'preparing': + fileSpec.status = 'preparing' + updateStatus = True else: # change file status if the file is not prepared by another - fileSpec.status = 'to_prepare' - updateStatus = True + if fileSpec.status != 'to_prepare': + fileSpec.status = 'to_prepare' + updateStatus = True # set new status if updateStatus: newFileStatusData.append((fileSpec.fileID, fileSpec.lfn, fileSpec.status)) - if fileSpec.status not in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: - fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] = 0 - fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] += 1 + fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn].setdefault(fileSpec.status, None) if len(newFileStatusData) > 0: self.dbProxy.change_file_status(jobSpec.PandaID, newFileStatusData, lockedBy) # wait since files are being prepared by another diff --git a/pandaharvester/harvesterbody/propagator.py b/pandaharvester/harvesterbody/propagator.py index 5f4eeb18..22f7d7b1 100644 --- a/pandaharvester/harvesterbody/propagator.py +++ b/pandaharvester/harvesterbody/propagator.py @@ -53,7 +53,10 @@ def run(self): if tmpJobSpec.computingSite not in hbSuppressMap: queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite, tmpJobSpec.configID) - hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status() + if queueConfig: + hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status() + else: # assume truepilot + hbSuppressMap[tmpJobSpec.computingSite] = ['running', 'transferring', 'finished', 'failed'] # heartbeat is suppressed if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and \ not tmpJobSpec.not_suppress_heartbeat(): diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index d69d07aa..53bc4719 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -26,386 +26,392 @@ class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) - self.queueConfigMapper = queue_config_mapper + self.queue_configMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() - self.apfmon = Apfmon(self.queueConfigMapper) + self.apfmon = Apfmon(self.queue_configMapper) # main loop def run(self): - lockedBy = 'submitter-{0}'.format(self.get_pid()) + locked_by = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo - queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval', + queue_lock_interval = getattr(harvester_config.submitter, 'queueLockInterval', harvester_config.submitter.lockInterval) while True: sw_main = core_utils.get_stopwatch() - mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') - mainLog.debug('getting queues to submit workers') + main_log = self.make_logger(_logger, 'id={0}'.format(locked_by), method_name='run') + main_log.debug('getting queues to submit workers') # get queues associated to a site to submit workers - curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, + current_workers, site_name, res_map = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval, - lockedBy, queueLockInterval) + locked_by, queue_lock_interval) submitted = False - if siteName is not None: - mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName)) + if site_name is not None: + main_log.debug('got {0} queues for site {1}'.format(len(current_workers), site_name)) - # get commands - comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) - commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr) - mainLog.debug('got {0} {1} commands'.format(len(commandSpecs), comStr)) - for commandSpec in commandSpecs: - newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params) - for tmpResource, tmpNewVal in iteritems(newLimits): - # if available, overwrite new worker value with the command from panda server - if tmpResource in resMap: - tmpQueueName = resMap[tmpResource] - if tmpQueueName in curWorkers: - curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal + # get commands from panda server + com_str = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, site_name) + command_specs = self.dbProxy.get_commands_for_receiver('submitter', com_str) + main_log.debug('got {0} {1} commands'.format(len(command_specs), com_str)) + for command_spec in command_specs: + new_limits = self.dbProxy.set_queue_limit(site_name, command_spec.params) + for tmp_job_type, tmp_jt_vals in iteritems(new_limits): + res_map.setdefault(tmp_job_type, {}) + for tmp_resource_type, tmp_new_val in iteritems(tmp_jt_vals): + # if available, overwrite new worker value with the command from panda server + if tmp_resource_type in res_map[tmp_job_type]: + tmp_queue_name = res_map[tmp_job_type][tmp_resource_type] + if tmp_queue_name in current_workers: + current_workers[tmp_queue_name][tmp_job_type][tmp_resource_type]['nNewWorkers'] = tmp_new_val # define number of new workers - if len(curWorkers) == 0: - n_workers_per_queue_and_rt = dict() + if len(current_workers) == 0: + n_workers_per_queue_jt_rt = dict() else: - n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName) + n_workers_per_queue_jt_rt = self.workerAdjuster.define_num_workers(current_workers, site_name) - if n_workers_per_queue_and_rt is None: - mainLog.error('WorkerAdjuster failed to define the number of workers') - elif len(n_workers_per_queue_and_rt) == 0: + if n_workers_per_queue_jt_rt is None: + main_log.error('WorkerAdjuster failed to define the number of workers') + elif len(n_workers_per_queue_jt_rt) == 0: pass else: # loop over all queues and resource types - for queueName in n_workers_per_queue_and_rt: - for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]): + for queue_name in n_workers_per_queue_jt_rt: + for job_type in n_workers_per_queue_jt_rt[queue_name]: + for resource_type in n_workers_per_queue_jt_rt[queue_name][job_type]: + tmp_val = n_workers_per_queue_jt_rt[queue_name][job_type][resource_type] + tmp_log = self.make_logger(_logger, 'id={0} queue={1} jtype={2} rtype={3}'.format( + locked_by, queue_name, job_type, resource_type), method_name='run') + try: + tmp_log.debug('start') + tmp_log.debug('workers status: %s' % tmp_val) + nWorkers = tmp_val['nNewWorkers'] + tmp_val['nReady'] + nReady = tmp_val['nReady'] - tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy, - queueName, - resource_type), - method_name='run') - try: - tmpLog.debug('start') - tmpLog.debug('workers status: %s' % tmpVal) - nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] - nReady = tmpVal['nReady'] - - # check queue - if not self.queueConfigMapper.has_queue(queueName): - tmpLog.error('config not found') - continue + # check queue + if not self.queue_configMapper.has_queue(queue_name): + tmp_log.error('config not found') + continue - # no new workers - if nWorkers == 0: - tmpLog.debug('skipped since no new worker is needed based on current stats') - continue - # get queue - queueConfig = self.queueConfigMapper.get_queue(queueName) - workerMakerCore = self.workerMaker.get_plugin(queueConfig) - # check if resource is ready - if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: - numReadyResources = self.workerMaker.num_ready_resources(queueConfig, - resource_type, - workerMakerCore) - tmpLog.debug('numReadyResources: %s' % numReadyResources) - if not numReadyResources: - if hasattr(workerMakerCore, 'staticWorkers'): - nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning'] - tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % - (workerMakerCore.staticWorkers, nQRWorkers)) - if nQRWorkers >= workerMakerCore.staticWorkers: - tmpLog.debug('No left static workers, skip') - continue + # no new workers + if nWorkers == 0: + tmp_log.debug('skipped since no new worker is needed based on current stats') + continue + # get queue + queue_config = self.queue_configMapper.get_queue(queue_name) + workerMakerCore = self.workerMaker.get_plugin(queue_config) + # check if resource is ready + if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: + numReadyResources = self.workerMaker.num_ready_resources(queue_config, + job_type, + resource_type, + workerMakerCore) + tmp_log.debug('numReadyResources: %s' % numReadyResources) + if not numReadyResources: + if hasattr(workerMakerCore, 'staticWorkers'): + nQRWorkers = tmp_val['nQueue'] + tmp_val['nRunning'] + tmp_log.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % + (workerMakerCore.staticWorkers, nQRWorkers)) + if nQRWorkers >= workerMakerCore.staticWorkers: + tmp_log.debug('No left static workers, skip') + continue + else: + nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) + tmp_log.debug('staticWorkers: %s, nWorkers: %s' % + (workerMakerCore.staticWorkers, nWorkers)) else: - nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) - tmpLog.debug('staticWorkers: %s, nWorkers: %s' % - (workerMakerCore.staticWorkers, nWorkers)) + tmp_log.debug('skip since no resources are ready') + continue else: - tmpLog.debug('skip since no resources are ready') - continue + nWorkers = min(nWorkers, numReadyResources) + # post action of worker maker + if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: + skipOnFail = True else: - nWorkers = min(nWorkers, numReadyResources) - # post action of worker maker - if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: - skipOnFail = True - else: - skipOnFail = False - # actions based on mapping type - if queueConfig.mapType == WorkSpec.MT_NoJob: - # workers without jobs - jobChunks = [] - for i in range(nWorkers): - jobChunks.append([]) - elif queueConfig.mapType == WorkSpec.MT_OneToOne: - # one worker per one job - jobChunks = self.dbProxy.get_job_chunks_for_workers( - queueName, - nWorkers, nReady, 1, None, - queueConfig.useJobLateBinding, - harvester_config.submitter.checkInterval, - harvester_config.submitter.lockInterval, - lockedBy) - elif queueConfig.mapType == WorkSpec.MT_MultiJobs: - # one worker for multiple jobs - nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig, - nWorkers, - resource_type, - maker=workerMakerCore) - tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) - jobChunks = self.dbProxy.get_job_chunks_for_workers( - queueName, - nWorkers, nReady, nJobsPerWorker, None, - queueConfig.useJobLateBinding, - harvester_config.submitter.checkInterval, - harvester_config.submitter.lockInterval, - lockedBy, - queueConfig.allowJobMixture) - elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: - # multiple workers for one job - nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig, - nWorkers, - resource_type, - maker=workerMakerCore) - maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( - queueConfig, resource_type, maker=workerMakerCore) - maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( - queueConfig, resource_type, maker=workerMakerCore) - tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) - jobChunks = self.dbProxy.get_job_chunks_for_workers( - queueName, - nWorkers, nReady, None, nWorkersPerJob, - queueConfig.useJobLateBinding, - harvester_config.submitter.checkInterval, - harvester_config.submitter.lockInterval, - lockedBy, max_workers_per_job_in_total=maxWorkersPerJob, - max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) - else: - tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType)) - continue - - tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) - if len(jobChunks) == 0: - continue - # make workers - okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig, - nReady, resource_type, - maker=workerMakerCore) - if len(ngChunks) == 0: - tmpLog.debug('successfully made {0} workers'.format(len(okChunks))) - else: - tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), - len(ngChunks))) - timeNow = datetime.datetime.utcnow() - timeNow_timestamp = time.time() - pandaIDs = set() - # NG (=not good) - for ngJobs in ngChunks: - for jobSpec in ngJobs: - if skipOnFail: - # release jobs when workers are not made - pandaIDs.add(jobSpec.PandaID) - else: - jobSpec.status = 'failed' - jobSpec.subStatus = 'failed_to_make' - jobSpec.stateChangeTime = timeNow - jobSpec.lockedBy = None - errStr = 'failed to make a worker' - jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) - jobSpec.trigger_propagation() - self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, - 'subStatus': 'prepared'}) - # OK - workSpecList = [] - if len(okChunks) > 0: - for workSpec, okJobs in okChunks: - # has job - if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ - or queueConfig.mapType == WorkSpec.MT_NoJob: - workSpec.hasJob = 0 - else: - workSpec.hasJob = 1 - if workSpec.nJobsToReFill in [None, 0]: - workSpec.set_jobspec_list(okJobs) - else: - # refill free slots during the worker is running - workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill]) - workSpec.nJobsToReFill = None - for jobSpec in okJobs[workSpec.nJobsToReFill:]: - pandaIDs.add(jobSpec.PandaID) - workSpec.set_num_jobs_with_list() - # map type - workSpec.mapType = queueConfig.mapType - # queue name - workSpec.computingSite = queueConfig.queueName - # set access point - workSpec.accessPoint = queueConfig.messenger['accessPoint'] - # sync level - workSpec.syncLevel = queueConfig.get_synchronization_level() - # events - if len(okJobs) > 0 and \ - ('eventService' in okJobs[0].jobParams or - 'cloneJob' in okJobs[0].jobParams): - workSpec.eventsRequest = WorkSpec.EV_useEvents - workSpecList.append(workSpec) - if len(workSpecList) > 0: - sw = core_utils.get_stopwatch() - # get plugin for submitter - submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter) - if submitterCore is None: - # not found - tmpLog.error( - 'submitter plugin for {0} not found'.format(jobSpec.computingSite)) + skipOnFail = False + # actions based on mapping type + if queue_config.mapType == WorkSpec.MT_NoJob: + # workers without jobs + jobChunks = [] + for i in range(nWorkers): + jobChunks.append([]) + elif queue_config.mapType == WorkSpec.MT_OneToOne: + # one worker per one job + jobChunks = self.dbProxy.get_job_chunks_for_workers( + queue_name, + nWorkers, nReady, 1, None, + queue_config.useJobLateBinding, + harvester_config.submitter.checkInterval, + harvester_config.submitter.lockInterval, + locked_by) + elif queue_config.mapType == WorkSpec.MT_MultiJobs: + # one worker for multiple jobs + nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queue_config, + nWorkers, + job_type, + resource_type, + maker=workerMakerCore) + tmp_log.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) + jobChunks = self.dbProxy.get_job_chunks_for_workers( + queue_name, + nWorkers, nReady, nJobsPerWorker, None, + queue_config.useJobLateBinding, + harvester_config.submitter.checkInterval, + harvester_config.submitter.lockInterval, + locked_by, + queue_config.allowJobMixture) + elif queue_config.mapType == WorkSpec.MT_MultiWorkers: + # multiple workers for one job + nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queue_config, + nWorkers, + job_type, + resource_type, + maker=workerMakerCore) + maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( + queue_config, job_type, resource_type, maker=workerMakerCore) + maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( + queue_config, job_type, resource_type, maker=workerMakerCore) + tmp_log.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) + jobChunks = self.dbProxy.get_job_chunks_for_workers( + queue_name, + nWorkers, nReady, None, nWorkersPerJob, + queue_config.useJobLateBinding, + harvester_config.submitter.checkInterval, + harvester_config.submitter.lockInterval, + locked_by, max_workers_per_job_in_total=maxWorkersPerJob, + max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) + else: + tmp_log.error('unknown mapType={0}'.format(queue_config.mapType)) continue - # get plugin for messenger - messenger = self.pluginFactory.get_plugin(queueConfig.messenger) - if messenger is None: - # not found - tmpLog.error( - 'messenger plugin for {0} not found'.format(jobSpec.computingSite)) + + tmp_log.debug('got {0} job chunks'.format(len(jobChunks))) + if len(jobChunks) == 0: continue - # setup access points - messenger.setup_access_points(workSpecList) - # feed jobs - for workSpec in workSpecList: - if workSpec.hasJob == 1: - tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list()) - if tmpStat is False: - tmpLog.error( - 'failed to send jobs to workerID={0}'.format(workSpec.workerID)) + # make workers + okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queue_config, + nReady, job_type, resource_type, + maker=workerMakerCore) + + if len(ngChunks) == 0: + tmp_log.debug('successfully made {0} workers'.format(len(okChunks))) + else: + tmp_log.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), + len(ngChunks))) + timeNow = datetime.datetime.utcnow() + timeNow_timestamp = time.time() + pandaIDs = set() + # NG (=not good) + for ngJobs in ngChunks: + for job_spec in ngJobs: + if skipOnFail: + # release jobs when workers are not made + pandaIDs.add(job_spec.PandaID) else: - tmpLog.debug( - 'sent jobs to workerID={0} with {1}'.format(workSpec.workerID, - tmpStat)) - # insert workers - self.dbProxy.insert_workers(workSpecList, lockedBy) - # submit - sw.reset() - tmpLog.info('submitting {0} workers'.format(len(workSpecList))) - workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, - workSpecList) - tmpLog.debug('done submitting {0} workers'.format(len(workSpecList)) - + sw.get_elapsed_time()) - # collect successful jobs - okPandaIDs = set() - for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): - if tmpRet: - workSpec, jobList = okChunks[iWorker] - jobList = workSpec.get_jobspec_list() - if jobList is not None: - for jobSpec in jobList: - okPandaIDs.add(jobSpec.PandaID) - # loop over all workers - for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): - workSpec, jobList = okChunks[iWorker] - # set harvesterHost - workSpec.harvesterHost = socket.gethostname() - # use associated job list since it can be truncated for re-filling - jobList = workSpec.get_jobspec_list() - # set status - if not tmpRet: - # failed submission - errStr = 'failed to submit a workerID={0} with {1}'.format( - workSpec.workerID, - tmpStr) - tmpLog.error(errStr) - workSpec.set_status(WorkSpec.ST_missed) - workSpec.set_dialog_message(tmpStr) - workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) + job_spec.status = 'failed' + job_spec.subStatus = 'failed_to_make' + job_spec.stateChangeTime = timeNow + job_spec.locked_by = None + errStr = 'failed to make a worker' + job_spec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) + job_spec.trigger_propagation() + self.dbProxy.update_job(job_spec, {'locked_by': locked_by, + 'subStatus': 'prepared'}) + # OK + work_specList = [] + if len(okChunks) > 0: + for work_spec, okJobs in okChunks: + # has job + if (queue_config.useJobLateBinding and work_spec.workerID is None) \ + or queue_config.mapType == WorkSpec.MT_NoJob: + work_spec.hasJob = 0 + else: + work_spec.hasJob = 1 + if work_spec.nJobsToReFill in [None, 0]: + work_spec.set_jobspec_list(okJobs) + else: + # refill free slots during the worker is running + work_spec.set_jobspec_list(okJobs[:work_spec.nJobsToReFill]) + work_spec.nJobsToReFill = None + for job_spec in okJobs[work_spec.nJobsToReFill:]: + pandaIDs.add(job_spec.PandaID) + work_spec.set_num_jobs_with_list() + # map type + work_spec.mapType = queue_config.mapType + # queue name + work_spec.computingSite = queue_config.queueName + # set access point + work_spec.accessPoint = queue_config.messenger['accessPoint'] + # sync level + work_spec.syncLevel = queue_config.get_synchronization_level() + # events + if len(okJobs) > 0 and \ + ('eventService' in okJobs[0].jobParams or + 'cloneJob' in okJobs[0].jobParams): + work_spec.eventsRequest = WorkSpec.EV_useEvents + work_specList.append(work_spec) + if len(work_specList) > 0: + sw = core_utils.get_stopwatch() + # get plugin for submitter + submitterCore = self.pluginFactory.get_plugin(queue_config.submitter) + if submitterCore is None: + # not found + tmp_log.error( + 'submitter plugin for {0} not found'.format(job_spec.computingSite)) + continue + # get plugin for messenger + messenger = self.pluginFactory.get_plugin(queue_config.messenger) + if messenger is None: + # not found + tmp_log.error( + 'messenger plugin for {0} not found'.format(job_spec.computingSite)) + continue + # setup access points + messenger.setup_access_points(work_specList) + # feed jobs + for work_spec in work_specList: + if work_spec.hasJob == 1: + tmpStat = messenger.feed_jobs(work_spec, work_spec.get_jobspec_list()) + if tmpStat is False: + tmp_log.error( + 'failed to send jobs to workerID={0}'.format(work_spec.workerID)) + else: + tmp_log.debug( + 'sent jobs to workerID={0} with {1}'.format(work_spec.workerID, + tmpStat)) + # insert workers + self.dbProxy.insert_workers(work_specList, locked_by) + # submit + sw.reset() + tmp_log.info('submitting {0} workers'.format(len(work_specList))) + work_specList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, + work_specList) + tmp_log.debug('done submitting {0} workers'.format(len(work_specList)) + + sw.get_elapsed_time()) + # collect successful jobs + okPandaIDs = set() + for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): + if tmpRet: + work_spec, jobList = okChunks[iWorker] + jobList = work_spec.get_jobspec_list() + if jobList is not None: + for job_spec in jobList: + okPandaIDs.add(job_spec.PandaID) + # loop over all workers + for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): + work_spec, jobList = okChunks[iWorker] + # set harvesterHost + work_spec.harvesterHost = socket.gethostname() + # use associated job list since it can be truncated for re-filling + jobList = work_spec.get_jobspec_list() + # set status + if not tmpRet: + # failed submission + errStr = 'failed to submit a workerID={0} with {1}'.format( + work_spec.workerID, + tmpStr) + tmp_log.error(errStr) + work_spec.set_status(WorkSpec.ST_missed) + work_spec.set_dialog_message(tmpStr) + work_spec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) + work_spec.set_pilot_closed() + if jobList is not None: + # increment attempt number + newJobList = [] + for job_spec in jobList: + # skip if successful with another worker + if job_spec.PandaID in okPandaIDs: + continue + if job_spec.submissionAttempts is None: + job_spec.submissionAttempts = 0 + job_spec.submissionAttempts += 1 + # max attempt or permanent error + if tmpRet is False or \ + job_spec.submissionAttempts >= \ + queue_config.maxSubmissionAttempts: + newJobList.append(job_spec) + else: + self.dbProxy.increment_submission_attempt( + job_spec.PandaID, + job_spec.submissionAttempts) + jobList = newJobList + elif queue_config.useJobLateBinding and work_spec.hasJob == 1: + # directly go to running after feeding jobs for late biding + work_spec.set_status(WorkSpec.ST_running) + else: + # normal successful submission + work_spec.set_status(WorkSpec.ST_submitted) + work_spec.submitTime = timeNow + work_spec.modificationTime = timeNow + work_spec.checkTime = timeNow + if self.monitor_fifo.enabled: + work_spec.set_work_params({'lastCheckAt': timeNow_timestamp}) + # prefetch events + if tmpRet and work_spec.hasJob == 1 and \ + work_spec.eventsRequest == WorkSpec.EV_useEvents and \ + queue_config.prefetchEvents: + work_spec.eventsRequest = WorkSpec.EV_requestEvents + eventsRequestParams = dict() + for job_spec in jobList: + eventsRequestParams[job_spec.PandaID] = \ + {'pandaID': job_spec.PandaID, + 'taskID': job_spec.taskID, + 'jobsetID': job_spec.jobParams['jobsetID'], + 'nRanges': max(int(math.ceil(work_spec.nCore / len(jobList))), + job_spec.jobParams['coreCount']), + } + work_spec.eventsRequestParams = eventsRequestParams + # register worker + tmpStat = self.dbProxy.register_worker(work_spec, jobList, locked_by) if jobList is not None: - # increment attempt number - newJobList = [] - for jobSpec in jobList: - # skip if successful with another worker - if jobSpec.PandaID in okPandaIDs: - continue - if jobSpec.submissionAttempts is None: - jobSpec.submissionAttempts = 0 - jobSpec.submissionAttempts += 1 - # max attempt or permanent error - if tmpRet is False or \ - jobSpec.submissionAttempts >= \ - queueConfig.maxSubmissionAttempts: - newJobList.append(jobSpec) + for job_spec in jobList: + pandaIDs.add(job_spec.PandaID) + if tmpStat: + if tmpRet: + tmpStr = \ + 'submitted a workerID={0} for PandaID={1} with batchID={2}' + tmp_log.info(tmpStr.format(work_spec.workerID, + job_spec.PandaID, + work_spec.batchID)) + else: + tmpStr = 'failed to submit a workerID={0} for PandaID={1}' + tmp_log.error(tmpStr.format(work_spec.workerID, + job_spec.PandaID)) else: - self.dbProxy.increment_submission_attempt( - jobSpec.PandaID, - jobSpec.submissionAttempts) - jobList = newJobList - elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: - # directly go to running after feeding jobs for late biding - workSpec.set_status(WorkSpec.ST_running) - else: - # normal successful submission - workSpec.set_status(WorkSpec.ST_submitted) - workSpec.submitTime = timeNow - workSpec.modificationTime = timeNow - workSpec.checkTime = timeNow - if self.monitor_fifo.enabled: - workSpec.set_work_params({'lastCheckAt': timeNow_timestamp}) - # prefetch events - if tmpRet and workSpec.hasJob == 1 and \ - workSpec.eventsRequest == WorkSpec.EV_useEvents and \ - queueConfig.prefetchEvents: - workSpec.eventsRequest = WorkSpec.EV_requestEvents - eventsRequestParams = dict() - for jobSpec in jobList: - eventsRequestParams[jobSpec.PandaID] = \ - {'pandaID': jobSpec.PandaID, - 'taskID': jobSpec.taskID, - 'jobsetID': jobSpec.jobParams['jobsetID'], - 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), - jobSpec.jobParams['coreCount']), - } - workSpec.eventsRequestParams = eventsRequestParams - # register worker - tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy) - if jobList is not None: - for jobSpec in jobList: - pandaIDs.add(jobSpec.PandaID) - if tmpStat: - if tmpRet: tmpStr = \ - 'submitted a workerID={0} for PandaID={1} with batchID={2}' - tmpLog.info(tmpStr.format(workSpec.workerID, - jobSpec.PandaID, - workSpec.batchID)) - else: - tmpStr = 'failed to submit a workerID={0} for PandaID={1}' - tmpLog.error(tmpStr.format(workSpec.workerID, - jobSpec.PandaID)) - else: - tmpStr = \ - 'failed to register a worker for PandaID={0} with batchID={1}' - tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID)) - # enqueue to monitor fifo - if self.monitor_fifo.enabled \ - and queueConfig.mapType != WorkSpec.MT_MultiWorkers: - workSpecsToEnqueue = \ - [[w] for w in workSpecList if w.status - in (WorkSpec.ST_submitted, WorkSpec.ST_running)] - check_delay = min( - getattr(harvester_config.monitor, 'eventBasedCheckInterval', - harvester_config.monitor.checkInterval), - getattr(harvester_config.monitor, 'fifoCheckInterval', - harvester_config.monitor.checkInterval)) - monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay) - mainLog.debug('put workers to monitor FIFO') - submitted = True - # release jobs - self.dbProxy.release_jobs(pandaIDs, lockedBy) - tmpLog.info('done') - except Exception: - core_utils.dump_error_message(tmpLog) + 'failed to register a worker for PandaID={0} with batchID={1}' + tmp_log.error(tmpStr.format(job_spec.PandaID, work_spec.batchID)) + # enqueue to monitor fifo + if self.monitor_fifo.enabled \ + and queue_config.mapType != WorkSpec.MT_MultiWorkers: + work_specsToEnqueue = \ + [[w] for w in work_specList if w.status + in (WorkSpec.ST_submitted, WorkSpec.ST_running)] + check_delay = min( + getattr(harvester_config.monitor, 'eventBasedCheckInterval', + harvester_config.monitor.checkInterval), + getattr(harvester_config.monitor, 'fifoCheckInterval', + harvester_config.monitor.checkInterval)) + monitor_fifo.put((queue_name, work_specsToEnqueue), time.time() + check_delay) + main_log.debug('put workers to monitor FIFO') + submitted = True + # release jobs + self.dbProxy.release_jobs(pandaIDs, locked_by) + tmp_log.info('done') + except Exception: + core_utils.dump_error_message(tmp_log) # release the site - self.dbProxy.release_site(siteName, lockedBy) - if sw_main.get_elapsed_time_in_sec() > queueLockInterval: - mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval) + self.dbProxy.release_site(site_name, locked_by) + if sw_main.get_elapsed_time_in_sec() > queue_lock_interval: + main_log.warning('a submitter cycle was longer than queue_lock_interval {0} sec'.format(queue_lock_interval) + sw_main.get_elapsed_time()) - mainLog.debug('done') + main_log.debug('done') # define sleep interval - if siteName is None: + if site_name is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 @@ -413,13 +419,13 @@ def run(self): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval) - self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName) + self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=site_name) # time the cycle - mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time()) + main_log.debug('done a submitter cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): - mainLog.debug('terminated') + main_log.debug('terminated') return # wrapper for submitWorkers to skip ready workers @@ -428,13 +434,13 @@ def submit_workers(self, submitter_core, workspec_list): strList = [] newSpecList = [] workersToSubmit = [] - for workSpec in workspec_list: - if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: - newSpecList.append(workSpec) + for work_spec in workspec_list: + if work_spec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: + newSpecList.append(work_spec) retList.append(True) strList.append('') else: - workersToSubmit.append(workSpec) + workersToSubmit.append(work_spec) tmpRetList = submitter_core.submit_workers(workersToSubmit) # submit the workers to the monitoring diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 22fd057e..bc27a116 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -15,11 +15,11 @@ class WorkerAdjuster(object): # constructor def __init__(self, queue_config_mapper): - self.queueConfigMapper = queue_config_mapper + self.queue_configMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() - self.apf_mon = Apfmon(self.queueConfigMapper) + self.apf_mon = Apfmon(self.queue_configMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: @@ -27,17 +27,17 @@ def __init__(self, queue_config_mapper): # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): - tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') - tmpLog.debug('start') - tmpLog.debug('static_num_workers: {0}'.format(static_num_workers)) + tmp_log = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') + tmp_log.debug('start') + tmp_log.debug('static_num_workers: {0}'.format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status - queueStat = self.dbProxy.get_cache("panda_queues.json", None) - if queueStat is None: - queueStat = dict() + queue_stat = self.dbProxy.get_cache("panda_queues.json", None) + if queue_stat is None: + queue_stat = dict() else: - queueStat = queueStat.data + queue_stat = queue_stat.data # get job statistics job_stats = self.dbProxy.get_cache("job_statistics.json", None) @@ -47,196 +47,211 @@ def define_num_workers(self, static_num_workers, site_name): job_stats = job_stats.data # define num of new workers - for queueName in static_num_workers: + for queue_name in static_num_workers: # get queue - queueConfig = self.queueConfigMapper.get_queue(queueName) - workerLimits_dict = self.dbProxy.get_worker_limits(queueName) - maxWorkers = workerLimits_dict.get('maxWorkers', 0) - nQueueLimit = workerLimits_dict.get('nQueueLimitWorker', 0) - nQueueLimitPerRT = workerLimits_dict['nQueueLimitWorkerPerRT'] - nQueue_total, nReady_total, nRunning_total = 0, 0, 0 + queue_config = self.queue_configMapper.get_queue(queue_name) + worker_limits_dict = self.dbProxy.get_worker_limits(queue_name) + max_workers = worker_limits_dict.get('maxWorkers', 0) + n_queue_limit = worker_limits_dict.get('nQueueLimitWorker', 0) + n_queue_limit_per_rt = worker_limits_dict['nQueueLimitWorkerPerRT'] + n_queue_total, n_ready_total, n_running_total = 0, 0, 0 apf_msg = None apf_data = None - for resource_type, tmpVal in iteritems(static_num_workers[queueName]): - tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'. - format(queueName, resource_type, tmpVal)) - - # set 0 to num of new workers when the queue is disabled - if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby', - 'maintenance']: - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 - retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status']) - tmpLog.debug(retMsg) - apf_msg = 'Not submitting workers since queue status = {0}'.format(queueStat[queueName]['status']) - continue - - # protection against not-up-to-date queue config - if queueConfig is None: - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 - retMsg = 'set nNewWorkers=0 due to missing queueConfig' - tmpLog.debug(retMsg) - apf_msg = 'Not submitting workers because of missing queueConfig' - continue - - # get throttler - if queueName not in self.throttlerMap: - if hasattr(queueConfig, 'throttler'): - throttler = self.pluginFactory.get_plugin(queueConfig.throttler) - else: - throttler = None - self.throttlerMap[queueName] = throttler - - # check throttler - throttler = self.throttlerMap[queueName] - if throttler is not None: - toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig) - if toThrottle: - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 - retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg) - tmpLog.debug(retMsg) + for job_type, jt_values in iteritems(static_num_workers[queue_name]): + for resource_type, tmp_val in iteritems(jt_values): + tmp_log.debug('Processing queue {0} job_type {1} resource_type {2} with static_num_workers {3}'. + format(queue_name, job_type, resource_type, tmp_val)) + + # set 0 to num of new workers when the queue is disabled + if queue_name in queue_stat and queue_stat[queue_name]['status'] in ['offline', 'standby', + 'maintenance']: + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 + ret_msg = 'set n_new_workers=0 since status={0}'.format(queue_stat[queue_name]['status']) + tmp_log.debug(ret_msg) + apf_msg = 'Not submitting workers since queue status = {0}'.format(queue_stat[queue_name]['status']) continue - # check stats - nQueue = tmpVal['nQueue'] - nReady = tmpVal['nReady'] - nRunning = tmpVal['nRunning'] - if resource_type != 'ANY': - nQueue_total += nQueue - nReady_total += nReady - nRunning_total += nRunning - if queueConfig.runMode == 'slave': - nNewWorkersDef = tmpVal['nNewWorkers'] - if nNewWorkersDef == 0: - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 - retMsg = 'set nNewWorkers=0 by panda in slave mode' - tmpLog.debug(retMsg) + # protection against not-up-to-date queue config + if queue_config is None: + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 + ret_msg = 'set n_new_workers=0 due to missing queue_config' + tmp_log.debug(ret_msg) + apf_msg = 'Not submitting workers because of missing queue_config' continue - else: - nNewWorkersDef = None - - # define num of new workers based on static site config - nNewWorkers = 0 - if nQueue >= nQueueLimitPerRT > 0: - # enough queued workers - retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimitPerRT({1})'.format(nQueue, nQueueLimitPerRT) - tmpLog.debug(retMsg) - pass - elif (nQueue + nReady + nRunning) >= maxWorkers > 0: - # enough workers in the system - retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue, - nReady, - nRunning) - retMsg += '>= maxWorkers({0})'.format(maxWorkers) - tmpLog.debug(retMsg) - pass - else: - - maxQueuedWorkers = None - - if nQueueLimitPerRT > 0: # there is a limit set for the queue - maxQueuedWorkers = nQueueLimitPerRT - - # Reset the maxQueueWorkers according to particular - if nNewWorkersDef is not None: # don't surpass limits given centrally - maxQueuedWorkers_slave = nNewWorkersDef + nQueue - if maxQueuedWorkers is not None: - maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers) + + # get throttler + if queue_name not in self.throttlerMap: + if hasattr(queue_config, 'throttler'): + throttler = self.pluginFactory.get_plugin(queue_config.throttler) else: - maxQueuedWorkers = maxQueuedWorkers_slave - - elif queueConfig.mapType == 'NoJob': # for pull mode, limit to activated jobs - # limit the queue to the number of activated jobs to avoid empty pilots - try: - n_activated = max(job_stats[queueName]['activated'], 1) # avoid no activity queues - queue_limit = maxQueuedWorkers - maxQueuedWorkers = min(n_activated, maxQueuedWorkers) - tmpLog.debug('limiting maxQueuedWorkers to min(n_activated={0}, queue_limit={1})'. - format(n_activated, queue_limit)) - except KeyError: - tmpLog.warning('n_activated not defined, defaulting to configured queue limits') - pass - - if maxQueuedWorkers is None: # no value found, use default value - maxQueuedWorkers = 1 - - # new workers - nNewWorkers = max(maxQueuedWorkers - nQueue, 0) - tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation' - .format(nNewWorkers)) - if maxWorkers > 0: - nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0)) - tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers' - .format(nNewWorkers)) - if queueConfig.maxNewWorkersPerCycle > 0: - nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle) - tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle' - .format(nNewWorkers)) - if self.maxNewWorkers is not None and self.maxNewWorkers > 0: - nNewWorkers = min(nNewWorkers, self.maxNewWorkers) - tmpLog.debug('setting nNewWorkers to {0} in order to respect universal maxNewWorkers' - .format(nNewWorkers)) - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers - - # adjust nNewWorkers for UCORE to let aggregations over RT respect nQueueLimitWorker and maxWorkers - if queueConfig is None: - maxNewWorkersPerCycle = 0 - retMsg = 'set maxNewWorkersPerCycle=0 in UCORE aggregation due to missing queueConfig' - tmpLog.debug(retMsg) + throttler = None + self.throttlerMap[queue_name] = throttler + + # check throttler + throttler = self.throttlerMap[queue_name] + if throttler is not None: + to_throttle, tmp_msg = throttler.to_be_throttled(queue_config) + if to_throttle: + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 + ret_msg = 'set n_new_workers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmp_msg) + tmp_log.debug(ret_msg) + continue + + # check stats + n_queue = tmp_val['nQueue'] + n_ready = tmp_val['nReady'] + n_running = tmp_val['nRunning'] + if resource_type != 'ANY' and job_type != 'ANY' and job_type is not None: + n_queue_total += n_queue + n_ready_total += n_ready + n_running_total += n_running + if queue_config.runMode == 'slave': + n_new_workers_def = tmp_val['nNewWorkers'] + if n_new_workers_def == 0: + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 + ret_msg = 'set n_new_workers=0 by panda in slave mode' + tmp_log.debug(ret_msg) + continue + else: + n_new_workers_def = None + + # define num of new workers based on static site config + n_new_workers = 0 + if n_queue >= n_queue_limit_per_rt > 0: + # enough queued workers + ret_msg = 'No n_new_workers since n_queue({0})>=n_queue_limit_per_rt({1})'.format(n_queue, + n_queue_limit_per_rt) + tmp_log.debug(ret_msg) + pass + elif (n_queue + n_ready + n_running) >= max_workers > 0: + # enough workers in the system + ret_msg = 'No n_new_workers since n_queue({0}) + n_ready({1}) + n_running({2}) '.format(n_queue, + n_ready, + n_running) + ret_msg += '>= max_workers({0})'.format(max_workers) + tmp_log.debug(ret_msg) + pass + else: + + max_queued_workers = None + + if n_queue_limit_per_rt > 0: # there is a limit set for the queue + max_queued_workers = n_queue_limit_per_rt + + # Reset the maxQueueWorkers according to particular + if n_new_workers_def is not None: # don't surpass limits given centrally + maxQueuedWorkers_slave = n_new_workers_def + n_queue + if max_queued_workers is not None: + max_queued_workers = min(maxQueuedWorkers_slave, max_queued_workers) + else: + max_queued_workers = maxQueuedWorkers_slave + + elif queue_config.mapType == 'NoJob': # for pull mode, limit to activated jobs + # limit the queue to the number of activated jobs to avoid empty pilots + try: + n_activated = max(job_stats[queue_name]['activated'], 1) # avoid no activity queues + queue_limit = max_queued_workers + max_queued_workers = min(n_activated, max_queued_workers) + tmp_log.debug('limiting max_queued_workers to min(n_activated={0}, queue_limit={1})'. + format(n_activated, queue_limit)) + except KeyError: + tmp_log.warning('n_activated not defined, defaulting to configured queue limits') + pass + + if max_queued_workers is None: # no value found, use default value + max_queued_workers = 1 + + # new workers + n_new_workers = max(max_queued_workers - n_queue, 0) + tmp_log.debug('setting n_new_workers to {0} in max_queued_workers calculation' + .format(n_new_workers)) + if max_workers > 0: + n_new_workers = min(n_new_workers, max(max_workers - n_queue - n_ready - n_running, 0)) + tmp_log.debug('setting n_new_workers to {0} to respect max_workers' + .format(n_new_workers)) + if queue_config.maxNewWorkersPerCycle > 0: + n_new_workers = min(n_new_workers, queue_config.maxNewWorkersPerCycle) + tmp_log.debug('setting n_new_workers to {0} in order to respect maxNewWorkersPerCycle' + .format(n_new_workers)) + if self.maxNewWorkers is not None and self.maxNewWorkers > 0: + n_new_workers = min(n_new_workers, self.maxNewWorkers) + tmp_log.debug('setting n_new_workers to {0} in order to respect universal maxNewWorkers' + .format(n_new_workers)) + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = n_new_workers + + # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers + if queue_config is None: + max_new_workers_per_cycle = 0 + ret_msg = 'set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config' + tmp_log.debug(ret_msg) else: - maxNewWorkersPerCycle = queueConfig.maxNewWorkersPerCycle - if len(dyn_num_workers[queueName]) > 1: - total_new_workers_rts = sum( dyn_num_workers[queueName][_rt]['nNewWorkers'] - if _rt != 'ANY' else 0 - for _rt in dyn_num_workers[queueName] ) - nNewWorkers_max_agg = min( - max(nQueueLimit - nQueue_total, 0), - max(maxWorkers - nQueue_total - nReady_total - nRunning_total, 0), - ) - if maxNewWorkersPerCycle >= 0: - nNewWorkers_max_agg = min(nNewWorkers_max_agg, maxNewWorkersPerCycle) + max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle + if len(dyn_num_workers[queue_name]) > 1: + total_new_workers_rts = 0 + for _jt in dyn_num_workers[queue_name]: + for _rt in dyn_num_workers[queue_name][_jt]: + if _jt != 'ANY' and _rt != 'ANY': + total_new_workers_rts = total_new_workers_rts + dyn_num_workers[queue_name][_jt][_rt]['nNewWorkers'] + n_new_workers_max_agg = min(max(n_queue_limit - n_queue_total, 0), + max(max_workers - n_queue_total - n_ready_total - n_running_total, 0)) + if max_new_workers_per_cycle >= 0: + n_new_workers_max_agg = min(n_new_workers_max_agg, max_new_workers_per_cycle) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: - nNewWorkers_max_agg = min(nNewWorkers_max_agg, self.maxNewWorkers) + n_new_workers_max_agg = min(n_new_workers_max_agg, self.maxNewWorkers) + # exceeded max, to adjust - if total_new_workers_rts > nNewWorkers_max_agg: - if nNewWorkers_max_agg == 0: - for resource_type in dyn_num_workers[queueName]: - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 - tmpLog.debug('No nNewWorkers since nNewWorkers_max_agg=0 for UCORE') + if total_new_workers_rts > n_new_workers_max_agg: + if n_new_workers_max_agg == 0: + for job_type in dyn_num_workers[queue_name]: + for resource_type in dyn_num_workers[queue_name][job_type]: + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 + tmp_log.debug('No n_new_workers since n_new_workers_max_agg=0 for UCORE') else: - tmpLog.debug('nNewWorkers_max_agg={0} for UCORE'.format(nNewWorkers_max_agg)) - _d = dyn_num_workers[queueName].copy() + tmp_log.debug('n_new_workers_max_agg={0} for UCORE'.format(n_new_workers_max_agg)) + _d = dyn_num_workers[queue_name].copy() del _d['ANY'] - simple_rt_nw_list = [ [_rt, _d[_rt].get('nNewWorkers', 0), 0] for _rt in _d ] - _countdown = nNewWorkers_max_agg + + # TODO: needs to be recalculated + simple_rt_nw_list = [] + for job_type in _d: # jt: job type + for resource_type in _d[job_type]: # rt: resource type + simple_rt_nw_list.append([resource_type, _d[job_type][resource_type].get('nNewWorkers', 0), 0]) + + _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: - resource_type, nNewWorkers_orig, _r = _rt_list - nNewWorkers, remainder = divmod(nNewWorkers_orig*nNewWorkers_max_agg, total_new_workers_rts) - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers + resource_type, n_new_workers_orig, _r = _rt_list + n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, + total_new_workers_rts) + dyn_num_workers[queue_name][job_type].setdefault(resource_type, + {'nReady': 0, 'nRunning': 0, + 'nQueue': 0, 'nNewWorkers': 0}) + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = n_new_workers _rt_list[2] = remainder - _countdown -= nNewWorkers + _countdown -= n_new_workers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) - for resource_type, nNewWorkers_orig, remainder in sorted_rt_nw_list: + for resource_type, n_new_workers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break - dyn_num_workers[queueName][resource_type]['nNewWorkers'] += 1 + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] += 1 _countdown -= 1 - for resource_type in dyn_num_workers[queueName]: - if resource_type == 'ANY': - continue - nNewWorkers = dyn_num_workers[queueName][resource_type]['nNewWorkers'] - tmpLog.debug('setting nNewWorkers to {0} of type {1} in order to respect RT aggregations for UCORE' - .format(nNewWorkers, resource_type)) + for job_type in dyn_num_workers[queue_name]: + for resource_type in dyn_num_workers[queue_name][job_type]: + if job_type == 'ANY' or resource_type == 'ANY': + continue + n_new_workers = dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] + tmp_log.debug('setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE' + .format(n_new_workers, job_type, resource_type)) if not apf_msg: - apf_data = copy.deepcopy(dyn_num_workers[queueName]) + apf_data = copy.deepcopy(dyn_num_workers[queue_name]) - self.apf_mon.update_label(queueName, apf_msg, apf_data) + self.apf_mon.update_label(queue_name, apf_msg, apf_data) # dump - tmpLog.debug('defined {0}'.format(str(dyn_num_workers))) + tmp_log.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except Exception: # dump error - errMsg = core_utils.dump_error_message(tmpLog) + err_msg = core_utils.dump_error_message(tmp_log) return None diff --git a/pandaharvester/harvesterbody/worker_maker.py b/pandaharvester/harvesterbody/worker_maker.py index 7557ea5f..8aae3440 100644 --- a/pandaharvester/harvesterbody/worker_maker.py +++ b/pandaharvester/harvesterbody/worker_maker.py @@ -18,8 +18,8 @@ def get_plugin(self, queue_config): return self.pluginFactory.get_plugin(queue_config.workerMaker) # make workers - def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, maker=None): - tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1}'.format(queue_config.queueName, resource_type), + def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_type, maker=None): + tmpLog = core_utils.make_logger(_logger, 'queue={0} jtype={1} rtype={2}'.format(queue_config.queueName, job_type, resource_type), method_name='make_workers') tmpLog.debug('start') try: @@ -38,7 +38,7 @@ def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, make for iChunk, jobChunk in enumerate(jobchunk_list): # make a worker if iChunk >= n_ready: - workSpec = maker.make_worker(jobChunk, queue_config, resource_type) + workSpec = maker.make_worker(jobChunk, queue_config, job_type, resource_type) else: # use ready worker if iChunk < len(readyWorkers): @@ -65,35 +65,35 @@ def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, make return [], jobchunk_list # get number of jobs per worker - def get_num_jobs_per_worker(self, queue_config, n_workers, resource_type, maker=None): + def get_num_jobs_per_worker(self, queue_config, n_workers, job_type, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_jobs_per_worker(n_workers) # get number of workers per job - def get_num_workers_per_job(self, queue_config, n_workers, resource_type, maker=None): + def get_num_workers_per_job(self, queue_config, n_workers, job_type, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_workers_per_job(n_workers) # check number of ready resources - def num_ready_resources(self, queue_config, resource_type, maker=None): + def num_ready_resources(self, queue_config, job_type, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.num_ready_resources() # get upper limit on the cumulative total of workers per job - def get_max_workers_per_job_in_total(self, queue_config, resource_type, maker=None): + def get_max_workers_per_job_in_total(self, queue_config, job_type, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_max_workers_per_job_in_total() # get upper limit on the number of new workers per job in a cycle - def get_max_workers_per_job_per_cycle(self, queue_config, resource_type, maker=None): + def get_max_workers_per_job_per_cycle(self, queue_config, job_type, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) diff --git a/pandaharvester/harvestercloud/k8s_startup_script.py b/pandaharvester/harvestercloud/k8s_startup_script.py index c9c14487..be5bd8cd 100644 --- a/pandaharvester/harvestercloud/k8s_startup_script.py +++ b/pandaharvester/harvestercloud/k8s_startup_script.py @@ -1,13 +1,19 @@ #!/usr/bin/env python + +######################################################## +# OBSOLETE!!! USE PILOTS_STARTER.PY +######################################################## + """ -This script will be executed at the VM startup time. -- It will download the proxy and panda queue from Google instance metadata +This script will be executed at container startup +- It will retrieve the proxy and panda queue from the environment - It will download the pilot wrapper from github and execute it -- It will upload the pilot logs to panda cache +- It will upload the pilot logs to panda cache at the end + +post-multipart code was taken from: https://github.com/haiwen/webapi-examples/blob/master/python/upload-file.py """ -import requests try: import subprocess32 as subprocess except Exception: @@ -15,30 +21,74 @@ import os import sys import logging -import time import traceback -from threading import Thread +import httplib +import mimetypes +import ssl +import urlparse +import urllib2 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', filename='/tmp/vm_script.log', filemode='w') -global loop -loop = True +def post_multipart(host, port, selector, files, proxy_cert): + """ + Post files to an http host as multipart/form-data. + files is a sequence of (name, filename, value) elements for data to be uploaded as files + Return the server's response page. + """ + content_type, body = encode_multipart_formdata(files) -def upload_logs(url, log_file_name, destination_name, proxy_path): - try: - # open and compress the content of the file - with open(log_file_name, 'rb') as log_file_object: - files = {'file': (destination_name, log_file_object.read())} + context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + context.load_cert_chain(certfile=proxy_cert, keyfile=proxy_cert) + + h = httplib.HTTPSConnection(host, port, context=context, timeout=180) - cert = [proxy_path, proxy_path] - # verify = '/etc/grid-security/certificates' # not supported in CernVM - requests.exceptions.SSLError: [Errno 21] Is a directory + h.putrequest('POST', selector) + h.putheader('content-type', content_type) + h.putheader('content-length', str(len(body))) + h.endheaders() + h.send(body) + response = h.getresponse() + return response.status, response.reason + + +def encode_multipart_formdata(files): + """ + files is a sequence of (name, filename, value) elements for data to be uploaded as files + Return (content_type, body) ready for httplib.HTTP instance + """ + BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$' + CRLF = '\r\n' + L = [] + for (key, filename, value) in files: + L.append('--' + BOUNDARY) + L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) + L.append('Content-Type: %s' % get_content_type(filename)) + L.append('') + L.append(value) + L.append('--' + BOUNDARY + '--') + L.append('') + body = CRLF.join(L) + content_type = 'multipart/form-data; boundary=%s' % BOUNDARY + return content_type, body + + +def get_content_type(filename): + return mimetypes.guess_type(filename)[0] or 'application/octet-stream' + + +def upload_logs(url, log_file_name, destination_name, proxy_cert): + try: + full_url = url + '/putFile' + urlparts = urlparse.urlsplit(full_url) logging.debug('[upload_logs] start') - res = requests.post(url + '/putFile', files=files, timeout=180, verify=False, cert=cert) - logging.debug('[upload_logs] finished with code={0} msg={1}'.format(res.status_code, res.text)) - if res.status_code == 200: + files = [('file', destination_name, open(log_file_name).read())] + status, reason = post_multipart(urlparts.hostname, urlparts.port, urlparts.path, files, proxy_cert) + logging.debug('[upload_logs] finished with code={0} msg={1}'.format(status, reason)) + if status == 200: return True except Exception: err_type, err_value = sys.exc_info()[:2] @@ -49,51 +99,14 @@ def upload_logs(url, log_file_name, destination_name, proxy_path): return False -def contact_harvester(harvester_frontend, data, auth_token, proxy_path): - try: - headers = {'Content-Type': 'application/json', - 'Authorization': 'Bearer {0}'.format(auth_token)} - cert = [proxy_path, proxy_path] - #verify = '/etc/grid-security/certificates' # not supported in CernVM - requests.exceptions.SSLError: [Errno 21] Is a directory - verify = False - resp = requests.post(harvester_frontend, json=data, headers=headers, cert=cert, verify=verify) - logging.debug('[contact_harvester] harvester returned: {0}'.format(resp.text)) - except Exception as e: - # message could not be sent - logging.debug('[contact_harvester] failed to send message to harvester: {0}'.format(e)) - pass - - -def heartbeat(harvester_frontend, worker_id, auth_token, proxy_path): - data = {'methodName': 'heartbeat', 'workerID': worker_id, 'data': None} - logging.debug('[heartbeat] sending heartbeat to harvester: {0}'.format(data)) - return contact_harvester(harvester_frontend, data, auth_token, proxy_path) - - -def suicide(harvester_frontend, worker_id, auth_token, proxy_path): - data = {'methodName': 'killWorker', 'workerID': worker_id, 'data': None} - logging.debug('[suicide] sending suicide message to harvester: {0}'.format(data)) - return contact_harvester(harvester_frontend, data, auth_token, proxy_path) - - -def heartbeat_loop(harvester_frontend, worker_id, auth_token, proxy_path): - while loop: - heartbeat(harvester_frontend, worker_id, auth_token, proxy_path) - time.sleep(300) - - def get_url(url, headers=None): """ get content from specified URL + TODO: error handling """ - - reply = requests.get(url, headers=headers) - if reply.status_code != 200: - logging.debug('[get_attribute] Failed to open {0}'.format(url)) - return None - else: - return reply.content - + response = urllib2.urlopen(wrapper_url) + content = response.read() + return content def get_configuration(): @@ -126,20 +139,10 @@ def get_configuration(): resource_type = os.environ.get('resourceType') logging.debug('[main] got resource type: {0}'.format(resource_type)) - # get the harvester frontend URL, where we'll send heartbeats - # harvester_frontend_url = METADATA_URL.format("harvester_frontend") - harvester_frontend = None - # logging.debug('[main] got harvester frontend: {0}'.format(harvester_frontend)) - # get the worker id worker_id = os.environ.get('workerID') logging.debug('[main] got worker id: {0}'.format(worker_id)) - # get the authentication token - # auth_token_url = METADATA_URL.format("auth_token") - auth_token = None - # logging.debug('[main] got authentication token') - # get the URL (e.g. panda cache) to upload logs logs_frontend_w = os.environ.get('logs_frontend_w') logging.debug('[main] got url to upload logs') @@ -148,17 +151,13 @@ def get_configuration(): logs_frontend_r = os.environ.get('logs_frontend_r') logging.debug('[main] got url to download logs') - return proxy_path, panda_site, panda_queue, resource_type, harvester_frontend, worker_id, auth_token, logs_frontend_w, logs_frontend_r + return proxy_path, panda_site, panda_queue, resource_type, worker_id, logs_frontend_w, logs_frontend_r if __name__ == "__main__": - # get all the configuration from the GCE metadata server - proxy_path, panda_site, panda_queue, resource_type, harvester_frontend, worker_id, auth_token, logs_frontend_w, logs_frontend_r = get_configuration() - - # start a separate thread that will send a heartbeat to harvester every 5 minutes - # heartbeat_thread = Thread(target=heartbeat_loop, args=(harvester_frontend, worker_id, auth_token, proxy_path)) - # heartbeat_thread.start() + # get all the configuration from the environment + proxy_path, panda_site, panda_queue, resource_type, worker_id, logs_frontend_w, logs_frontend_r = get_configuration() # the pilot should propagate the download link via the pilotId field in the job table destination_name = '{0}.out'.format(worker_id) @@ -166,8 +165,8 @@ def get_configuration(): os.environ['GTAG'] = log_download_url # GTAG env variable is read by pilot # get the pilot wrapper - wrapper_path = "/tmp/runpilot3-wrapper.sh" - wrapper_url = "https://raw.githubusercontent.com/fbarreir/adc/master/runpilot3-wrapper.sh" + wrapper_path = "/tmp/runpilot2-wrapper.sh" + wrapper_url = "https://raw.githubusercontent.com/PanDAWMS/pilot-wrapper/master/runpilot2-wrapper.sh" wrapper_string = get_url(wrapper_url) with open(wrapper_path, "w") as wrapper_file: wrapper_file.write(wrapper_string) @@ -178,21 +177,16 @@ def get_configuration(): logging.debug('[main] starting pilot wrapper...') resource_type_option = '' if resource_type: - resource_type_option = '-R {0}'.format(resource_type) - wrapper_params = '-s {0} -h {1} {2}'.format(panda_site, panda_queue, resource_type_option) + resource_type_option = '--resource-type {0}'.format(resource_type) + wrapper_params = '-s {0} -r {1} -q {2} {3}'.format(panda_site, panda_queue, panda_queue, resource_type_option) if 'ANALY' in panda_queue: - wrapper_params = '{0} -u user'.format(wrapper_params) + wrapper_params = '{0} -j user'.format(wrapper_params) else: - wrapper_params = '{0} -u managed'.format(wrapper_params) - command = "/tmp/runpilot3-wrapper.sh {0} -p 25443 -w https://pandaserver.cern.ch >& /tmp/wrapper-wid.log".\ + wrapper_params = '{0} -j managed'.format(wrapper_params) + command = "/tmp/runpilot2-wrapper.sh {0} -i PR -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode=PULL --allow-same-user=False >& /tmp/wrapper-wid.log".\ format(wrapper_params, worker_id) subprocess.call(command, shell=True) logging.debug('[main] pilot wrapper done...') # upload logs to e.g. panda cache or similar upload_logs(logs_frontend_w, '/tmp/wrapper-wid.log', destination_name, proxy_path) - - # ask harvester to kill the VM and stop the heartbeat - # suicide(harvester_frontend, worker_id, auth_token, proxy_path) - loop = False - # heartbeat_thread.join() diff --git a/pandaharvester/harvestercloud/pilots_starter.py b/pandaharvester/harvestercloud/pilots_starter.py new file mode 100644 index 00000000..ab93ea77 --- /dev/null +++ b/pandaharvester/harvestercloud/pilots_starter.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python + +""" +This script will be executed at container startup +- It will retrieve the proxy and panda queue from the environment +- It will download the pilot wrapper from github and execute it +- It will upload the pilot logs to panda cache at the end + +post-multipart code was taken from: https://github.com/haiwen/webapi-examples/blob/master/python/upload-file.py +""" + +try: + import subprocess32 as subprocess +except Exception: + import subprocess +import os +import sys +import shutil +import logging +import httplib +import mimetypes +import ssl +import urlparse +import urllib2 +import traceback + +WORK_DIR = '/scratch' +CONFIG_DIR = '/scratch/jobconfig' +PJD = 'pandaJobData.out' +PFC = 'PoolFileCatalog_H.xml' +CONFIG_FILES = [PJD, PFC] + +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', stream=sys.stdout) + + +# handlers=[logging.FileHandler('/tmp/vm_script.log'), logging.StreamHandler(sys.stdout)]) +# filename='/tmp/vm_script.log', filemode='w') + + +def post_multipart(host, port, selector, files, proxy_cert): + """ + Post files to an http host as multipart/form-data. + files is a sequence of (name, filename, value) elements for data to be uploaded as files + Return the server's response page. + """ + content_type, body = encode_multipart_formdata(files) + + context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + context.load_cert_chain(certfile=proxy_cert, keyfile=proxy_cert) + + h = httplib.HTTPSConnection(host, port, context=context, timeout=180) + + h.putrequest('POST', selector) + h.putheader('content-type', content_type) + h.putheader('content-length', str(len(body))) + h.endheaders() + h.send(body) + response = h.getresponse() + return response.status, response.reason + + +def encode_multipart_formdata(files): + """ + files is a sequence of (name, filename, value) elements for data to be uploaded as files + Return (content_type, body) ready for httplib.HTTP instance + """ + BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$' + CRLF = '\r\n' + L = [] + for (key, filename, value) in files: + L.append('--' + BOUNDARY) + L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) + L.append('Content-Type: %s' % get_content_type(filename)) + L.append('') + L.append(value) + L.append('--' + BOUNDARY + '--') + L.append('') + body = CRLF.join(L) + content_type = 'multipart/form-data; boundary=%s' % BOUNDARY + return content_type, body + + +def get_content_type(filename): + return mimetypes.guess_type(filename)[0] or 'application/octet-stream' + + +def upload_logs(url, log_file_name, destination_name, proxy_cert): + try: + full_url = url + '/putFile' + urlparts = urlparse.urlsplit(full_url) + + logging.debug('[upload_logs] start') + files = [('file', destination_name, open(log_file_name).read())] + status, reason = post_multipart(urlparts.hostname, urlparts.port, urlparts.path, files, proxy_cert) + logging.debug('[upload_logs] finished with code={0} msg={1}'.format(status, reason)) + if status == 200: + return True + except Exception: + err_type, err_value = sys.exc_info()[:2] + err_messsage = "failed to put with {0}:{1} ".format(err_type, err_value) + err_messsage += traceback.format_exc() + logging.debug('[upload_logs] excepted with:\n {0}'.format(err_messsage)) + + return False + + +def get_url(url, headers=None): + """ + get content from specified URL + TODO: error handling + """ + response = urllib2.urlopen(wrapper_url) + content = response.read() + return content + + +def copy_files_in_dir(src_dir, dst_dir): + # src_files = os.listdir(src_dir) + for file_name in CONFIG_FILES: + full_file_name = os.path.join(src_dir, file_name) + shutil.copy(full_file_name, dst_dir) + + +def get_configuration(): + # get the proxy certificate and save it + if os.environ.get('proxySecretPath'): + # os.symlink(os.environ.get('proxySecretPath'), proxy_path) + proxy_path = os.environ.get('proxySecretPath') + elif os.environ.get('proxyContent'): + proxy_path = "/tmp/x509up" + proxy_string = os.environ.get('proxyContent').replace(",", "\n") + with open(proxy_path, "w") as proxy_file: + proxy_file.write(proxy_string) + del os.environ['proxyContent'] + os.chmod(proxy_path, 0o600) + else: + logging.debug('[main] no proxy specified in env var $proxySecretPath nor $proxyContent') + raise Exception('Found no voms proxy specified') + os.environ['X509_USER_PROXY'] = proxy_path + logging.debug('[main] initialized proxy') + + # get the panda site name + panda_site = os.environ.get('computingSite') + logging.debug('[main] got panda site: {0}'.format(panda_site)) + + # get the panda queue name + panda_queue = os.environ.get('pandaQueueName') + logging.debug('[main] got panda queue: {0}'.format(panda_queue)) + + # get the resource type of the worker + resource_type = os.environ.get('resourceType') + logging.debug('[main] got resource type: {0}'.format(resource_type)) + + prodSourceLabel = os.environ.get('prodSourceLabel') + logging.debug('[main] got prodSourceLabel: {0}'.format(prodSourceLabel)) + + job_type = os.environ.get('jobType') + logging.debug('[main] got job type: {0}'.format(job_type)) + + # get the Harvester ID + harvester_id = os.environ.get('HARVESTER_ID') + logging.debug('[main] got Harvester ID: {0}'.format(harvester_id)) + + # get the worker id + worker_id = os.environ.get('workerID') + logging.debug('[main] got worker ID: {0}'.format(worker_id)) + + # get the URL (e.g. panda cache) to upload logs + logs_frontend_w = os.environ.get('logs_frontend_w') + logging.debug('[main] got url to upload logs') + + # get the URL (e.g. panda cache) where the logs can be downloaded afterwards + logs_frontend_r = os.environ.get('logs_frontend_r') + logging.debug('[main] got url to download logs') + + # get the filename to use for the stdout log + stdout_name = os.environ.get('stdout_name') + if not stdout_name: + stdout_name = '{0}_{1}.out'.format(harvester_id, worker_id) + + logging.debug('[main] got filename for the stdout log') + + # get the submission mode (push/pull) for the pilot + submit_mode = os.environ.get('submit_mode') + if not submit_mode: + submit_mode = 'PULL' + + # see if there is a work directory specified + tmpdir = os.environ.get('TMPDIR') + if tmpdir: + global WORK_DIR + WORK_DIR = tmpdir + global CONFIG_DIR + CONFIG_DIR = tmpdir + '/jobconfig' + + return proxy_path, panda_site, panda_queue, resource_type, prodSourceLabel, job_type, harvester_id, \ + worker_id, logs_frontend_w, logs_frontend_r, stdout_name, submit_mode + + +if __name__ == "__main__": + + # get all the configuration from environment + proxy_path, panda_site, panda_queue, resource_type, prodSourceLabel, job_type, harvester_id, worker_id, \ + logs_frontend_w, logs_frontend_r, destination_name, submit_mode = get_configuration() + + # the pilot should propagate the download link via the pilotId field in the job table + log_download_url = '{0}/{1}'.format(logs_frontend_r, destination_name) + os.environ['GTAG'] = log_download_url # GTAG env variable is read by pilot + + # get the pilot wrapper + wrapper_path = "/tmp/runpilot2-wrapper.sh" + wrapper_url = "https://raw.githubusercontent.com/PanDAWMS/pilot-wrapper/master/runpilot2-wrapper.sh" + wrapper_string = get_url(wrapper_url) + with open(wrapper_path, "w") as wrapper_file: + wrapper_file.write(wrapper_string) + os.chmod(wrapper_path, 0o544) # make pilot wrapper executable + logging.debug('[main] downloaded pilot wrapper') + + # execute the pilot wrapper + logging.debug('[main] starting pilot wrapper...') + resource_type_option = '' + if resource_type: + resource_type_option = '--resource-type {0}'.format(resource_type) + + psl_option = '' + if prodSourceLabel: + psl_option = '-j {0}'.format(prodSourceLabel) + + job_type_option = '' + if job_type: + job_type_option = '-i {0}'.format(job_type) + + wrapper_params = '-a {0} -s {1} -r {2} -q {3} {4} {5} {6}'.format(WORK_DIR, panda_site, panda_queue, panda_queue, + resource_type_option, psl_option, job_type_option) + + # TODO: This should be removed once we start using prodSourceLabel + if not psl_option: + if 'ANALY' in panda_queue: + wrapper_params = '{0} -j user'.format(wrapper_params) + else: + wrapper_params = '{0} -j managed'.format(wrapper_params) + + if submit_mode == 'PUSH': + # job configuration files need to be copied, because k8s configmap mounts as read-only file system + # and therefore the pilot cannot execute in the same directory + copy_files_in_dir(CONFIG_DIR, WORK_DIR) + + command = "/tmp/runpilot2-wrapper.sh {0} -i PR -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode={1} --allow-same-user=False -t | tee /tmp/wrapper-wid.log". \ + format(wrapper_params, submit_mode) + try: + subprocess.call(command, shell=True) + except: + logging.error(traceback.format_exc()) + logging.debug('[main] pilot wrapper done...') + + # upload logs to e.g. panda cache or similar + upload_logs(logs_frontend_w, '/tmp/wrapper-wid.log', destination_name, proxy_path) + logging.debug('[main] FINISHED') \ No newline at end of file diff --git a/pandaharvester/harvestercommunicator/panda_communicator.py b/pandaharvester/harvestercommunicator/panda_communicator.py index c1402971..6b56000d 100644 --- a/pandaharvester/harvestercommunicator/panda_communicator.py +++ b/pandaharvester/harvestercommunicator/panda_communicator.py @@ -15,7 +15,6 @@ import uuid import inspect import datetime -import requests import traceback from future.utils import iteritems # TO BE REMOVED for python2.7 @@ -26,6 +25,7 @@ pass from pandaharvester.harvestercore import core_utils from pandaharvester.harvesterconfig import harvester_config +from pandacommon.pandautils.net_utils import get_http_adapter_with_random_dns_resolution from .base_communicator import BaseCommunicator @@ -56,11 +56,12 @@ def post(self, path, data): url = '{0}/{1}'.format(harvester_config.pandacon.pandaURL, path) if self.verbose: tmpLog.debug('exec={0} URL={1} data={2}'.format(tmpExec, url, str(data))) - res = requests.post(url, - data=data, - headers={"Accept": "application/json", - "Connection": "close"}, - timeout=harvester_config.pandacon.timeout) + session = get_http_adapter_with_random_dns_resolution() + res = session.post(url, + data=data, + headers={"Accept": "application/json", + "Connection": "close"}, + timeout=harvester_config.pandacon.timeout) if self.verbose: tmpLog.debug('exec={0} code={1} return={2}'.format(tmpExec, res.status_code, res.text)) if res.status_code == 200: @@ -90,14 +91,15 @@ def post_ssl(self, path, data, cert=None): if cert is None: cert = (harvester_config.pandacon.cert_file, harvester_config.pandacon.key_file) + session = get_http_adapter_with_random_dns_resolution() sw = core_utils.get_stopwatch() - res = requests.post(url, - data=data, - headers={"Accept": "application/json", - "Connection": "close"}, - timeout=harvester_config.pandacon.timeout, - verify=harvester_config.pandacon.ca_cert, - cert=cert) + res = session.post(url, + data=data, + headers={"Accept": "application/json", + "Connection": "close"}, + timeout=harvester_config.pandacon.timeout, + verify=harvester_config.pandacon.ca_cert, + cert=cert) if self.verbose: tmpLog.debug('exec={0} code={1} {3}. return={2}'.format(tmpExec, res.status_code, res.text, sw.get_elapsed_time())) @@ -129,11 +131,12 @@ def put_ssl(self, path, files, cert=None): if cert is None: cert = (harvester_config.pandacon.cert_file, harvester_config.pandacon.key_file) - res = requests.post(url, - files=files, - timeout=harvester_config.pandacon.timeout, - verify=harvester_config.pandacon.ca_cert, - cert=cert) + session = get_http_adapter_with_random_dns_resolution() + res = session.post(url, + files=files, + timeout=harvester_config.pandacon.timeout, + verify=harvester_config.pandacon.ca_cert, + cert=cert) if self.verbose: tmpLog.debug('exec={0} code={1} return={2}'.format(tmpExec, res.status_code, res.text)) if res.status_code == 200: @@ -530,7 +533,7 @@ def update_worker_stats(self, site_name, stats): data['siteName'] = site_name data['paramsList'] = json.dumps(stats) tmpLog.debug('update stats for {0}, stats: {1}'.format(site_name, stats)) - tmpStat, tmpRes = self.post_ssl('reportWorkerStats', data) + tmpStat, tmpRes = self.post_ssl('reportWorkerStats_jobtype', data) errStr = 'OK' if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) diff --git a/pandaharvester/harvestercore/command_spec.py b/pandaharvester/harvestercore/command_spec.py index a1e674f5..ac18f30a 100644 --- a/pandaharvester/harvestercore/command_spec.py +++ b/pandaharvester/harvestercore/command_spec.py @@ -16,7 +16,7 @@ class CommandSpec(SpecBase): ) # commands COM_reportWorkerStats = 'REPORT_WORKER_STATS' - COM_setNWorkers = 'SET_N_WORKERS' + COM_setNWorkers = 'SET_N_WORKERS_JOBTYPE' COM_killWorkers = 'KILL_WORKERS' # mapping between command and receiver receiver_map = { diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index a6dcea42..148eca12 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -322,6 +322,8 @@ def update_job_attributes_with_workers(map_type, jobspec_list, workspec_list, fi # set start and end times if workSpec.status in [WorkSpec.ST_running]: jobSpec.set_start_time() + elif workSpec.pilot_closed: + jobSpec.reset_start_end_time() elif workSpec.is_final_status(): jobSpec.set_end_time() # core count @@ -592,8 +594,8 @@ def get_queues_config_url(): # get unique queue name -def get_unique_queue_name(queue_name, resource_type): - return '{0}:{1}'.format(queue_name, resource_type) +def get_unique_queue_name(queue_name, resource_type, job_type): + return '{0}:{1}:{2}'.format(queue_name, resource_type, job_type) # capability to dynamically change plugins @@ -613,10 +615,10 @@ def _asdict(self): return dict(zip(self.attributes, self)) -# Make a list of choice candidates accroding to permille weight +# Make a list of choice candidates according to permille weight def make_choice_list(pdpm={}, default=None): weight_sum = sum(pdpm.values()) - weight_defualt = 1000 + weight_default = 1000 ret_list = [] for candidate, weight in iteritems(pdpm): if weight_sum > 1000: @@ -624,8 +626,8 @@ def make_choice_list(pdpm={}, default=None): else: real_weight = int(weight) ret_list.extend([candidate]*real_weight) - weight_defualt -= real_weight - ret_list.extend([default]*weight_defualt) + weight_default -= real_weight + ret_list.extend([default]*weight_default) return ret_list @@ -637,3 +639,16 @@ def pickle_to_text(data): # unpickle from text def unpickle_from_text(text): return pickle.loads(codecs.decode(text.encode(), 'base64')) + + +# increasing retry period after timeout or failure +def retry_period_sec(nth_retry, increment=1, max_retries=None, max_seconds=None, min_seconds=1): + nth = max(nth_retry, 1) + ret_period = max(min_seconds, 1) + if max_retries and nth_retry > max_retries: + return False + else: + ret_period += (nth - 1)*increment + if max_seconds: + ret_period = min(ret_period, max_seconds) + return ret_period diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 95ec3679..84785e01 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -60,6 +60,9 @@ def __init__(self, thr_name=None, read_only=False): self.thrName = thr_name self.verbLog = None self.useInspect = False + self.reconnectTimeout = 300 + if hasattr(harvester_config.db, 'reconnectTimeout'): + self.reconnectTimeout = harvester_config.db.reconnectTimeout if harvester_config.db.verbose: self.verbLog = core_utils.make_logger(_logger, method_name='execute') if self.thrName is None: @@ -104,7 +107,7 @@ def fetchall(self): self.con = MySQLdb.connect(user=harvester_config.db.user, passwd=harvester_config.db.password, db=harvester_config.db.schema, host=host, port=port, - cursorclass=MyCursor) + cursorclass=MyCursor, charset='utf8') self.cur = self.con.cursor() else: import mysql.connector @@ -139,7 +142,7 @@ def fetchall(self): self.usingAppLock = True # exception handler for type of DBs - def _handle_exception(self, exc, retry_time=30): + def _handle_exception(self, exc): tmpLog = core_utils.make_logger(_logger, 'thr={0}'.format(self.thrName), method_name='_handle_exception') if harvester_config.db.engine == 'mariadb': tmpLog.warning('exception of mysql {0} occurred'.format(exc.__class__.__name__)) @@ -155,14 +158,20 @@ def _handle_exception(self, exc, retry_time=30): isOperationalError = True if isOperationalError: try_timestamp = time.time() - while time.time() - try_timestamp < retry_time: + n_retry = 1 + while time.time() - try_timestamp < self.reconnectTimeout: try: self.__init__() tmpLog.info('renewed connection') break except Exception as e: - tmpLog.error('failed to renew connection; {0}'.format(e)) - time.sleep(1) + tmpLog.error('failed to renew connection ({0} retries); {1}'.format(n_retry, e)) + sleep_time = core_utils.retry_period_sec(n_retry, increment=2, max_seconds=300, min_seconds=1) + if not sleep_time: + break + else: + time.sleep(sleep_time) + n_retry += 1 # convert param dict to list def convert_params(self, sql, varmap): @@ -361,6 +370,20 @@ def need_index(self, attr): isUnique = True return isIndex, isUnique + def initialize_jobType(self, table_name): + # initialize old NULL entries to ANY in pq_table and work_table + # get logger + tmp_log = core_utils.make_logger(_logger, method_name='initialize_jobType') + + sql_update = "UPDATE {0} SET jobType = 'ANY' WHERE jobType is NULL ".format(table_name) + try: + self.execute(sql_update) + # commit + self.commit() + tmp_log.debug('initialized entries in {0}'.format(table_name)) + except Exception: + core_utils.dump_error_message(tmp_log) + # make table def make_table(self, cls, table_name): try: @@ -430,6 +453,12 @@ def make_table(self, cls, table_name): tmpLog.debug('added {0} to {1}'.format(attr, table_name)) except Exception: core_utils.dump_error_message(tmpLog) + + # if we just added the jobType, old entries need to be initialized + if (table_name == pandaQueueTableName and attrName == 'jobType') \ + or (table_name == pandaQueueTableName and attrName == 'jobType'): + self.initialize_jobType(table_name) + # make indexes for index in indexes: indexName = 'idx_{0}_{1}'.format(index, table_name) @@ -479,6 +508,7 @@ def make_tables(self, queue_config_mapper): for outStr in outStrs: print (outStr) sys.exit(1) + # add sequential numbers self.add_seq_number('SEQ_workerID', 1) self.add_seq_number('SEQ_configID', 1) @@ -1416,7 +1446,7 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ sqlS += "OR (submitTime<:lookupTimeLimit AND lockedBy IS NULL) " sqlS += "ORDER BY submitTime " # sql to get queues - sqlQ = "SELECT queueName,resourceType,nNewWorkers FROM {0} ".format(pandaQueueTableName) + sqlQ = "SELECT queueName, jobType, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) sqlQ += "WHERE siteName=:siteName " # sql to get orphaned workers sqlO = "SELECT workerID FROM {0} ".format(workTableName) @@ -1426,7 +1456,7 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ sqlD = "DELETE FROM {0} ".format(workTableName) sqlD += "WHERE workerID=:workerID " # sql to count nQueue - sqlN = "SELECT status,COUNT(*) cnt FROM {0} ".format(workTableName) + sqlN = "SELECT status, COUNT(*) cnt FROM {0} ".format(workTableName) sqlN += "WHERE computingSite=:computingSite " # sql to count re-fillers sqlR = "SELECT COUNT(*) cnt FROM {0} ".format(workTableName) @@ -1462,13 +1492,17 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ varMap[':siteName'] = siteName self.execute(sqlQ, varMap) resQ = self.cur.fetchall() - for queueName, resourceType, nNewWorkers in resQ: + for queueName, jobType, resourceType, nNewWorkers in resQ: + # delete orphaned workers varMap = dict() varMap[':computingSite'] = queueName varMap[':status'] = WorkSpec.ST_pending varMap[':timeLimit'] = timeNow - datetime.timedelta(seconds=lock_interval) sqlO_tmp = sqlO + if jobType != 'ANY': + varMap[':jobType'] = jobType + sqlO_tmp += "AND jobType=:jobType " if resourceType != 'ANY': varMap[':resourceType'] = resourceType sqlO_tmp += "AND resourceType=:resourceType " @@ -1480,11 +1514,15 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ self.execute(sqlD, varMap) # commit self.commit() + # count nQueue varMap = dict() varMap[':computingSite'] = queueName varMap[':resourceType'] = resourceType sqlN_tmp = sqlN + if jobType != 'ANY': + varMap[':jobType'] = jobType + sqlN_tmp += "AND jobType=:jobType " if resourceType != 'ANY': varMap[':resourceType'] = resourceType sqlN_tmp += "AND resourceType=:resourceType " @@ -1500,11 +1538,15 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ nReady += tmpNum elif workerStatus in [WorkSpec.ST_running]: nRunning += tmpNum + # count nFillers varMap = dict() varMap[':computingSite'] = queueName varMap[':status'] = WorkSpec.ST_running sqlR_tmp = sqlR + if jobType != 'ANY': + varMap[':jobType'] = jobType + sqlR_tmp += "AND jobType=:jobType " if resourceType != 'ANY': varMap[':resourceType'] = resourceType sqlR_tmp += "AND resourceType=:resourceType " @@ -1513,11 +1555,13 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ nReady += nReFill # add retMap.setdefault(queueName, {}) - retMap[queueName][resourceType] = {'nReady': nReady, - 'nRunning': nRunning, - 'nQueue': nQueue, - 'nNewWorkers': nNewWorkers} - resourceMap[resourceType] = queueName + retMap[queueName].setdefault(jobType, {}) + retMap[queueName][jobType][resourceType] = {'nReady': nReady, + 'nRunning': nRunning, + 'nQueue': nQueue, + 'nNewWorkers': nNewWorkers} + resourceMap.setdefault(jobType, {}) + resourceMap[jobType][resourceType] = queueName # enough queues if len(retMap) >= 0: break @@ -3297,7 +3341,7 @@ def get_worker_stats(self, site_name): tmpLog = core_utils.make_logger(_logger, method_name='get_worker_stats') tmpLog.debug('start') # sql to get nQueueLimit - sqlQ = "SELECT queueName,resourceType,nNewWorkers FROM {0} ".format(pandaQueueTableName) + sqlQ = "SELECT queueName, jobType, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) sqlQ += "WHERE siteName=:siteName " # get nQueueLimit varMap = dict() @@ -3305,18 +3349,18 @@ def get_worker_stats(self, site_name): self.execute(sqlQ, varMap) resQ = self.cur.fetchall() retMap = dict() - for computingSite, resourceType, nNewWorkers in resQ: - if resourceType not in retMap: - retMap[resourceType] = { - 'running': 0, - 'submitted': 0, - 'to_submit': nNewWorkers - } + for computingSite, jobType, resourceType, nNewWorkers in resQ: + retMap.setdefault(jobType, {}) + if resourceType not in retMap[jobType]: + retMap[jobType][resourceType] = {'running': 0, + 'submitted': 0, + 'to_submit': nNewWorkers} + # get worker stats - sqlW = "SELECT wt.status, wt.computingSite, pq.resourceType, COUNT(*) cnt " + sqlW = "SELECT wt.status, wt.computingSite, pq.jobType, pq.resourceType, COUNT(*) cnt " sqlW += "FROM {0} wt, {1} pq ".format(workTableName, pandaQueueTableName) sqlW += "WHERE pq.siteName=:siteName AND wt.computingSite=pq.queueName AND wt.status IN (:st1,:st2) " - sqlW += "GROUP BY wt.status, wt.computingSite, pq.resourceType " + sqlW += "GROUP BY wt.status, wt.computingSite, pq.jobType, pq.resourceType " # get worker stats varMap = dict() varMap[':siteName'] = site_name @@ -3324,14 +3368,14 @@ def get_worker_stats(self, site_name): varMap[':st2'] = 'submitted' self.execute(sqlW, varMap) resW = self.cur.fetchall() - for workerStatus, computingSite, resourceType, cnt in resW: + for workerStatus, computingSite, jobType, resourceType, cnt in resW: + retMap.setdefault(jobType, {}) if resourceType not in retMap: - retMap[resourceType] = { - 'running': 0, - 'submitted': 0, - 'to_submit': 0 - } - retMap[resourceType][workerStatus] = cnt + retMap[jobType][resourceType] = {'running': 0, + 'submitted': 0, + 'to_submit': 0 + } + retMap[jobType][resourceType][workerStatus] = cnt # commit self.commit() tmpLog.debug('got {0}'.format(str(retMap))) @@ -3351,40 +3395,46 @@ def get_worker_stats_bulk(self, active_ups_queues): tmpLog = core_utils.make_logger(_logger, method_name='get_worker_stats_bulk') tmpLog.debug('start') # sql to get nQueueLimit - sqlQ = "SELECT queueName, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) + sqlQ = "SELECT queueName, jobType, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) # get nQueueLimit self.execute(sqlQ) resQ = self.cur.fetchall() retMap = dict() - for computingSite, resourceType, nNewWorkers in resQ: + for computingSite, jobType, resourceType, nNewWorkers in resQ: retMap.setdefault(computingSite, {}) - if resourceType and resourceType != 'ANY' and resourceType not in retMap[computingSite]: - retMap[computingSite][resourceType] = {'running': 0, 'submitted': 0, 'to_submit': nNewWorkers} + retMap[computingSite].setdefault(jobType, {}) + if resourceType and resourceType != 'ANY' and resourceType not in retMap[computingSite][jobType]: + retMap[computingSite][jobType][resourceType] = {'running': 0, + 'submitted': 0, + 'to_submit': nNewWorkers} # get worker stats - sqlW = "SELECT wt.status, wt.computingSite, wt.resourceType, COUNT(*) cnt " + sqlW = "SELECT wt.status, wt.computingSite, wt.jobType, wt.resourceType, COUNT(*) cnt " sqlW += "FROM {0} wt ".format(workTableName) sqlW += "WHERE wt.status IN (:st1,:st2) " - sqlW += "GROUP BY wt.status,wt.computingSite, wt.resourceType " + sqlW += "GROUP BY wt.status,wt.computingSite, wt.jobType, wt.resourceType " # get worker stats varMap = dict() varMap[':st1'] = 'running' varMap[':st2'] = 'submitted' self.execute(sqlW, varMap) resW = self.cur.fetchall() - for workerStatus, computingSite, resourceType, cnt in resW: + for workerStatus, computingSite, jobType, resourceType, cnt in resW: if resourceType and resourceType != 'ANY': retMap.setdefault(computingSite, {}) - retMap[computingSite].setdefault(resourceType, {'running': 0, 'submitted': 0, 'to_submit': 0}) - retMap[computingSite][resourceType][workerStatus] = cnt + retMap[computingSite].setdefault(jobType, {}) + retMap[computingSite][jobType].setdefault(resourceType, {'running': 0, + 'submitted': 0, + 'to_submit': 0}) + retMap[computingSite][jobType][resourceType][workerStatus] = cnt # if there are no jobs for an active UPS queue, it needs to be initialized so that the pilot streaming # on panda server starts processing the queue if active_ups_queues: for ups_queue in active_ups_queues: - if ups_queue not in retMap or not retMap[ups_queue]: - retMap[ups_queue] = {'SCORE': {'running': 0, 'submitted': 0, 'to_submit': 0}} + if ups_queue not in retMap or not retMap[ups_queue] or retMap[ups_queue] == {'ANY': {}}: + retMap[ups_queue] = {'managed': {'SCORE': {'running': 0, 'submitted': 0, 'to_submit': 0}}} # commit self.commit() @@ -3698,11 +3748,11 @@ def release_jobs(self, panda_ids, locked_by): return False # clone queue - def clone_queue_with_new_resource_type(self, site_name, queue_name, resource_type, new_workers): + def clone_queue_with_new_job_and_resource_type(self, site_name, queue_name, job_type, resource_type, new_workers): try: # get logger tmpLog = core_utils.make_logger(_logger, 'site_name={0} queue_name={1}'.format(site_name, queue_name), - method_name='clone_queue_with_new_resource_type') + method_name='clone_queue_with_new_job_and_resource_type') tmpLog.debug('start') # get the values from one of the existing queues @@ -3721,10 +3771,12 @@ def clone_queue_with_new_resource_type(self, site_name, queue_name, resource_typ attr_binding = ':{0}'.format(attribute) if attribute == 'resourceType': var_map[attr_binding] = resource_type + elif attribute == 'jobType': + var_map[attr_binding] = job_type elif attribute == 'nNewWorkers': var_map[attr_binding] = new_workers elif attribute == 'uniqueName': - var_map[attr_binding] = core_utils.get_unique_queue_name(queue_name, resource_type) + var_map[attr_binding] = core_utils.get_unique_queue_name(queue_name, resource_type, job_type) else: var_map[attr_binding] = value attribute_list.append(attribute) @@ -3754,85 +3806,87 @@ def set_queue_limit(self, site_name, params): sql_reset += "SET nNewWorkers=:zero WHERE siteName=:siteName " # sql to get resource types - sql_get_resource = "SELECT resourceType FROM {0} ".format(pandaQueueTableName) - sql_get_resource += "WHERE siteName=:siteName " - sql_get_resource += "FOR UPDATE " + sql_get_job_resource = "SELECT jobType, resourceType FROM {0} ".format(pandaQueueTableName) + sql_get_job_resource += "WHERE siteName=:siteName " + sql_get_job_resource += "FOR UPDATE " # sql to update nQueueLimit sql_update_queue = "UPDATE {0} ".format(pandaQueueTableName) - sql_update_queue += "SET nNewWorkers=:nQueue WHERE siteName=:siteName AND resourceType=:resourceType " + sql_update_queue += "SET nNewWorkers=:nQueue " + sql_update_queue += "WHERE siteName=:siteName AND jobType=:jobType AND resourceType=:resourceType " # sql to get num of submitted workers sql_count_workers = "SELECT COUNT(*) cnt " sql_count_workers += "FROM {0} wt, {1} pq ".format(workTableName, pandaQueueTableName) sql_count_workers += "WHERE pq.siteName=:siteName AND wt.computingSite=pq.queueName AND wt.status=:status " - sql_count_workers += "ANd pq.resourceType=:resourceType " + sql_count_workers += "AND pq.jobType=:jobType AND pq.resourceType=:resourceType " - # reset nqueued for all resource types + # reset nqueued for all job & resource types varMap = dict() varMap[':zero'] = 0 varMap[':siteName'] = site_name self.execute(sql_reset, varMap) - # get resource types + # get job & resource types varMap = dict() varMap[':siteName'] = site_name - self.execute(sql_get_resource, varMap) - resRes = self.cur.fetchall() - resource_type_list = set() - for tmpRes, in resRes: - resource_type_list.add(tmpRes) + self.execute(sql_get_job_resource, varMap) + results = self.cur.fetchall() + job_resource_type_list = set() + for tmp_job_type, tmp_resource_type in results: + job_resource_type_list.add((tmp_job_type, tmp_resource_type)) # set all queues nUp = 0 - retMap = dict() + ret_map = dict() queue_name = site_name - for resource_type, value in iteritems(params): - tmpLog.debug('Processing rt {0} -> {1}'.format(resource_type, value)) + for job_type, job_values in iteritems(params): + ret_map.setdefault(job_type, {}) + for resource_type, value in iteritems(job_values): + tmpLog.debug('Processing rt {0} -> {1}'.format(resource_type, value)) - # get num of submitted workers - varMap = dict() - varMap[':siteName'] = site_name - varMap[':resourceType'] = resource_type - varMap[':status'] = 'submitted' - self.execute(sql_count_workers, varMap) - res = self.cur.fetchone() - tmpLog.debug('{0} has {1} submitted workers'.format(resource_type, res)) - nSubmittedWorkers = 0 - if res is not None: - nSubmittedWorkers, = res - - # set new value - # value = max(value - nSubmittedWorkers, 0) - if value is None: - value = 0 - varMap = dict() - varMap[':nQueue'] = value - varMap[':siteName'] = site_name - varMap[':resourceType'] = resource_type - self.execute(sql_update_queue, varMap) - iUp = self.cur.rowcount - - # iUp is 0 when nQueue is not changed - if iUp > 0 or resource_type in resource_type_list: - # a queue was updated, add the values to the map - retMap[resource_type] = value - else: - # no queue was updated, we need to create a new one for the resource type - cloned = self.clone_queue_with_new_resource_type(site_name, queue_name, resource_type, value) - if cloned: - retMap[resource_type] = value - iUp = 1 + # get num of submitted workers + varMap = dict() + varMap[':siteName'] = site_name + varMap[':jobType'] = job_type + varMap[':resourceType'] = resource_type + varMap[':status'] = 'submitted' + self.execute(sql_count_workers, varMap) + res = self.cur.fetchone() + tmpLog.debug('{0} has {1} submitted workers'.format(resource_type, res)) + + if value is None: + value = 0 + varMap = dict() + varMap[':nQueue'] = value + varMap[':siteName'] = site_name + varMap[':jobType'] = job_type + varMap[':resourceType'] = resource_type + self.execute(sql_update_queue, varMap) + iUp = self.cur.rowcount + + # iUp is 0 when nQueue is not changed + if iUp > 0 or (job_type, resource_type) in job_resource_type_list: + # a queue was updated, add the values to the map + ret_map[job_type][resource_type] = value + else: + # no queue was updated, we need to create a new one for the resource type + cloned = self.clone_queue_with_new_job_and_resource_type(site_name, queue_name, job_type, + resource_type, value) + if cloned: + ret_map[job_type][resource_type] = value + iUp = 1 - nUp += iUp - tmpLog.debug('set nNewWorkers={0} to {1}:{2} with {3}'.format(value, queue_name, resource_type, iUp)) + nUp += iUp + tmpLog.debug('set nNewWorkers={0} to {1}:{2}:{3} with {4}'.format(value, queue_name, job_type, + resource_type, iUp)) # commit self.commit() tmpLog.debug('updated {0} queues'.format(nUp)) - return retMap + return ret_map except Exception: # roll back self.rollback() @@ -4042,12 +4096,12 @@ def get_file_status(self, lfn, file_type, endpoint, job_status): method_name='get_file_status') tmpLog.debug('start') # sql to get files - sqlF = "SELECT f.status, COUNT(*) cnt FROM {0} f, {1} j ".format(fileTableName, jobTableName) + sqlF = "SELECT f.status, f.path, COUNT(*) cnt FROM {0} f, {1} j ".format(fileTableName, jobTableName) sqlF += "WHERE j.PandaID=f.PandaID AND j.status=:jobStatus " sqlF += "AND f.lfn=:lfn AND f.fileType=:type " if endpoint is not None: sqlF += "AND f.endpoint=:endpoint " - sqlF += "GROUP BY f.status " + sqlF += "GROUP BY f.status, f.path " # get files varMap = dict() varMap[':lfn'] = lfn @@ -4057,8 +4111,10 @@ def get_file_status(self, lfn, file_type, endpoint, job_status): varMap[':endpoint'] = endpoint self.execute(sqlF, varMap) retMap = dict() - for status, cnt in self.cur.fetchall(): - retMap[status] = cnt + for status, path, cnt in self.cur.fetchall(): + retMap.setdefault(status, {'cnt': 0, 'path': set()}) + retMap[status]['cnt'] += cnt + retMap[status]['path'].add(path) # commit self.commit() tmpLog.debug('got {0}'.format(str(retMap))) @@ -4383,18 +4439,22 @@ def increment_submission_attempt(self, panda_id, new_number): def get_worker_limits(self, site_name): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_worker_limits') + tmpLog = core_utils.make_logger(_logger, token='site_name={0}'.format(site_name), method_name='get_worker_limits') tmpLog.debug('start') - # sql to get - sqlQ = "SELECT maxWorkers,nQueueLimitWorker,nQueueLimitWorkerRatio," + + # sql to get queue limits + sqlQ = "SELECT maxWorkers, nQueueLimitWorker, nQueueLimitWorkerRatio," sqlQ += "nQueueLimitWorkerMax,nQueueLimitWorkerMin FROM {0} ".format(pandaQueueTableName) - sqlQ += "WHERE siteName=:siteName AND resourceType='ANY'" + sqlQ += "WHERE siteName=:siteName AND resourceType='ANY' AND (jobType='ANY' OR jobType IS NULL) " + # sql to count resource types sqlNT = "SELECT COUNT(*) cnt FROM {0} ".format(pandaQueueTableName) sqlNT += "WHERE siteName=:siteName AND resourceType!='ANY'" + # sql to count running workers sqlNR = "SELECT COUNT(*) cnt FROM {0} ".format(workTableName) sqlNR += "WHERE computingSite=:computingSite AND status IN (:status1)" + # get varMap = dict() varMap[':siteName'] = site_name @@ -4412,6 +4472,7 @@ def get_worker_limits(self, site_name): varMap[':status1'] = 'running' self.execute(sqlNR, varMap) resNR = self.cur.fetchall() + # dynamic nQueueLimitWorker retMap = dict() nRunning = 0 @@ -4744,6 +4805,10 @@ def get_active_workers(self, n_workers, seconds_ago=0): sqlW += "WHERE status IN (:st_submitted,:st_running,:st_idle) " sqlW += "AND modificationTime<:timeLimit " sqlW += "ORDER BY modificationTime,computingSite LIMIT {0} ".format(n_workers) + # sql to get jobs + sqlJ = "SELECT j.{columns} FROM {jobWorkerTableName} jw, {jobTableName} j ".format(columns=JobSpec.column_names(), jobTableName=jobTableName, jobWorkerTableName=jobWorkerTableName) + sqlJ += "WHERE j.PandaID=jw.PandaID AND jw.workerID=:workerID " + # parameter map varMap = dict() varMap[':timeLimit'] = datetime.datetime.utcnow() - datetime.timedelta(seconds=seconds_ago) varMap[':st_submitted'] = WorkSpec.ST_submitted @@ -4754,7 +4819,18 @@ def get_active_workers(self, n_workers, seconds_ago=0): def _get_workspec_from_record(rec): workspec = WorkSpec() workspec.pack(rec) + jobspec_list = [] workspec.pandaid_list = [] + varMap = dict() + varMap[':workerID'] = workspec.workerID + self.execute(sqlJ, varMap) + resJ = self.cur.fetchall() + for one_job in resJ: + jobspec = JobSpec() + jobspec.pack(one_job) + jobspec_list.append(jobspec) + workspec.pandaid_list.append(jobspec.PandaID) + workspec.set_jobspec_list(jobspec_list) return workspec retVal = map(_get_workspec_from_record, resW) tmpLog.debug('got {0} workers'.format(len(resW))) diff --git a/pandaharvester/harvestercore/job_spec.py b/pandaharvester/harvestercore/job_spec.py index b212d718..65a6d38e 100644 --- a/pandaharvester/harvestercore/job_spec.py +++ b/pandaharvester/harvestercore/job_spec.py @@ -334,7 +334,9 @@ def set_input_file_paths(self, in_files): lfns = self.get_input_file_attributes().keys() paths = [] for lfn in lfns: - paths.append(in_files[lfn]['path']) + # check for consistency + if lfn in in_files: + paths.append(in_files[lfn]['path']) self.jobParams['inFilePaths'] = ','.join(paths) # trigger updating self.force_update('jobParams') @@ -388,6 +390,11 @@ def set_end_time(self, force=False): if self.endTime is None or force is True: self.endTime = datetime.datetime.utcnow() + # reset start and end time + def reset_start_end_time(self): + self.startTime = datetime.datetime.utcnow() + self.endTime = self.startTime + # add work spec list def add_workspec_list(self, workspec_list): self.workspec_list = workspec_list diff --git a/pandaharvester/harvestercore/panda_queue_spec.py b/pandaharvester/harvestercore/panda_queue_spec.py index 1ba06b5d..210e29d7 100644 --- a/pandaharvester/harvestercore/panda_queue_spec.py +++ b/pandaharvester/harvestercore/panda_queue_spec.py @@ -16,6 +16,7 @@ class PandaQueueSpec(SpecBase): 'submitTime:timestamp / index', 'lockedBy:text', 'siteName:text / index', + 'jobType:text', 'resourceType:text', 'nNewWorkers:integer', 'uniqueName:text / unique', @@ -29,7 +30,7 @@ class PandaQueueSpec(SpecBase): # catchall resource type RT_catchall = 'ANY' - + JT_catchall = 'ANY' # constructor def __init__(self): SpecBase.__init__(self) diff --git a/pandaharvester/harvestercore/queue_config_mapper.py b/pandaharvester/harvestercore/queue_config_mapper.py index c197ee25..7466d7a1 100644 --- a/pandaharvester/harvestercore/queue_config_mapper.py +++ b/pandaharvester/harvestercore/queue_config_mapper.py @@ -57,6 +57,7 @@ def __init__(self, queue_name): self.noHeartbeat = '' self.runMode = 'self' self.resourceType = PandaQueueSpec.RT_catchall + self.jobType = PandaQueueSpec.JT_catchall self.getJobCriteria = None self.ddmEndpointIn = None self.allowJobMixture = False @@ -76,14 +77,24 @@ def is_no_heartbeat_status(self, status): return status in self.get_no_heartbeat_status() # get prodSourceLabel - def get_source_label(self): + def get_source_label(self, job_type=None, is_gu=None): + # if queue is in test status, only submit workers for HC jobs if self.queueStatus == 'test': return 'test' + + # grandly unified queues: prodsourcelabel in job has precedence over queue prodsourcelabel + if job_type in ('user', 'panda'): + return 'user' + + # grandly unified queues: call to getJobs should not request for a particular prodSourceLabel + if is_gu: + return 'unified' + return self.prodSourceLabel # set unique name def set_unique_name(self): - self.uniqueName = core_utils.get_unique_queue_name(self.queueName, self.resourceType) + self.uniqueName = core_utils.get_unique_queue_name(self.queueName, self.resourceType, self.prodSourceLabel) # update attributes def update_attributes(self, data): @@ -576,7 +587,8 @@ def load_data(self): continue # filter for pilot version if hasattr(harvester_config.qconf, 'pilotVersion') and \ - pandaQueueDict[queueConfig.siteName].get('pilot_version') != str(harvester_config.qconf.pilotVersion): + pandaQueueDict.get(queueConfig.siteName) is not None and \ + pandaQueueDict.get(queueConfig.siteName).get('pilot_version') != str(harvester_config.qconf.pilotVersion): continue if 'ALL' not in harvester_config.qconf.queueList and \ 'DYNAMIC' not in harvester_config.qconf.queueList and \ diff --git a/pandaharvester/harvestercore/work_spec.py b/pandaharvester/harvestercore/work_spec.py index 2f411c4c..2e16680d 100644 --- a/pandaharvester/harvestercore/work_spec.py +++ b/pandaharvester/harvestercore/work_spec.py @@ -79,6 +79,7 @@ class WorkSpec(SpecBase): 'computingElement:text', 'nJobsToReFill:integer / index', 'logFilesToUpload:blob', + 'jobType:text', 'resourceType:text', 'nativeExitCode:integer', 'nativeStatus:text', @@ -236,6 +237,7 @@ def convert_to_propagate(self): 'submitTime', 'startTime', 'endTime', + 'jobType', 'resourceType', 'nativeExitCode', 'nativeStatus', diff --git a/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py b/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py index e93c9dda..99467926 100644 --- a/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py +++ b/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py @@ -1,8 +1,5 @@ import re -try: - import subprocess32 as subprocess -except: - import subprocess +import subprocess from pandaharvester.harvestercore.plugin_base import PluginBase from pandaharvester.harvestercore import core_utils @@ -25,21 +22,22 @@ def check_credential(self): comStr = "arcproxy -i vomsACvalidityLeft -P {0}".format(self.outCertFile) mainLog.debug(comStr) try: - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdOut, stdErr = p.communicate() + p = subprocess.run(comStr.split(), + encoding='utf-8', + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdOut = p.stdout.strip() + stdErr = p.stderr retCode = p.returncode except: core_utils.dump_error_message(mainLog) return False mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) - if retCode != 0 or not re.match(r'\d+', stdOut.strip()): + if retCode != 0 or not re.match(r'\d+', stdOut): mainLog.error('Unexpected output from arcproxy: {0}'.format(stdOut)) return False # return whether lifetime is greater than three days - return int(stdOut.strip()) > 3600 * 72 + return int(stdOut) > 3600 * 72 # renew proxy def renew_credential(self): @@ -50,11 +48,12 @@ def renew_credential(self): self.inCertFile) mainLog.debug(comStr) try: - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdOut, stdErr = p.communicate() + p = subprocess.run(comStr.split(), + encoding='utf-8', + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdOut = p.stdout + stdErr = p.stderr retCode = p.returncode mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) except: diff --git a/pandaharvester/harvestercredmanager/dummy_cred_manager.py b/pandaharvester/harvestercredmanager/dummy_cred_manager.py new file mode 100644 index 00000000..c4657d61 --- /dev/null +++ b/pandaharvester/harvestercredmanager/dummy_cred_manager.py @@ -0,0 +1,17 @@ +from pandaharvester.harvestercore.plugin_base import PluginBase + + +# dummy credential manager +class DummyCredManager(PluginBase): + + # constructor + def __init__(self, **kwarg): + PluginBase.__init__(self, **kwarg) + + # check proxy + def check_credential(self): + return True + + # renew proxy + def renew_credential(self): + return True, '' diff --git a/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py b/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py index ddcfa370..aeee36ef 100644 --- a/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py +++ b/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py @@ -46,7 +46,11 @@ def __init__(self, **kwarg): e.__class__.__name__, e)) raise # k8s client - self.k8s_client = k8s_Client(namespace=self.k8s_namespace, config_file=self.k8s_config_file) + try: + self.k8s_client = k8s_Client(namespace=self.k8s_namespace, config_file=self.k8s_config_file) + except Exception as e: + mainLog.error('Problem instantiating k8s client for {0}'.format(self.k8s_config_file)) + raise # check proxy def check_credential(self): diff --git a/pandaharvester/harvesterextractor/analysis_extractor.py b/pandaharvester/harvesterextractor/analysis_extractor.py index 6ab43ecb..f37174fd 100644 --- a/pandaharvester/harvesterextractor/analysis_extractor.py +++ b/pandaharvester/harvesterextractor/analysis_extractor.py @@ -2,7 +2,7 @@ from .base_extractor import BaseExtractor -# extractor for analysis +# OBSOLETE - use aux_extractor class AnalysisExtractor(BaseExtractor): # constructor def __init__(self, **kwarg): diff --git a/pandaharvester/harvesterextractor/aux_extractor.py b/pandaharvester/harvesterextractor/aux_extractor.py new file mode 100644 index 00000000..67253806 --- /dev/null +++ b/pandaharvester/harvesterextractor/aux_extractor.py @@ -0,0 +1,39 @@ +import re +from .base_extractor import BaseExtractor + + +# extractor for auxiliary input files +class AuxExtractor(BaseExtractor): + # constructor + def __init__(self, **kwarg): + self.containerPrefix = None + BaseExtractor.__init__(self, **kwarg) + + # get auxiliary input files + def get_aux_inputs(self, jobspec): + url_list = [] + jobPars = jobspec.jobParams['jobPars'] + # transformation + trf = jobspec.jobParams['transformation'] + if trf is not None and trf.startswith('http'): + url_list.append(trf) + # extract source URL + tmpM = re.search(' --sourceURL\s+([^\s]+)', jobPars) + if tmpM is not None: + sourceURL = tmpM.group(1) + # extract sandbox + if jobspec.jobParams['prodSourceLabel'] == 'user': + tmpM = re.search('-a\s+([^\s]+)', jobPars) + else: + tmpM = re.search('-i\s+([^\s]+)', jobPars) + if tmpM is not None: + lfn = tmpM.group(1) + url = '{0}/cache/{1}'.format(sourceURL, lfn) + url_list.append(url) + # extract container image + if 'container_name' in jobspec.jobParams: + url = jobspec.jobParams['container_name'] + if self.containerPrefix is not None and not url.startswith(self.containerPrefix): + url = self.containerPrefix + url + url_list.append(url) + return self.make_aux_inputs(url_list) diff --git a/pandaharvester/harvesterfifo/mysql_fifo.py b/pandaharvester/harvesterfifo/mysql_fifo.py index 7405d081..e53976c4 100644 --- a/pandaharvester/harvesterfifo/mysql_fifo.py +++ b/pandaharvester/harvesterfifo/mysql_fifo.py @@ -13,6 +13,11 @@ class MysqlFifo(PluginBase): # constructor def __init__(self, **kwarg): + self.reconnectTimeout = 300 + if hasattr(harvester_config, 'fifo') and hasattr(harvester_config.fifo, 'reconnectTimeout'): + self.reconnectTimeout = harvester_config.db.reconnectTimeout + elif hasattr(harvester_config.db, 'reconnectTimeout'): + self.reconnectTimeout = harvester_config.db.reconnectTimeout PluginBase.__init__(self, **kwarg) self.tableName = '{title}_FIFO'.format(title=self.titleName) # DB access attribues @@ -52,8 +57,8 @@ def __init__(self, **kwarg): except ImportError: raise Exception('No available MySQL DB API installed. Please pip install mysqlclient or mysql-connection-python') else: - self.con = mysql.connector.connect(user=db_user, passwd=db_password, - db=db_schema, host=db_host, port=db_port) + self.con = mysql.connector.connect(user=db_user, passwd=db_password, db=db_schema, + host=db_host, port=db_port, charset='utf8') self.cur = self.con.cursor(buffered=True) self.OperationalError = mysql.connector.errors.OperationalError else: @@ -81,7 +86,7 @@ def fetchall(self): raise _e # decorator exception handler for type of DBs - def _handle_exception(method, retry_time=30): + def _handle_exception(method): def _decorator(_method, *args, **kwargs): @functools.wraps(_method) def _wrapped_method(self, *args, **kwargs): @@ -94,13 +99,19 @@ def _wrapped_method(self, *args, **kwargs): isOperationalError = True if isOperationalError: try_timestamp = time.time() - while time.time() - try_timestamp < retry_time: + n_retry = 1 + while time.time() - try_timestamp < self.reconnectTimeout: try: self.__init__() return except Exception as _e: exc = _e - time.sleep(1) + sleep_time = core_utils.retry_period_sec(n_retry, increment=2, max_seconds=300, min_seconds=1) + if not sleep_time: + break + else: + time.sleep(sleep_time) + n_retry += 1 raise exc else: raise exc diff --git a/pandaharvester/harvestermessenger/k8s_messenger.py b/pandaharvester/harvestermessenger/k8s_messenger.py new file mode 100644 index 00000000..232f756d --- /dev/null +++ b/pandaharvester/harvestermessenger/k8s_messenger.py @@ -0,0 +1,55 @@ +import os + +from pandaharvester.harvestercore import core_utils +from .base_messenger import BaseMessenger +from pandaharvester.harvesterconfig import harvester_config +from pandaharvester.harvestermisc.k8s_utils import k8s_Client +# from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper +# from pandaharvester.harvestercore.work_spec import WorkSpec + + +# logger +_logger = core_utils.setup_logger('k8s_messenger') + + +# Messenger for generic Kubernetes clusters +class K8sMessenger(BaseMessenger): + + def __init__(self, **kwargs): + BaseMessenger.__init__(self, **kwargs) + try: + self.logDir + except AttributeError: + print('K8sMessenger: Missing attribute logDir') + raise + self.k8s_client = k8s_Client(namespace=self.k8s_namespace, config_file=self.k8s_config_file) + self._all_pods_list = self.k8s_client.get_pods_info() + + def post_processing(self, workspec, jobspec_list, map_type): + """ + Do the following in post_processing, i.e. when workers terminate (finished/failed/cancelled) + - Fetch logs of the pod from k8s + - Store or upload logs + """ + # get logger + tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), + method_name='post_processing') + tmpLog.debug('start') + try: + # fetch and store logs + job_id = workspec.batchID + pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) + pod_name_list = [ pods_info['name'] for pods_info in pods_list ] + outlog_filename = os.path.join(self.logDir, 'gridK8S.{0}.{1}.out'.format(workspec.workerID, workspec.batchID)) + with open(outlog_filename, 'w') as f: + for pod_name in pod_name_list: + current_log_str = self.k8s_client.get_pod_logs(pod_name) + f.write(current_log_str) + # upload logs + pass + # return + tmpLog.debug('done') + return True + except Exception: + core_utils.dump_error_message(tmpLog) + return None diff --git a/pandaharvester/harvestermisc/apfmon.py b/pandaharvester/harvestermisc/apfmon.py index 42625e99..2f7a6675 100644 --- a/pandaharvester/harvestermisc/apfmon.py +++ b/pandaharvester/harvestermisc/apfmon.py @@ -8,9 +8,8 @@ import traceback from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestercore import core_utils from pandaharvester import panda_pkg_info -from pandaharvester.harvestermisc import generic_utils +from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.work_spec import WorkSpec from pandaharvester.harvestermisc.info_utils import PandaQueuesDict @@ -117,7 +116,7 @@ def create_labels(self): panda_queues_dict = PandaQueuesDict() # publish the active queues to APF mon in shards - for sites in generic_utils.create_shards(all_sites, 20): + for sites in core_utils.create_shards(all_sites, 20): labels = [] for site in sites: try: @@ -271,7 +270,7 @@ def create_workers(self, worker_spec_list): url = '{0}/jobs'.format(self.base_url) - for worker_spec_shard in generic_utils.create_shards(worker_spec_list, 20): + for worker_spec_shard in core_utils.create_shards(worker_spec_list, 20): apfmon_workers = [] for worker_spec in worker_spec_shard: batch_id = worker_spec.batchID diff --git a/pandaharvester/harvestermisc/generic_utils.py b/pandaharvester/harvestermisc/generic_utils.py deleted file mode 100644 index 4db657da..00000000 --- a/pandaharvester/harvestermisc/generic_utils.py +++ /dev/null @@ -1,14 +0,0 @@ -def create_shards(input_list, size): - """ - Creates shards of size n from the input list. - """ - shard, i = [], 0 - for element in input_list: - shard.append(element) - i += 1 - if i == size: - yield shard - shard, i = [], 0 - - if i > 0: - yield shard \ No newline at end of file diff --git a/pandaharvester/harvestermisc/htcondor_utils.py b/pandaharvester/harvestermisc/htcondor_utils.py index c3778a12..cd267276 100644 --- a/pandaharvester/harvestermisc/htcondor_utils.py +++ b/pandaharvester/harvestermisc/htcondor_utils.py @@ -3,7 +3,6 @@ import re import time -import datetime import threading import random import multiprocessing @@ -27,7 +26,6 @@ from pandaharvester.harvestercore import core_utils from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore.core_utils import SingletonWithID -from pandaharvester.harvestercore.work_spec import WorkSpec from pandaharvester.harvestercore.fifos import SpecialFIFOBase # condor python or command api diff --git a/pandaharvester/harvestermisc/info_utils.py b/pandaharvester/harvestermisc/info_utils.py index cb22af4c..dca3615b 100644 --- a/pandaharvester/harvestermisc/info_utils.py +++ b/pandaharvester/harvestermisc/info_utils.py @@ -69,7 +69,7 @@ def get_all_queue_names(self): names = set() for queue_name, queue_dict in iteritems(self): if queue_dict.get('pilot_manager') in ['Harvester'] \ - and queue_dict.get('harvester') == harvesterID: + and queue_dict.get('harvester') == harvesterID: names.add(queue_name) return names @@ -83,6 +83,19 @@ def is_ups_queue(self, panda_resource): return True return False + # is grandly unified queue, i.e. runs analysis and production + def is_grandly_unified_queue(self, panda_resource): + panda_queue_dict = self.get(panda_resource) + if panda_queue_dict is None: + return False + + # initial, temporary nomenclature + if 'grandly_unified' in panda_queue_dict.get('catchall') \ + or panda_queue_dict.get('type') == 'unified': + return True + + return False + # get harvester params def get_harvester_params(self, panda_resource): panda_queue_dict = self.get(panda_resource) @@ -107,5 +120,7 @@ def get_type_workflow(self, panda_resource): workflow = None else: pq_type = panda_queue_dict.get('type') + if pq_type == 'unified': # use production templates + pq_type = 'production' workflow = panda_queue_dict.get('workflow') return pq_type, workflow diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index b197f5d9..d9e80235 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -12,11 +12,18 @@ from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestermisc.info_utils import PandaQueuesDict +from pandaharvester.harvestercore import core_utils + +base_logger = core_utils.setup_logger('k8s_utils') + +CONFIG_DIR = '/scratch/jobconfig' class k8s_Client(object): def __init__(self, namespace, config_file=None): + if not os.path.isfile(config_file): + raise RuntimeError('Cannot find k8s config file: {0}'.format(config_file)) config.load_kube_config(config_file=config_file) self.namespace = namespace if namespace else 'default' self.corev1 = client.CoreV1Api() @@ -29,114 +36,227 @@ def read_yaml_file(self, yaml_file): return yaml_content - def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=True, cpuadjustratio=100, memoryadjustratio=100): + def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, container_image, executable, args, + cert, cert_in_secret=True, cpu_adjust_ratio=100, memory_adjust_ratio=100, max_time=None): + + tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') + + # consider PULL mode as default, unless specified + submit_mode = 'PULL' + + # create the configmap in push mode + worker_id = None + if work_spec.mapType != 'NoJob': + submit_mode = 'PUSH' + worker_id = str(work_spec.workerID) + res = self.create_configmap(work_spec) + if not res: # if the configmap creation failed, don't submit a job because the pod creation will hang + return res, 'Failed to create a configmap' + + # retrieve panda queue information panda_queues_dict = PandaQueuesDict() queue_name = panda_queues_dict.get_panda_queue_name(work_spec.computingSite) + # set the worker name yaml_content['metadata']['name'] = yaml_content['metadata']['name'] + "-" + str(work_spec.workerID) + # set the resource type and other metadata to filter the pods yaml_content['spec']['template'].setdefault('metadata', {}) - yaml_content['spec']['template']['metadata'].update({ - 'labels': {'resourceType': str(work_spec.resourceType)}}) + yaml_content['spec']['template']['metadata'].update({'labels': + {'resourceType': str(work_spec.resourceType)} + }) + # fill the container details. we can only handle one container (take the first, delete the rest) yaml_containers = yaml_content['spec']['template']['spec']['containers'] - del(yaml_containers[1:len(yaml_containers)]) + del (yaml_containers[1:len(yaml_containers)]) container_env = yaml_containers[0] container_env.setdefault('resources', {}) + # set the container image + if 'image' not in container_env: + container_env['image'] = container_image + if 'command' not in container_env: + container_env['command'] = executable + container_env['args'] = args + + # set the resources (CPU and memory) we need for the container # note that predefined values in the yaml template will NOT be overwritten + # Be familiar with QoS classes: https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod + # The CPU & memory settings will affect the QoS for the pod + container_env.setdefault('resources', {}) if work_spec.nCore > 0: - container_env['resources'].setdefault('limits', { - 'cpu': str(work_spec.nCore)}) - container_env['resources'].setdefault('requests', { - 'cpu': str(work_spec.nCore*cpuadjustratio/100.0)}) - - if work_spec.minRamCount > 4: - # K8S minimum memory limit = 4 MB - container_env['resources'].setdefault('limits', { - 'memory': str(work_spec.minRamCount) + 'M'}) - container_env['resources'].setdefault('requests', { - 'memory': str(work_spec.minRamCount*memoryadjustratio/100.0) + 'M'}) + + # CPU limits + container_env['resources'].setdefault('limits', {}) + if 'cpu' not in container_env['resources']['limits']: + container_env['resources']['limits']['cpu'] = str(work_spec.nCore) + # CPU requests + container_env['resources'].setdefault('requests', {}) + if 'cpu' not in container_env['resources']['requests']: + container_env['resources']['requests']['cpu'] = str(work_spec.nCore * cpu_adjust_ratio / 100.0) + + if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB + # memory limits + # container_env['resources'].setdefault('limits', {}) + # if 'memory' not in container_env['resources']['limits']: + # container_env['resources']['limits']['memory'] = str(work_spec.minRamCount) + 'M' + # memory requests + container_env['resources'].setdefault('requests', {}) + if 'memory' not in container_env['resources']['requests']: + container_env['resources']['requests']['memory'] = str( + work_spec.minRamCount * memory_adjust_ratio / 100.0) + 'M' container_env.setdefault('env', []) + # try to retrieve the stdout log file name + try: + log_file_name = work_spec.workAttributes['stdout'] + except (KeyError, AttributeError): + tmp_log.debug('work_spec does not have stdout workAttribute, using default') + log_file_name = '' container_env['env'].extend([ {'name': 'computingSite', 'value': work_spec.computingSite}, {'name': 'pandaQueueName', 'value': queue_name}, {'name': 'resourceType', 'value': work_spec.resourceType}, + {'name': 'prodSourceLabel', 'value': prod_source_label}, + {'name': 'jobType', 'value': work_spec.jobType}, {'name': 'proxySecretPath', 'value': cert if cert_in_secret else None}, {'name': 'proxyContent', 'value': None if cert_in_secret else self.set_proxy(cert)}, {'name': 'workerID', 'value': str(work_spec.workerID)}, {'name': 'logs_frontend_w', 'value': harvester_config.pandacon.pandaCacheURL_W}, {'name': 'logs_frontend_r', 'value': harvester_config.pandacon.pandaCacheURL_R}, + {'name': 'stdout_name', 'value': log_file_name}, {'name': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id}, {'name': 'HARVESTER_WORKER_ID', 'value': str(work_spec.workerID)}, - {'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id} - ]) - + {'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id}, + {'name': 'submit_mode', 'value': submit_mode} + ]) + + # in push mode, add the configmap as a volume to the pod + if submit_mode == 'PUSH' and worker_id: + yaml_content['spec']['template']['spec'].setdefault('volumes', []) + yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] + yaml_volumes.append({'name': 'job-config', 'configMap': {'name': worker_id}}) + # mount the volume to the filesystem + container_env.setdefault('volumeMounts', []) + container_env['volumeMounts'].append({'name': 'job-config', 'mountPath': CONFIG_DIR}) + + # set the affinity if 'affinity' not in yaml_content['spec']['template']['spec']: yaml_content = self.set_affinity(yaml_content) + # set max_time to avoid having a pod running forever + if 'activeDeadlineSeconds' not in yaml_content['spec']['template']['spec']: + if not max_time: # 4 days + max_time = 4 * 24 * 23600 + yaml_content['spec']['template']['spec']['activeDeadlineSeconds'] = max_time + + tmp_log.debug('creating job {0}'.format(yaml_content)) + rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) - return rsp + return rsp, yaml_content + + def generate_ls_from_wsl(self, workspec_list=[]): + if workspec_list: + batch_ids_list = [workspec.batchID for workspec in workspec_list if workspec.batchID] + batch_ids_concat = ','.join(batch_ids_list) + label_selector = 'job-name in ({0})'.format(batch_ids_concat) + else: + label_selector = '' + + return label_selector - def get_pods_info(self): + def get_pods_info(self, workspec_list=[]): + + tmp_log = core_utils.make_logger(base_logger, method_name='get_pods_info') pods_list = list() - ret = self.corev1.list_namespaced_pod(namespace=self.namespace) + label_selector = self.generate_ls_from_wsl(workspec_list) + # tmp_log.debug('label_selector: {0}'.format(label_selector)) - for i in ret.items: - pod_info = {} - pod_info['name'] = i.metadata.name - pod_info['start_time'] = i.status.start_time.replace(tzinfo=None) if i.status.start_time else i.status.start_time - pod_info['status'] = i.status.phase - pod_info['status_reason'] = i.status.conditions[0].reason if i.status.conditions else None - pod_info['status_message'] = i.status.conditions[0].message if i.status.conditions else None - pod_info['job_name'] = i.metadata.labels['job-name'] if i.metadata.labels and 'job-name' in i.metadata.labels else None - pods_list.append(pod_info) + try: + ret = self.corev1.list_namespaced_pod(namespace=self.namespace, label_selector=label_selector) + except Exception as _e: + tmp_log.error('Failed call to list_namespaced_pod with: {0}'.format(_e)) + else: + for i in ret.items: + pod_info = { + 'name': i.metadata.name, + 'start_time': i.status.start_time.replace(tzinfo=None) if i.status.start_time else i.status.start_time, + 'status': i.status.phase, + 'status_conditions': i.status.conditions, + 'job_name': i.metadata.labels['job-name'] if i.metadata.labels and 'job-name' in i.metadata.labels else None, + 'containers_state': [] + } + if i.status.container_statuses: + for cs in i.status.container_statuses: + if cs.state: + pod_info['containers_state'].append(cs.state) + pods_list.append(pod_info) return pods_list def filter_pods_info(self, pods_list, job_name=None): if job_name: - pods_list = [ i for i in pods_list if i['job_name'] == job_name] + pods_list = [i for i in pods_list if i['job_name'] == job_name] return pods_list - def get_jobs_info(self, job_name=None): + def get_jobs_info(self, workspec_list=[]): + + tmp_log = core_utils.make_logger(base_logger, method_name='get_jobs_info') + jobs_list = list() - field_selector = 'metadata.name=' + job_name if job_name else '' - ret = self.batchv1.list_namespaced_job(namespace=self.namespace, field_selector=field_selector) + label_selector = self.generate_ls_from_wsl(workspec_list) + # tmp_log.debug('label_selector: {0}'.format(label_selector)) + + try: + ret = self.batchv1.list_namespaced_job(namespace=self.namespace, label_selector=label_selector) + + for i in ret.items: + job_info = { + 'name': i.metadata.name, + 'status': i.status.conditions[0].type, + 'status_reason': i.status.conditions[0].reason, + 'status_message': i.status.conditions[0].message + } + jobs_list.append(job_info) + except Exception as _e: + tmp_log.error('Failed call to list_namespaced_job with: {0}'.format(_e)) - for i in ret.items: - job_info = {} - job_info['name'] = i.metadata.name - job_info['status'] = i.status.conditions[0].type - job_info['status_reason'] = i.status.conditions[0].reason - job_info['status_message'] = i.status.conditions[0].message - jobs_list.append(job_info) return jobs_list def delete_pods(self, pod_name_list): - retList = list() + ret_list = list() for pod_name in pod_name_list: - rsp = {} - rsp['name'] = pod_name + rsp = {'name': pod_name} try: - self.corev1.delete_namespaced_pod(name=pod_name, namespace=self.namespace, body=self.deletev1, grace_period_seconds=0) + self.corev1.delete_namespaced_pod(name=pod_name, namespace=self.namespace, body=self.deletev1, + grace_period_seconds=0) except ApiException as _e: rsp['errMsg'] = '' if _e.status == 404 else _e.reason + except Exception as _e: + rsp['errMsg'] = _e.reason else: rsp['errMsg'] = '' - retList.append(rsp) + ret_list.append(rsp) - return retList + return ret_list def delete_job(self, job_name): - self.batchv1.delete_namespaced_job(name=job_name, namespace=self.namespace, body=self.deletev1, grace_period_seconds=0) + tmp_log = core_utils.make_logger(base_logger, 'job_name={0}'.format(job_name), method_name='delete_job') + try: + self.batchv1.delete_namespaced_job(name=job_name, namespace=self.namespace, body=self.deletev1, + grace_period_seconds=0) + except Exception as _e: + tmp_log.error('Failed call to delete_namespaced_job with: {0}'.format(_e)) + + def delete_config_map(self, config_map_name): + self.corev1.delete_namespaced_config_map(name=config_map_name, namespace=self.namespace, body=self.deletev1, + grace_period_seconds=0) def set_proxy(self, proxy_path): with open(proxy_path) as f: @@ -154,16 +274,18 @@ def set_affinity(self, yaml_content): 'labelSelector': {'matchExpressions': [ {'key': 'resourceType', 'operator': 'In', 'values': ['SCORE']}]}, 'topologyKey': 'kubernetes.io/hostname'} - }]} + }]} - resourceType = yaml_content['spec']['template']['metadata']['labels']['resourceType'] + resource_type = yaml_content['spec']['template']['metadata']['labels']['resourceType'] - if resourceType == 'SCORE': + if resource_type == 'SCORE': yaml_affinity['podAffinity'] = copy.deepcopy(affinity_spec) - yaml_affinity['podAffinity']['preferredDuringSchedulingIgnoredDuringExecution'][0]['podAffinityTerm']['labelSelector']['matchExpressions'][0]['values'][0] = resourceType + yaml_affinity['podAffinity']['preferredDuringSchedulingIgnoredDuringExecution'][0]['podAffinityTerm'][ + 'labelSelector']['matchExpressions'][0]['values'][0] = resource_type yaml_affinity['podAntiAffinity'] = copy.deepcopy(affinity_spec) - yaml_affinity['podAntiAffinity']['preferredDuringSchedulingIgnoredDuringExecution'][0]['podAffinityTerm']['labelSelector']['matchExpressions'][0]['values'][0] = res_element.difference({resourceType}).pop() + yaml_affinity['podAntiAffinity']['preferredDuringSchedulingIgnoredDuringExecution'][0]['podAffinityTerm'][ + 'labelSelector']['matchExpressions'][0]['values'][0] = res_element.difference({resource_type}).pop() return yaml_content @@ -171,17 +293,72 @@ def create_or_patch_secret(self, file_list, secret_name): # api_version = 'v1' # kind = 'Secret' # type='kubernetes.io/tls' + rsp = None + tmp_log = core_utils.make_logger(base_logger, method_name='create_or_patch_secret') + metadata = {'name': secret_name, 'namespace': self.namespace} data = {} - for file in file_list: - filename = os.path.basename(file) - with open(file, 'rb') as f: - str = f.read() - data[filename] = base64.b64encode(str).decode() + for file_name in file_list: + filename = os.path.basename(file_name) + with open(file_name, 'rb') as f: + content = f.read() + data[filename] = base64.b64encode(content).decode() body = client.V1Secret(data=data, metadata=metadata) try: - rsp = self.corev1.patch_namespaced_secret(name=secret_name, body=body, namespace=self.namespace) - except ApiException as e: - print('Exception when patch secret: {0} . Try to create secret instead...'.format(e)) - rsp = self.corev1.create_namespaced_secret(body=body, namespace=self.namespace) + try: + rsp = self.corev1.patch_namespaced_secret(name=secret_name, body=body, namespace=self.namespace) + tmp_log.debug('Patched secret') + except ApiException as e: + tmp_log.debug('Exception when patching secret: {0} . Try to create secret instead...'.format(e)) + rsp = self.corev1.create_namespaced_secret(body=body, namespace=self.namespace) + tmp_log.debug('Created secret') + except Exception as e: + tmp_log.error('Exception when patching or creating secret: {0}.'.format(e)) return rsp + + def create_configmap(self, work_spec): + # useful guide: https://matthewpalmer.net/kubernetes-app-developer/articles/ultimate-configmap-guide-kubernetes.html + + tmp_log = core_utils.make_logger(base_logger, method_name='create_configmap') + + try: + worker_id = str(work_spec.workerID) + + # Get the access point. The messenger should have dropped the input files for the pilot here + access_point = work_spec.get_access_point() + pjd = 'pandaJobData.out' + job_data_file = os.path.join(access_point, pjd) + with open(job_data_file) as f: + job_data_contents = f.read() + + pfc = 'PoolFileCatalog_H.xml' + pool_file_catalog_file = os.path.join(access_point, pfc) + with open(pool_file_catalog_file) as f: + pool_file_catalog_contents = f.read() + + # put the job data and PFC into a dictionary + data = {pjd: job_data_contents, pfc: pool_file_catalog_contents} + + # instantiate the configmap object + metadata = {'name': worker_id, 'namespace': self.namespace} + config_map = client.V1ConfigMap(api_version="v1", kind="ConfigMap", data=data, metadata=metadata) + + # create the configmap object in K8s + api_response = self.corev1.create_namespaced_config_map(namespace=self.namespace, body=config_map) + tmp_log.debug('Created configmap for worker id: {0}'.format(worker_id)) + return True + + except (ApiException, TypeError) as e: + tmp_log.error('Could not create configmap with: {0}'.format(e)) + return False + + def get_pod_logs(self, pod_name, previous=False): + tmp_log = core_utils.make_logger(base_logger, method_name='get_pod_logs') + try: + rsp = self.corev1.read_namespaced_pod_log(name=pod_name, namespace=self.namespace, previous=previous) + tmp_log.debug('Log file retrieved for {0}'.format(pod_name)) + except Exception as e: + tmp_log.debug('Exception when getting logs for pod {0} : {1}. Skipped'.format(pod_name, e)) + raise + else: + return rsp diff --git a/pandaharvester/harvestermisc/rucio_utils.py b/pandaharvester/harvestermisc/rucio_utils.py index 7dc45d68..92c026ad 100644 --- a/pandaharvester/harvestermisc/rucio_utils.py +++ b/pandaharvester/harvestermisc/rucio_utils.py @@ -2,30 +2,12 @@ utilities routines associated with Rucio CLI access """ -from future.utils import iteritems - try: import subprocess32 as subprocess except: import subprocess from pandaharvester.harvestercore import core_utils -from pandalogger.PandaLogger import PandaLogger -from pandalogger.LogWrapper import LogWrapper - -import time -import datetime -import uuid -import os -import sys -import stat -import os.path -import threading -import tarfile -import hashlib -import string -import shutil -import errno def rucio_create_dataset(tmpLog,datasetScope,datasetName): @@ -35,58 +17,53 @@ def rucio_create_dataset(tmpLog,datasetScope,datasetName): lifetime = 7*24*60*60 tmpLog.debug('register {0}:{1} lifetime = {2}' .format(datasetScope, datasetName,lifetime)) - try: - executable = ['/usr/bin/env', - 'rucio', 'add-dataset'] - executable += [ '--lifetime',('%d' %lifetime)] - executable += [datasetName] - - #print executable - - tmpLog.debug('rucio add-dataset command: {0} '.format(executable)) - tmpLog.debug('rucio add-dataset command (for human): %s ' % ' '.join(executable)) - - process = subprocess.Popen(executable, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - - stdout,stderr = process.communicate() - - if process.returncode == 0: - tmpLog.debug(stdout) - return True,'' + executable = ['/usr/bin/env', + 'rucio', 'add-dataset'] + executable += [ '--lifetime',('%d' %lifetime)] + executable += [datasetName] + tmpLog.debug('rucio add-dataset command: {0} '.format(executable)) + tmpLog.debug('rucio add-dataset command (for human): %s ' % ' '.join(executable)) + process = subprocess.Popen(executable, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True) + stdout,stderr = process.communicate() + if process.returncode == 0: + tmpLog.debug(stdout) + return True,'' + else: + # check what failed + dataset_exists = False + rucio_sessions_limit_error = False + for line in stdout.split('\n'): + if 'Data Identifier Already Exists' in line: + dataset_exists = True + break + elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line: + rucio_sessions_limit_error = True + break + if dataset_exists: + errMsg = 'dataset {0}:{1} already exists'.format(datasetScope, + datasetName) + tmpLog.debug(errMsg) + return True,errMsg + elif rucio_sessions_limit_error: + # do nothing + errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout) + tmpLog.warning(errStr) + return None,errStr else: - # check what failed - dataset_exists = False - rucio_sessions_limit_error = False - for line in stdout.split('\n'): - if 'Data Identifier Already Exists' in line: - dataset_exists = True - break - elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line: - rucio_sessions_limit_error = True - break - if dataset_exists: - errMsg = 'dataset {0}:{1} already exists'.format(datasetScope, - datasetName) - tmpLog.debug(errMsg) - return True,errMsg - elif rucio_sessions_limit_error: - # do nothing - errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout) - tmpLog.warning(errStr) - return None,errStr - else: - # some other Rucio error - errStr = 'Rucio returned error : stdout: {0}'.format(stdout) - tmpLog.error(errStr) - return False,errStr - except Exception: - errMsg = 'Could not create dataset {0}:{1}'.format(datasetScope, - datasetName) - core_utils.dump_error_message(tmpLog) - tmpLog.error(errMsg) - return False,errMsg + # some other Rucio error + errStr = 'Rucio returned error : stdout: {0}'.format(stdout) + tmpLog.error(errStr) + return False,errStr + except Exception as e: + errMsg = 'Could not create dataset {0}:{1} with {2}'.format(datasetScope, + datasetName, + str(e)) + core_utils.dump_error_message(tmpLog) + tmpLog.error(errMsg) + return False,errMsg def rucio_add_files_to_dataset(tmpLog,datasetScope,datasetName,fileList): # add files to dataset diff --git a/pandaharvester/harvestermonitor/k8s_monitor.py b/pandaharvester/harvestermonitor/k8s_monitor.py index f79224c5..31162711 100644 --- a/pandaharvester/harvestermonitor/k8s_monitor.py +++ b/pandaharvester/harvestermonitor/k8s_monitor.py @@ -1,18 +1,16 @@ -import os -import time import datetime -import re from concurrent.futures import ThreadPoolExecutor from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.work_spec import WorkSpec +from pandaharvester.harvestercore.worker_errors import WorkerErrors from pandaharvester.harvestercore.plugin_base import PluginBase from pandaharvester.harvestermisc.k8s_utils import k8s_Client # logger -baseLogger = core_utils.setup_logger('k8s_monitor') +base_logger = core_utils.setup_logger('k8s_monitor') # monitor for K8S @@ -40,100 +38,129 @@ def __init__(self, **kwarg): self._all_pods_list = [] - def check_pods_status(self, pods_status_list): - newStatus = '' + def check_pods_status(self, pods_status_list, containers_state_list): + sub_msg = '' if 'Unknown' in pods_status_list: if all(item == 'Unknown' for item in pods_status_list): - newStatus = None + new_status = None elif 'Running' in pods_status_list: - newStatus = WorkSpec.ST_running + new_status = WorkSpec.ST_running else: - newStatus = WorkSpec.ST_idle + new_status = WorkSpec.ST_idle else: if all(item == 'Pending' for item in pods_status_list): - newStatus = WorkSpec.ST_submitted + new_status = WorkSpec.ST_submitted # elif all(item == 'Succeeded' for item in pods_status_list): - # newStatus = WorkSpec.ST_finished + # new_status = WorkSpec.ST_finished elif 'Succeeded' in pods_status_list: - newStatus = WorkSpec.ST_finished + if all((item.terminated is not None and item.terminated.reason == 'Completed') for item in containers_state_list): + new_status = WorkSpec.ST_finished + else: + sub_mesg_list = [] + for item in containers_state_list: + msg_str = '' + if item.terminated is None: + state = 'UNKNOWN' + if item.running is not None: + state = 'running' + elif item.waiting is not None: + state = 'waiting' + msg_str = 'container not terminated yet ({0}) while pod Succeeded'.format(state) + elif item.terminated.reason != 'Completed': + msg_str = 'container termiated by k8s for reason {0}'.format(item.terminated.reason) + sub_mesg_list.append(msg_str) + sub_msg = ';'.join(sub_mesg_list) + new_status = WorkSpec.ST_cancelled elif 'Running' in pods_status_list: - newStatus = WorkSpec.ST_running + new_status = WorkSpec.ST_running elif 'Failed' in pods_status_list: - newStatus = WorkSpec.ST_failed + new_status = WorkSpec.ST_failed else: - newStatus = WorkSpec.ST_idle + new_status = WorkSpec.ST_idle - return newStatus + return new_status, sub_msg - def check_a_job(self, workspec): + def check_a_worker(self, workspec): # set logger - tmpLog = self.make_logger(baseLogger, 'workerID={0} batchID={1}'.format(workspec.workerID, workspec.batchID), - method_name='check_a_job') + tmp_log = self.make_logger(base_logger, 'workerID={0} batchID={1}'.format(workspec.workerID, workspec.batchID), + method_name='check_a_worker') - ## initialization + # initialization job_id = workspec.batchID - newStatus = workspec.status - errStr = '' + err_str = '' + time_now = datetime.datetime.utcnow() + pods_status_list = [] + pods_name_to_delete_list = [] try: pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) - timeNow = datetime.datetime.utcnow() - pods_status_list = [] - pods_name_to_delete_list = [] + containers_state_list = [] + pods_sup_diag_list = [] for pods_info in pods_list: + # make a list of pods that have been queued too long if pods_info['status'] in ['Pending', 'Unknown'] and pods_info['start_time'] \ - and timeNow - pods_info['start_time'] > datetime.timedelta(seconds=self.podQueueTimeLimit): + and time_now - pods_info['start_time'] > datetime.timedelta(seconds=self.podQueueTimeLimit): # fetch queuing too long pods pods_name_to_delete_list.append(pods_info['name']) + # make list of status of the pods belonging to our job pods_status_list.append(pods_info['status']) + containers_state_list.extend(pods_info['containers_state']) + pods_sup_diag_list.append(pods_info['name']) except Exception as _e: - errStr = 'Failed to get POD status of JOB id={0} ; {1}'.format(job_id, _e) - tmpLog.error(errStr) - newStatus = None + err_str = 'Failed to get POD status of JOB id={0} ; {1}'.format(job_id, _e) + tmp_log.error(err_str) + new_status = None else: if not pods_status_list: - errStr = 'JOB id={0} not found'.format(job_id) - tmpLog.error(errStr) - tmpLog.info('Force to cancel the worker due to JOB not found') - newStatus = WorkSpec.ST_cancelled + # there were no pods found belonging to our job + err_str = 'JOB id={0} not found'.format(job_id) + tmp_log.error(err_str) + tmp_log.info('Force to cancel the worker due to JOB not found') + new_status = WorkSpec.ST_cancelled else: - tmpLog.debug('pods_status_list={0}'.format(pods_status_list)) - newStatus = self.check_pods_status(pods_status_list) - tmpLog.debug('new_status={0}'.format(newStatus)) - # delete queuing too long pods + # we found pods belonging to our job. Obtain the final status + tmp_log.debug('pods_status_list={0}'.format(pods_status_list)) + new_status, sub_msg = self.check_pods_status(pods_status_list, containers_state_list) + if sub_msg: + err_str += sub_msg + tmp_log.debug('new_status={0}'.format(new_status)) + + # delete pods that have been queueing too long if pods_name_to_delete_list: - tmpLog.debug('Deleting pods queuing too long') - retList = self.k8s_client.delete_pods(pods_name_to_delete_list) + tmp_log.debug('Deleting pods queuing too long') + ret_list = self.k8s_client.delete_pods(pods_name_to_delete_list) deleted_pods_list = [] - for item in retList: + for item in ret_list: if item['errMsg'] == '': deleted_pods_list.append(item['name']) - tmpLog.debug('Deleted pods queuing too long: {0}'.format( + tmp_log.debug('Deleted pods queuing too long: {0}'.format( ','.join(deleted_pods_list))) + # supplemental diag messages + sup_error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') if err_str else WorkerErrors.error_codes.get('SUCCEEDED') + sup_error_diag = 'PODs=' + ','.join(pods_sup_diag_list) + ' ; ' + err_str + workspec.set_supplemental_error(error_code=sup_error_code, error_diag=sup_error_diag) - return (newStatus, errStr) + return new_status, err_str - - # check workers def check_workers(self, workspec_list): - tmpLog = self.make_logger(baseLogger, 'k8s query', method_name='check_workers') - tmpLog.debug('start') + tmp_log = self.make_logger(base_logger, 'k8s query', method_name='check_workers') + tmp_log.debug('start') - retList = list() + ret_list = list() if not workspec_list: - errStr = 'empty workspec_list' - tmpLog.debug(errStr) - retList.append(('', errStr)) - return False, retList + err_str = 'empty workspec_list' + tmp_log.debug(err_str) + ret_list.append(('', err_str)) + return False, ret_list - self._all_pods_list = self.k8s_client.get_pods_info() + self._all_pods_list = self.k8s_client.get_pods_info(workspec_list=workspec_list) + # resolve status requested workers with ThreadPoolExecutor(self.nProcesses) as thread_pool: - retIterator = thread_pool.map(self.check_a_job, workspec_list) - - retList = list(retIterator) + ret_iterator = thread_pool.map(self.check_a_worker, workspec_list) - tmpLog.debug('done') + ret_list = list(ret_iterator) - return True, retList + tmp_log.debug('done') + return True, ret_list diff --git a/pandaharvester/harvestermonitor/lsf_monitor.py b/pandaharvester/harvestermonitor/lsf_monitor.py new file mode 100644 index 00000000..ea226f60 --- /dev/null +++ b/pandaharvester/harvestermonitor/lsf_monitor.py @@ -0,0 +1,90 @@ +import re +from shlex import quote +from shlex import split + +try: + import subprocess32 as subprocess +except: + import subprocess + +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore.work_spec import WorkSpec +from pandaharvester.harvestercore.plugin_base import PluginBase + +# logger +baseLogger = core_utils.setup_logger('lsf_monitor') + + +# monitor for LSF batch system +class LSFMonitor(PluginBase): + # constructor + def __init__(self, **kwarg): + PluginBase.__init__(self, **kwarg) + + # check workers + def check_workers(self, workspec_list): + retList = [] + for workSpec in workspec_list: + # make logger + tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), + method_name='check_workers') + # command + comStr = 'bjobs -a -noheader -o {0} {1} '.format(quote("jobid:10 stat:10"),workSpec.batchID) + comStr_split = split(comStr) + # check + p = subprocess.Popen(comStr_split, + shell=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + newStatus = workSpec.status + # check return code + stdOut, stdErr = p.communicate() + retCode = p.returncode + tmpLog.debug('len(stdOut) = {0} stdOut={1}'.format(len(str(stdOut)),stdOut)) + tmpLog.debug('len(stdErr) = {0} stdErr={1}'.format(len(str(stdErr)),stdErr)) + tmpLog.debug('retCode={0}'.format(retCode)) + errStr = '' + if retCode == 0: + # check if any came back on stdOut otherwise check stdErr + tempresponse = "" + if len(str(stdOut)) >= len(str(stdErr)): + tempresponse = str(stdOut) + else: + tempresponse = str(stdErr) + #tmpLog.debug('tempresponse = {0}'.format(tempresponse)) + # parse + for tmpLine in tempresponse.split('\n'): + tmpMatch = re.search('{0}'.format(workSpec.batchID), tmpLine) + tmpLog.debug('tmpLine = {0} tmpMatch = {1}'.format(tmpLine,tmpMatch)) + if tmpMatch is not None: + errStr = tmpLine + # search for phrase is not found + tmpMatch = re.search('is not found', tmpLine) + if tmpMatch is not None: + batchStatus = 'Job {0} is not found'.format(workSpec.batchID) + newStatus = WorkSpec.ST_failed + tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, + retCode)) + else: + batchStatus = tmpLine.split()[-2] + if batchStatus in ['RUN']: + newStatus = WorkSpec.ST_running + elif batchStatus in ['DONE']: + newStatus = WorkSpec.ST_finished + elif batchStatus in ['PEND', 'PROV','WAIT']: + newStatus = WorkSpec.ST_submitted + else: + newStatus = WorkSpec.ST_failed + tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, + newStatus)) + break + retList.append((newStatus, errStr)) + else: + # failed + errStr = stdOut + ' ' + stdErr + tmpLog.error(errStr) + if 'Unknown Job Id Error' in errStr: + tmpLog.info("Mark job as finished.") + newStatus = WorkSpec.ST_finished + retList.append((newStatus, errStr)) + return True, retList diff --git a/pandaharvester/harvestermonitor/slurm_squeue_monitor.py b/pandaharvester/harvestermonitor/slurm_squeue_monitor.py new file mode 100644 index 00000000..d9a68e01 --- /dev/null +++ b/pandaharvester/harvestermonitor/slurm_squeue_monitor.py @@ -0,0 +1,102 @@ +import re +try: + import subprocess32 as subprocess +except ImportError: + import subprocess + +import json +import os + +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore.work_spec import WorkSpec +from pandaharvester.harvestercore.plugin_base import PluginBase + +# logger +baseLogger = core_utils.setup_logger('slurm_squeue_monitor') + + +# monitor for SLURM batch system with squeue +class SlurmSqueueMonitor(PluginBase): + _HARVESTER_POSTMORTEM_FILENAME="FINISHED" + + # constructor + def __init__(self, **kwarg): + PluginBase.__init__(self, **kwarg) + + # check workers + def check_workers(self, workspec_list): + retList = [] + for workSpec in workspec_list: + # make logger + tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), + method_name='check_workers') + # here try to load file + current_postmortem_fname = '%s/%s' %(workSpec.accessPoint, SlurmSqueueMonitor._HARVESTER_POSTMORTEM_FILENAME) + + if os.path.exists(current_postmortem_fname): + with open(current_postmortem_fname) as postmortem: + try: + worker_status_json = json.load(postmortem) + if 'worker_status' in worker_status_json: + worker_status = None + if worker_status_json['worker_status']=='finished': + worker_status = WorkSpec.ST_finished + if worker_status_json['worker_status']=='failed': + worker_status = WorkSpec.ST_failed + if worker_status is not None: + retList.append((worker_status, '')) + continue + except json.JSONDecodeError: + tmpLog.debug('Not able to parse JSON in postmortem for a worker: %s, continung with SLURM CLI' % current_postmortem_fname) + + # command + comStr = "squeue -j {0}".format(workSpec.batchID) + # check + tmpLog.debug('check with {0}'.format(comStr)) + p = subprocess.Popen(comStr.split(), + shell=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + newStatus = workSpec.status + # check return code + stdOut, stdErr = p.communicate() + retCode = p.returncode + tmpLog.debug('retCode={0}'.format(retCode)) + errStr = '' + stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() + stdErr_str = stdErr if (isinstance(stdErr, str) or stdErr is None) else stdErr.decode() + if retCode == 0: + for tmpLine in stdOut_str.split('\n'): + tmpMatch = re.search('{0} '.format(workSpec.batchID), tmpLine) + if tmpMatch is not None: + errStr = tmpLine + batchStatus = tmpLine.split()[4] + if batchStatus in ['R', 'RUNNING', 'COMPLETING', 'STOPPED', 'SUSPENDED']: + newStatus = WorkSpec.ST_running + elif batchStatus in ['COMPLETED', 'PREEMPTED', 'TIMEOUT']: + newStatus = WorkSpec.ST_finished + elif batchStatus in ['CANCELLED']: + newStatus = WorkSpec.ST_cancelled + elif batchStatus in ['PD', 'CONFIGURING', 'PENDING']: + newStatus = WorkSpec.ST_submitted + else: + newStatus = WorkSpec.ST_failed + tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, + newStatus)) + break + retList.append((newStatus, errStr)) + else: + # squeue does not show finished jobs, gives return code 1 + # Assume finished for now. Maybe look in workdir. + newStatus = WorkSpec.ST_finished + errStr = '{0} {1}'.format(stdOut_str, stdErr_str) + tmpLog.error(errStr) + #if 'slurm_load_jobs error: Invalid job id specified' in errStr: + # newStatus = WorkSpec.ST_failed + retList.append((newStatus, errStr)) + return True, retList + + + def _get_worker_completion_details(): + # try to open FINISHED file + pass diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index d791fefc..99c177aa 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -11,7 +11,6 @@ from pandaharvester.harvestercore.plugin_base import PluginBase from pandaharvester.harvestercore import core_utils from pandaharvester.harvestermover import mover_utils -from pandaharvester.harvesterconfig import harvester_config # logger baseLogger = core_utils.setup_logger('analysis_aux_preparator') @@ -21,7 +20,8 @@ class AnalysisAuxPreparator(PluginBase): # constructor def __init__(self, **kwarg): - self.gulOpts = None + self.containerRuntime = None + self.externalCommand = {} self.maxAttempts = 3 PluginBase.__init__(self, **kwarg) @@ -33,25 +33,44 @@ def trigger_preparation(self, jobspec): tmpLog.debug('start') # loop over all inputs allDone = True + bulkExtCommand = {} + tmpLog.debug('number of inFiles : {0}'.format(len(jobspec.inFiles))) for tmpFileSpec in jobspec.inFiles: # local access path url = tmpFileSpec.url accPath = self.make_local_access_path(tmpFileSpec.scope, tmpFileSpec.lfn) + accPathTmp = accPath + '.tmp' + tmpLog.debug('url : {0} accPath : {1}'.format(url,accPath)) # check if already exits if os.path.exists(accPath): - continue + continue # make directories if needed if not os.path.isdir(os.path.dirname(accPath)): os.makedirs(os.path.dirname(accPath)) - # get + # check if use an external command + extCommand = None + for protocol in self.externalCommand: + if url.startswith(protocol): + extCommand = self.externalCommand[protocol] + # collect file info to execute the command later + bulkExtCommand.setdefault(protocol, {'command': extCommand, 'url': [], 'dst': [], 'lfn': []}) + bulkExtCommand[protocol]['url'].append(url) + bulkExtCommand[protocol]['dst'].append(accPath) + bulkExtCommand[protocol]['lfn'].append(tmpFileSpec.lfn) + break + # execute the command later + if extCommand is not None: + continue + # execute return_code = 1 if url.startswith('http'): try: - tmpLog.debug('getting via http from {0} to {1}'.format(url, accPath)) + tmpLog.debug('getting via http from {0} to {1}'.format(url, accPathTmp)) res = requests.get(url, timeout=180, verify=False) if res.status_code == 200: - with open(accPath, 'w') as f: + with open(accPathTmp, 'wb') as f: f.write(res.content) + tmpLog.debug('Successfully fetched file - {0}'.format(accPathTmp)) return_code = 0 else: errMsg = 'failed to get {0} with StatusCode={1} {2}'.format(url, res.status_code, res.text) @@ -61,30 +80,85 @@ def trigger_preparation(self, jobspec): except Exception: core_utils.dump_error_message(tmpLog) elif url.startswith('docker'): - args = ['docker', 'save', '-o', accPath, url.split('://')[-1]] - try: - tmpLog.debug('executing ' + ' '.join(args)) - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - return_code = p.returncode - if stdout is not None: - stdout = stdout.replace('\n', ' ') - if stderr is not None: - stderr = stderr.replace('\n', ' ') - tmpLog.debug("stdout: %s" % stdout) - tmpLog.debug("stderr: %s" % stderr) - except Exception: - core_utils.dump_error_message(tmpLog) + if self.containerRuntime is None: + tmpLog.debug('container downloading is disabled') + continue + if self.containerRuntime == 'docker': + args = ['docker', 'save', '-o', accPathTmp, url.split('://')[-1]] + return_code = self.make_image(jobspec,args) + elif self.containerRuntime == 'singularity': + args = ['singularity', 'build', '--sandbox', accPathTmp, url ] + return_code = self.make_image(jobspec,args) + else: + tmpLog.error('unsupported container runtime : {0}'.format(self.containerRuntime)) elif url.startswith('/'): try: - shutil.copyfile(url, accPath) + shutil.copyfile(url, accPathTmp) return_code = 0 except Exception: core_utils.dump_error_message(tmpLog) else: tmpLog.error('unsupported protocol in {0}'.format(url)) + # remove empty files + if os.path.exists(accPathTmp) and os.path.getsize(accPathTmp) == 0: + return_code = 1 + tmpLog.debug('remove empty file - {0}'.format(accPathTmp)) + try: + os.remove(accPathTmp) + except Exception: + core_utils.dump_error_message(tmpLog) + # rename + if return_code == 0: + try: + os.rename(accPathTmp, accPath) + except Exception: + return_code = 1 + core_utils.dump_error_message(tmpLog) if return_code != 0: allDone = False + # execute external command + execIdMap = {} + tmpLog.debug('bulkExtCommand : {0}'.format(bulkExtCommand)) + for protocol in bulkExtCommand: + args = [] + for arg in bulkExtCommand[protocol]['command']['trigger']['args']: + if arg == '{src}': + arg = ','.join(bulkExtCommand[protocol]['url']) + elif arg == '{dst}': + arg = ','.join(bulkExtCommand[protocol]['dst']) + args.append(arg) + # execute + try: + tmpLog.debug('executing external command: ' + ' '.join(args)) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + stdout, stderr = p.communicate() + return_code = p.returncode + if stdout is None: + stdout = '' + if stderr is None: + stderr = '' + # get ID of command execution such as transfer ID and batch job ID + executionID = None + if return_code == 0 and 'check' in bulkExtCommand[protocol]['command']: + executionID = [s for s in stdout.split('\n') if s][-1] + dst = ','.join(bulkExtCommand[protocol]['dst']) + executionID = '{0}:{1}:{2}'.format(protocol, executionID, dst) + tmpLog.debug('executionID - {0}'.format(executionID)) + execIdMap[executionID] = {'lfns': bulkExtCommand[protocol]['lfn'], 'groupStatus': 'active'} + stdout = stdout.replace('\n', ' ') + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: {0}".format(stdout)) + tmpLog.debug("stderr: {0}".format(stderr)) + if executionID is not None: + tmpLog.debug("execution ID: {0}".format(executionID)) + except Exception: + core_utils.dump_error_message(tmpLog) + allDone = False + # keep execution ID to check later + tmpLog.debug('execIdMap : {0}'.format(execIdMap)) + if execIdMap: + jobspec.set_groups_to_files(execIdMap) + # done if allDone: tmpLog.debug('succeeded') return True, '' @@ -101,18 +175,93 @@ def trigger_preparation(self, jobspec): # check status def check_stage_in_status(self, jobspec): + # make logger + tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='check_stage_in_status') + tmpLog.debug('start') + allDone = True + errMsg = '' + transferGroups = jobspec.get_groups_of_input_files(skip_ready=True) + for tmpGroupID in transferGroups: + if tmpGroupID is None: + continue + tmpGroupID_parts = tmpGroupID.split(':',maxsplit=2) + tmpLog.debug('transfer group ID : {0} components: {1}'.format(tmpGroupID, tmpGroupID_parts)) + protocol, executionID, dst = tmpGroupID.split(':',maxsplit=2) + args = [] + for arg in self.externalCommand[protocol]['check']['args']: + if arg == '{id}': + arg = executionID + elif arg == '{dst}': + arg = dst + args.append(arg) + # execute + try: + tmpLog.debug('executing external command: ' + ' '.join(args)) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + stdout, stderr = p.communicate() + return_code = p.returncode + if stdout is None: + stdout = '' + if stderr is None: + stderr = '' + stdout = stdout.replace('\n', ' ') + stderr = stderr.replace('\n', ' ') + tmpLog.debug("return_code: {0}".format(return_code)) + tmpLog.debug("stdout: {0}".format(stdout)) + tmpLog.debug("stderr: {0}".format(stderr)) + if return_code != 0: + errMsg = '{0} is not ready'.format(tmpGroupID) + allDone = False + break + except Exception: + errMsg = core_utils.dump_error_message(tmpLog) + allDone = False + break + if not allDone: + tmpLog.debug("check_stage_in_status: Return : None errMsg : {0}".format(errMsg)) + return None, errMsg + tmpLog.debug("check_stage_in_status: Return : True") return True, '' # resolve input file paths def resolve_input_paths(self, jobspec): + # make logger + tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='resolve_input_paths') pathInfo = dict() for tmpFileSpec in jobspec.inFiles: url = tmpFileSpec.lfn accPath = self.make_local_access_path(tmpFileSpec.scope, tmpFileSpec.lfn) pathInfo[tmpFileSpec.lfn] = {'path': accPath} + tmpLog.debug('lfn: {0} scope : {1} accPath : {2} pathInfo : {3}'.format(url, tmpFileSpec.scope, accPath, pathInfo)) jobspec.set_input_file_paths(pathInfo) return True, '' # make local access path def make_local_access_path(self, scope, lfn): return mover_utils.construct_file_path(self.localBasePath, scope, lfn) + + # run the command to create the image + def make_image(self, jobspec, args): + # make logger + tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='make_image') + tmpLog.debug('start') + return_code = 1 + try: + tmpLog.debug('executing ' + ' '.join(args)) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + stdout, stderr = p.communicate() + return_code = p.returncode + if stdout is not None: + stdout = stdout.replace('\n', ' ') + if stderr is not None: + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: {0}".format(stdout)) + tmpLog.debug("stderr: [0}".format(stderr)) + except Exception: + core_utils.dump_error_message(tmpLog) + tmpLog.debug('end with return code {0}'.format(return_code)) + return return_code + diff --git a/pandaharvester/harvesterpreparator/aux_preparator.py b/pandaharvester/harvesterpreparator/aux_preparator.py new file mode 100644 index 00000000..4fd306cb --- /dev/null +++ b/pandaharvester/harvesterpreparator/aux_preparator.py @@ -0,0 +1,13 @@ +from . import analysis_aux_preparator +from .analysis_aux_preparator import AnalysisAuxPreparator +from pandaharvester.harvestercore import core_utils + +# logger +baseLogger = core_utils.setup_logger('aux_preparator') + +analysis_aux_preparator.baseLogger = baseLogger + + +# preparator plugin for auxiliary inputs +class AuxPreparator (AnalysisAuxPreparator): + pass \ No newline at end of file diff --git a/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py b/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py index a949105b..0bef1361 100644 --- a/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py +++ b/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py @@ -133,7 +133,7 @@ def trigger_preparation(self, jobspec): ErrMsg = 'These files failed to download : ' if files: threads = [] - n_files_per_thread = (len(files) + self.n_threads - 1) / self.n_threads + n_files_per_thread = (len(files) + self.n_threads - 1) // self.n_threads tmpLog.debug('num files per thread: %s' % n_files_per_thread) for i in range(0, len(files), n_files_per_thread): sub_files = files[i:i + n_files_per_thread] diff --git a/pandaharvester/harvesterpreparator/xrdcp_preparator.py b/pandaharvester/harvesterpreparator/xrdcp_preparator.py new file mode 100644 index 00000000..cf17b1e0 --- /dev/null +++ b/pandaharvester/harvesterpreparator/xrdcp_preparator.py @@ -0,0 +1,149 @@ +import os +import tempfile +try: + import subprocess32 as subprocess +except Exception: + import subprocess + +from pandaharvester.harvestercore.plugin_base import PluginBase +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestermover import mover_utils + +# logger +baseLogger = core_utils.setup_logger('xrdcp_preparator') + + +# preparator plugin with https://xrootd.slac.stanford.edu/ xrdcp +""" + -- Example of plugin config + "preparator": { + "name": "XrdcpPreparator", + "module": "pandaharvester.harvesterpreparator.xrdcp_preparator", + # base path for source xrdcp server + "srcBasePath": " root://dcgftp.usatlas.bnl.gov:1096//pnfs/usatlas.bnl.gov/BNLT0D1/rucio", + # base path for local access to the copied files + "localBasePath": "/hpcgpfs01/scratch/benjamin/harvester/rucio-data-area", + # max number of attempts + "maxAttempts": 3, + # check paths under localBasePath. + "checkLocalPath": true, + # options for xrdcp + "xrdcpOpts": "--retry 3 --cksum adler32 --debug 1" + } +""" +class XrdcpPreparator(PluginBase): + # constructor + def __init__(self, **kwarg): + self.xrdcpOpts = None + self.maxAttempts = 3 + self.timeout = None + self.checkLocalPath = True + PluginBase.__init__(self, **kwarg) + + # trigger preparation + def trigger_preparation(self, jobspec): + # make logger + tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='trigger_preparation') + tmpLog.debug('start') + # get the environment + harvester_env = os.environ.copy() + #tmpLog.debug('Harvester environment : {}'.format(harvester_env)) + # loop over all inputs + inFileInfo = jobspec.get_input_file_attributes() + xrdcpInput = None + allfiles_transfered = True + overall_errMsg = "" + for tmpFileSpec in jobspec.inFiles: + # construct source and destination paths + srcPath = mover_utils.construct_file_path(self.srcBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], + tmpFileSpec.lfn) + # local path + localPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], + tmpFileSpec.lfn) + if self.checkLocalPath: + # check if already exits + if os.path.exists(localPath): + # calculate checksum + checksum = core_utils.calc_adler32(localPath) + checksum = 'ad:{0}'.format(checksum) + if checksum == inFileInfo[tmpFileSpec.lfn]['checksum']: + continue + # make directories if needed + if not os.path.isdir(os.path.dirname(localPath)): + os.makedirs(os.path.dirname(localPath)) + tmpLog.debug('Make directory - {0}'.format(os.path.dirname(localPath))) + # collect list of input files + if xrdcpInput is None: + xrdcpInput = [srcPath] + else: + xrdcpInput.append[srcPath] + # transfer using xrdcp one file at a time + tmpLog.debug('execute xrdcp') + args = ['xrdcp', '--nopbar', '--force'] + args_files = [srcPath,localPath] + if self.xrdcpOpts is not None: + args += self.xrdcpOpts.split() + args += args_files + tmpFileSpec.attemptNr += 1 + try: + xrdcp_cmd = ' '.join(args) + tmpLog.debug('execute: {0}'.format(xrdcp_cmd)) + p = subprocess.Popen(xrdcp_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=harvester_env, shell=True) + try: + stdout, stderr = p.communicate(timeout=self.timeout) + except subprocess.TimeoutExpired: + p.kill() + stdout, stderr = p.communicate() + tmpLog.warning('command timeout') + return_code = p.returncode + if stdout is not None: + if not isinstance(stdout, str): + stdout = stdout.decode() + stdout = stdout.replace('\n', ' ') + if stderr is not None: + if not isinstance(stderr, str): + stderr = stderr.decode() + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: %s" % stdout) + tmpLog.debug("stderr: %s" % stderr) + except Exception: + core_utils.dump_error_message(tmpLog) + return_code = 1 + if return_code != 0: + overall_errMsg += "file - {0} did not transfer error code {1} ".format(localPath,return_code) + allfiles_transfered = False + errMsg = 'failed with {0}'.format(return_code) + tmpLog.error(errMsg) + # check attemptNr + if tmpFileSpec.attemptNr >= self.maxAttempts: + errMsg = 'gave up due to max attempts' + tmpLog.error(errMsg) + return (False, errMsg) + # end loop over input files + # nothing to transfer + if xrdcpInput is None: + tmpLog.debug('done with no transfers') + return True, '' + # check if all files were transfered + if allfiles_transfered : + return True, '' + else: + return None, overall_errMsg + + + # check status + def check_stage_in_status(self, jobspec): + return True, '' + + # resolve input file paths + def resolve_input_paths(self, jobspec): + # input files + inFileInfo = jobspec.get_input_file_attributes() + pathInfo = dict() + for tmpFileSpec in jobspec.inFiles: + localPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], + tmpFileSpec.lfn) + pathInfo[tmpFileSpec.lfn] = {'path': localPath} + jobspec.set_input_file_paths(pathInfo) + return True, '' diff --git a/pandaharvester/harvesterstager/rucio_stager_hpc.py b/pandaharvester/harvesterstager/rucio_stager_hpc.py index 7426d9a3..95c49a0b 100644 --- a/pandaharvester/harvesterstager/rucio_stager_hpc.py +++ b/pandaharvester/harvesterstager/rucio_stager_hpc.py @@ -136,7 +136,7 @@ def trigger_stage_out(self, jobspec): stdout, stderr = process.communicate() fileSpec.attemptNr += 1 - stdout = stdout + " attemptNr: %s" % fileSpec.attemptNr + stdout = stdout.decode() + " attemptNr: %s" % fileSpec.attemptNr tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) if process.returncode == 0: diff --git a/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py b/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py index 3133c7cc..0abb90bd 100644 --- a/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py +++ b/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py @@ -151,7 +151,7 @@ def _stage_one_file(fileSpec): stdout, stderr = process.communicate() fileSpec.attemptNr += 1 - stdout = stdout + " attemptNr: %s" % fileSpec.attemptNr + stdout = stdout.decode() + " attemptNr: %s" % fileSpec.attemptNr tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) if process.returncode == 0: diff --git a/pandaharvester/harvesterstager/xrdcp_stager.py b/pandaharvester/harvesterstager/xrdcp_stager.py new file mode 100644 index 00000000..0905bdfe --- /dev/null +++ b/pandaharvester/harvesterstager/xrdcp_stager.py @@ -0,0 +1,264 @@ +import os +import tempfile +import gc + +try: + import subprocess32 as subprocess +except Exception: + import subprocess + +from pandaharvester.harvestermover import mover_utils +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvesterstager.base_stager import BaseStager + +import uuid + +# logger +_logger = core_utils.setup_logger('xrdcp_stager') + +# stager plugin with https://xrootd.slac.stanford.edu/ xrdcp +""" + -- Example of plugin config + "stager": { + "name": "XrdcpStager", + "module": "pandaharvester.harvesterstager.xrdcp_stager", + # base path for destinattion xrdcp server + "dstBasePath": " root://dcgftp.usatlas.bnl.gov:1096//pnfs/usatlas.bnl.gov/BNLT0D1/rucio", + # base path for local access to the copied files + "localBasePath": "/hpcgpfs01/scratch/benjamin/harvester/rucio-data-area", + # max number of attempts + "maxAttempts": 3, + # check paths under localBasePath. + "checkLocalPath": true, + # options for xrdcp + "xrdcpOpts": "--retry 3 --cksum adler32 --debug 1" + } +""" + + +# dummy plugin for stager +class XrdcpStager(BaseStager): + # constructor + def __init__(self, **kwarg): + BaseStager.__init__(self, **kwarg) + if not hasattr(self, 'xrdcpOpts'): + self.xrdcpOpts = None + if not hasattr(self, 'maxAttempts'): + self.maxAttempts = 3 + if not hasattr(self, 'timeout'): + self.timeout = None + if not hasattr(self, 'checkLocalPath'): + self.checkLocalPath = True + + # check status + def check_stage_out_status(self, jobspec): + """Check the status of stage-out procedure. If staging-out is done synchronously in trigger_stage_out + this method should always return True. + Output files are available through jobspec.get_outfile_specs(skip_done=False) which gives + a list of FileSpecs not yet done. + FileSpec.attemptNr shows how many times the transfer was checked for the file. + If the file was successfully transferred, status should be set to 'finished'. + Or 'failed', if the file failed to be transferred. Once files are set to 'finished' or 'failed', + jobspec.get_outfile_specs(skip_done=False) ignores them. + + :param jobspec: job specifications + :type jobspec: JobSpec + :return: A tuple of return code (True: transfer success, False: fatal transfer failure, + None: on-going or temporary failure) and error dialog + :rtype: (bool, string) + """ + for fileSpec in jobspec.get_output_file_specs(skip_done=True): + fileSpec.status = 'finished' + return True, '' + + # trigger stage out + def trigger_stage_out(self, jobspec): + """Trigger the stage-out procedure for the job. + Output files are available through jobspec.get_outfile_specs(skip_done=False) which gives + a list of FileSpecs not yet done. + FileSpec.attemptNr shows how many times transfer was tried for the file so far. + + :param jobspec: job specifications + :type jobspec: JobSpec + :return: A tuple of return code (True: success, False: fatal failure, None: temporary failure) + and error dialog + :rtype: (bool, string) + """ + + # let gc clean up memory + gc.collect() + + # make logger + tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='trigger_stage_out') + tmpLog.debug('start') + # get the environment + harvester_env = os.environ.copy() + #tmpLog.debug('Harvester environment : {}'.format(harvester_env)) + + xrdcpOutput = None + allfiles_transfered = True + overall_errMsg = "" + fileAttrs = jobspec.get_output_file_attributes() + # loop over all output files + for fileSpec in jobspec.get_output_file_specs(skip_done=True): + # fileSpec.objstoreID = 123 + # fileSpec.fileAttributes['guid'] + # construct source and destination paths + dstPath = mover_utils.construct_file_path(self.dstBasePath, fileAttrs[fileSpec.lfn]['scope'], + fileSpec.lfn) + # local path + localPath = mover_utils.construct_file_path(self.localBasePath, fileAttrs[fileSpec.lfn]['scope'], + fileSpec.lfn) + tmpLog.debug('fileSpec.path - {0} fileSpec.lfn = {1}'.format(fileSpec.path,fileSpec.lfn)) + localPath = fileSpec.path + if self.checkLocalPath: + # check if already exits + if os.path.exists(localPath): + # calculate checksum + checksum = core_utils.calc_adler32(localPath) + checksum = 'ad:{0}'.format(checksum) + if checksum == fileAttrs[fileSpec.lfn]['checksum']: + continue + # collect list of output files + if xrdcpOutput is None: + xrdcpOutput = [dstPath] + else: + if dstPath not in xrdcpOutput : + xrdcpOutput.append(dstPath) + # transfer using xrdcp one file at a time + tmpLog.debug('execute xrdcp') + args = ['xrdcp', '--nopbar', '--force'] + args_files = [localPath,dstPath] + if self.xrdcpOpts is not None: + args += self.xrdcpOpts.split() + args += args_files + fileSpec.attemptNr += 1 + try: + xrdcp_cmd = ' '.join(args) + tmpLog.debug('execute: {0}'.format(xrdcp_cmd)) + process = subprocess.Popen(xrdcp_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=harvester_env, shell=True) + try: + stdout, stderr = process.communicate(timeout=self.timeout) + except subprocess.TimeoutExpired: + process.kill() + stdout, stderr = process.communicate() + tmpLog.warning('command timeout') + return_code = process.returncode + if stdout is not None: + if not isinstance(stdout, str): + stdout = stdout.decode() + stdout = stdout.replace('\n', ' ') + if stderr is not None: + if not isinstance(stderr, str): + stderr = stderr.decode() + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: %s" % stdout) + tmpLog.debug("stderr: %s" % stderr) + except Exception: + core_utils.dump_error_message(tmpLog) + return_code = 1 + if return_code == 0: + fileSpec.status = 'finished' + else: + overall_errMsg += "file - {0} did not transfer error code {1} ".format(localPath,return_code) + allfiles_transfered = False + errMsg = 'failed with {0}'.format(return_code) + tmpLog.error(errMsg) + # check attemptNr + if fileSpec.attemptNr >= self.maxAttempts: + tmpLog.error('reached maxattempts: {0}, marked it as failed'.format(self.maxAttempts)) + fileSpec.status = 'failed' + + # force update + fileSpec.force_update('status') + tmpLog.debug('file: {0} status: {1}'.format(fileSpec.lfn, fileSpec.status)) + del process, stdout, stderr + + # end loop over output files + + # nothing to transfer + if xrdcpOutput is None: + tmpLog.debug('done with no transfers') + return True, '' + # check if all files were transfered + tmpLog.debug('done') + if allfiles_transfered : + return True, '' + else: + return None, overall_errMsg + + + # zip output files + def zip_output(self, jobspec): + """OBSOLETE : zip functions should be implemented in zipper plugins. + Zip output files. This method loops over jobspec.outFiles, which is a list of zip file's FileSpecs, + to make a zip file for each zip file's FileSpec. FileSpec.associatedFiles is a list of FileSpecs of + associated files to be zipped. The path of each associated file is available in associated + file's FileSpec.path. Once zip files are made, their FileSpec.path, FileSpec.fsize and + FileSpec.chksum need to be set. + + :param jobspec: job specifications + :type jobspec: JobSpec + :return: A tuple of return code (True for success, False otherwise) and error dialog + :rtype: (bool, string) + """ + # make logger + tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='zip_output') + return self.simple_zip_output(jobspec, tmpLog) + + # asynchronous zip output + def async_zip_output(self, jobspec): + """OBSOLETE : zip functions should be implemented in zipper plugins. + Zip output files asynchronously. This method is followed by post_zip_output(), + which is typically useful to trigger an asynchronous zipping mechanism such as batch job. + This method loops over jobspec.outFiles, which is a list of zip file's FileSpecs, to make + a zip file for each zip file's FileSpec. FileSpec.associatedFiles is a list of FileSpecs + of associated files to be zipped. The path of each associated file is available in associated + file's FileSpec.path. + + :param jobspec: job specifications + :type jobspec: JobSpec + :return: A tuple of return code (True for success, False otherwise) and error dialog + :rtype: (bool, string) + """ + # make logger + tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='zip_output') + # set some ID which can be used for lookup in post_zip_output() + groupID = str(uuid.uuid4()) + lfns = [] + for fileSpec in jobspec.outFiles: + lfns.append(fileSpec.lfn) + jobspec.set_groups_to_files({groupID: {'lfns': lfns, + 'groupStatus': 'zipping'} + } + ) + return True, '' + + # post zipping + def post_zip_output(self, jobspec): + """OBSOLETE : zip functions should be implemented in zipper plugins. + This method is executed after async_zip_output(), to do post-processing for zipping. + Once zip files are made, this method needs to look over jobspec.outFiles to set their + FileSpec.path, FileSpec.fsize, and FileSpec.chksum. + + :param jobspec: job specifications + :type jobspec: JobSpec + :return: A tuple of return code (True for success, False otherwise) and error dialog + :rtype: (bool, string) + """ + # make logger + tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='zip_output') + # get groups for lookup + groups = jobspec.get_groups_of_output_files() + # do something with groupIDs + pass + # update file attributes + for fileSpec in jobspec.outFiles: + fileSpec.path = '/path/to/zip' + fileSpec.fsize = 12345 + fileSpec.chksum = '66bb0985' + return True, '' diff --git a/pandaharvester/harvestersubmitter/act_submitter.py b/pandaharvester/harvestersubmitter/act_submitter.py index 177372f3..981997a0 100644 --- a/pandaharvester/harvestersubmitter/act_submitter.py +++ b/pandaharvester/harvestersubmitter/act_submitter.py @@ -2,7 +2,7 @@ import json import socket import time -import urllib +import urllib.parse from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase @@ -40,7 +40,6 @@ def __init__(self, **kwarg): uc.ProxyPath(str(proxy)) cred = arc.Credential(uc) dn = cred.GetIdentityName() - self.log.info("Proxy {0} with DN {1} and role {2}".format(proxy, dn, role)) actp = aCTProxy(self.log) attr = '/atlas/Role='+role @@ -69,6 +68,8 @@ def submit_workers(self, workspec_list): if jobSpec: jobSpec = jobSpec[0] tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map())) + # Unified queues: take prodsourcelabel from job + prodSourceLabel = jobSpec.jobParams.get('prodSourceLabel', prodSourceLabel) desc = {} # If we need to prefetch events, set aCT status waiting. @@ -81,7 +82,8 @@ def submit_workers(self, workspec_list): desc['pandastatus'] = 'sent' desc['actpandastatus'] = 'sent' desc['siteName'] = workSpec.computingSite - desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel == 'user' else 'production'] + desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel in ['user', 'panda'] else 'production'] + desc['prodSourceLabel'] = prodSourceLabel desc['sendhb'] = 0 metadata = {'harvesteraccesspoint': workSpec.get_access_point(), 'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id)} @@ -90,7 +92,7 @@ def submit_workers(self, workspec_list): if jobSpec: # push mode: aCT takes the url-encoded job description (like it gets from panda server) pandaid = jobSpec.PandaID - actjobdesc = urllib.urlencode(jobSpec.jobParams) + actjobdesc = urllib.parse.urlencode(jobSpec.jobParams) else: # pull mode: just set pandaid (to workerid) and prodsourcelabel pandaid = workSpec.workerID diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index 16179feb..01e63604 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -4,10 +4,11 @@ import tempfile import threading import random +import json from concurrent.futures import ThreadPoolExecutor import re -from math import sqrt, log1p +from math import log1p from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper @@ -43,7 +44,8 @@ def _get_ce_weighting(ce_endpoint_list=[], worker_ce_all_tuple=None): for _ce in worker_ce_backend_throughput_dict)) thruput_avg = (log1p(Q_good_init) - log1p(Q_good_fin)) n_new_workers = float(n_new_workers) - def _get_thruput(_ce_endpoint): + + def _get_thruput(_ce_endpoint): # inner function if _ce_endpoint not in worker_ce_backend_throughput_dict: q_good_init = 0. q_good_fin = 0. @@ -54,7 +56,8 @@ def _get_thruput(_ce_endpoint): for _st in ('submitted',))) thruput = (log1p(q_good_init) - log1p(q_good_fin)) return thruput - def _get_thruput_adj_ratio(thruput): + + def _get_thruput_adj_ratio(thruput): # inner function try: thruput_adj_ratio = thruput/thruput_avg + 1/N except ZeroDivisionError: @@ -65,7 +68,8 @@ def _get_thruput_adj_ratio(thruput): return thruput_adj_ratio ce_base_weight_sum = sum((_get_thruput_adj_ratio(_get_thruput(_ce)) for _ce in ce_endpoint_list)) - def _get_init_weight(_ce_endpoint): + + def _get_init_weight(_ce_endpoint): # inner function if _ce_endpoint not in worker_ce_stats_dict: q = 0. r = 0. @@ -174,16 +178,13 @@ def _condor_macro_replace(string, **kwarg): # Parse resource type from string for Unified PanDA Queue -def _get_resource_type(string, is_unified_queue, is_pilot_option=False, pilot_version='1'): +def _get_resource_type(string, is_unified_queue, is_pilot_option=False): string = str(string) if not is_unified_queue: ret = '' elif string in set(['SCORE', 'MCORE', 'SCORE_HIMEM', 'MCORE_HIMEM']): if is_pilot_option: - if pilot_version == '2': - ret = '--resource-type {0}'.format(string) - else: - ret = '-R {0}'.format(string) + ret = '--resource-type {0}'.format(string) else: ret = string else: @@ -192,25 +193,29 @@ def _get_resource_type(string, is_unified_queue, is_pilot_option=False, pilot_ve # Map "pilotType" (defined in harvester) to prodSourceLabel and pilotType option (defined in pilot, -i option) -# and piloturl (pilot option --piloturl) -# Depending on pilot version 1 or 2 -def _get_prodsourcelabel_pilotypeopt_piloturlstr(pilot_type, pilot_version='1'): - if pilot_version == '2': - # pilot 2 - pt_psl_map = { - 'RC': ('rc_test2', 'RC', '--piloturl http://cern.ch/atlas-panda-pilot/pilot2-dev.tar.gz'), - 'ALRB': ('rc_alrb', 'ALRB', ''), - 'PT': ('ptest', 'PR', ''), +# and piloturl (pilot option --piloturl) for pilot 2 +def _get_complicated_pilot_options(pilot_type, pilot_url=None): + pt_psl_map = { + 'RC': { + 'prod_source_label': 'rc_test2', + 'pilot_type_opt': 'RC', + 'pilot_url_str': '--piloturl http://cern.ch/atlas-panda-pilot/pilot2-dev.tar.gz', + }, + 'ALRB': { + 'prod_source_label': 'rc_alrb', + 'pilot_type_opt': 'ALRB', + 'pilot_url_str': '', + }, + 'PT': { + 'prod_source_label': 'ptest', + 'pilot_type_opt': 'PR', + 'pilot_url_str': '', + }, } - else: - # pilot 1, need not piloturl since wrapper covers it - pt_psl_map = { - 'RC': ('rc_test', 'RC', ''), - 'ALRB': ('rc_alrb', 'ALRB', ''), - 'PT': ('ptest', 'PR', ''), - } - pilot_opt_tuple = pt_psl_map.get(pilot_type, None) - return pilot_opt_tuple + pilot_opt_dict = pt_psl_map.get(pilot_type, None) + if pilot_url and pilot_opt_dict: + pilot_opt_dict['pilot_url_str'] = '--piloturl {0}'.format(pilot_url) + return pilot_opt_dict # submit a bag of workers @@ -238,8 +243,6 @@ def submit_bag_of_workers(data_list): worker_retval_map[workerID] = (tmpRetVal, workspec.get_changed_attributes()) # attributes try: - ce_info_dict = data['ce_info_dict'] - batch_log_dict = data['batch_log_dict'] use_spool = data['use_spool'] except KeyError: errStr = '{0} not submitted due to incomplete data of the worker'.format(workerID) @@ -326,8 +329,8 @@ def submit_bag_of_workers(data_list): # make a condor jdl for a worker def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, executable_file, - x509_user_proxy, log_subdir=None, ce_info_dict=dict(), batch_log_dict=dict(), - special_par='', harvester_queue_config=None, is_unified_queue=False, pilot_version='1', **kwarg): + x509_user_proxy, log_subdir=None, ce_info_dict=dict(), batch_log_dict=dict(), pilot_url=None, + special_par='', harvester_queue_config=None, is_unified_queue=False, pilot_version='unknown', **kwarg): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='make_a_jdl') @@ -362,13 +365,15 @@ def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, e request_walltime_minute = _div_round_up(request_walltime, 60) request_cputime_minute = _div_round_up(request_cputime, 60) # decide prodSourceLabel - pilot_opt_tuple = _get_prodsourcelabel_pilotypeopt_piloturlstr(workspec.pilotType, pilot_version) - if pilot_opt_tuple is None: - prod_source_label = harvester_queue_config.get_source_label() + pilot_opt_dict = _get_complicated_pilot_options(workspec.pilotType, pilot_url=pilot_url) + if pilot_opt_dict is None: + prod_source_label = harvester_queue_config.get_source_label(workspec.jobType) pilot_type_opt = workspec.pilotType - pilot_url_str = '' + pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else '' else: - prod_source_label, pilot_type_opt, pilot_url_str = pilot_opt_tuple + prod_source_label = pilot_opt_dict['prod_source_label'] + pilot_type_opt = pilot_opt_dict['pilot_type_opt'] + pilot_url_str = pilot_opt_dict['pilot_url_str'] # open tmpfile as submit description file tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sdf', dir=workspec.get_access_point()) # fill in template string @@ -401,11 +406,13 @@ def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, e logSubdir=log_subdir, gtag=batch_log_dict.get('gtag', 'fake_GTAG_string'), prodSourceLabel=prod_source_label, + jobType=workspec.jobType, resourceType=_get_resource_type(workspec.resourceType, is_unified_queue), - pilotResourceTypeOption=_get_resource_type(workspec.resourceType, is_unified_queue, True, pilot_version), + pilotResourceTypeOption=_get_resource_type(workspec.resourceType, is_unified_queue, True), ioIntensity=io_intensity, pilotType=pilot_type_opt, pilotUrlOption=pilot_url_str, + pilotVersion=pilot_version, ) # save jdl to submit description file tmpFile.write(jdl_str) @@ -436,7 +443,9 @@ def parse_batch_job_filename(value_str, file_dir, batchID, guess=False): class HTCondorSubmitter(PluginBase): # constructor def __init__(self, **kwarg): + tmpLog = core_utils.make_logger(baseLogger, method_name='__init__') self.logBaseURL = None + self.templateFile = None PluginBase.__init__(self, **kwarg) # number of processes try: @@ -456,11 +465,16 @@ def __init__(self, **kwarg): self.logDir except AttributeError: self.logDir = os.getenv('TMPDIR') or '/tmp' - # x509 proxy + # Default x509 proxy for a queue try: self.x509UserProxy except AttributeError: self.x509UserProxy = os.getenv('X509_USER_PROXY') + # x509 proxy for analysis jobs in grandly unified queues + try: + self.x509UserProxyAnalysis + except AttributeError: + self.x509UserProxyAnalysis = os.getenv('X509_USER_PROXY_ANAL') # ATLAS AGIS try: self.useAtlasAGIS = bool(self.useAtlasAGIS) @@ -473,12 +487,12 @@ def __init__(self, **kwarg): self.useAtlasGridCE = False finally: self.useAtlasAGIS = self.useAtlasAGIS or self.useAtlasGridCE - # sdf template directories of CEs + # sdf template directories of CEs; ignored if templateFile is set try: self.CEtemplateDir except AttributeError: self.CEtemplateDir = '' - # remote condor schedd and pool name (collector), and spool option + # remote condor schedd and pool name (collector) try: self.condorSchedd except AttributeError: @@ -487,6 +501,32 @@ def __init__(self, **kwarg): self.condorPool except AttributeError: self.condorPool = None + # json config file of remote condor host: schedd/pool and weighting. If set, condorSchedd and condorPool are overwritten + try: + self.condorHostConfig + except AttributeError: + self.condorHostConfig = False + if self.condorHostConfig: + try: + self.condorSchedd = [] + self.condorPool = [] + self.condorHostWeight = [] + with open(self.condorHostConfig, 'r') as f: + condor_host_config_map = json.load(f) + for _schedd, _cm in condor_host_config_map.items(): + _pool = _cm['pool'] + _weight = int(_cm['weight']) + self.condorSchedd.append(_schedd) + self.condorPool.append(_pool) + self.condorHostWeight.append(_weight) + except Exception as e: + tmpLog.error('error when parsing condorHostConfig json file; {0}: {1}'.format(e.__class__.__name__, e)) + raise + else: + if isinstance(self.condorSchedd, list): + self.condorHostWeight = [1] * len(self.condorSchedd) + else: + self.condorHostWeight = [1] # condor spool mechanism. If False, need shared FS across remote schedd try: self.useSpool @@ -500,6 +540,10 @@ def __init__(self, **kwarg): # record of information of CE statistics self.ceStatsLock = threading.Lock() self.ceStats = dict() + # allowed associated parameters from AGIS + self._allowed_agis_attrs = ( + 'pilot_url', + ) # get CE statistics of a site def get_ce_statistics(self, site_name, n_new_workers, time_window=21600): @@ -541,12 +585,21 @@ def submit_workers(self, workspec_list): _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) + # associated parameters dict + associated_params_dict = {} + + is_grandly_unified_queue = False # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName) this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) + is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) + # associated params on AGIS + for key, val in panda_queues_dict.get_harvester_params(self.queueName).items(): + if key in self._allowed_agis_attrs: + associated_params_dict[key] = val else: panda_queues_dict = dict() panda_queue_name = self.queueName @@ -555,8 +608,9 @@ def submit_workers(self, workspec_list): # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' - pilot_version_orig = str(this_panda_queue_dict.get('pilot_version', '')) - pilot_version_suffix_str = '_pilot2' if pilot_version_orig == '2' else '' + pilot_url = associated_params_dict.get('pilot_url') + pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current')) + sdf_suffix_str = '_pilot2' # get override requirements from queue configured try: @@ -567,10 +621,13 @@ def submit_workers(self, workspec_list): # deal with Condor schedd and central managers; make a random list the choose n_bulks = _div_round_up(nWorkers, self.minBulkToRamdomizedSchedd) if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: + orig_list = [] if isinstance(self.condorPool, list) and len(self.condorPool) > 0: - orig_list = list(zip(self.condorSchedd, self.condorPool)) + for _schedd, _pool, _weight in zip(self.condorSchedd, self.condorPool, self.condorHostWeight): + orig_list.extend([(_schedd, _pool)] * _weight) else: - orig_list = [ (_schedd, self.condorPool) for _schedd in self.condorSchedd ] + for _schedd, _weight in zip(self.condorSchedd, self.condorHostWeight): + orig_list.extend([(_schedd, self.condorPool)] * _weight) if n_bulks < len(orig_list): schedd_pool_choice_list = random.sample(orig_list, n_bulks) else: @@ -613,8 +670,6 @@ def submit_workers(self, workspec_list): tmpLog.error('No valid CE endpoint found') to_submit_any = False - - def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), @@ -633,8 +688,8 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): except KeyError: tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint') ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() - # go on info of the CE - ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '') + # go on info of the CE; ignore protocol prefix in ce_endpoint + ce_endpoint_from_queue = re.sub('^\w+://', '', ce_info_dict.get('ce_endpoint', '')) ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) @@ -649,10 +704,10 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port) tmpLog.debug('For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"'.format( - self.queueName, pilot_version_orig, ce_endpoint_from_queue, ce_flavour_str)) - if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: - sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format( - ce_flavour_str=ce_flavour_str, pilot_version_suffix_str=pilot_version_suffix_str) + self.queueName, pilot_version, ce_endpoint_from_queue, ce_flavour_str)) + if not self.templateFile and os.path.isdir(self.CEtemplateDir) and ce_flavour_str: + sdf_template_filename = '{ce_flavour_str}{sdf_suffix_str}.sdf'.format( + ce_flavour_str=ce_flavour_str, sdf_suffix_str=sdf_suffix_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) else: try: @@ -668,6 +723,12 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass + try: + # Manually define ceQueueName + if self.ceQueueName: + ce_info_dict['ce_queue_name'] = self.ceQueueName + except AttributeError: + pass # template for batch script try: tmpFile = open(self.templateFile) @@ -736,6 +797,10 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') + + # choose the x509 certificate based on the type of job (analysis or production) + proxy = _choose_proxy(workspec) + # set data dict data.update({ 'workspec': workspec, @@ -746,7 +811,7 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, - 'x509_user_proxy': self.x509UserProxy, + 'x509_user_proxy': proxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, @@ -755,7 +820,8 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, - 'pilot_version': pilot_version_orig, + 'pilot_url': pilot_url, + 'pilot_version': pilot_version, }) return data @@ -768,6 +834,20 @@ def _propagate_attributes(workspec, tmpVal): tmpLog.debug('Done workspec attributes propagation') return retVal + def _choose_proxy(workspec): + """ + Choose the proxy based on the job type + """ + job_type = workspec.jobType + proxy = self.x509UserProxy + if is_grandly_unified_queue and job_type in ('user', 'panda', 'analysis') and self.x509UserProxyAnalysis: + tmpLog.debug('Taking analysis proxy') + proxy = self.x509UserProxyAnalysis + else: + tmpLog.debug('Taking default proxy') + + return proxy + tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index 663f3fd8..a36997e1 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -1,14 +1,31 @@ import os +import argparse +import traceback +try: + from urllib import unquote # Python 2.X +except ImportError: + from urllib.parse import unquote # Python 3+ from concurrent.futures import ThreadPoolExecutor from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase from pandaharvester.harvestermisc.k8s_utils import k8s_Client - +from pandaharvester.harvesterconfig import harvester_config +from pandaharvester.harvestermisc.info_utils import PandaQueuesDict +from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper # logger -baseLogger = core_utils.setup_logger('k8s_submitter') +base_logger = core_utils.setup_logger('k8s_submitter') + +# image defaults +DEF_SLC6_IMAGE = 'atlasadc/atlas-grid-slc6' +DEF_CENTOS7_IMAGE = 'atlasadc/atlas-grid-centos7' +DEF_IMAGE = DEF_CENTOS7_IMAGE + +# command defaults +DEF_COMMAND = ["/usr/bin/bash"] +DEF_ARGS = ["-c", "cd; wget https://raw.githubusercontent.com/HSF/harvester/master/pandaharvester/harvestercloud/pilots_starter.py; chmod 755 pilots_starter.py; ./pilots_starter.py || true"] # submitter for K8S @@ -20,6 +37,11 @@ def __init__(self, **kwarg): self.k8s_client = k8s_Client(namespace=self.k8s_namespace, config_file=self.k8s_config_file) + # required for parsing jobParams + self.parser = argparse.ArgumentParser() + self.parser.add_argument('-p', dest='executable', type=unquote) + self.parser.add_argument('--containerImage', dest='container_image') + # number of processes try: self.nProcesses @@ -28,13 +50,33 @@ def __init__(self, **kwarg): else: if (not self.nProcesses) or (self.nProcesses < 1): self.nProcesses = 1 - # x509 proxy + # x509 proxy: obsolete mode try: self.x509UserProxy except AttributeError: if os.getenv('X509_USER_PROXY'): self.x509UserProxy = os.getenv('X509_USER_PROXY') + # x509 proxy for analysis jobs in grandly unified queues + try: + self.x509UserProxyAnalysis + except AttributeError: + self.x509UserProxyAnalysis = os.getenv('X509_USER_PROXY_ANAL') + + # x509 proxy through k8s secrets: preferred way + try: + self.proxySecretPath + except AttributeError: + if os.getenv('PROXY_SECRET_PATH'): + self.proxySecretPath = os.getenv('PROXY_SECRET_PATH') + + # analysis x509 proxy through k8s secrets: on GU queues + try: + self.proxySecretPathAnalysis + except AttributeError: + if os.getenv('PROXY_SECRET_PATH_ANAL'): + self.proxySecretPath = os.getenv('PROXY_SECRET_PATH_ANAL') + # CPU adjust ratio try: self.cpuAdjustRatio @@ -47,53 +89,186 @@ def __init__(self, **kwarg): except AttributeError: self.memoryAdjustRatio = 100 - def submit_a_job(self, work_spec): - tmp_log = self.make_logger(baseLogger, method_name='submit_a_job') - tmpRetVal = (None, 'Nothing done') + def parse_params(self, job_params): + tmp_log = self.make_logger(base_logger, method_name='parse_params') - yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) + job_params_list = job_params.split(' ') + args, unknown = self.parser.parse_known_args(job_params_list) + + tmp_log.info('Parsed params: {0}'.format(args)) + return args + + def read_job_configuration(self, work_spec): + + try: + job_spec_list = work_spec.get_jobspec_list() + if job_spec_list: + job_spec = job_spec_list[0] + job_fields = job_spec.jobParams + job_pars_parsed = self.parse_params(job_fields['jobPars']) + return job_fields, job_pars_parsed + except (KeyError, AttributeError): + return None, None + + return None, None + + def decide_container_image(self, job_fields, job_pars_parsed): + """ + Decide container image: + - job defined image: if we are running in push mode and the job specified an image, use it + - production images: take SLC6 or CentOS7 + - otherwise take default image specified for the queue + """ + tmp_log = self.make_logger(base_logger, method_name='decide_container_image') + try: + container_image = job_pars_parsed.container_image + if container_image: + tmp_log.debug('Taking container image from job params: {0}'.format(container_image)) + return container_image + except AttributeError: + pass try: - if hasattr(self, 'proxySecretPath'): - rsp = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, self.proxySecretPath, True, self.cpuAdjustRatio, self.memoryAdjustRatio) - elif hasattr(self, 'x509UserProxy'): - rsp = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, self.x509UserProxy, False, self.cpuAdjustRatio, self.memoryAdjustRatio) + cmt_config = job_fields['cmtconfig'] + requested_os = cmt_config.split('@')[1] + if 'slc6' in requested_os.lower(): + container_image = DEF_SLC6_IMAGE else: - errStr = 'No proxy specified in proxySecretPath or x509UserProxy; not submitted' - tmpRetVal = (False, errStr) - except Exception as _e: - errStr = 'Failed to create a JOB; {0}'.format(_e) - tmpRetVal = (False, errStr) + container_image = DEF_CENTOS7_IMAGE + tmp_log.debug('Taking container image from cmtconfig: {0}'.format(container_image)) + return container_image + except (KeyError, TypeError): + pass + + container_image = DEF_IMAGE + tmp_log.debug('Taking default container image: {0}'.format(container_image)) + return container_image + + def build_executable(self, job_fields, job_pars_parsed): + executable = DEF_COMMAND + args = DEF_ARGS + try: + if 'runcontainer' in job_fields['transformation']: + # remove any quotes + exec_list = job_pars_parsed.executable.strip('"\'').split(' ') + # take first word as executable + executable = [exec_list[0]] + # take rest as arguments + if len(exec_list) > 1: + args = [' '.join(exec_list[1:])] + except (AttributeError, TypeError): + pass + + return executable, args + + def _choose_proxy(self, workspec, is_grandly_unified_queue): + """ + Choose the proxy based on the job type and whether k8s secrets are enabled + """ + cert = None + use_secret = False + job_type = workspec.jobType + + if is_grandly_unified_queue and job_type in ('user', 'panda', 'analysis'): + if self.proxySecretPathAnalysis: + cert = self.proxySecretPathAnalysis + use_secret = True + elif self.proxySecretPath: + cert = self.proxySecretPath + use_secret = True + elif self.x509UserProxyAnalysis: + cert = self.x509UserProxyAnalysis + use_secret = False + elif self.x509UserProxy: + cert = self.x509UserProxy + use_secret = False else: - work_spec.batchID = yaml_content['metadata']['name'] + if self.proxySecretPath: + cert = self.proxySecretPath + use_secret = True + elif self.x509UserProxy: + cert = self.x509UserProxy + use_secret = False - # set the log files - work_spec.set_log_file('stdout', '{0}/{1}.out'.format(self.logBaseURL, work_spec.workerID)) + return cert, use_secret - tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID)) - tmpRetVal = (True, '') + def submit_k8s_worker(self, work_spec): + tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') + + # get info from harvester queue config + _queueConfigMapper = QueueConfigMapper() + harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) + prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType) + + # set the stdout log file + log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID) + work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) + # TODO: consider if we want to upload the yaml file to PanDA cache - return tmpRetVal + yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) + try: + + # read the job configuration (if available, only push model) + job_fields, job_pars_parsed = self.read_job_configuration(work_spec) + + # decide container image and executable to run. In pull mode, defaults are provided + container_image = self.decide_container_image(job_fields, job_pars_parsed) + executable, args = self.build_executable(job_fields, job_pars_parsed) + tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable, + args)) + + # choose the appropriate proxy + panda_queues_dict = PandaQueuesDict() + is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName) + cert, use_secret = self._choose_proxy(work_spec, is_grandly_unified_queue) + if not cert: + err_str = 'No proxy specified in proxySecretPath or x509UserProxy. Not submitted' + tmp_return_value = (False, err_str) + return tmp_return_value + + # get the walltime limit + try: + max_time = panda_queues_dict.get(self.queueName)['maxtime'] + except Exception as e: + tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName)) + max_time = None + + # submit the worker + rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label, + container_image, executable, args, + cert, cert_in_secret=use_secret, + cpu_adjust_ratio=self.cpuAdjustRatio, + memory_adjust_ratio=self.memoryAdjustRatio, + max_time=max_time) + except Exception as _e: + tmp_log.error(traceback.format_exc()) + err_str = 'Failed to create a JOB; {0}'.format(_e) + tmp_return_value = (False, err_str) + else: + work_spec.batchID = yaml_content['metadata']['name'] + tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID)) + tmp_return_value = (True, '') + return tmp_return_value # submit workers def submit_workers(self, workspec_list): - tmp_log = self.make_logger(baseLogger, method_name='submit_workers') + tmp_log = self.make_logger(base_logger, method_name='submit_workers') - nWorkers = len(workspec_list) - tmp_log.debug('start, nWorkers={0}'.format(nWorkers)) + n_workers = len(workspec_list) + tmp_log.debug('start, n_workers={0}'.format(n_workers)) - retList = list() + ret_list = list() if not workspec_list: tmp_log.debug('empty workspec_list') - return retList + return ret_list with ThreadPoolExecutor(self.nProcesses) as thread_pool: - retValList = thread_pool.map(self.submit_a_job, workspec_list) - tmp_log.debug('{0} workers submitted'.format(nWorkers)) + ret_val_list = thread_pool.map(self.submit_k8s_worker, workspec_list) + tmp_log.debug('{0} workers submitted'.format(n_workers)) - retList = list(retValList) + ret_list = list(ret_val_list) tmp_log.debug('done') - return retList + return ret_list diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py new file mode 100644 index 00000000..479d150c --- /dev/null +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -0,0 +1,129 @@ +import datetime +import tempfile +import re +try: + import subprocess32 as subprocess +except: + import subprocess + +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore.plugin_base import PluginBase + +# logger +baseLogger = core_utils.setup_logger('lsf_submitter') + + +# submitter for LSF batch system +class LSFSubmitter(PluginBase): + # constructor + def __init__(self, **kwarg): + self.uploadLog = False + self.logBaseURL = None + PluginBase.__init__(self, **kwarg) + # template for batch script + tmpFile = open(self.templateFile) + self.template = tmpFile.read() + tmpFile.close() + + # submit workers + def submit_workers(self, workspec_list): + retList = [] + for workSpec in workspec_list: + # make logger + tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), + method_name='submit_workers') + # make batch script + batchFile = self.make_batch_script(workSpec) + # command + comStr = "bsub -L /bin/sh" + # submit + tmpLog.debug('submit with {0} and LSF options file {1}'.format(comStr,batchFile)) + p = subprocess.Popen(comStr.split(), + shell=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + stdin=open(batchFile,'r')) + # check return code + stdOut, stdErr = p.communicate() + retCode = p.returncode + tmpLog.debug('retCode={0}'.format(retCode)) + tmpLog.debug('stdOut={0}'.format(stdOut)) + tmpLog.debug('stdErr={0}'.format(stdErr)) + if retCode == 0: + # extract batchID + batchID = str(stdOut.split()[1],'utf-8') + result = re.sub('[^0-9]','', batchID) + tmpLog.debug('strip out non-numberic charactors from {0} - result {1}'.format(batchID,result)) + workSpec.batchID = result + tmpLog.debug('batchID={0}'.format(workSpec.batchID)) + # set log files + if self.uploadLog: + if self.logBaseURL is None: + baseDir = workSpec.get_access_point() + else: + baseDir = self.logBaseURL + stdOut, stdErr = self.get_log_file_names(batchFile, workSpec.batchID) + if stdOut is not None: + workSpec.set_log_file('stdout', '{0}/{1}'.format(baseDir, stdOut)) + if stdErr is not None: + workSpec.set_log_file('stderr', '{0}/{1}'.format(baseDir, stdErr)) + tmpRetVal = (True, '') + else: + # failed + errStr = stdOut + ' ' + stdErr + tmpLog.error(errStr) + tmpRetVal = (False, errStr) + retList.append(tmpRetVal) + return retList + + # make batch script + def make_batch_script(self, workspec): + #if hasattr(self, 'dynamicSizing') and self.dynamicSizing is True: + # maxWalltime = str(datetime.timedelta(seconds=workspec.maxWalltime)) + # yodaWallClockLimit = workspec.maxWalltime / 60 + #else: + # workspec.nCore = self.nCore + # maxWalltime = str(datetime.timedelta(seconds=self.maxWalltime)) + # yodaWallClockLimit = self.maxWalltime / 60 + + # set number of nodes - Note Ultimately will need to something more sophisticated + if hasattr(self,'nGpuPerNode'): + if int(self.nGpuPerNode) > 0: + numnodes = int(workspec.nJobs/self.nGpuPerNode) + if numnodes <= 0: + numnodes = 1 + else: + if (workspec.nJobs % self.nGpuPerNode) != 0 : + numnodes += 1 + else: + numnodes=workspec.nCore / self.nCorePerNode + + tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) + tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, + #localQueue=self.localQueue, + #projectName=self.projectName, + nNode=numnodes, + accessPoint=workspec.accessPoint, + #walltime=maxWalltime, + #yodaWallClockLimit=yodaWallClockLimit, + workerID=workspec.workerID) + ) + tmpFile.close() + return tmpFile.name + + # get log file names + def get_log_file_names(self, batch_script, batch_id): + stdOut = None + stdErr = None + with open(batch_script) as f: + for line in f: + if not line.startswith('#BSUB'): + continue + items = line.split() + if '-o' in items: + #stdOut = items[-1].replace('$LSB_BATCH_JID', batch_id) + stdOut = items[-1].replace('%J', batch_id) + elif '-e' in items: + #stdErr = items[-1].replace('$LSB_BATCH_JID', batch_id) + stdErr = items[-1].replace('%J', batch_id) + return stdOut, stdErr diff --git a/pandaharvester/harvestersubmitter/slurm_submitter.py b/pandaharvester/harvestersubmitter/slurm_submitter.py index 42e91468..feddc8ea 100644 --- a/pandaharvester/harvestersubmitter/slurm_submitter.py +++ b/pandaharvester/harvestersubmitter/slurm_submitter.py @@ -51,7 +51,7 @@ def submit_workers(self, workspec_list): stdErr_str = stdErr if (isinstance(stdErr, str) or stdErr is None) else stdErr.decode() if retCode == 0: # extract batchID - workSpec.batchID = re.search('[^0-9]*([0-9]+)[^0-9]*', '{0}'.format(stdOut_str)).group(1) + workSpec.batchID = re.search('[^0-9]*([0-9]+)[^0-9]*$', '{0}'.format(stdOut_str)).group(1) tmpLog.debug('batchID={0}'.format(workSpec.batchID)) # set log files if self.uploadLog: diff --git a/pandaharvester/harvestersweeper/k8s_sweeper.py b/pandaharvester/harvestersweeper/k8s_sweeper.py index a3b2556b..9486364c 100644 --- a/pandaharvester/harvestersweeper/k8s_sweeper.py +++ b/pandaharvester/harvestersweeper/k8s_sweeper.py @@ -1,12 +1,9 @@ -import os - from pandaharvester.harvestercore import core_utils from pandaharvester.harvestersweeper.base_sweeper import BaseSweeper from pandaharvester.harvestermisc.k8s_utils import k8s_Client - # logger -baseLogger = core_utils.setup_logger('k8s_sweeper') +base_logger = core_utils.setup_logger('k8s_sweeper') # sweeper for K8S @@ -15,92 +12,115 @@ class K8sSweeper(BaseSweeper): def __init__(self, **kwarg): BaseSweeper.__init__(self, **kwarg) - self.k8s_client = k8s_Client(namespace=self.k8s_namespace, config_file=self.k8s_config_file) + self.k8s_client = k8s_Client(self.k8s_namespace, config_file=self.k8s_config_file) self._all_pods_list = [] # # kill a worker - # def kill_worker(self, workspec): - # tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), + # def kill_worker(self, work_spec): + # tmp_log = self.make_logger(base_logger, 'workerID={0}'.format(work_spec.workerID), # method_name='kill_worker') # - # tmpRetVal = (None, 'Nothing done') + # tmp_ret_val = (None, 'Nothing done') # - # job_id = workspec.batchID + # batch_id = work_spec.batchID # try: - # self.k8s_client.delete_job(job_id) + # self.k8s_client.delete_job(batch_id) # except Exception as _e: - # errStr = 'Failed to delete a JOB with id={0} ; {1}'.format(job_id, _e) - # tmpLog.error(errStr) - # tmpRetVal = (False, errStr) + # err_str = 'Failed to delete a JOB with id={0} ; {1}'.format(batch_id, _e) + # tmp_log.error(err_str) + # tmp_ret_val = (False, err_str) # # self._all_pods_list = self.k8s_client.get_pods_info() - # pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) + # pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=batch_id) # pods_name = [ pods_info['name'] for pods_info in pods_list ] - # job_info = self.k8s_client.get_jobs_info(job_id) + # job_info = self.k8s_client.get_jobs_info(batch_id) # # if not job_info: - # retList = self.k8s_client.delete_pods(pods_name) - # if all(item['errMsg'] == '' for item in retList): - # tmpLog.info('Deleted a JOB & POD with id={0}'.format(job_id)) - # tmpRetVal = (True, '') + # ret_list = self.k8s_client.delete_pods(pods_name) + # if all(item['errMsg'] == '' for item in ret_list): + # tmp_log.info('Deleted a JOB & POD with id={0}'.format(batch_id)) + # tmp_ret_val = (True, '') # else: - # errStrList = list() - # for item in retList: + # err_str_list = list() + # for item in ret_list: # if item['errMsg']: - # errStr = 'Failed to delete a POD with id={0} ; {1}'.format(item['name'], item['errMsg']) - # tmpLog.error(errStr) - # errStrList.append(errStr) - # tmpRetVal = (False, ','.join(errStrList)) + # err_str = 'Failed to delete a POD with id={0} ; {1}'.format(item['name'], item['errMsg']) + # tmp_log.error(err_str) + # err_str_list.append(err_str) + # tmp_ret_val = (False, ','.join(err_str_list)) # - # return tmpRetVal + # return tmp_ret_val # kill workers - def kill_workers(self, workspec_list): - tmpLog = self.make_logger(baseLogger, method_name='kill_workers') - - self._all_pods_list = self.k8s_client.get_pods_info() - - retList = [] - for workspec in workspec_list: - tmpRetVal = (None, 'Nothing done') - - job_id = workspec.batchID - try: - self.k8s_client.delete_job(job_id) - except Exception as _e: - errStr = 'Failed to delete a JOB with id={0} ; {1}'.format(job_id, _e) - tmpLog.error(errStr) - tmpRetVal = (False, errStr) - - pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) - pods_name = [ pods_info['name'] for pods_info in pods_list ] - job_info = self.k8s_client.get_jobs_info(job_id) - - if not job_info: - retList = self.k8s_client.delete_pods(pods_name) - if all(item['errMsg'] == '' for item in retList): - tmpLog.info('Deleted a JOB & POD with id={0}'.format(job_id)) - tmpRetVal = (True, '') + def kill_workers(self, work_spec_list): + tmp_log = self.make_logger(base_logger, method_name='kill_workers') + + self._all_pods_list = self.k8s_client.get_pods_info(workspec_list=work_spec_list) + + ret_list = [] + for work_spec in work_spec_list: + tmp_ret_val = (None, 'Nothing done') + + batch_id = work_spec.batchID + worker_id = str(work_spec.workerID) + if batch_id: # sometimes there are missed workers that were not submitted + + # if push mode, delete the configmap + if work_spec.mapType != 'NoJob': + try: + self.k8s_client.delete_config_map(worker_id) + tmp_log.debug('Deleted configmap {0}'.format(worker_id)) + except Exception as _e: + err_str = 'Failed to delete a CONFIGMAP with id={0} ; {1}'.format(worker_id, _e) + tmp_log.error(err_str) + tmp_ret_val = (False, err_str) else: - errStrList = list() - for item in retList: - if item['errMsg']: - errStr = 'Failed to delete a POD with id={0} ; {1}'.format(item['name'], item['errMsg']) - tmpLog.error(errStr) - errStrList.append(errStr) - tmpRetVal = (False, ','.join(errStrList)) - - retList.append(tmpRetVal) - - return retList - - - # cleanup for a worker - def sweep_worker(self, workspec): - ## Make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='sweep_worker') + tmp_log.debug('No pandajob/configmap associated to worker {0}'.format(work_spec.workerID)) + + # delete the job + try: + self.k8s_client.delete_job(batch_id) + tmp_log.debug('Deleted JOB {0}'.format(batch_id)) + except Exception as _e: + err_str = 'Failed to delete a JOB with id={0} ; {1}'.format(batch_id, _e) + tmp_log.error(err_str) + tmp_ret_val = (False, err_str) + + """ + # retrieve the associated pods + pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=batch_id) + pods_name = [pods_info['name'] for pods_info in pods_list] + job_info = self.k8s_client.get_jobs_info(workspec_list=[work_spec]) + # retrieve the associated pods + if not job_info: + ret_list = self.k8s_client.delete_pods(pods_name) + if all(item['errMsg'] == '' for item in ret_list): + tmp_log.info('Deleted a JOB & POD with id={0}'.format(batch_id)) + tmp_ret_val = (True, '') + else: + err_str_list = list() + for item in ret_list: + if item['errMsg']: + err_str = 'Failed to delete a POD with id={0} ; {1}'.format(item['name'], item['errMsg']) + tmp_log.error(err_str) + err_str_list.append(err_str) + tmp_ret_val = (False, ','.join(err_str_list)) + """ + else: # the worker cannot be cleaned + tmp_ret_val = (True, '') + + ret_list.append(tmp_ret_val) + + return ret_list + + def sweep_worker(self, work_spec): + # cleanup for a worker + tmp_log = self.make_logger(base_logger, 'workerID={0}'.format(work_spec.workerID), method_name='sweep_worker') + + # retrieve and upload the logs to panda cache + # batch_id = work_spec.batchID + # log_content = self.k8s_client.retrieve_pod_log(batch_id) # nothing to do - return True, '' + return True, '' \ No newline at end of file diff --git a/pandaharvester/harvestersweeper/lsf_sweeper.py b/pandaharvester/harvestersweeper/lsf_sweeper.py new file mode 100644 index 00000000..7c0c29d4 --- /dev/null +++ b/pandaharvester/harvestersweeper/lsf_sweeper.py @@ -0,0 +1,68 @@ +import os +import shutil +try: + import subprocess32 as subprocess +except: + import subprocess + +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore.plugin_base import PluginBase + +# logger +baseLogger = core_utils.setup_logger('lsf_sweeper') + + +# plugin for sweeper with LSF +class LSFSweeper(PluginBase): + # constructor + def __init__(self, **kwarg): + PluginBase.__init__(self, **kwarg) + + # kill a worker + def kill_worker(self, workspec): + """Kill a worker in a scheduling system like batch systems and computing elements. + + :param workspec: worker specification + :type workspec: WorkSpec + :return: A tuple of return code (True for success, False otherwise) and error dialog + :rtype: (bool, string) + """ + # make logger + tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), + method_name='kill_worker') + # kill command + comStr = 'bkill {0}'.format(workspec.batchID) + # execute + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdOut, stdErr = p.communicate() + retCode = p.returncode + if retCode != 0: + # failed + errStr = 'command "{0}" failed, retCode={1}, error: {2} {3}'.format(comStr, retCode, stdOut, stdErr) + tmpLog.error(errStr) + return False, errStr + else: + tmpLog.info('Succeeded to kill workerID={0} batchID={1}'.format(workspec.workerID, workspec.workerID)) + # return + return True, '' + + # cleanup for a worker + def sweep_worker(self, workspec): + """Perform cleanup procedures for a worker, such as deletion of work directory. + + :param workspec: worker specification + :type workspec: WorkSpec + :return: A tuple of return code (True for success, False otherwise) and error dialog + :rtype: (bool, string) + """ + # make logger + tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), + method_name='sweep_worker') + # clean up worker directory + if os.path.exists(workspec.accessPoint): + shutil.rmtree(workspec.accessPoint) + tmpLog.info('removed {0}'.format(workspec.accessPoint)) + else: + tmpLog.info('access point already removed.') + # return + return True, '' diff --git a/pandaharvester/harvestertest/container_auxpreparator_test.py b/pandaharvester/harvestertest/container_auxpreparator_test.py new file mode 100644 index 00000000..372d49bd --- /dev/null +++ b/pandaharvester/harvestertest/container_auxpreparator_test.py @@ -0,0 +1,23 @@ +import json +import sys +import time +from pprint import pprint + +#from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper +from pandaharvester.harvestercore.job_spec import JobSpec +from pandaharvester.harvesterextractor.aux_extractor import AuxExtractor + +job_data_json = """{"container_name": "atlas.athena:21.0.15_DBRelease-100.0.2_Patched", "PandaID": 4731765799, "jobsetID": 1,"taskID": 1, "transformation": "Sim_tf.py", "attemptNr": 1,"currentPriority": 1, "outFiles": "", "realDatasets": "", "ddmEndPointOut": "", "scopeOut": "", "scopeLog": "", "logFile": "", "logGUID": "", "files": [{"row_id": 30133581516, "pandaid": 4731765799, "modificationtime": "2020-05-18T10:29:21", "guid": "ADCF2DEC-3412-C64B-B8FB-E8629680AA4D", "lfn": "EVNT.21265061._000036.pool.root.1", "type": "input", "dataset": "mc16_13TeV.830011.H7EG_jetjet_JZ1.merge.EVNT.e7954_e7400_tid21265061_00", "status": "ready", "proddblock": "mc16_13TeV:mc16_13TeV.830011.H7EG_jetjet_JZ1.merge.EVNT.e7954_e7400_tid21265061_00", "proddblocktoken": "", "dispatchdblock": "", "dispatchdblocktoken": "", "destinationdblock": "", "destinationdblocktoken": "", "destinationse": "", "fsize": 307204932, "md5sum": "", "checksum": "ad:698afd11", "scope": "mc16_13TeV", "jeditaskid": 21265064, "datasetid": 311267780, "fileid": 21006488431, "attemptnr": 22, "destination": " ", "fsizemb": "292.97", "ruciodatasetname": "mc16_13TeV:mc16_13TeV.830011.H7EG_jetjet_JZ1.merge.EVNT.e7954_e7400_tid21265061_00", "datasetname": "mc16_13TeV:mc16_13TeV.830011.H7EG_jetjet_JZ1.merge.EVNT.e7954_e7400_tid21265061_00", "ddmsite": "LRZ-LMU", "creationdate": "2020-05-18T10:29:21", "oldfiletable": 1, "destinationdblock_vis": "", "maxattempt": 30}, {"row_id": 30133581517, "pandaid": 4731765799, "modificationtime": "2020-05-18T10:29:21", "guid": "819a609a-1650-455b-8980-9c190cb77064", "lfn": "352", "type": "pseudo_input", "dataset": "seq_number", "status": "unknown", "proddblock": "seq_number", "proddblocktoken": "", "dispatchdblock": "", "dispatchdblocktoken": "", "destinationdblock": "", "destinationdblocktoken": "", "destinationse": "", "fsize": 0, "md5sum": "", "checksum": "", "scope": "", "jeditaskid": 21265064, "datasetid": 311267781, "fileid": 21006489271, "attemptnr": 21, "destination": " ", "fsizemb": "0.00", "ruciodatasetname": "seq_number", "datasetname": "seq_number", "ddmsite": "LRZ-LMU", "creationdate": "2020-05-18T10:29:21", "oldfiletable": 1, "destinationdblock_vis": ""}, {"row_id": 30133581519, "pandaid": 4731765799, "modificationtime": "2020-05-18T10:29:21", "guid": "", "lfn": "HITS.21265064._002580.pool.root.1", "type": "output", "dataset": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.HITS.e7954_e7400_s3126_tid21265064_00", "status": "failed", "proddblock": "", "proddblocktoken": "", "dispatchdblock": "", "dispatchdblocktoken": "", "destinationdblock": "mc16_13TeV.830011.NA.simul.HITS.21265064_sub2290908", "destinationdblocktoken": "dst:TOKYO-LCG2_DATADISK", "destinationse": "nucleus:TOKYO-LCG2", "fsize": 0, "md5sum": "", "checksum": "", "scope": "mc16_13TeV", "jeditaskid": 21265064, "datasetid": 311267782, "fileid": 21092403937, "attemptnr": 0, "destination": " ", "fsizemb": "0.00", "ruciodatasetname": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.HITS.e7954_e7400_s3126_tid21265064_00", "datasetname": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.HITS.e7954_e7400_s3126_tid21265064_00", "ddmsite": "TOKYO-LCG2", "dsttoken": "DATADISK", "creationdate": "2020-05-18T10:29:21", "oldfiletable": 1, "destinationdblock_vis": "sub2290908"}, {"row_id": 30133581518, "pandaid": 4731765799, "modificationtime": "2020-05-18T10:29:21", "guid": "73a868bd-acd5-4fb4-ade9-a66badd0e5a9", "lfn": "log.21265064._002580.job.log.tgz.1", "type": "log", "dataset": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.log.e7954_e7400_s3126_tid21265064_00", "status": "failed", "proddblock": "", "proddblocktoken": "", "dispatchdblock": "", "dispatchdblocktoken": "", "destinationdblock": "mc16_13TeV.830011.NA.simul.log.21265064_sub2290905", "destinationdblocktoken": "ddd:LRZ-LMU_DATADISK", "destinationse": "LRZ-LMU_MUC", "fsize": 0, "md5sum": "", "checksum": "", "scope": "mc16_13TeV", "jeditaskid": 21265064, "datasetid": 311267783, "fileid": 21092403936, "attemptnr": 0, "destination": " ", "fsizemb": "0.00", "ruciodatasetname": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.log.e7954_e7400_s3126_tid21265064_00", "datasetname": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.log.e7954_e7400_s3126_tid21265064_00", "ddmsite": "LRZ-LMU", "creationdate": "2020-05-18T10:29:21", "oldfiletable": 1, "destinationdblock_vis": "sub2290905"}], "job": {"pandaid": 4731765799, "jobdefinitionid": 0, "schedulerid": "", "pilotid": "", "creationtime": "2020-05-18 03:42:54", "creationhost": "", "modificationtime": "2020-05-18 10:29:21", "modificationhost": "aipanda058.cern.ch", "atlasrelease": "Atlas-21.0.15", "transformation": "Sim_tf.py", "homepackage": "AtlasOffline/21.0.15", "prodserieslabel": "pandatest", "prodsourcelabel": "managed", "produserid": "dhirsch", "gshare": "Validation", "assignedpriority": 888, "currentpriority": 888, "attemptnr": 22, "maxattempt": 22, "jobname": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.e7954_e7400_s3126.4727254713", "maxcpucount": 4611, "maxcpuunit": "kSI2kseconds", "maxdiskcount": 5255, "maxdiskunit": "MB ", "ipconnectivity": "yes ", "minramcount": 44100, "minramunit": "MB", "starttime": null, "endtime": "2020-05-18T10:29:21", "cpuconsumptiontime": 0, "cpuconsumptionunit": "", "commandtopilot": "tobekilled", "transexitcode": "", "piloterrorcode": 0, "piloterrordiag": "", "exeerrorcode": 0, "exeerrordiag": "", "superrorcode": 0, "superrordiag": "", "ddmerrorcode": 0, "ddmerrordiag": "", "brokerageerrorcode": 0, "brokerageerrordiag": "", "jobdispatchererrorcode": 0, "jobdispatchererrordiag": "", "taskbuffererrorcode": 100, "taskbuffererrordiag": "reassigned by JEDI", "computingsite": "LRZ-LMU_MUC", "computingelement": "", "jobparameters": "", "metadata": "", "proddblock": "mc16_13TeV:mc16_13TeV.830011.H7EG_jetjet_JZ1.merge.EVNT.e7954_e7400_tid21265061_00", "dispatchdblock": "", "destinationdblock": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.log.e7954_e7400_s3126_tid21265064_00", "destinationse": "dst:TOKYO-LCG2_DATADISK/ATLASDATADISK", "nevents": 0, "grid": "", "cloud": "WORLD", "cpuconversion": null, "sourcesite": "", "destinationsite": "", "transfertype": "", "taskid": 21265064, "cmtconfig": "x86_64-slc6-gcc49-opt", "statechangetime": "2020-05-18 10:29:21", "proddbupdatetime": "2020-05-18T03:42:54", "lockedby": "jedi", "relocationflag": 1, "jobexecutionid": 0, "vo": "atlas", "pilottiming": "", "workinggroup": "AP_MCGN", "processingtype": "simul", "produsername": "dhirsch", "ninputfiles": null, "countrygroup": "", "batchid": "", "parentid": null, "specialhandling": "ddm:rucio,hc:DE,de", "jobsetid": 30864, "corecount": 96, "ninputdatafiles": 1, "inputfiletype": "EVNT", "inputfileproject": "mc16_13TeV", "inputfilebytes": 307204932, "noutputdatafiles": 0, "outputfilebytes": 0, "jobmetrics": "", "workqueue_id": 16, "jeditaskid": 21265064, "jobstatus": "closed", "actualcorecount": null, "reqid": 30864, "nucleus": "TOKYO-LCG2", "jobsubstatus": "toreassign", "eventservice": "ordinary", "hs06": 1920, "hs06sec": null, "maxrss": null, "maxvmem": null, "maxswap": null, "maxpss": null, "avgrss": null, "avgvmem": null, "avgswap": null, "avgpss": null, "maxwalltime": 4611, "resourcetype": "MCORE", "failedattempt": 4, "totrchar": null, "totwchar": null, "totrbytes": null, "totwbytes": null, "raterchar": null, "ratewchar": null, "raterbytes": null, "ratewbytes": null, "diskio": null, "memoryleak": null, "memoryleakx2": null, "container_name": "atlas.athena:21.0.15_DBRelease-100.0.2_Patched", "outputfiletype": "log", "homecloud": "DE", "errorinfo": "", "jobinfo": "", "duration": "", "durationsec": 0, "durationmin": 0, "waittime": "0:6:46:27", "priorityrange": "800:899", "jobsetrange": "30800:30899"}, "dsfiles": []}""" + +job_data = json.loads(job_data_json) + +job_data["jobPars"] = '--inputEVNTFile=EVNT.21265061._000036.pool.root.1 --maxEvents=1000 --postInclude "default:RecJobTransforms/UseFrontier.py" --preExec "EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)" "EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True" --preInclude "EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py" --skipEvents=1000 --firstEvent=331001 --outputHITSFile=HITS.21265064._002580.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=352 --DBRelease="all:current" --conditionsTag "default:OFLCOND-MC16-SDR-14" --geometryVersion="default:ATLAS-R2-2016-01-00-01_VALIDATION" --runNumber=830011 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus' + + +jobSpec = JobSpec() +jobSpec.convert_job_json(job_data) + +#pprint(jobSpec.jobParams) + +ae = AuxExtractor() +print(ae.get_aux_inputs(jobSpec)) diff --git a/pandaharvester/harvestertest/stageInTest_GlobusOnline.py b/pandaharvester/harvestertest/stageInTest_GlobusOnline.py new file mode 100644 index 00000000..f3fb4b89 --- /dev/null +++ b/pandaharvester/harvestertest/stageInTest_GlobusOnline.py @@ -0,0 +1,77 @@ +import sys +import time +from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper +from pandaharvester.harvestercore.job_spec import JobSpec +from pilot.info.filespec import FileSpec + +queueName = sys.argv[1] + +queueConfigMapper = QueueConfigMapper() + +queueConfig = queueConfigMapper.get_queue(queueName) + +jobSpec = JobSpec() +new_file_data = {'scope': 'test', + 'lfn': 'TXT.19772875._044894.tar.gz.1', 'attemptNr': 0 } +new_file_spec = FileSpec(filetype='input', **new_file_data) +new_file_spec.attemptNr = 0 +new_file_spec.path = '/home/psvirin/harvester3' + +jobSpec.inFiles = {new_file_spec} +jobSpec.outFiles = {} +jobSpec.jobParams = { + 'inFiles': 'TXT.19772875._044894.tar.gz.1', + 'scopeIn': 'mc15_13TeV', + 'fsize': '658906675', + 'GUID': '7e3776f9bb0af341b03e59d3de895a13', + 'checksum': 'ad:3734bdd9', + 'ddmEndPointIn': 'BNL-OSG2_DATADISK', + 'realDatasetsIn': 'mc15_13TeV.363638.MGPy8EG_N30NLO_Wmunu_Ht500_700_BFilter.merge.DAOD_STDM4.e4944_s2726_r7772_r7676_p2842_tid09596175_00', + } +jobSpec.computingSite = queueName +jobSpec.PandaID='11111' + +from pandaharvester.harvestercore.plugin_factory import PluginFactory + +pluginFactory = PluginFactory() + +# get plugin +preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) +print ("plugin={0}".format(preparatorCore.__class__.__name__)) + +print(jobSpec) + +print ("testing stagein:") +print ("BasePath from preparator configuration: %s " % preparatorCore.basePath) +preparatorCore.basePath = preparatorCore.basePath + "/testdata/" +print ("basePath redifuned for test data: %s " % preparatorCore.basePath) + +tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) +if tmpStat: + print (" OK") +else: + print (" NG {0}".format(tmpOut)) + +print + +print ("testing status check") +while True: + tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) + if tmpStat is True: + print (" OK") + break + elif tmpStat is False: + print (" NG {0}".format(tmpOut)) + sys.exit(1) + else: + print (" still running. sleep 1 min") + time.sleep(60) + +print + +print ("checking path resolution") +tmpStat, tmpOut = preparatorCore.resolve_input_paths(jobSpec) +if tmpStat: + print (" OK {0}".format(jobSpec.jobParams['inFilePaths'])) +else: + print (" NG {0}".format(tmpOut)) diff --git a/pandaharvester/harvestertest/stageInTest_globus.py b/pandaharvester/harvestertest/stageInTest_globus.py index 168ee496..2957a70e 100644 --- a/pandaharvester/harvestertest/stageInTest_globus.py +++ b/pandaharvester/harvestertest/stageInTest_globus.py @@ -29,9 +29,9 @@ def dump(obj): for attr in dir(obj): if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) + print("obj.%s = %s" % (attr, getattr(obj, attr))) -print len(sys.argv) +print(len(sys.argv)) queueName = 'ALCF_Theta' job_id = 1111 globus_sleep_time = 15 @@ -60,7 +60,7 @@ def dump(obj): tmpLog = core_utils.make_logger(_logger, method_name='stageInTest_go_preparator') tmpLog.debug('start') -for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): +for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: @@ -180,7 +180,7 @@ def dump(obj): assFileSpec.fsize = random.randint(10, 100) # create source file hash = hashlib.md5() - hash.update('%s:%s' % (fileSpec.scope, fileSpec.lfn)) + hash.update(('%s:%s' % (fileSpec.scope, fileSpec.lfn)).encode('utf-8')) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) fileSpec.path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=queueConfig.preparator['Globus_dstPath'], @@ -249,7 +249,7 @@ def dump(obj): tmpLog.error('Failed to send intial files') sys.exit(3) -print "sleep {0} seconds".format(globus_sleep_time) +print("sleep {0} seconds".format(globus_sleep_time)) time.sleep(globus_sleep_time) # enter polling loop to see if the intial files have transfered @@ -282,7 +282,7 @@ def dump(obj): tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status']) tmpLog.debug(tmpStr) if NotFound : - print "sleep {0} seconds".format(globus_sleep_time) + print("sleep {0} seconds".format(globus_sleep_time)) time.sleep(globus_sleep_time) ++iloop @@ -293,39 +293,37 @@ def dump(obj): #dump(queueConfig) -print "plugin={0}".format(preparatorCore.__class__.__name__) +print("plugin={0}".format(preparatorCore.__class__.__name__)) -print "testing stagein:" -print "BasePath from preparator configuration: %s " % preparatorCore.basePath +print("testing stagein:") +print("BasePath from preparator configuration: %s " % preparatorCore.basePath) tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) if tmpStat: - print " OK" + print(" OK") else: - print " NG {0}".format(tmpOut) + print(" NG {0}".format(tmpOut)) -print "sleep {0} seconds".format(globus_sleep_time) +print("sleep {0} seconds".format(globus_sleep_time)) time.sleep(globus_sleep_time) -print "testing status check" +print("testing status check") while True: tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) if tmpStat == True: - print " OK" + print(" OK") break elif tmpStat == False: - print " NG {0}".format(tmpOut) + print(" NG {0}".format(tmpOut)) sys.exit(1) else: - print " still running. sleep 1 min" + print(" still running. sleep 1 min") time.sleep(60) -print - -print "checking path resolution" +print("checking path resolution") tmpStat, tmpOut = preparatorCore.resolve_input_paths(jobSpec) if tmpStat: - print " OK {0}".format(jobSpec.jobParams['inFilePaths']) + print(" OK {0}".format(jobSpec.jobParams['inFilePaths'])) else: - print " NG {0}".format(tmpOut) + print(" NG {0}".format(tmpOut)) diff --git a/pandaharvester/harvestertest/submitterTest.py b/pandaharvester/harvestertest/submitterTest.py index 9e995f54..200cd3f7 100644 --- a/pandaharvester/harvestertest/submitterTest.py +++ b/pandaharvester/harvestertest/submitterTest.py @@ -14,10 +14,39 @@ signal_utils.set_suicide_handler(None) os.wait() else: + + if len(sys.argv) not in (2, 4): + print("Wrong number of parameters. You can either:") + print(" - specify the queue name") + print(" - specify the queue name, jobType (managed, user) and resourceType (SCORE, SCORE_HIMEM, MCORE, MCORE_HIMEM)") + sys.exit(0) + queueName = sys.argv[1] queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) + if queueConfig.prodSourceLabel in ('user', 'managed'): + jobType = queueConfig.prodSourceLabel + else: + jobType = 'managed' # default, can be overwritten by parameters + + resourceType = 'SCORE' # default, can be overwritten by parameters + + if len(sys.argv) == 4: + # jobType should be 'managed' or 'user'. If not specified will default to a production job + if sys.argv[2] in ('user', 'managed'): + jobType = sys.argv[2] + else: + print ('value for jobType not valid, defaulted to {0}'.format(jobType)) + + # resourceType should be 'SCORE', 'SCORE_HIMEM', 'MCORE', 'MCORE_HIMEM'. If not specified defaults to single core + if sys.argv[3] in ('SCORE', 'SCORE_HIMEM', 'MCORE', 'MCORE_HIMEM'): + resourceType = sys.argv[3] + else: + print ('value for resourceType not valid, defaulted to {0}'.format(resourceType)) + + print ('Running with queueName:{0}, jobType:{1}, resourceType:{2}'.format(queueName, jobType, resourceType)) + pluginFactory = PluginFactory() com = CommunicatorPool() @@ -28,7 +57,7 @@ jobs, errStr = com.get_jobs(queueConfig.queueName, 'nodeName', queueConfig.prodSourceLabel, 'computingElement', 1, None) if len(jobs) == 0: - print ("Failed to get jobs at {0} due to {1}".format(queueConfig.queueName, errStr)) + print("Failed to get jobs at {0} due to {1}".format(queueConfig.queueName, errStr)) sys.exit(0) jobSpec = JobSpec() @@ -42,7 +71,7 @@ jobSpecList.append(jobSpec) maker = pluginFactory.get_plugin(queueConfig.workerMaker) - workSpec = maker.make_worker(jobSpecList, queueConfig, 'SCORE') # TODO: needs to be thought + workSpec = maker.make_worker(jobSpecList, queueConfig, jobType, resourceType) workSpec.accessPoint = queueConfig.messenger['accessPoint'] workSpec.mapType = queueConfig.mapType diff --git a/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py b/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py index f202eece..beb29294 100644 --- a/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py @@ -11,7 +11,7 @@ def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type): workSpec = WorkSpec() workSpec.resourceType = resource_type if len(jobspec_list) > 0: diff --git a/pandaharvester/harvesterworkermaker/multijob_worker_maker.py b/pandaharvester/harvesterworkermaker/multijob_worker_maker.py index 7b56c235..7de1f4f5 100644 --- a/pandaharvester/harvesterworkermaker/multijob_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/multijob_worker_maker.py @@ -45,7 +45,7 @@ def _get_executable(self, queue_config): return exe_str # make a worker from a job with a disk access point - def make_worker(self, jobspec_list, queue_config, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog = self.make_logger(baseLogger, method_name='make_worker') workSpec = WorkSpec() self.nJobsPerWorker = len(jobspec_list) diff --git a/pandaharvester/harvesterworkermaker/multinode_worker_maker.py b/pandaharvester/harvesterworkermaker/multinode_worker_maker.py index c59b81a5..c03c44cf 100644 --- a/pandaharvester/harvesterworkermaker/multinode_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/multinode_worker_maker.py @@ -58,7 +58,7 @@ def _get_executable(self): return exe_str # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') diff --git a/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py index 2ac53375..bc2d7dad 100644 --- a/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py @@ -28,7 +28,7 @@ def __init__(self, **kwarg): self.dyn_resources = None # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') @@ -201,7 +201,7 @@ def adjust_resources(self, resources): tmpLog.info("Available backfill resources after adjusting: %s" % ret_resources) return ret_resources - def get_dynamic_resource(self, queue_name, resource_type): + def get_dynamic_resource(self, queue_name, job_type, resource_type): resources = self.get_bf_resources() if resources: resources = self.adjust_resources(resources) diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index 3e723117..4a1088d4 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -41,9 +41,38 @@ def get_job_core_and_memory(self, queue_dict, job_spec): return job_corecount, job_memory + def get_job_type(self, job_spec, job_type, queue_dict, tmp_prodsourcelabel=None): + + queue_type = queue_dict.get('type', None) + + # 1. get prodSourceLabel from job (PUSH) + if job_spec and 'prodSourceLabel' in job_spec.jobParams: + job_type_final = job_spec.jobParams['prodSourceLabel'] + + # 2. get prodSourceLabel from the specified job_type (PULL UPS) + elif job_type: + job_type_final = job_type + if tmp_prodsourcelabel: + if queue_type != 'analysis' and tmp_prodsourcelabel not in ('user', 'panda', 'managed'): + # for production, unified or other types of queues we need to run neutral prodsourcelabels + # with production proxy since they can't be distinguished and can fail + job_type_final = 'managed' + + # 3. convert the prodSourcelabel from the queue configuration or leave it empty (PULL) + else: + # map AGIS types to PanDA types + if queue_type == 'analysis': + job_type_final = 'user' + elif queue_type == 'production': + job_type_final = 'managed' + else: + job_type_final = None + + return job_type_final + # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, resource_type): - tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), + def make_worker(self, jobspec_list, queue_config, job_type, resource_type): + tmpLog = self.make_logger(_logger, 'queue={0}:{1}:{2}'.format(queue_config.queueName, job_type, resource_type), method_name='make_worker') tmpLog.debug('jobspec_list: {0}'.format(jobspec_list)) @@ -61,7 +90,7 @@ def make_worker(self, jobspec_list, queue_config, resource_type): workSpec.nCore = queue_dict.get('corecount', 1) or 1 workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1 - # case of unified queue: look at the resource type and queue configuration + # case of unified queue: look at the job & resource type and queue configuration else: catchall = queue_dict.get('catchall', '') if 'useMaxRam' in catchall or queue_config.queueName in ('Taiwan-LCG2-HPC2_Unified', @@ -113,11 +142,9 @@ def make_worker(self, jobspec_list, queue_config, resource_type): except Exception: pass - if (nCore > 0 and 'nCore' in self.jobAttributesToUse) \ - or unified_queue: + if (nCore > 0 and 'nCore' in self.jobAttributesToUse) or unified_queue: workSpec.nCore = nCore - if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) \ - or unified_queue: + if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) or unified_queue: workSpec.minRamCount = minRamCount if maxDiskCount > 0 and 'maxDiskCount' in self.jobAttributesToUse: workSpec.maxDiskCount = maxDiskCount @@ -125,7 +152,10 @@ def make_worker(self, jobspec_list, queue_config, resource_type): workSpec.maxWalltime = maxWalltime if ioIntensity > 0 and 'ioIntensity' in self.jobAttributesToUse: workSpec.ioIntensity = ioIntensity + workSpec.pilotType = jobspec_list[0].get_pilot_type() + workSpec.jobType = self.get_job_type(jobspec_list[0], job_type, queue_dict) + else: # when no job # randomize pilot type with weighting @@ -139,7 +169,11 @@ def make_worker(self, jobspec_list, queue_config, resource_type): del fake_job if workSpec.pilotType in ['RC', 'ALRB', 'PT']: tmpLog.info('a worker has pilotType={0}'.format(workSpec.pilotType)) - # TODO: this needs to be improved with real resource types + + workSpec.jobType = self.get_job_type(None, job_type, queue_dict, tmp_prodsourcelabel) + tmpLog.debug('get_job_type decided for job_type: {0} (input job_type: {1}, queue_type: {2}, tmp_prodsourcelabel: {3})' + .format(workSpec.jobType, job_type, queue_dict.get('type', None), tmp_prodsourcelabel)) + if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type elif workSpec.nCore == 1: diff --git a/pandaharvester/panda_pkg_info.py b/pandaharvester/panda_pkg_info.py index 1997a788..fa0695ef 100644 --- a/pandaharvester/panda_pkg_info.py +++ b/pandaharvester/panda_pkg_info.py @@ -1 +1 @@ -release_version = "0.2.1" +release_version = "0.2.2" diff --git a/setup.py b/setup.py index 6281fa4f..95aebfc7 100644 --- a/setup.py +++ b/setup.py @@ -4,9 +4,7 @@ # # import sys - from setuptools import setup, find_packages - from pandaharvester import panda_pkg_info sys.path.insert(0, '.') @@ -30,7 +28,7 @@ 'future', 'futures; python_version == "2.*"', 'pycryptodomex', - 'panda-common-s >= 0.0.11', + 'panda-common', 'pyjwt', 'subprocess32; python_version == "2.*"', 'rpyc', @@ -39,6 +37,13 @@ 'psutil >= 5.4.8', 'scandir; python_version < "3.5"' ], + + # optional pip dependencies + extras_require={ + 'kubernetes': ['kubernetes', 'pyyaml'], + 'mysql': ['mysqlclient'] + }, + data_files=[ # config and cron files ('etc/panda', ['templates/panda_harvester.cfg.rpmnew.template', @@ -63,6 +68,7 @@ ] ), ], + scripts=['templates/panda_jedi-renice', 'templates/panda_harvester-sqlite3backup', ] diff --git a/templates/init.d/panda_harvester-uwsgi.rpmnew.template b/templates/init.d/panda_harvester-uwsgi.rpmnew.template index 1652c445..16b476b7 100755 --- a/templates/init.d/panda_harvester-uwsgi.rpmnew.template +++ b/templates/init.d/panda_harvester-uwsgi.rpmnew.template @@ -172,6 +172,13 @@ start) fi fi ;; +runfg) + echo Run harvester prescript + ${PRESCRIPT} + echo Run Harvester in foreground + ${PROGNAME} >> ${HSTDOUT} 2>> ${HSTDERR} + rm -f ${PIDFILE} + ;; stop) if [ ! -f ${PIDFILE} ]; then echo "WANRING: pidfile:${PIDFILE} does not exist. Nothing done" diff --git a/templates/panda_harvester.cfg.rpmnew.template b/templates/panda_harvester.cfg.rpmnew.template index fce0977a..7e49c3a4 100644 --- a/templates/panda_harvester.cfg.rpmnew.template +++ b/templates/panda_harvester.cfg.rpmnew.template @@ -66,6 +66,8 @@ host = localhost # port number for MariaDB. N/A for sqlite port = 3306 +# max time in seconds to keep trying to reconnect DB before timeout +reconnectTimeout = 300 @@ -182,7 +184,7 @@ resolverClass = PandaQueuesDict autoBlacklist = False # restrict to a certain pilot version (optional) -pilotVersion = 1 +#pilotVersion = 2