diff --git a/atlas/lib/idds/atlas/notifier/messaging.py b/atlas/lib/idds/atlas/notifier/messaging.py index 760ef301..a9b61333 100644 --- a/atlas/lib/idds/atlas/notifier/messaging.py +++ b/atlas/lib/idds/atlas/notifier/messaging.py @@ -41,6 +41,10 @@ def on_error(self, headers, body): ''' self.logger.error('[broker] [%s]: %s', self.__broker, body) + def on_message(self, headers, body): + # self.logger.info('[broker] [%s]: %s', self.__broker, body) + pass + class MessagingSender(PluginBase, threading.Thread): def __init__(self, **kwargs): @@ -126,3 +130,62 @@ def run(self): def __call__(self): self.run() + + +class MessagingReceiver(MessagingSender): + def __init__(self, **kwargs): + super(MessagingReceiver, self).__init__(**kwargs) + + def subscribe(self, listener=MessagingListener): + self.conns = [] + + broker_addresses = [] + for b in self.brokers: + try: + addrinfos = socket.getaddrinfo(b, 0, socket.AF_INET, 0, socket.IPPROTO_TCP) + for addrinfo in addrinfos: + b_addr = addrinfo[4][0] + broker_addresses.append(b_addr) + except socket.gaierror as error: + self.logger.error('Cannot resolve hostname %s: %s' % (b, str(error))) + + self.logger.info("Resolved broker addresses: %s" % broker_addresses) + + for broker in broker_addresses: + conn = stomp.Connection12(host_and_ports=[(broker, self.port)], + vhost=self.vhost, + keepalive=True) + conn.set_listener('message-receiver', listener(conn.transport._Transport__host_and_ports[0])) + conn.connect(self.username, self.password, wait=True) + conn.subscribe(destination=self.destination, id='atlas-idds-messaging', ack='auto') + self.conns.append(conn) + + while not self.graceful_stop.is_set(): + try: + for conn in self.conns: + if not conn.is_connected(): + self.logger.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) + conn.set_listener('message-receiver', listener(conn.transport._Transport__host_and_ports[0])) + # conn.start() + conn.connect(self.username, self.password, wait=True) + conn.subscribe(destination=self.destination, id='atlas-idds-messaging', ack='auto') + time.sleep(1) + except Exception as error: + self.logger.error("Messaging receiver throws an exception: %s, %s" % (error, traceback.format_exc())) + + self.logger.info('receiver graceful stop requested') + + for conn in self.conns: + try: + conn.disconnect() + except Exception: + pass + + def run(self): + try: + self.subscribe() + except Exception as error: + self.logger.error("Messaging receiver throws an exception: %s, %s" % (error, traceback.format_exc())) + + def __call__(self): + self.run() diff --git a/atlas/lib/idds/atlas/version.py b/atlas/lib/idds/atlas/version.py index 3b9ff3e9..d9044964 100644 --- a/atlas/lib/idds/atlas/version.py +++ b/atlas/lib/idds/atlas/version.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2021 -release_version = "0.0.5" +release_version = "0.1.0" diff --git a/atlas/lib/idds/atlas/workflow/atlasactuatorwork.py b/atlas/lib/idds/atlas/workflow/atlasactuatorwork.py new file mode 100644 index 00000000..06d2f060 --- /dev/null +++ b/atlas/lib/idds/atlas/workflow/atlasactuatorwork.py @@ -0,0 +1,453 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 - 2021 + +import copy +import json +import os +import traceback +import uuid + +from rucio.client.client import Client as RucioClient +from rucio.common.exception import (CannotAuthenticate as RucioCannotAuthenticate) + +from idds.common import exceptions +from idds.common.constants import (TransformType, CollectionType, CollectionStatus, + ContentStatus, ContentType, + ProcessingStatus, WorkStatus) +from idds.common.utils import run_command +# from idds.workflow.work import Work +from idds.atlas.workflow.atlascondorwork import ATLASCondorWork + + +class ATLASActuatorWork(ATLASCondorWork): + def __init__(self, executable=None, arguments=None, parameters=None, setup=None, + work_tag='actuating', exec_type='local', sandbox=None, work_id=None, + name=None, + primary_input_collection=None, other_input_collections=None, + output_collections=None, log_collections=None, + logger=None, + workload_id=None, + agent_attributes=None, + output_json=None): + """ + Init a work/task/transformation. + + :param setup: A string to setup the executable enviroment, it can be None. + :param executable: The executable. + :param arguments: The arguments. + :param parameters: A dict with arguments needed to be replaced. + :param work_type: The work type like data carousel, hyperparameteroptimization and so on. + :param exec_type: The exec type like 'local', 'remote'(with remote_package set), 'docker' and so on. + :param sandbox: The sandbox. + :param work_id: The work/task id. + :param primary_input_collection: The primary input collection. + :param other_input_collections: List of the input collections. + :param output_collections: List of the output collections. + # :param workflow: The workflow the current work belongs to. + :param sandbox: The sandbox to be uploaded or the container path. + :param executable: The executable command. + :param arguments: The arguments for the executable. + """ + + super(ATLASActuatorWork, self).__init__(executable=executable, arguments=arguments, work_tag=work_tag, + parameters=parameters, setup=setup, work_type=TransformType.Actuating, + exec_type=exec_type, sandbox=sandbox, work_id=work_id, + primary_input_collection=primary_input_collection, + other_input_collections=other_input_collections, + output_collections=output_collections, + log_collections=log_collections, + logger=logger, + agent_attributes=agent_attributes) + + self.output_json = output_json + + self.terminated = False + self.tocancel = False + + # if self.agent_attributes and 'atlashpowork' in self.agent_attributes: + # self.agent_attributes = self.agent_attributes['atlashpowork'] + # self.logger.info("agent_attributes: %s" % self.agent_attributes) + + # if self.agent_attributes and 'workdir' in self.agent_attributes and self.agent_attributes['workdir']: + # self.set_workdir(self.agent_attributes['workdir']) + # self.logger.info("workdir: %s" % self.get_workdir()) + + if agent_attributes: + self.set_agent_attributes(agent_attributes) + + def set_agent_attributes(self, attrs, req_attributes=None): + super(ATLASActuatorWork, self).set_agent_attributes(attrs) + + if self.agent_attributes and 'workdir' in self.agent_attributes and self.agent_attributes['workdir']: + if req_attributes and 'request_id' in req_attributes and 'workload_id' in req_attributes and 'transform_id' in req_attributes: + req_dir = 'request_%s_%s/transform_%s' % (req_attributes['request_id'], + req_attributes['workload_id'], + req_attributes['transform_id']) + self.set_workdir(os.path.join(self.agent_attributes['workdir'], req_dir)) + self.logger.info("workdir: %s" % self.get_workdir()) + + ########################################## # noqa E266 + def generate_new_task(self): + self.logger.info("Work %s parameters for next task: %s" % (self.get_internal_id(), str(self.get_parameters_for_next_task()))) + if self.get_parameters_for_next_task(): + return True + else: + return False + + ####### functions for transformer ######## # noqa E266 + ###################################### # noqa E266 + + def set_output_data(self, data): + # overwrite to transfer the output of current task to next task + super(ATLASActuatorWork, self).set_output_data(data) + super(ATLASActuatorWork, self).set_parameters_for_next_task(data) + + def get_rucio_client(self): + try: + client = RucioClient() + except RucioCannotAuthenticate as error: + self.logger.error(error) + self.logger.error(traceback.format_exc()) + raise exceptions.IDDSException('%s: %s' % (str(error), traceback.format_exc())) + return client + + def poll_external_collection(self, coll): + try: + if 'status' in coll and coll['status'] in [CollectionStatus.Closed]: + return coll + else: + client = self.get_rucio_client() + did_meta = client.get_metadata(scope=coll['scope'], name=coll['name']) + if 'coll_metadata' not in coll: + coll['coll_metadata'] = {} + coll['coll_metadata']['bytes'] = did_meta['bytes'] + coll['coll_metadata']['total_files'] = did_meta['length'] + coll['coll_metadata']['availability'] = did_meta['availability'] + coll['coll_metadata']['events'] = did_meta['events'] + coll['coll_metadata']['is_open'] = did_meta['is_open'] + coll['coll_metadata']['run_number'] = did_meta['run_number'] + coll['coll_metadata']['did_type'] = did_meta['did_type'] + coll['coll_metadata']['list_all_files'] = False + + if (('is_open' in coll['coll_metadata'] and not coll['coll_metadata']['is_open']) + or ('force_close' in coll['coll_metadata'] and coll['coll_metadata']['force_close'])): # noqa: W503 + coll_status = CollectionStatus.Closed + else: + coll_status = CollectionStatus.Open + coll['status'] = coll_status + + if 'did_type' in coll['coll_metadata']: + if coll['coll_metadata']['did_type'] == 'DATASET': + coll_type = CollectionType.Dataset + elif coll['coll_metadata']['did_type'] == 'CONTAINER': + coll_type = CollectionType.Container + else: + coll_type = CollectionType.File + else: + coll_type = CollectionType.Dataset + coll['coll_type'] = coll_type + + return coll + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) + + def get_input_collections(self): + # return [self.primary_input_collection] + self.other_input_collections + colls = [self.primary_input_collection] + self.other_input_collections + for coll_int_id in colls: + coll = self.collections[coll_int_id] + coll = self.poll_external_collection(coll) + self.collections[coll_int_id] = coll + return super(ATLASActuatorWork, self).get_input_collections() + + def get_input_contents(self): + """ + Get all input contents from DDM. + """ + try: + ret_files = [] + coll = self.collections[self.primary_input_collection] + ret_file = {'coll_id': coll['coll_id'], + 'scope': coll['scope'], + 'name': coll['name'], + 'bytes': coll['coll_metadata']['bytes'], + 'adler32': None, + 'min_id': 0, + 'max_id': coll['coll_metadata']['total_files'], + 'content_type': ContentType.File, + 'content_metadata': {'total_files': coll['coll_metadata']['total_files']} + } + ret_files.append(ret_file) + return ret_files + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) + + def get_mapped_inputs(self, mapped_input_output_maps): + ret = [] + for map_id in mapped_input_output_maps: + inputs = mapped_input_output_maps[map_id]['inputs'] + + # if 'primary' is not set, the first one is the primary input. + primary_input = inputs[0] + for ip in inputs: + if 'primary' in ip['content_metadata'] and ip['content_metadata']['primary']: + primary_input = ip + ret.append(primary_input) + return ret + + def get_new_input_output_maps(self, mapped_input_output_maps={}): + """ + New inputs which are not yet mapped to outputs. + + :param mapped_input_output_maps: Inputs that are already mapped. + """ + inputs = self.get_input_contents() + mapped_inputs = self.get_mapped_inputs(mapped_input_output_maps) + mapped_inputs_scope_name = [ip['scope'] + ":" + ip['name'] for ip in mapped_inputs] + + new_inputs = [] + new_input_output_maps = {} + for ip in inputs: + ip_scope_name = ip['scope'] + ":" + ip['name'] + if ip_scope_name not in mapped_inputs_scope_name: + new_inputs.append(ip) + + # to avoid cheking new inputs if there are no new inputs anymore + if (not new_inputs and 'status' in self.collections[self.primary_input_collection] + and self.collections[self.primary_input_collection]['status'] in [CollectionStatus.Closed]): # noqa: W503 + self.set_has_new_inputs(False) + else: + mapped_keys = mapped_input_output_maps.keys() + if mapped_keys: + next_key = max(mapped_keys) + 1 + else: + next_key = 1 + for ip in new_inputs: + out_ip = copy.deepcopy(ip) + out_ip['coll_id'] = self.collections[self.output_collections[0]]['coll_id'] + new_input_output_maps[next_key] = {'inputs': [ip], + 'outputs': [out_ip]} + next_key += 1 + + self.unfinished_points = 1 + + return new_input_output_maps + + def get_processing(self, input_output_maps): + if self.active_processings: + return self.processings[self.active_processings[0]] + else: + return self.create_processing(input_output_maps) + + def create_processing(self, input_output_maps): + proc = {'processing_metadata': {'internal_id': str(uuid.uuid1())}} + self.add_processing_to_processings(proc) + self.active_processings.append(proc['processing_metadata']['internal_id']) + return proc + + def get_status_statistics(self, registered_input_output_maps): + status_statistics = {} + + self.total_output_files = 0 + self.processed_output_file = 0 + + for map_id in registered_input_output_maps: + outputs = registered_input_output_maps[map_id]['outputs'] + + self.total_output_files += 1 + + for content in outputs: + if content['status'].name not in status_statistics: + status_statistics[content['status'].name] = 0 + status_statistics[content['status'].name] += 1 + + if content['status'] == ContentStatus.Available: + self.processed_output_file += 1 + + self.status_statistics = status_statistics + return status_statistics + + def syn_collection_status(self): + input_collections = self.get_input_collections() + output_collections = self.get_output_collections() + # log_collections = self.get_log_collections() + + for input_collection in input_collections: + input_collection['total_files'] = 1 + input_collection['processed_files'] = 1 + + for output_collection in output_collections: + output_collection['total_files'] = self.total_output_files + output_collection['processed_files'] = self.processed_output_file + + def syn_work_status(self, registered_input_output_maps): + self.get_status_statistics(registered_input_output_maps) + + self.syn_collection_status() + + if self.is_processings_terminated() and not self.has_new_inputs(): + if self.is_processings_finished(): + self.status = WorkStatus.Finished + elif self.is_processings_failed(): + self.status = WorkStatus.Failed + elif self.is_processings_subfinished(): + self.status = WorkStatus.SubFinished + + ####### functions for carrier ######## # noqa E266 + ###################################### # noqa E266 + + def get_rucio_setup_env(self): + script = "export ATLAS_LOCAL_ROOT_BASE=/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase\n" + script += "source ${ATLAS_LOCAL_ROOT_BASE}/user/atlasLocalSetup.sh\n" + script += "export RUCIO_ACCOUNT=pilot\n" + script += "localSetupRucioClients\n" + return script + + def generate_processing_script_sandbox(self, processing): + arguments = self.parse_arguments() + + script = "#!/bin/bash\n\n" + script += self.get_rucio_setup_env() + script += "\n" + + script += "sandbox=%s\n" % str(self.sandbox) + script += "executable=%s\n" % str(self.executable) + script += "arguments=%s\n" % str(arguments) + script += "output_json=%s\n" % str(self.output_json) + script += "\n" + + script += "env\n" + script += "echo $X509_USER_PROXY\n" + script += "\n" + + script += "echo 'user id:'\n" + script += "id\n" + script += "\n" + + script += "wget $sandbox\n" + script += 'base_sandbox="$(basename -- $sandbox)"\n' + script += 'tar xzf $base_sandbox\n' + + dataset = self.collections[self.primary_input_collection] + script += 'rucio download %s:%s\n' % (dataset['scope'], dataset['name']) + script += 'chmod +x %s\n' % str(self.executable) + script += "echo '%s' '%s'\n" % (str(self.executable), str(arguments)) + script += '%s %s\n' % (str(self.executable), str(arguments)) + + script += 'ls\n\n' + + long_id = self.get_long_id(processing) + script_name = 'processing_%s.sh' % long_id + script_name = os.path.join(self.get_working_dir(processing), script_name) + with open(script_name, 'w') as f: + f.write(script) + run_command("chmod +x %s" % script_name) + return script_name + + def get_output_json(self, processing): + # job_dir = self.get_working_dir(processing) + if self.output_json: + return self.output_json + elif 'output_json' in self.agent_attributes and self.agent_attributes['output_json']: + output_json = self.agent_attributes['output_json'] + else: + output_json = 'idds_output.json' + return output_json + + def generate_processing_script(self, processing): + self.output_json = self.get_output_json(processing) + + script_name = self.generate_processing_script_sandbox(processing) + return script_name, None + + def get_output_files(self, processing): + return [self.output_json] + + def submit_processing(self, processing): + if 'job_id' in processing['processing_metadata']: + pass + else: + job_id, errors = self.submit_condor_processing(processing) + if errors: + self.add_errors(errors) + processing['processing_metadata']['job_id'] = job_id + processing['processing_metadata']['errors'] = str(self.get_errors()) + + def abort_processing(self, processing): + self.tocancel = True + + def parse_processing_outputs(self, processing): + request_id = processing['request_id'] + workload_id = processing['workload_id'] + processing_id = processing['processing_id'] + + if not self.output_json: + return None, 'Request(%s)_workload(%s)_processing(%s) output_json(%s) is not defined' % (request_id, workload_id, + processing_id, self.output_json) + + job_dir = self.get_working_dir(processing) + full_output_json = os.path.join(job_dir, self.output_json) + if not os.path.exists(full_output_json): + return None, '%s is not created' % str(full_output_json) + else: + try: + with open(full_output_json, 'r') as f: + data = f.read() + outputs = json.loads(data) + if not outputs: + return outputs, "No points generated: the outputs is empty" + return outputs, None + except Exception as ex: + return None, 'Failed to load the content of %s: %s' % (str(full_output_json), str(ex)) + + def poll_processing(self, processing): + job_status, job_err_msg = self.poll_condor_job_status(processing, processing['processing_metadata']['job_id']) + processing_outputs = None + if job_status in [ProcessingStatus.Finished]: + job_outputs, parser_errors = self.parse_processing_outputs(processing) + if job_outputs: + processing_status = ProcessingStatus.Finished + processing_err = None + processing_outputs = job_outputs + else: + processing_status = ProcessingStatus.Failed + processing_err = parser_errors + elif self.tocancel: + processing_status = ProcessingStatus.Cancelled + processing_outputs = None + processing_err = None + else: + processing_status = job_status + processing_err = job_err_msg + return processing_status, processing_outputs, processing_err + + def poll_processing_updates(self, processing, input_output_maps): + processing_status, processing_outputs, processing_err = self.poll_processing(processing) + + processing_metadata = processing['processing_metadata'] + if not processing_metadata: + processing_metadata = {} + if processing_err: + processing_err = processing_err.strip() + if processing_err: + self.add_errors(processing_err) + processing_metadata['errors'] = str(self.get_errors()) + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': processing_status, + 'processing_metadata': processing_metadata, + 'output_metadata': processing_outputs}} + + updated_contents = [] + return update_processing, updated_contents diff --git a/atlas/lib/idds/atlas/workflow/atlascondorwork.py b/atlas/lib/idds/atlas/workflow/atlascondorwork.py new file mode 100644 index 00000000..30d3523d --- /dev/null +++ b/atlas/lib/idds/atlas/workflow/atlascondorwork.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 + +import os + +from idds.common.constants import (ProcessingStatus) +from idds.common.utils import run_command +from idds.workflow.work import Work + + +class ATLASCondorWork(Work): + def __init__(self, executable=None, arguments=None, parameters=None, setup=None, + work_type=None, work_tag='hpo', exec_type='local', sandbox=None, work_id=None, + primary_input_collection=None, other_input_collections=None, + output_collections=None, log_collections=None, + agent_attributes=None, + logger=None): + """ + Init a work/task/transformation. + + :param setup: A string to setup the executable enviroment, it can be None. + :param executable: The executable. + :param arguments: The arguments. + :param parameters: A dict with arguments needed to be replaced. + :param work_type: The work type like data carousel, hyperparameteroptimization and so on. + :param exec_type: The exec type like 'local', 'remote'(with remote_package set), 'docker' and so on. + :param sandbox: The sandbox. + :param work_id: The work/task id. + :param primary_input_collection: The primary input collection. + :param other_input_collections: List of the input collections. + :param output_collections: List of the output collections. + # :param workflow: The workflow the current work belongs to. + """ + super(ATLASCondorWork, self).__init__(executable=executable, arguments=arguments, + parameters=parameters, setup=setup, work_type=work_type, + exec_type=exec_type, sandbox=sandbox, work_id=work_id, + primary_input_collection=primary_input_collection, + other_input_collections=other_input_collections, + output_collections=output_collections, + log_collections=log_collections, + agent_attributes=agent_attributes, + logger=logger) + + def get_long_id(self, processing): + request_id = processing['request_id'] + workload_id = processing['workload_id'] + processing_id = processing['processing_id'] + long_id = '%s_%s_%s' % (request_id, workload_id, processing_id) + return long_id + + def get_working_dir(self, processing): + # request_id = processing['request_id'] + # workload_id = processing['workload_id'] + processing_id = processing['processing_id'] + + job_dir = 'processing_%s' % (processing_id) + job_dir = os.path.join(self.get_workdir(), job_dir) + if not os.path.exists(job_dir): + os.makedirs(job_dir) + return job_dir + + def generate_processing_submit_file(self, processing): + script_name, err_msg = self.generate_processing_script(processing) + if not script_name: + return None, err_msg + + input_files = self.get_input_files(processing) + output_files = self.get_output_files(processing) + # self.logger.info("input_files: %s, output_files: %s" % (str(input_files), str(output_files))) + + long_id = self.get_long_id(processing) + + jdl = "#Agent jdl file\n" + jdl += "Universe = vanilla\n" + jdl += "Notification = Never\n" + jdl += "initialdir = %s\n" % self.get_working_dir(processing) + jdl += "Executable = %s\n" % script_name + # jdl += "Arguments = %s\na" % (self.get_job_dir(processing_id)) + jdl += "GetEnv = False\n" + jdl += "Output = " + 'processing_%s' % long_id + ".$(ClusterId).$(ProcId).out\n" + jdl += "Error = " + 'processing_%s' % long_id + ".$(ClusterId).$(ProcId).err\n" + jdl += "Log = " + 'processing_%s' % long_id + ".$(ClusterId).$(ProcId).log\n" + jdl += "stream_output = False\n" + jdl += "stream_error = False\n" + # jdl += 'Requirements = ((Arch == "X86_64") && (regexp("SLC",OpSysLongName)))\n' + # jdl += 'Requirements = ((Arch == "X86_64") && (regexp("CentOS",OpSysLongName)))\n' + # jdl += "transfer_input_files = file1, file2\n" + jdl += "should_transfer_files = yes\n" + + tf_inputs = [script_name] + if input_files: + tf_inputs = tf_inputs + input_files + tf_outputs = output_files + + # self.logger.info("tf_inputs: %s, tf_outputs: %s" % (str(tf_inputs), str(tf_outputs))) + + if tf_inputs: + jdl += "transfer_input_files = %s\n" % (str(','.join(tf_inputs))) + if tf_outputs: + jdl += "transfer_output_files = %s\n" % (str(','.join(tf_outputs))) + + jdl += "WhenToTransferOutput = ON_EXIT_OR_EVICT\n" + jdl += "OnExitRemove = TRUE\n" + # jdl += '+JobFlavour = "espresso"\n' + # jdl += '+JobFlavour = "tomorrow"\n' + # jdl += '+JobFlavour = "testmatch"\n' + # jdl += '+JobFlavour = "nextweek"\n' + jdl += '+JobType="ActiveLearning"\n' + # jdl += '+AccountingGroup ="group_u_ATLASWISC.all"\n' + jdl += '+Processing_id = "%s"\n' % long_id + jdl += "RequestCpus = 1\n" + if 'X509_USER_PROXY' in os.environ and os.environ['X509_USER_PROXY']: + jdl += "x509userproxy = %s\n" % str(os.environ['X509_USER_PROXY']) + jdl += "Queue 1\n" + + submit_file = 'processing_%s.jdl' % long_id + submit_file = os.path.join(self.get_working_dir(processing), submit_file) + with open(submit_file, 'w') as f: + f.write(jdl) + return submit_file, None + + def get_input_files(self, processing): + return [] + + def get_output_files(self, processing): + return [] + + def submit_condor_processing(self, processing): + jdl_file, err_msg = self.generate_processing_submit_file(processing) + if not jdl_file: + return None, err_msg + + cmd = "condor_submit " + jdl_file + status, output, error = run_command(cmd) + jobid = None + self.logger.info("submiting the job to cluster: %s" % cmd) + self.logger.info("status: %s, output: %s, error: %s " % (status, output, error)) + if status == 0 or str(status) == '0': + if output and 'submitted to cluster' in output: + for line in output.split('\n'): + if 'submitted to cluster' in line: + jobid = line.split(' ')[-1].replace('.', '') + return jobid, None + return None, output + error + + def get_job_err_message(self, job_workdir, job_err): + try: + if not job_err: + return '' + if not job_err.startswith("/") and job_workdir: + job_err = os.path.join(job_workdir, job_err) + if not os.path.exists(job_err): + return '' + with open(job_err, "r") as myfile: + data = myfile.readlines() + data = str(data) + data = data[-1000:] + return data + except Exception as e: + self.logger.error("Failed to read job error file(workdir: %s, error file: %s): %s" % (job_workdir, job_err, e)) + return '' + + def poll_condor_job_status(self, processing, job_id): + # 0 Unexpanded U + # 1 Idle I + # 2 Running R + # 3 Removed X + # 4 Completed C + # 5 Held H + # 6 Submission_err E + cmd = "condor_q -format '%s' ClusterId -format ' %s' Processing_id -format ' %s' JobStatus -format ' %s' Iwd -format ' %s' Cmd -format ' %s' Err " + str(job_id) + status, output, error = run_command(cmd) + self.logger.info("poll job status: %s" % cmd) + self.logger.info("status: %s, output: %s, error: %s" % (status, output, error)) + if status == 0 and len(output) == 0: + cmd = "condor_history -format '%s' ClusterId -format ' %s' Processing_id -format ' %s' JobStatus -format ' %s' Iwd -format ' %s' Cmd -format ' %s' Err " + str(job_id) + status, output, error = run_command(cmd) + self.logger.info("poll job status: %s" % cmd) + self.logger.info("status: %s, output: %s, error: %s" % (status, output, error)) + + ret_err = '' + job_cmd_msg, job_err_msg = '', '' + if status == 0: + lines = output.split('\n') + for line in lines: + c_job_id, c_processing_id, c_job_status, job_workdir, job_cmd, job_err = line.split(' ') + if str(c_job_id) != str(job_id): + continue + + processing_id = self.get_long_id(processing) + c_job_status = int(c_job_status) + if c_processing_id != processing_id: + final_job_status = ProcessingStatus.Failed + ret_err = 'jobid and the processing_id mismatched' + else: + job_status = c_job_status + if job_status < 2: + final_job_status = ProcessingStatus.Submitted + elif job_status == 2: + final_job_status = ProcessingStatus.Submitted + elif job_status == 2: + final_job_status = ProcessingStatus.Running + elif job_status == 3: + final_job_status = ProcessingStatus.Cancel + elif job_status == 4: + final_job_status = ProcessingStatus.Finished + else: + final_job_status = ProcessingStatus.Failed + + if final_job_status in [ProcessingStatus.Failed]: + job_cmd_msg = self.get_job_err_message(job_workdir, job_cmd) + job_cmd_msg = job_cmd_msg[-500:] + job_err_msg = self.get_job_err_message(job_workdir, job_err) + else: + final_job_status = ProcessingStatus.Submitted + + # if output: + # ret_err += output + if error: + ret_err += error + if job_cmd_msg: + ret_err += "Command output: " + job_cmd_msg + if job_err_msg: + ret_err += "Stderr: " + job_err_msg + + return final_job_status, ret_err diff --git a/atlas/lib/idds/atlas/workflow/atlashpowork.py b/atlas/lib/idds/atlas/workflow/atlashpowork.py new file mode 100644 index 00000000..89085260 --- /dev/null +++ b/atlas/lib/idds/atlas/workflow/atlashpowork.py @@ -0,0 +1,663 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 + +import copy +import datetime +import json +import random +import os +import traceback +import uuid + +from idds.common import exceptions +from idds.common.constants import (TransformType, CollectionType, CollectionStatus, + ContentStatus, ContentType, + ProcessingStatus, WorkStatus) +from idds.common.utils import run_command +from idds.common.utils import replace_parameters_with_values +# from idds.workflow.work import Work +from idds.atlas.workflow.atlascondorwork import ATLASCondorWork +from idds.core import (catalog as core_catalog) + + +class ATLASHPOWork(ATLASCondorWork): + def __init__(self, executable=None, arguments=None, parameters=None, setup=None, + work_tag='hpo', exec_type='local', sandbox=None, work_id=None, + name=None, + # primary_input_collection=None, other_input_collections=None, + # output_collections=None, log_collections=None, + logger=None, + workload_id=None, + agent_attributes=None, + method=None, + container_workdir=None, + output_json=None, + opt_space=None, initial_points=None, + max_points=None, num_points_per_iteration=10): + """ + Init a work/task/transformation. + + :param setup: A string to setup the executable enviroment, it can be None. + :param executable: The executable. + :param arguments: The arguments. + :param parameters: A dict with arguments needed to be replaced. + :param work_type: The work type like data carousel, hyperparameteroptimization and so on. + :param exec_type: The exec type like 'local', 'remote'(with remote_package set), 'docker' and so on. + :param sandbox: The sandbox. + :param work_id: The work/task id. + :param primary_input_collection: The primary input collection. + :param other_input_collections: List of the input collections. + :param output_collections: List of the output collections. + # :param workflow: The workflow the current work belongs to. + :param method: The HPO methd to use. It can be 'nevergrad', 'container' or 'sandbox'. + :param sandbox: The sandbox to be uploaded or the container path. + :param executable: The executable command. + :param arguments: The arguments for the executable. + :param container_workdir: The working directory for container. + :param opt_space: The optimization space. + :param initial_points: The initial points. + :param max_points: The maximum number of points. + :param number_points_per_iteration: The number of points to be generated per iteration. + """ + if not name: + if workload_id: + name = 'hpo.' + str(workload_id) + "." + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") + str(random.randint(1, 1000)) + else: + name = 'hpo.' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") + str(random.randint(1, 1000)) + + primary_input_collection = {'scope': 'HPO', 'name': name} + other_input_collections = None + output_collections = [{'scope': 'HPO', 'name': name + 'output'}] + log_collections = None + + super(ATLASHPOWork, self).__init__(executable=executable, arguments=arguments, + parameters=parameters, setup=setup, work_type=TransformType.HyperParameterOpt, + exec_type=exec_type, sandbox=sandbox, work_id=work_id, + primary_input_collection=primary_input_collection, + other_input_collections=other_input_collections, + output_collections=output_collections, + log_collections=log_collections, + logger=logger, + agent_attributes=agent_attributes) + self.method = method + self.sandbox = sandbox + self.executable = executable + self.arguments = arguments + self.container_workdir = container_workdir + self.opt_space = opt_space + self.initial_points = initial_points + self.max_points = max_points + self.num_points_per_iteration = num_points_per_iteration + self.unfinished_points = 0 + + self.input_json = None + self.output_json = output_json + + self.finished_points = 0 + self.points_to_generate = self.num_points_per_iteration + self.point_index = 0 + self.terminated = False + self.tocancel = False + + if not self.num_points_per_iteration or self.num_points_per_iteration < 0: + raise exceptions.IDDSException("num_points_per_iteration must be integer bigger than 0") + self.num_points_per_iteration = int(self.num_points_per_iteration) + + if not self.method and self.executable and 'docker' in self.executable: + self.method = 'docker' + + # if self.agent_attributes and 'atlashpowork' in self.agent_attributes: + # self.agent_attributes = self.agent_attributes['atlashpowork'] + # self.logger.info("agent_attributes: %s" % self.agent_attributes) + + # if self.agent_attributes and 'workdir' in self.agent_attributes and self.agent_attributes['workdir']: + # self.set_workdir(self.agent_attributes['workdir']) + # self.logger.info("workdir: %s" % self.get_workdir()) + + if agent_attributes: + self.set_agent_attributes(agent_attributes) + + def set_agent_attributes(self, attrs, req_attributes=None): + self.agent_attributes = attrs + + if self.agent_attributes and 'atlashpowork' in self.agent_attributes: + self.agent_attributes = self.agent_attributes['atlashpowork'] + self.logger.info("agent_attributes: %s" % self.agent_attributes) + + if self.agent_attributes and 'workdir' in self.agent_attributes and self.agent_attributes['workdir']: + if req_attributes and 'request_id' in req_attributes and 'workload_id' in req_attributes and 'transform_id' in req_attributes: + req_dir = 'request_%s_%s/transform_%s' % (req_attributes['request_id'], + req_attributes['workload_id'], + req_attributes['transform_id']) + self.set_workdir(os.path.join(self.agent_attributes['workdir'], req_dir)) + self.logger.info("workdir: %s" % self.get_workdir()) + + ####### functions for transformer ######## # noqa E266 + ###################################### # noqa E266 + + def poll_external_collection(self, coll): + try: + if 'status' in coll and coll['status'] in [CollectionStatus.Closed]: + return coll + else: + if 'coll_metadata' not in coll: + coll['coll_metadata'] = {} + coll['coll_metadata']['bytes'] = 0 + coll['coll_metadata']['total_files'] = 0 + coll['coll_metadata']['availability'] = True + coll['coll_metadata']['events'] = 0 + coll['coll_metadata']['is_open'] = True + coll['coll_metadata']['run_number'] = None + coll['coll_metadata']['did_type'] = 'DATASET' + coll['coll_metadata']['list_all_files'] = False + + if self.terminated: + self.logger.info("Work is terminated. Closing input dataset.") + coll['coll_metadata']['is_open'] = False + + if self.points_to_generate <= 0: + self.logger.info("points_to_generate(%s) is equal or smaller than 0. Closing input dataset." % self.points_to_generate) + coll['coll_metadata']['is_open'] = False + + if 'is_open' in coll['coll_metadata'] and not coll['coll_metadata']['is_open']: + coll_status = CollectionStatus.Closed + else: + coll_status = CollectionStatus.Open + coll['status'] = coll_status + + if 'did_type' in coll['coll_metadata']: + if coll['coll_metadata']['did_type'] == 'DATASET': + coll_type = CollectionType.Dataset + elif coll['coll_metadata']['did_type'] == 'CONTAINER': + coll_type = CollectionType.Container + else: + coll_type = CollectionType.File + else: + coll_type = CollectionType.Dataset + coll['coll_type'] = coll_type + + return coll + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) + + def get_input_collections(self): + # return [self.primary_input_collection] + self.other_input_collections + colls = [self.primary_input_collection] + self.other_input_collections + for coll_int_id in colls: + coll = self.collections[coll_int_id] + coll = self.poll_external_collection(coll) + self.collections[coll_int_id] = coll + return super(ATLASHPOWork, self).get_input_collections() + + def get_input_contents(self): + """ + Get all input contents from DDM. + """ + try: + if self.terminated: + return [] + + if self.unfinished_points > 0: + return [] + + ret_files = [] + coll = self.collections[self.primary_input_collection] + + if self.max_points and (self.max_points - self.finished_points < self.num_points_per_iteration): + self.points_to_generate = self.max_points - self.finished_points + + # call external processing to generate points + points = self.generate_points() + self.logger.info("points generated: %s" % str(points)) + + loss = None + for point in points: + ret_file = {'coll_id': coll['coll_id'], + 'scope': coll['scope'], + 'name': str(self.point_index), + 'bytes': 0, + 'adler32': None, + 'min_id': 0, + 'max_id': 0, + 'path': json.dumps((point, loss)), + 'content_type': ContentType.File, + 'content_metadata': {'events': 0}} + ret_files.append(ret_file) + self.point_index += 1 + return ret_files + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) + + def get_mapped_inputs(self, mapped_input_output_maps): + ret = [] + for map_id in mapped_input_output_maps: + inputs = mapped_input_output_maps[map_id]['inputs'] + + # if 'primary' is not set, the first one is the primary input. + primary_input = inputs[0] + for ip in inputs: + if 'primary' in ip['content_metadata'] and ip['content_metadata']['primary']: + primary_input = ip + ret.append(primary_input) + return ret + + def get_unfinished_points(self, mapped_input_output_maps): + counts = 0 + count_finished = 0 + for map_id in mapped_input_output_maps: + outputs = mapped_input_output_maps[map_id]['outputs'] + + for op in outputs: + if op['status'] in [ContentStatus.New]: + counts += 1 + if op['status'] in [ContentStatus.Available]: + count_finished += 1 + self.finished_points = count_finished + return counts + + def get_new_input_output_maps(self, mapped_input_output_maps={}): + """ + New inputs which are not yet mapped to outputs. + + :param mapped_input_output_maps: Inputs that are already mapped. + """ + unfinished_mapped = self.get_unfinished_points(mapped_input_output_maps) + self.unfinished_points = unfinished_mapped + + inputs = self.get_input_contents() + mapped_inputs = self.get_mapped_inputs(mapped_input_output_maps) + mapped_inputs_scope_name = [ip['scope'] + ":" + ip['name'] for ip in mapped_inputs] + + new_inputs = [] + new_input_output_maps = {} + for ip in inputs: + ip_scope_name = ip['scope'] + ":" + ip['name'] + if ip_scope_name not in mapped_inputs_scope_name: + new_inputs.append(ip) + + # to avoid cheking new inputs if there are no new inputs anymore + if (not new_inputs and 'status' in self.collections[self.primary_input_collection] + and self.collections[self.primary_input_collection]['status'] in [CollectionStatus.Closed]): # noqa: W503 + self.set_has_new_inputs(False) + else: + mapped_keys = mapped_input_output_maps.keys() + if mapped_keys: + next_key = max(mapped_keys) + 1 + else: + next_key = 1 + for ip in new_inputs: + out_ip = copy.deepcopy(ip) + out_ip['coll_id'] = self.collections[self.output_collections[0]]['coll_id'] + new_input_output_maps[next_key] = {'inputs': [ip], + 'outputs': [out_ip]} + next_key += 1 + + self.unfinished_points = self.unfinished_points + len(new_inputs) + + return new_input_output_maps + + def generate_points(self): + active_processing = self.get_processing(None) + if not active_processing: + if self.points_to_generate > 0: + active_processing = self.create_processing(None) + log_str = "max_points: %s, finished_points: %s, points_to_generate: %s, new processing: %s" % (self.max_points, + self.finished_points, + self.points_to_generate, + active_processing) + self.logger.info(log_str) + if active_processing: + return [] + else: + self.terminated = True + self.set_terminated_msg("Failed to create processing") + return [] + else: + self.terminated = True + self.set_terminated_msg("Number of points is enough(points_to_generate: %s)" % self.points_to_generate) + return [] + + if self.is_processing_terminated(active_processing): + self.logger.info("processing terminated: %s" % active_processing) + self.reap_processing(active_processing) + output_metadata = active_processing['output_metadata'] + if output_metadata: + return output_metadata + else: + self.terminated = True + processing_metadata = active_processing['processing_metadata'] + errors = None + if 'errors' in processing_metadata: + errors = processing_metadata['errors'] + self.set_terminated_msg("No points generated. Terminating the Work/Transformation. Detailed errors: %s" % errors) + return [] + return [] + + def get_processing(self, input_output_maps): + if self.active_processings: + return self.processings[self.active_processings[0]] + else: + return self.create_processing(input_output_maps) + + def create_processing(self, input_output_maps): + proc = {'processing_metadata': {'internal_id': str(uuid.uuid1()), + 'points_to_generate': self.points_to_generate}} + self.add_processing_to_processings(proc) + self.active_processings.append(proc['processing_metadata']['internal_id']) + return proc + + def get_status_statistics(self, registered_input_output_maps): + status_statistics = {} + for map_id in registered_input_output_maps: + outputs = registered_input_output_maps[map_id]['outputs'] + + for content in outputs: + if content['status'].name not in status_statistics: + status_statistics[content['status'].name] = 0 + status_statistics[content['status'].name] += 1 + self.status_statistics = status_statistics + return status_statistics + + def syn_collection_status(self): + input_collections = self.get_input_collections() + output_collections = self.get_output_collections() + # log_collections = self.get_log_collections() + + for input_collection in input_collections: + input_collection['total_files'] = self.finished_points + self.unfinished_points + input_collection['processed_files'] = self.finished_points + self.unfinished_points + + for output_collection in output_collections: + output_collection['total_files'] = self.finished_points + self.unfinished_points + output_collection['processed_files'] = self.finished_points + + def syn_work_status(self, registered_input_output_maps): + self.get_status_statistics(registered_input_output_maps) + + self.syn_collection_status() + + if self.is_processings_terminated() and not self.has_new_inputs(): + keys = self.status_statistics.keys() + if ContentStatus.New.name in keys or ContentStatus.Processing.name in keys: + pass + else: + if len(keys) == 1: + if ContentStatus.Available.name in keys: + self.status = WorkStatus.Finished + else: + self.status = WorkStatus.Failed + else: + self.status = WorkStatus.SubFinished + + ####### functions for carrier ######## # noqa E266 + ###################################### # noqa E266 + + def generate_processing_script_nevergrad(self, processing): + executable = self.agent_attributes['nevergrad']['executable'] + arguments = self.agent_attributes['nevergrad']['arguments'] + + param_values = {'MAX_POINTS': self.max_points, + 'NUM_POINTS': self.points_to_generate, + 'IN': self.input_json, + 'OUT': self.output_json} + arguments = replace_parameters_with_values(arguments, param_values) + + script = "#!/bin/bash\n\n" + script += "executable=%s\n" % os.path.basename(executable) + script += "arguments='%s'\n" % str(arguments) + script += "input_json=%s\n" % str(self.input_json) + script += "output_json=%s\n" % str(self.output_json) + script += "\n" + + script += "env\n" + script += "echo $X509_USER_PROXY\n" + script += "\n" + + script += "echo 'user id:'\n" + script += "id\n" + script += "\n" + + script += "echo '%s' '%s'\n" % (os.path.basename(executable), str(arguments)) + script += '%s %s\n' % (os.path.basename(executable), str(arguments)) + + script += '\n' + + long_id = self.get_long_id(processing) + script_name = 'processing_%s.sh' % long_id + script_name = os.path.join(self.get_working_dir(processing), script_name) + with open(script_name, 'w') as f: + f.write(script) + run_command("chmod +x %s" % script_name) + return script_name + + def generate_processing_script_container(self, processing): + param_values = {'MAX_POINTS': self.max_points, + 'NUM_POINTS': self.points_to_generate, + 'IN': self.input_json, + 'OUT': self.output_json} + executable = replace_parameters_with_values(self.executable, param_values) + arguments = replace_parameters_with_values(self.arguments, param_values) + + script = "#!/bin/bash\n\n" + script += "executable=%s\n" % str(executable) + script += "arguments=%s\n" % str(arguments) + script += "input_json=%s\n" % str(self.input_json) + script += "output_json=%s\n" % str(self.output_json) + script += "\n" + + script += "env\n" + script += "echo $X509_USER_PROXY\n" + script += "\n" + + script += "echo 'user id:'\n" + script += "id\n" + script += "\n" + + if self.sandbox and 'docker' in executable: + arguments = 'run --rm -v $(pwd):%s %s ' % (self.container_workdir, self.sandbox) + arguments + + script += "echo '%s' '%s'\n" % (str(executable), str(arguments)) + script += '%s %s\n' % (str(executable), str(arguments)) + + if self.sandbox and 'docker' in executable: + script += 'docker image rm -f %s\n' % self.sandbox + + script += '\n' + + long_id = self.get_long_id(processing) + script_name = 'processing_%s.sh' % long_id + script_name = os.path.join(self.get_working_dir(processing), script_name) + with open(script_name, 'w') as f: + f.write(script) + run_command("chmod +x %s" % script_name) + return script_name + + def generate_processing_script_sandbox(self, processing): + param_values = {'MAX_POINTS': self.max_points, + 'NUM_POINTS': self.points_to_generate, + 'IN': self.input_json, + 'OUT': self.output_json} + executable = replace_parameters_with_values(self.executable, param_values) + arguments = replace_parameters_with_values(self.arguments, param_values) + + script = "#!/bin/bash\n\n" + script += "sandbox=%s\n" % str(self.sandbox) + script += "executable=%s\n" % str(executable) + script += "arguments=%s\n" % str(arguments) + script += "input_json=%s\n" % str(self.input_json) + script += "output_json=%s\n" % str(self.output_json) + script += "\n" + + script += "env\n" + script += "echo $X509_USER_PROXY\n" + script += "\n" + + script += "echo 'user id:'\n" + script += "id\n" + script += "\n" + + script += "wget $sandbox\n" + script += 'base_sandbox="$(basename -- $sandbox)"\n' + script += 'tar xzf $base_sandbox\n' + + script += 'chmod +x %s\n' % str(executable) + script += "echo '%s' '%s'\n" % (str(executable), str(arguments)) + script += '%s %s\n' % (str(executable), str(arguments)) + + script += '\n' + + long_id = self.get_long_id(processing) + script_name = 'processing_%s.sh' % long_id + script_name = os.path.join(self.get_working_dir(processing), script_name) + with open(script_name, 'w') as f: + f.write(script) + run_command("chmod +x %s" % script_name) + return script_name + + def generate_input_json(self, processing): + try: + output_collection = self.get_output_collections()[0] + contents = core_catalog.get_contents_by_coll_id_status(coll_id=output_collection['coll_id']) + points = [] + for content in contents: + # point = content['content_metadata']['point'] + point = json.loads(content['path']) + points.append(point) + + job_dir = self.get_working_dir(processing) + if 'input_json' in self.agent_attributes and self.agent_attributes['input_json']: + input_json = self.agent_attributes['input_json'] + else: + input_json = 'idds_input.json' + opt_points = {'points': points, 'opt_space': self.opt_space} + with open(os.path.join(job_dir, input_json), 'w') as f: + json.dump(opt_points, f) + return input_json + except Exception as e: + raise Exception("Failed to generate idds inputs for HPO: %s" % str(e)) + + def get_output_json(self, processing): + # job_dir = self.get_working_dir(processing) + if self.output_json: + return self.output_json + elif 'output_json' in self.agent_attributes and self.agent_attributes['output_json']: + output_json = self.agent_attributes['output_json'] + else: + output_json = 'idds_output.json' + return output_json + + def generate_processing_script(self, processing): + if not self.method: + err_msg = "Processing %s HPO method(%s) is not defined" % (processing['processing_id'], self.method) + self.logger.error(err_msg) + self.set_terminated_msg(err_msg) + self.terminated = True + return None, err_msg + + self.input_json = self.generate_input_json(processing) + self.output_json = self.get_output_json(processing) + + if self.method == 'nevergrad': + script_name = self.generate_processing_script_nevergrad(processing) + return script_name, None + elif self.method in ['container', 'docker']: + script_name = self.generate_processing_script_container(processing) + return script_name, None + elif self.method == 'sandbox': + script_name = self.generate_processing_script_sandbox(processing) + return script_name, None + else: + err_msg = "Processing %s not supported HPO method: %s" % (processing['processing_id'], self.method) + self.logger.error(err_msg) + self.set_terminated_msg(err_msg) + self.terminated = True + return None, err_msg + + def get_input_files(self, processing): + return [self.input_json] + + def get_output_files(self, processing): + return [self.output_json] + + def submit_processing(self, processing): + if 'job_id' in processing['processing_metadata']: + pass + else: + job_id, errors = self.submit_condor_processing(processing) + processing['processing_metadata']['job_id'] = job_id + processing['processing_metadata']['errors'] = errors + + def abort_processing(self, processing): + self.tocancel = True + + def parse_processing_outputs(self, processing): + request_id = processing['request_id'] + workload_id = processing['workload_id'] + processing_id = processing['processing_id'] + + if not self.output_json: + return None, 'Request(%s)_workload(%s)_processing(%s) output_json(%s) is not defined' % (request_id, workload_id, + processing_id, self.output_json) + + job_dir = self.get_working_dir(processing) + full_output_json = os.path.join(job_dir, self.output_json) + if not os.path.exists(full_output_json): + return None, '%s is not created' % str(full_output_json) + else: + try: + with open(full_output_json, 'r') as f: + data = f.read() + outputs = json.loads(data) + if not outputs: + return outputs, "No points generated: the outputs is empty" + return outputs, None + except Exception as ex: + return None, 'Failed to load the content of %s: %s' % (str(full_output_json), str(ex)) + + def poll_processing(self, processing): + job_status, job_err_msg = self.poll_condor_job_status(processing, processing['processing_metadata']['job_id']) + processing_outputs = None + if job_status in [ProcessingStatus.Finished]: + job_outputs, parser_errors = self.parse_processing_outputs(processing) + if job_outputs: + processing_status = ProcessingStatus.Finished + processing_err = None + processing_outputs = job_outputs + else: + processing_status = ProcessingStatus.Failed + processing_err = parser_errors + elif self.tocancel: + processing_status = ProcessingStatus.Cancelled + processing_outputs = None + processing_err = None + else: + processing_status = job_status + processing_err = job_err_msg + return processing_status, processing_outputs, processing_err + + def poll_processing_updates(self, processing, input_output_maps): + processing_status, processing_outputs, processing_err = self.poll_processing(processing) + + processing_metadata = processing['processing_metadata'] + if not processing_metadata: + processing_metadata = {} + processing_metadata['errors'] = processing_err + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': processing_status, + 'processing_metadata': processing_metadata, + 'output_metadata': processing_outputs}} + + updated_contents = [] + return update_processing, updated_contents diff --git a/atlas/lib/idds/atlas/workflow/atlaspandawork.py b/atlas/lib/idds/atlas/workflow/atlaspandawork.py new file mode 100644 index 00000000..615f4570 --- /dev/null +++ b/atlas/lib/idds/atlas/workflow/atlaspandawork.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 - 2021 + +try: + import ConfigParser +except ImportError: + import configparser as ConfigParser + +try: + from urllib import quote +except ImportError: + from urllib.parse import quote + +import copy +import json +import os +import re +import traceback +import uuid + +from pandatools import Client + +from idds.common import exceptions +from idds.common.constants import (TransformType, CollectionType, CollectionStatus, + ProcessingStatus, WorkStatus) +from idds.workflow.work import Work +from idds.workflow.workflow import Condition + + +class PandaCondition(Condition): + def __init__(self, cond=None, current_work=None, true_work=None, false_work=None): + super(PandaCondition, self).__init__(cond=cond, current_work=current_work, + true_work=true_work, false_work=false_work) + + +class ATLASPandaWork(Work): + def __init__(self, executable=None, arguments=None, parameters=None, setup=None, + work_tag='activelearning', exec_type='panda', sandbox=None, work_id=None, + primary_input_collection=None, other_input_collections=None, + output_collections=None, log_collections=None, + logger=None, dependency_map=None, task_name="", + panda_task_id=None, cmd_to_arguments=None): + """ + Init a work/task/transformation. + + :param setup: A string to setup the executable enviroment, it can be None. + :param executable: The executable. + :param arguments: The arguments. + :param parameters: A dict with arguments needed to be replaced. + :param work_type: The work type like data carousel, hyperparameteroptimization and so on. + :param exec_type: The exec type like 'local', 'remote'(with remote_package set), 'docker' and so on. + :param sandbox: The sandbox. + :param work_id: The work/task id. + :param primary_input_collection: The primary input collection. + :param other_input_collections: List of the input collections. + :param output_collections: List of the output collections. + # :param workflow: The workflow the current work belongs to. + """ + self.panda_task_id = panda_task_id + self.cmd_to_arguments = cmd_to_arguments + self.panda_task_paramsmap = None + super(ATLASPandaWork, self).__init__(executable=executable, arguments=arguments, + parameters=parameters, setup=setup, work_type=TransformType.Processing, + work_tag=work_tag, exec_type=exec_type, sandbox=sandbox, work_id=work_id, + primary_input_collection=primary_input_collection, + other_input_collections=other_input_collections, + output_collections=output_collections, + log_collections=log_collections, + logger=logger) + self.panda_task_id = panda_task_id + self.panda_task_paramsmap = None + self.panda = None + self.pandassl = None + self.pandamonitor = None + + # from pandatools import Client + # Client.getTaskParamsMap(23752996) + # (0, '{"buildSpec": {"jobParameters": "-i ${IN} -o ${OUT} --sourceURL ${SURL} -r . ", "archiveName": "sources.0ca6a2fb-4ad0-42d0-979d-aa7c284f1ff7.tar.gz", "prodSourceLabel": "panda"}, "sourceURL": "https://aipanda048.cern.ch:25443", "cliParams": "prun --exec \\"python simplescript.py 0.5 0.5 200 output.json\\" --outDS user.wguan.altest1234 --outputs output.json --nJobs=10", "site": null, "vo": "atlas", "respectSplitRule": true, "osInfo": "Linux-3.10.0-1127.19.1.el7.x86_64-x86_64-with-centos-7.9.2009-Core", "log": {"type": "template", "param_type": "log", "container": "user.wguan.altest1234.log/", "value": "user.wguan.altest1234.log.$JEDITASKID.${SN}.log.tgz", "dataset": "user.wguan.altest1234.log/"}, "transUses": "", "excludedSite": [], "nMaxFilesPerJob": 200, "uniqueTaskName": true, "noInput": true, "taskName": "user.wguan.altest1234/", "transHome": null, "includedSite": null, "nEvents": 10, "nEventsPerJob": 1, "jobParameters": [{"type": "constant", "value": "-j \\"\\" --sourceURL ${SURL}"}, {"type": "constant", "value": "-r ."}, {"padding": false, "type": "constant", "value": "-p \\""}, {"padding": false, "type": "constant", "value": "python%20simplescript.py%200.5%200.5%20200%20output.json"}, {"type": "constant", "value": "\\""}, {"type": "constant", "value": "-l ${LIB}"}, {"container": "user.wguan.altest1234_output.json/", "value": "user.wguan.$JEDITASKID._${SN/P}.output.json", "dataset": "user.wguan.altest1234_output.json/", "param_type": "output", "hidden": true, "type": "template"}, {"type": "constant", "value": "-o \\"{\'output.json\': \'user.wguan.$JEDITASKID._${SN/P}.output.json\'}\\""}], "prodSourceLabel": "user", "processingType": "panda-client-1.4.47-jedi-run", "architecture": "@centos7", "userName": "Wen Guan", "taskType": "anal", "taskPriority": 1000, "countryGroup": "us"}') # noqa E501 + + def initialize_work(self): + if not self.is_initialized(): + if self.panda_task_id is not None: + self.init_panda_task_info() + else: + self.init_new_panda_task_info() + super(ATLASPandaWork, self).initialize_work() + + def get_scope_name(self, dataset): + if dataset.startswith("user"): + scope = "user." + dataset.split('.')[1] + elif dataset.startswith("group"): + scope = "group." + dataset.split('.')[1] + else: + scope = dataset.split('.')[0] + return scope + + def init_panda_task_info(self): + status, task_param_map = Client.getTaskParamsMap(self.panda_task_id) + if status == 0: + task_param_map = json.loads(task_param_map) + self.panda_task_paramsmap = task_param_map + self.sandbox = os.path.join(task_param_map['sourceURL'], 'cache/' + task_param_map['buildSpec']['archiveName']) + for p in task_param_map["jobParameters"]: + if 'param_type' in p and p['param_type'] == 'output': + output_dataset = p['dataset'] + output_dataset = output_dataset.replace("/", "") + scope = self.get_scope_name(output_dataset) + primary_input_collection = {'scope': scope, 'name': output_dataset} + output_collection = {'scope': scope, 'name': output_dataset} + self.set_primary_input_collection(primary_input_collection) + self.add_output_collections([output_collection]) + if 'log' in p: + log_dataset = p['dataset'] + log_dataset = log_dataset.replace("/", "") + scope = self.get_scope_name(log_dataset) + log_collection = {'scope': scope, 'name': log_dataset} + self.add_log_collections([log_collection]) + + def init_new_panda_task_info(self): + if not self.panda_task_paramsmap: + return + + # generate new dataset name + # self.padding = self.sequence_in_workflow + new_dataset_name = self.cmd_to_arguments['outDS'] + "_" + str(self.get_sequence_id()) + for coll_id in self.collections: + coll = self.collections[coll_id] + coll['name'] = coll['name'].replace(self.cmd_to_arguments['outDS'], new_dataset_name) + + self.panda_task_paramsmap['cliParams'] = \ + self.panda_task_paramsmap['cliParams'].replace(self.cmd_to_arguments['outDS'], new_dataset_name) + + self.panda_task_paramsmap['taskName'] = \ + self.panda_task_paramsmap['taskName'].replace(self.cmd_to_arguments['outDS'], new_dataset_name) + + jobParameters = self.panda_task_paramsmap['jobParameters'] + for p in jobParameters: + if 'container' in p: + p['container'] = p['container'].replace(self.cmd_to_arguments['outDS'], new_dataset_name) + if 'dataset' in p: + p['dataset'] = p['dataset'].replace(self.cmd_to_arguments['outDS'], new_dataset_name) + + log = self.panda_task_paramsmap['log'] + if 'value' in log: + log['value'] = log['value'].replace(self.cmd_to_arguments['outDS'], new_dataset_name) + if 'container' in log: + log['container'] = log['container'].replace(self.cmd_to_arguments['outDS'], new_dataset_name) + if 'dataset' in log: + log['dataset'] = log['dataset'].replace(self.cmd_to_arguments['outDS'], new_dataset_name) + + def parse_arguments(self): + try: + # arguments = self.get_arguments() + # parameters = self.get_parameters() + arguments = self.cmd_to_arguments['arguments'] if 'arguments' in self.cmd_to_arguments else None + parameters = self.cmd_to_arguments['parameters'] if 'parameters' in self.cmd_to_arguments else None + new_parameters = self.get_parameters() + + if parameters and new_parameters: + new_arguments = parameters.format(**new_parameters) + + cliParams = self.panda_task_paramsmap['cliParams'] + cliParams = cliParams.replace(arguments, new_arguments) + self.panda_task_paramsmap['cliParams'] = cliParams + + jobParameters = self.panda_task_paramsmap['jobParameters'] + for p in jobParameters: + if 'value' in p: + p['value'] = p['value'].replace(quote(arguments), quote(new_arguments)) + + return new_arguments + except Exception as ex: + self.add_errors(str(ex)) + + def generate_work_from_template(self): + new_work = super(ATLASPandaWork, self).generate_work_from_template() + # new_work.unset_initialized() + # new_work.panda_task_id = None + return new_work + + def set_parameters(self, parameters): + self.parameters = parameters + # trigger to submit new tasks + self.unset_initialized() + self.panda_task_id = None + + def my_condition(self): + if self.is_finished(): + return True + return False + + def load_panda_configuration(self): + panda_config = ConfigParser.SafeConfigParser() + if os.environ.get('IDDS_PANDA_CONFIG', None): + configfile = os.environ['IDDS_PANDA_CONFIG'] + if panda_config.read(configfile) == [configfile]: + return panda_config + + configfiles = ['%s/etc/panda/panda.cfg' % os.environ.get('IDDS_HOME', ''), + '/etc/panda/panda.cfg', '/opt/idds/etc/panda/panda.cfg', + '%s/etc/panda/panda.cfg' % os.environ.get('VIRTUAL_ENV', '')] + for configfile in configfiles: + if panda_config.read(configfile) == [configfile]: + return panda_config + return panda_config + + def load_panda_config(self): + panda_config = self.load_panda_configuration() + self.logger.info("panda config: %s" % panda_config) + if panda_config.has_section('panda'): + if panda_config.has_option('panda', 'pandamonitor'): + pandamonitor = panda_config.get('panda', 'pandamonitor') + self.pandamonitor = pandamonitor + if panda_config.has_option('panda', 'panda'): + panda = panda_config.get('panda', 'panda') + self.panda = panda + if panda_config.has_option('panda', 'pandassl'): + pandassl = panda_config.get('panda', 'pandassl') + self.pandassl = pandassl + + def poll_external_collection(self, coll): + try: + # if 'coll_metadata' in coll and 'is_open' in coll['coll_metadata'] and not coll['coll_metadata']['is_open']: + if 'status' in coll and coll['status'] in [CollectionStatus.Closed]: + return coll + else: + # client = self.get_rucio_client() + # did_meta = client.get_metadata(scope=coll['scope'], name=coll['name']) + if 'coll_metadata' not in coll: + coll['coll_metadata'] = {} + coll['coll_metadata']['bytes'] = 0 + coll['coll_metadata']['total_files'] = 0 + coll['coll_metadata']['availability'] = True + coll['coll_metadata']['events'] = 0 + coll['coll_metadata']['is_open'] = False + coll['coll_metadata']['run_number'] = None + coll['coll_metadata']['did_type'] = 'DATASET' + coll['coll_metadata']['list_all_files'] = False + + if 'is_open' in coll['coll_metadata'] and not coll['coll_metadata']['is_open']: + coll_status = CollectionStatus.Closed + else: + coll_status = CollectionStatus.Open + coll['status'] = coll_status + + if 'did_type' in coll['coll_metadata']: + if coll['coll_metadata']['did_type'] == 'DATASET': + coll_type = CollectionType.Dataset + elif coll['coll_metadata']['did_type'] == 'CONTAINER': + coll_type = CollectionType.Container + else: + coll_type = CollectionType.File + else: + coll_type = CollectionType.Dataset + coll['coll_type'] = coll_type + + return coll + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) + + def get_input_collections(self): + """ + *** Function called by Transformer agent. + """ + colls = [self.primary_input_collection] + self.other_input_collections + for coll_int_id in colls: + coll = self.collections[coll_int_id] + coll = self.poll_external_collection(coll) + self.collections[coll_int_id] = coll + return super(ATLASPandaWork, self).get_input_collections() + + def get_input_contents(self): + """ + Get all input contents from DDM. + """ + try: + ret_files = [] + return ret_files + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) + + def get_mapped_inputs(self, mapped_input_output_maps): + ret = [] + for map_id in mapped_input_output_maps: + inputs = mapped_input_output_maps[map_id]['inputs'] + + # if 'primary' is not set, the first one is the primary input. + primary_input = inputs[0] + for ip in inputs: + if 'primary' in ip['content_metadata'] and ip['content_metadata']['primary']: + primary_input = ip + ret.append(primary_input) + return ret + + def get_new_input_output_maps(self, mapped_input_output_maps={}): + """ + New inputs which are not yet mapped to outputs. + + :param mapped_input_output_maps: Inputs that are already mapped. + """ + inputs = self.get_input_contents() + mapped_inputs = self.get_mapped_inputs(mapped_input_output_maps) + mapped_inputs_scope_name = [ip['scope'] + ":" + ip['name'] for ip in mapped_inputs] + + new_inputs = [] + new_input_output_maps = {} + for ip in inputs: + ip_scope_name = ip['scope'] + ":" + ip['name'] + if ip_scope_name not in mapped_inputs_scope_name: + new_inputs.append(ip) + + # to avoid cheking new inputs if there are no new inputs anymore + if (not new_inputs and 'status' in self.collections[self.primary_input_collection] + and self.collections[self.primary_input_collection]['status'] in [CollectionStatus.Closed]): # noqa: W503 + self.set_has_new_inputs(False) + else: + mapped_keys = mapped_input_output_maps.keys() + if mapped_keys: + next_key = max(mapped_keys) + 1 + else: + next_key = 1 + for ip in new_inputs: + out_ip = copy.deepcopy(ip) + out_ip['coll_id'] = self.collections[self.output_collections[0]]['coll_id'] + new_input_output_maps[next_key] = {'inputs': [ip], + 'outputs': [out_ip]} + next_key += 1 + + return new_input_output_maps + + def get_processing(self, input_output_maps): + """ + *** Function called by Transformer agent. + + If there is already an active processing for this work, will do nothing. + If there is no active processings, create_processing will be called. + """ + if self.active_processings: + return self.processings[self.active_processings[0]] + else: + return self.create_processing(input_output_maps) + + def create_processing(self, input_output_maps): + """ + *** Function called by Transformer agent. + + :param input_output_maps: new maps from inputs to outputs. + """ + proc = {'processing_metadata': {'internal_id': str(uuid.uuid1()), + 'panda_task_id': self.panda_task_id}} + self.add_processing_to_processings(proc) + self.active_processings.append(proc['processing_metadata']['internal_id']) + return proc + + def submit_panda_task(self, processing): + try: + status, tmpOut = Client.insertTaskParams(self.panda_task_paramsmap, False, True) + if status == 0: + tmp_status, tmp_output = tmpOut + m = re.search("jediTaskID=(\d+)", tmp_output) # noqa W605 + task_id = int(m.group(1)) + processing['processing_metadata']['panda_task_id'] = task_id + else: + self.add_errors(tmpOut) + raise Exception(tmpOut) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) + + def submit_processing(self, processing): + """ + *** Function called by Carrier agent. + """ + if 'panda_task_id' in processing['processing_metadata'] and processing['processing_metadata']['panda_task_id']: + pass + else: + self.set_user_proxy() + self.submit_panda_task(processing) + self.unset_user_proxy() + + def poll_panda_task(self, processing): + if 'panda_task_id' in processing['processing_metadata']: + status, task_status = Client.getTaskStatus(processing['processing_metadata']['panda_task_id']) + if status == 0: + return task_status + else: + return 'failed' + return None + + def poll_processing_updates(self, processing, input_output_maps): + """ + *** Function called by Carrier agent. + """ + updated_contents = [] + update_processing = {} + + if processing: + task_status = self.poll_panda_task(processing) + if task_status: + if task_status in ['registered', 'defined']: + processing_status = ProcessingStatus.Submitted + elif task_status in ['assigning', 'ready', 'pending', 'scouting', 'scouted', 'running', 'prepared']: + processing_status = ProcessingStatus.Running + elif task_status in ['done']: + # finished, finishing, waiting it to be done + processing_status = ProcessingStatus.Finished + elif task_status in ['failed', 'aborted', 'broken', 'exhausted']: + processing_status = ProcessingStatus.Failed + else: + # finished, finishing, aborting, topreprocess, preprocessing, tobroken + # toretry, toincexec, rerefine, paused, throttled, passed + processing_status = ProcessingStatus.Running + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': processing_status}} + return update_processing, updated_contents + + def syn_collection_status(self): + input_collections = self.get_input_collections() + output_collections = self.get_output_collections() + # log_collections = self.get_log_collections() + + for input_collection in input_collections: + input_collection['processed_files'] = 0 + + for output_collection in output_collections: + output_collection['total_files'] = 0 + output_collection['processed_files'] = 0 + + def syn_work_status(self, registered_input_output_maps): + self.syn_collection_status() + + if self.is_processings_terminated() and not self.has_new_inputs(): + if self.is_processings_finished(): + self.status = WorkStatus.Finished + elif self.is_processings_failed(): + self.status = WorkStatus.Failed + elif self.is_processings_subfinished(): + self.status = WorkStatus.SubFinished diff --git a/atlas/lib/idds/atlas/workflow/atlasstageinwork.py b/atlas/lib/idds/atlas/workflow/atlasstageinwork.py index b2556bc6..ca00d68a 100644 --- a/atlas/lib/idds/atlas/workflow/atlasstageinwork.py +++ b/atlas/lib/idds/atlas/workflow/atlasstageinwork.py @@ -18,7 +18,8 @@ RuleNotFound as RucioRuleNotFound) from idds.common import exceptions -from idds.common.constants import (TransformType, CollectionStatus, ContentStatus, ContentType, +from idds.common.constants import (TransformType, CollectionType, CollectionStatus, + ContentStatus, ContentType, ProcessingStatus, WorkStatus) from idds.workflow.work import Work @@ -28,7 +29,8 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_tag='stagein', exec_type='local', sandbox=None, work_id=None, primary_input_collection=None, other_input_collections=None, output_collections=None, log_collections=None, - workflow=None, logger=None, + agent_attributes=None, + logger=None, max_waiting_time=3600 * 7 * 24, src_rse=None, dest_rse=None, rule_id=None): """ Init a work/task/transformation. @@ -44,7 +46,7 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, :param primary_input_collection: The primary input collection. :param other_input_collections: List of the input collections. :param output_collections: List of the output collections. - :param workflow: The workflow the current work belongs to. + # :param workflow: The workflow the current work belongs to. :param max_waiting_time: The max waiting time to terminate the work. :param src_rse: The source rse. :param dest_rse: The destination rse. @@ -57,7 +59,7 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, other_input_collections=other_input_collections, output_collections=output_collections, log_collections=log_collections, - workflow=workflow, + agent_attributes=agent_attributes, logger=logger) self.max_waiting_time = max_waiting_time self.src_rse = src_rse @@ -65,6 +67,13 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.life_time = max_waiting_time self.rule_id = rule_id + self.num_mapped_inputs = 0 + self.total_output_files = 0 + self.processed_output_files = 0 + self.status_statistics = {} + + self.tocancel = False + def get_rucio_client(self): try: client = RucioClient() @@ -76,24 +85,41 @@ def get_rucio_client(self): def poll_external_collection(self, coll): try: - if 'coll_metadata' in coll and 'is_open' in coll['coll_metadata'] and not coll['coll_metadata']['is_open']: + # if 'coll_metadata' in coll and 'is_open' in coll['coll_metadata'] and not coll['coll_metadata']['is_open']: + if 'status' in coll and coll['status'] in [CollectionStatus.Closed]: return coll else: - meta = {} - did_meta = self.client.get_metadata(scope=coll['scope'], name=coll['name']) - meta = {'scope': coll['scope'], - 'name': coll['name'], - 'coll_metadata': { - 'bytes': did_meta['bytes'], - 'total_files': did_meta['length'], - 'availability': did_meta['availability'], - 'events': did_meta['events'], - 'is_open': did_meta['is_open'], - 'run_number': did_meta['run_number'], - 'did_type': did_meta['did_type'], - 'list_all_files': False} - } - return meta + client = self.get_rucio_client() + did_meta = client.get_metadata(scope=coll['scope'], name=coll['name']) + if 'coll_metadata' not in coll: + coll['coll_metadata'] = {} + coll['coll_metadata']['bytes'] = did_meta['bytes'] + coll['coll_metadata']['total_files'] = did_meta['length'] + coll['coll_metadata']['availability'] = did_meta['availability'] + coll['coll_metadata']['events'] = did_meta['events'] + coll['coll_metadata']['is_open'] = did_meta['is_open'] + coll['coll_metadata']['run_number'] = did_meta['run_number'] + coll['coll_metadata']['did_type'] = did_meta['did_type'] + coll['coll_metadata']['list_all_files'] = False + + if 'is_open' in coll['coll_metadata'] and not coll['coll_metadata']['is_open']: + coll_status = CollectionStatus.Closed + else: + coll_status = CollectionStatus.Open + coll['status'] = coll_status + + if 'did_type' in coll['coll_metadata']: + if coll['coll_metadata']['did_type'] == 'DATASET': + coll_type = CollectionType.Dataset + elif coll['coll_metadata']['did_type'] == 'CONTAINER': + coll_type = CollectionType.Container + else: + coll_type = CollectionType.File + else: + coll_type = CollectionType.Dataset + coll['coll_type'] = coll_type + + return coll except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) @@ -106,7 +132,7 @@ def get_input_collections(self): coll = self.collections[coll_int_id] coll = self.poll_external_collection(coll) self.collections[coll_int_id] = coll - return super(self).get_input_collections() + return super(ATLASStageinWork, self).get_input_collections() def get_input_contents(self): """ @@ -165,7 +191,8 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): new_inputs.append(ip) # to avoid cheking new inputs if there are no new inputs anymore - if not new_inputs and self.collections[self.primary_input_collection]['status'] in [CollectionStatus.Closed]: + if (not new_inputs and 'status' in self.collections[self.primary_input_collection] + and self.collections[self.primary_input_collection]['status'] in [CollectionStatus.Closed]): # noqa: W503 self.set_has_new_inputs(False) else: mapped_keys = mapped_input_output_maps.keys() @@ -174,6 +201,7 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): else: next_key = 1 for ip in new_inputs: + self.num_mapped_inputs += 1 out_ip = copy.deepcopy(ip) out_ip['coll_id'] = self.collections[self.output_collections[0]]['coll_id'] new_input_output_maps[next_key] = {'inputs': [ip], @@ -186,7 +214,7 @@ def get_processing(self, input_output_maps): if self.active_processings: return self.processings[self.active_processings[0]] else: - return None + return self.create_processing(input_output_maps) def create_processing(self, input_output_maps): proc = {'processing_metadata': {'internal_id': str(uuid.uuid1()), @@ -196,8 +224,9 @@ def create_processing(self, input_output_maps): 'rule_id': self.rule_id}} self.add_processing_to_processings(proc) self.active_processings.append(proc['processing_metadata']['internal_id']) + return proc - def create_rule(self): + def create_rule(self, processing): try: rucio_client = self.get_rucio_client() ds_did = {'scope': self.collections[self.primary_input_collection]['scope'], @@ -225,41 +254,47 @@ def create_rule(self): # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc())) return None - def submit_processing(self): - if self.active_processings: - p = self.processings[self.active_processings[0]] - if 'rule_id' in p['processing_metadata']: - pass - else: - rule_id = self.create_rule() - p['processing_metadata']['rule_id'] = rule_id + def submit_processing(self, processing): + if 'rule_id' in processing['processing_metadata']: + pass + else: + rule_id = self.create_rule(processing) + processing['processing_metadata']['rule_id'] = rule_id + + def abort_processing(self, processing): + self.tocancel = True - def poll_rule(self): + def poll_rule(self, processing): try: - p = self.processings[self.active_processings[0]] + p = processing rule_id = p['processing_metadata']['rule_id'] - rucio_client = self.get_rucio_client() - rule = rucio_client.get_replication_rule(rule_id=rule_id) - # rule['state'] - replicases_status = {} - if rule['locks_ok_cnt'] > 0: - locks = rucio_client.list_replica_locks(rule_id=rule_id) - for lock in locks: - scope_name = '%s:%s' % (lock['scope'], lock['name']) - if lock['state'] == 'OK': - replicases_status[scope_name] = ContentStatus.Available # 'OK' + if rule_id: + if not isinstance(rule_id, (tuple, list)): + rule_id = [rule_id] + + rucio_client = self.get_rucio_client() + for rule_id_item in rule_id: + rule = rucio_client.get_replication_rule(rule_id=rule_id_item) + # rule['state'] + + if rule['locks_ok_cnt'] > 0: + locks = rucio_client.list_replica_locks(rule_id=rule_id_item) + for lock in locks: + scope_name = '%s:%s' % (lock['scope'], lock['name']) + if lock['state'] == 'OK': + replicases_status[scope_name] = ContentStatus.Available # 'OK' return p, rule['state'], replicases_status except RucioRuleNotFound as ex: msg = "rule(%s) not found: %s" % (str(rule_id), str(ex)) raise exceptions.ProcessNotFound(msg) - def poll_processing(self): - return self.poll_rule() + def poll_processing(self, processing): + return self.poll_rule(processing) - def poll_processing_updates(self, input_output_maps): - processing, rule_state, rep_status = self.poll_processing() + def poll_processing_updates(self, processing, input_output_maps): + processing, rule_state, rep_status = self.poll_processing(processing) updated_contents = [] content_substatus = {'finished': 0, 'unfinished': 0} @@ -268,11 +303,11 @@ def poll_processing_updates(self, input_output_maps): for content in outputs: key = '%s:%s' % (content['scope'], content['name']) if key in rep_status: - if content['substatus'] != rep_status[key]['substatus']: + if content['substatus'] != rep_status[key]: updated_content = {'content_id': content['content_id'], - 'substatus': rep_status[key]['substatus']} + 'substatus': rep_status[key]} updated_contents.append(updated_content) - content['substatus'] = rep_status[key]['substatus'] + content['substatus'] = rep_status[key] if content['substatus'] == ContentStatus.Available: content_substatus['finished'] += 1 else: @@ -282,24 +317,52 @@ def poll_processing_updates(self, input_output_maps): if rule_state == 'OK' and content_substatus['finished'] > 0 and content_substatus['unfinished'] == 0: update_processing = {'processing_id': processing['processing_id'], 'parameters': {'status': ProcessingStatus.Finished}} + elif self.tocancel: + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': ProcessingStatus.Cancelled}} return update_processing, updated_contents def get_status_statistics(self, registered_input_output_maps): status_statistics = {} + + self.total_output_files = 0 + self.processed_output_file = 0 + for map_id in registered_input_output_maps: - outputs = registered_input_output_maps['map_id']['outputs'] + # inputs = registered_input_output_maps[map_id]['inputs'] + outputs = registered_input_output_maps[map_id]['outputs'] + + self.total_output_files += 1 for content in outputs: if content['status'].name not in status_statistics: status_statistics[content['status'].name] = 0 status_statistics[content['status'].name] += 1 + + if content['status'] == ContentStatus.Available: + self.processed_output_file += 1 + self.status_statistics = status_statistics return status_statistics + def syn_collection_status(self): + input_collections = self.get_input_collections() + output_collections = self.get_output_collections() + # log_collections = self.get_log_collections() + + for input_collection in input_collections: + input_collection['processed_files'] = self.num_mapped_inputs + + for output_collection in output_collections: + output_collection['total_files'] = self.total_output_files + output_collection['processed_files'] = self.processed_output_file + def syn_work_status(self, registered_input_output_maps): self.get_status_statistics(registered_input_output_maps) - if not self.active_processings and not self.has_new_inputs(): + self.syn_collection_status() + + if self.is_processings_terminated() and not self.has_new_inputs(): keys = self.status_statistics.keys() if ContentStatus.New.name in keys or ContentStatus.Processing.name in keys: pass diff --git a/atlas/tools/env/environment.yml b/atlas/tools/env/environment.yml index f4c65379..979f8f09 100644 --- a/atlas/tools/env/environment.yml +++ b/atlas/tools/env/environment.yml @@ -11,3 +11,5 @@ dependencies: - nose # nose test tools - rucio-clients - rucio-clients-atlas + - idds-common + - idds-workflow diff --git a/client/bin/idds b/client/bin/idds new file mode 100755 index 00000000..d6293650 --- /dev/null +++ b/client/bin/idds @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 + +""" +iDDS CLI +""" + +from __future__ import print_function + +import argparse +import argcomplete +# import json +import logging +import os +import sys +import time + +from idds.client.version import release_version +from idds.client.clientmanager import ClientManager + + +def get_requests_status(args): + wm = ClientManager(host=args.host) + wm.get_status(request_id=args.request_id, workload_id=args.workload_id, with_detail=args.with_detail) + + +def abort_requests(args): + wm = ClientManager(host=args.host) + wm.abort(request_id=args.request_id, workload_id=args.workload_id) + + +def download_logs(args): + wm = ClientManager(host=args.host) + wm.download_logs(request_id=args.request_id, workload_id=args.workload_id, dest_dir=args.dest_dir, filename=args.dest_filename) + + +def upload_to_cacher(args): + wm = ClientManager(host=args.host) + wm.upload_to_cacher(args.filename) + + +def download_from_cacher(args): + wm = ClientManager(host=args.host) + wm.download_from_cacher(args.filename) + + +def get_hyperparameters(args): + wm = ClientManager(host=args.host) + ret = wm.get_hyperparameters(workload_id=args.workload_id, request_id=args.request_id, id=args.id, status=args.status, limit=args.limit) + # print(json.dumps(ret, sort_keys=True, indent=4)) + for k in ret: + print(k) + +def update_hyperparameter(args): + wm = ClientManager(host=args.host) + ret = wm.update_hyperparameter(workload_id=args.workload_id, request_id=args.request_id, id=args.id, loss=args.loss) + print(ret) + +def get_parser(): + """ + Return the argparse parser. + """ + oparser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), add_help=True) + subparsers = oparser.add_subparsers() + + # common items + oparser.add_argument('--version', action='version', version='%(prog)s ' + release_version) + oparser.add_argument('--config', dest="config", help="The iDDS configuration file to use.") + oparser.add_argument('--verbose', '-v', default=False, action='store_true', help="Print more verbose output.") + oparser.add_argument('-H', '--host', dest="host", metavar="ADDRESS", help="The iDDS Rest host. For example: https://iddsserver.cern.ch:443/idds") + + # get request status + req_status_parser = subparsers.add_parser('get_requests_status', help='Get the requests status') + req_status_parser.set_defaults(function=get_requests_status) + req_status_parser.add_argument('--request_id', dest='request_id', action='store', type=int, help='The request id') + req_status_parser.add_argument('--workload_id', dest='workload_id', action='store', type=int, help='The workload id') + req_status_parser.add_argument('--with_detail', dest='with_detail', default=False, action='store_true', help='To show detail status') + + # abort requests + abort_parser = subparsers.add_parser('abort_requests', help='Abort requests') + abort_parser.set_defaults(function=abort_requests) + abort_parser.add_argument('--request_id', dest='request_id', action='store', type=int, help='The request id') + abort_parser.add_argument('--workload_id', dest='workload_id', action='store', type=int, help='The workload id') + + # download logs + log_parser = subparsers.add_parser('download_logs', help='Download logs') + log_parser.set_defaults(function=download_logs) + log_parser.add_argument('--request_id', dest='request_id', action='store', type=int, help='The request id') + log_parser.add_argument('--workload_id', dest='workload_id', action='store', type=int, help='The workload id') + log_parser.add_argument('--dest_dir', dest='dest_dir', action='store', default='./', help='The destination directory') + log_parser.add_argument('--dest_filename', dest='dest_filename', action='store', default=None, help='The destination filename') + + # upload a file to the cacher + upload_parser = subparsers.add_parser('upload_to_cacher', help='Upload a file to the iDDS cacher on the server') + upload_parser.set_defaults(function=upload_to_cacher) + upload_parser.add_argument('--filename', dest='filename', action='store', default=None, help='The source filename. The destination filename on the server will be the base name of the file') + + # download a file from the cacher + download_parser = subparsers.add_parser('download_from_cacher', help='Download a file from the iDDS cacher on the server') + download_parser.set_defaults(function=download_from_cacher) + download_parser.add_argument('--filename', dest='filename', action='store', default=None, help='The destination filename. The source filename on the server will be the base name of the file') + + # get hyperparameters + hp_get_parser = subparsers.add_parser('get_hyperparameters', help='Get hyperparameters') + hp_get_parser.set_defaults(function=get_hyperparameters) + hp_get_parser.add_argument('--request_id', dest='request_id', action='store', type=int, help='The request id') + hp_get_parser.add_argument('--workload_id', dest='workload_id', action='store', type=int, help='The workload id') + hp_get_parser.add_argument('--id', dest='id', action='store', type=int, help='The id of the hyperparameter') + hp_get_parser.add_argument('--status', dest='status', action='store', help='Retrieve hyperparameters with defined status') + hp_get_parser.add_argument('--limit', dest='limit', action='store', type=int, help='Limit number of hyperparameters') + + # update hyperparameter + hp_update_parser = subparsers.add_parser('update_hyperparameter', help='Update the hyperparameter result') + hp_update_parser.set_defaults(function=update_hyperparameter) + hp_update_parser.add_argument('--request_id', dest='request_id', action='store', type=int, help='The request id') + hp_update_parser.add_argument('--workload_id', dest='workload_id', action='store', type=int, help='The workload id') + hp_update_parser.add_argument('--id', dest='id', action='store', type=int, help='The id of the hyperparameter') + hp_update_parser.add_argument('--loss', dest='loss', action='store', type=float, help='The loss result to be updated') + + return oparser + + +if __name__ == '__main__': + arguments = sys.argv[1:] + # set the configuration before anything else, if the config parameter is present + for argi in range(len(arguments)): + if arguments[argi] == '--config' and (argi + 1) < len(arguments): + os.environ['IDDS_CONFIG'] = arguments[argi + 1] + + oparser = get_parser() + argcomplete.autocomplete(oparser) + + if len(sys.argv) == 1: + oparser.print_help() + sys.exit(-1) + + args = oparser.parse_args(arguments) + + try: + if args.verbose: + logging.setLevel(logging.DEBUG) + start_time = time.time() + result = args.function(args) + end_time = time.time() + if args.verbose: + print("Completed in %-0.4f sec." % (end_time - start_time)) + sys.exit(0) + except Exception as error: + logging.error("Strange error: {0}".format(error)) + sys.exit(-1) diff --git a/client/etc/idds/idds.cfg.client.template b/client/etc/idds/idds.cfg.client.template new file mode 100755 index 00000000..ec05d26d --- /dev/null +++ b/client/etc/idds/idds.cfg.client.template @@ -0,0 +1,19 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Wen Guan, , 2020 + +[common] +# if logdir is configured, idds will write to idds.log in this directory. +# else idds will go to stdout/stderr. +# With supervisord, it's good to write to stdout/stderr, then supervisord can manage and rotate logs. +# logdir = /var/log/idds +loglevel = INFO + +[rest] +host = https://iddsserver.cern.ch:443/idds +#url_prefix = /idds +#cacher_dir = /tmp +cacher_dir = /data/idds diff --git a/client/lib/idds/client/base.py b/client/lib/idds/client/base.py index 8bda4119..d6b973b5 100644 --- a/client/lib/idds/client/base.py +++ b/client/lib/idds/client/base.py @@ -101,6 +101,7 @@ def get_request_response(self, url, type='GET', data=None, headers=None): raise exceptions.ConnectionException('ConnectionError: ' + str(error)) if result is not None: + # print(result.text) if result.status_code == HTTP_STATUS_CODE.OK: # print(result.text) if result.text: diff --git a/client/lib/idds/client/catalogclient.py b/client/lib/idds/client/catalogclient.py index 19344c3a..e4c13118 100644 --- a/client/lib/idds/client/catalogclient.py +++ b/client/lib/idds/client/catalogclient.py @@ -6,11 +6,11 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2020 """ -Request Rest client to access IDDS system. +Request Rest client to access IDDS catalog system. """ import os @@ -35,7 +35,7 @@ def __init__(self, host=None, client_proxy=None, timeout=None): """ super(CatalogClient, self).__init__(host=host, client_proxy=client_proxy, timeout=timeout) - def get_collections(self, scope=None, name=None, request_id=None, workload_id=None): + def get_collections(self, scope=None, name=None, request_id=None, workload_id=None, relation_type=None): """ Get collections from the Head service. @@ -43,7 +43,7 @@ def get_collections(self, scope=None, name=None, request_id=None, workload_id=No :param name: the collection name, can be wildcard. :param request_id: the request id. :param workload_id: the workload id. - + :param relation_type: The relation_type of the request (input/output/log). :raise exceptions if it's not got successfully. """ path = os.path.join(self.CATALOG_BASEURL, 'collections') @@ -55,27 +55,19 @@ def get_collections(self, scope=None, name=None, request_id=None, workload_id=No request_id = 'null' if workload_id is None: workload_id = 'null' - url = self.build_url(self.host, path=os.path.join(path, scope, name, str(request_id), str(workload_id))) + if relation_type is None: + relation_type = 'null' + elif isinstance(relation_type, Enum): + relation_type = relation_type.value + + url = self.build_url(self.host, path=os.path.join(path, scope, name, str(request_id), str(workload_id), + str(relation_type))) collections = self.get_request_response(url, type='GET') + return collections - # print(collections) - # json dumps will change integer key to string and json.loads will not change it back. fix it. - new_collections = {} - for req_id in collections: - if req_id is not None: - if req_id == 'null': - new_req_id = None - else: - new_req_id = int(req_id) - else: - new_req_id = req_id - new_collections[new_req_id] = {} - for trans_id in collections[req_id]: - new_collections[new_req_id][int(trans_id)] = collections[req_id][trans_id] - return new_collections - - def get_contents(self, coll_scope=None, coll_name=None, request_id=None, workload_id=None, relation_type=None): + def get_contents(self, coll_scope=None, coll_name=None, request_id=None, workload_id=None, + relation_type=None, status=None): """ Get contents from the Head service. @@ -84,6 +76,7 @@ def get_contents(self, coll_scope=None, coll_name=None, request_id=None, workloa :param request_id: the request id. :param workload_id: the workload id. :param relation_type: the relation between the collection and the transform(input, output, log) + :param status: The content status. :raise exceptions if it's not got successfully. """ @@ -100,24 +93,16 @@ def get_contents(self, coll_scope=None, coll_name=None, request_id=None, workloa relation_type = 'null' elif isinstance(relation_type, Enum): relation_type = relation_type.value + if status is None: + status = 'null' + elif isinstance(status, Enum): + status = status.value - url = self.build_url(self.host, path=os.path.join(path, coll_scope, coll_name, str(request_id), str(workload_id), str(relation_type))) + url = self.build_url(self.host, path=os.path.join(path, coll_scope, coll_name, str(request_id), + str(workload_id), str(relation_type), str(status))) contents = self.get_request_response(url, type='GET') - new_contents = {} - for req_id in contents: - if req_id is not None: - if req_id == 'null': - new_req_id = None - else: - new_req_id = int(req_id) - else: - new_req_id = req_id - new_contents[new_req_id] = {} - for trans_id in contents[req_id]: - new_contents[new_req_id][int(trans_id)] = contents[req_id][trans_id] - - return new_contents + return contents def get_match_contents(self, coll_scope=None, coll_name=None, scope=None, name=None, min_id=None, max_id=None, request_id=None, workload_id=None, only_return_best_match=None): """ diff --git a/client/lib/idds/client/clientmanager.py b/client/lib/idds/client/clientmanager.py new file mode 100644 index 00000000..73aea272 --- /dev/null +++ b/client/lib/idds/client/clientmanager.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 + + +""" +Workflow manager. +""" +import logging +import tabulate + +from idds.common.utils import setup_logging + +from idds.client.client import Client +from idds.common.constants import RequestType, RequestStatus +from idds.common.utils import get_rest_host, exception_handler + +# from idds.workflow.work import Work, Parameter, WorkStatus +# from idds.workflow.workflow import Condition, Workflow + + +setup_logging(__name__) + + +class ClientManager: + def __init__(self, host=None): + self.host = host + if self.host is None: + self.host = get_rest_host() + self.client = Client(host=self.host) + + @exception_handler + def submit(self, workflow): + """ + Submit the workflow as a request to iDDS server. + + :param workflow: The workflow to be submitted. + """ + props = { + 'scope': 'workflow', + 'name': workflow.get_name(), + 'requester': 'panda', + 'request_type': RequestType.Workflow, + 'transform_tag': 'workflow', + 'status': RequestStatus.New, + 'priority': 0, + 'lifetime': 30, + 'workload_id': workflow.get_workload_id(), + 'request_metadata': {'workload_id': workflow.get_workload_id(), 'workflow': workflow} + } + workflow.add_proxy() + primary_init_work = workflow.get_primary_initial_collection() + if primary_init_work: + props['scope'] = primary_init_work['scope'] + props['name'] = primary_init_work['name'] + + # print(props) + request_id = self.client.add_request(**props) + return request_id + + @exception_handler + def abort(self, request_id=None, workload_id=None): + """ + Abort requests. + + :param workload_id: the workload id. + :param request_id: the request. + """ + if request_id is None and workload_id is None: + logging.error("Both request_id and workload_id are None. One of them should not be None") + return + reqs = self.client.get_requests(request_id=request_id, workload_id=workload_id) + for req in reqs: + logging.info("Aborting request: %s" % req['request_id']) + self.client.update_request(request_id=req['request_id'], parameters={'status': RequestStatus.ToCancel}) + + @exception_handler + def get_status(self, request_id=None, workload_id=None, with_detail=False): + """ + Get the status progress report of requests. + + :param workload_id: the workload id. + :param request_id: the request. + :param with_detail: Whether to show detail info. + """ + reqs = self.client.get_requests(request_id=request_id, workload_id=workload_id, with_detail=with_detail) + if with_detail: + table = [] + for req in reqs: + table.append([req['request_id'], req['workload_id'], "%s:%s" % (req['scope'], req['name']), + "%s[%s/%s/%s]" % (req['status'], req['total_contents'], req['processed_contents'], req['processing_contents']), + req['errors']]) + print(tabulate.tabulate(table, tablefmt='simple', headers=['request_id', 'workload_id', 'scope:name', 'status[Total/OK/Processing]', 'errors'])) + else: + table = [] + for req in reqs: + table.append([req['request_id'], req['workload_id'], "%s:%s" % (req['scope'], req['name']), req['status'], req['errors']]) + print(tabulate.tabulate(table, tablefmt='simple', headers=['request_id', 'workload_id', 'scope:name', 'status', 'errors'])) + + @exception_handler + def download_logs(self, request_id=None, workload_id=None, dest_dir='./', filename=None): + """ + Download logs for a request. + + :param workload_id: the workload id. + :param request_id: the request. + :param dest_dir: The destination directory. + :param filename: The destination filename to be saved. If it's None, default filename will be saved. + """ + filename = self.client.download_logs(request_id=request_id, workload_id=workload_id, dest_dir=dest_dir, filename=filename) + if filename: + logging.info("Logs are downloaded to %s" % filename) + else: + logging.info("Failed to download logs for workload_id(%s) and request_id(%s)" % (workload_id, request_id)) + + @exception_handler + def upload_to_cacher(self, filename): + """ + Upload file to iDDS cacher: On the cacher, the filename will be the basename of the file. + """ + return self.client.upload(filename) + + @exception_handler + def download_from_cacher(self, filename): + """ + Download file from iDDS cacher: On the cacher, the filename will be the basename of the file. + """ + return self.client.download(filename) + + @exception_handler + def get_hyperparameters(self, workload_id, request_id, id=None, status=None, limit=None): + """ + Get hyperparameters from the Head service. + + :param workload_id: the workload id. + :param request_id: the request id. + :param status: the status of the hyperparameters. + :param limit: limit number of hyperparameters + + :raise exceptions if it's not got successfully. + """ + return self.client.get_hyperparameters(workload_id=workload_id, request_id=request_id, id=id, status=status, limit=limit) + + @exception_handler + def update_hyperparameter(self, workload_id, request_id, id, loss): + """ + Update hyperparameter to the Head service. + + :param workload_id: the workload id. + :param request_id: the request. + :param id: id of the hyper parameter. + :param loss: the loss. + + :raise exceptions if it's not updated successfully. + """ + return self.client.update_hyperparameter(workload_id=workload_id, request_id=request_id, id=id, loss=loss) diff --git a/client/lib/idds/client/requestclient.py b/client/lib/idds/client/requestclient.py index 1462d4e8..f5c945d1 100644 --- a/client/lib/idds/client/requestclient.py +++ b/client/lib/idds/client/requestclient.py @@ -48,6 +48,7 @@ def add_request(self, **kwargs): url = self.build_url(self.host, path=path) data = kwargs + # if 'request_type' in data and data['request_type'] and isinstance(data['request_type'], RequestType): # data['request_type'] = data['request_type'].value # if 'status' in data and data['status'] and isinstance(data['status'], RequestStatus): @@ -79,7 +80,7 @@ def update_request(self, request_id, parameters): r = self.get_request_response(url, type='PUT', data=data) return r - def get_requests(self, request_id=None, workload_id=None): + def get_requests(self, request_id=None, workload_id=None, with_detail=False): """ Get request from the Head service. @@ -93,7 +94,7 @@ def get_requests(self, request_id=None, workload_id=None): request_id = 'null' if workload_id is None: workload_id = 'null' - url = self.build_url(self.host, path=os.path.join(path, str(request_id), str(workload_id))) + url = self.build_url(self.host, path=os.path.join(path, str(request_id), str(workload_id), str(with_detail))) requests = self.get_request_response(url, type='GET') diff --git a/client/lib/idds/client/version.py b/client/lib/idds/client/version.py index 3b9ff3e9..d9044964 100644 --- a/client/lib/idds/client/version.py +++ b/client/lib/idds/client/version.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2021 -release_version = "0.0.5" +release_version = "0.1.0" diff --git a/client/lib/idds/client/workflowmanager.py b/client/lib/idds/client/workflowmanager.py deleted file mode 100644 index 501d35be..00000000 --- a/client/lib/idds/client/workflowmanager.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0OA -# -# Authors: -# - Wen Guan, , 2020 - - -""" -Workflow manager. -""" - -from idds.common.utils import setup_logging - -from idds.client.client import Client -from idds.common.constants import RequestType, RequestStatus -from idds.common.utils import get_rest_host - -# from idds.workflow.work import Work, Parameter, WorkStatus -# from idds.workflow.workflow import Condition, Workflow - - -setup_logging(__name__) - - -class WorkflowManager: - def __init__(self, host=None): - self.host = host - if self.host is None: - self.host = get_rest_host() - - def submit(self, workflow): - props = { - 'scope': 'workflow', - 'name': workflow.get_name(), - 'requester': 'panda', - 'request_type': RequestType.Workflow, - 'transform_tag': 'workflow', - 'status': RequestStatus.New, - 'priority': 0, - 'lifetime': 30, - 'workload_id': workflow.get_workload_id(), - 'request_metadata': {'workload_id': workflow.get_workload_id(), 'workflow': workflow} - } - - print(props) - client = Client(host=self.host) - request_id = client.add_request(**props) - return request_id diff --git a/client/tools/env/environment.yml b/client/tools/env/environment.yml index 90d2fcd2..db59f941 100644 --- a/client/tools/env/environment.yml +++ b/client/tools/env/environment.yml @@ -1,4 +1,4 @@ -name: iDDS +name: idds-client dependencies: - python==3.6 - pip @@ -13,4 +13,6 @@ dependencies: - flake8 # Wrapper around PyFlakes&pep8 - pytest # python testing tool - nose # nose test tools - + - tabulate + - idds-common + - idds-workflow diff --git a/common/lib/idds/common/constants.py b/common/lib/idds/common/constants.py index 6fed34a1..0d0883fc 100644 --- a/common/lib/idds/common/constants.py +++ b/common/lib/idds/common/constants.py @@ -28,6 +28,7 @@ class Sections: Transporter = 'transporter' Carrier = 'carrier' Conductor = 'conductor' + Consumer = 'consumer' class HTTP_STATUS_CODE: @@ -157,6 +158,8 @@ class TransformType(IDDSEnum): ActiveLearning = 3 HyperParameterOpt = 4 Derivation = 5 + Processing = 6 + Actuating = 7 Other = 99 @@ -200,6 +203,7 @@ class CollectionStatus(IDDSEnum): SubClosed = 5 Failed = 6 Deleted = 7 + Cancelled = 8 class CollectionLocking(IDDSEnum): @@ -247,6 +251,9 @@ class ProcessingStatus(IDDSEnum): FinishedOnExec = 9 TimeOut = 10 FinishedTerm = 11 + ToCancel = 12 + Cancelling = 13 + Cancelled = 14 class ProcessingLocking(IDDSEnum): @@ -257,12 +264,39 @@ class ProcessingLocking(IDDSEnum): class MessageType(IDDSEnum): StageInFile = 0 StageInCollection = 1 - ActiveLearningFile = 2 - ActiveLearningCollection = 3 - HyperParameterOptFile = 4 - HyperParameterOptCollection = 5 - UnknownFile = 98 - UnknownCollection = 99 + StageInWork = 2 + ActiveLearningFile = 3 + ActiveLearningCollection = 4 + ActiveLearningWork = 5 + HyperParameterOptFile = 6 + HyperParameterOptCollection = 7 + HyperParameterOptWork = 8 + ProcessingFile = 9 + ProcessingCollection = 10 + ProcessingWork = 11 + HealthHeartbeat = 12 + UnknownFile = 97 + UnknownCollection = 98 + UnknownWork = 99 + + +class MessageTypeStr(IDDSEnum): + StageInFile = 'file_stagein' + StageInCollection = 'collection_stagein' + StageInWork = 'work_stagein' + ActiveLearningFile = 'file_activelearning' + ActiveLearningCollection = 'collection_activelearning' + ActiveLearningWork = 'work_activelearning' + HyperParameterOptFile = 'file_hyperparameteropt' + HyperParameterOptCollection = 'collection_hyperparameteropt' + HyperParameterOptWork = 'work_hyperparameteropt' + ProcessingFile = 'file_processing' + ProcessingCollection = 'collection_processing' + ProcessingWork = 'work_processing' + HealthHeartbeat = 'health_heartbeat' + UnknownFile = 'file_unknown' + UnknownCollection = 'collection_unknown' + UnknownWork = 'work_unknown' class MessageStatus(IDDSEnum): diff --git a/common/lib/idds/common/dict_class.py b/common/lib/idds/common/dict_class.py index e78a9570..f780ad9a 100644 --- a/common/lib/idds/common/dict_class.py +++ b/common/lib/idds/common/dict_class.py @@ -12,6 +12,7 @@ Dict class. """ +import inspect from enum import Enum @@ -32,6 +33,8 @@ def to_dict_l(self, d): for k in d: new_d.append(self.to_dict_l(k)) return new_d + elif inspect.ismethod(d): + return {'idds_method': d.__name__, 'idds_method_class_id': d.__self__.get_internal_id()} return d def to_dict(self): @@ -56,6 +59,12 @@ def is_class(d): return True return False + @staticmethod + def is_class_method(d): + if d and isinstance(d, dict) and 'idds_method' in d and 'idds_method_class_id' in d: + return True + return False + @staticmethod def load_instance(d): module = __import__(d['module'], fromlist=[None]) @@ -66,6 +75,11 @@ def load_instance(d): impl = cls() return impl + @staticmethod + def load_instance_method(d): + # not do anything. Will load the method in Workflow class. + return d + @staticmethod def from_dict(d): if not d: @@ -80,6 +94,9 @@ def from_dict(d): value = DictClass.from_dict(value) setattr(impl, key, value) return impl + elif DictClass.is_class_method(d): + impl = DictClass.load_instance_method(d) + return impl elif isinstance(d, dict): for k, v in d.items(): d[k] = DictClass.from_dict(v) diff --git a/common/lib/idds/common/status_utils.py b/common/lib/idds/common/status_utils.py new file mode 100644 index 00000000..fca4734b --- /dev/null +++ b/common/lib/idds/common/status_utils.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 - 2020 + + +from idds.common.constants import (WorkprogressStatus) +from idds.common.utils import is_sub + + +def get_workprogresses_status(wp_status): + if not wp_status: + return False + + if not isinstance(wp_status, (list, tuple)): + wp_status = [wp_status] + if len(wp_status) == 1: + return wp_status[0] + elif is_sub(wp_status, [WorkprogressStatus.Finished]): + return WorkprogressStatus.Finished + elif is_sub(wp_status, [WorkprogressStatus.Finished, WorkprogressStatus.SubFinished]): + return WorkprogressStatus.SubFinished + elif is_sub(wp_status, [WorkprogressStatus.Finished, WorkprogressStatus.SubFinished, + WorkprogressStatus.Failed]): + return WorkprogressStatus.Failed + elif is_sub(wp_status, [WorkprogressStatus.Finished, WorkprogressStatus.SubFinished, + WorkprogressStatus.Failed, WorkprogressStatus.Cancelled]): + return WorkprogressStatus.Cancelled + elif is_sub(wp_status, [WorkprogressStatus.Finished, WorkprogressStatus.SubFinished, + WorkprogressStatus.Failed, WorkprogressStatus.Cancelled, + WorkprogressStatus.Transforming]): + return WorkprogressStatus.Transforming + elif is_sub(wp_status, [WorkprogressStatus.Finished, WorkprogressStatus.SubFinished, + WorkprogressStatus.Failed, WorkprogressStatus.Cancelled, + WorkprogressStatus.Transforming, WorkprogressStatus.Cancelling]): + return WorkprogressStatus.Cancelling + else: + return WorkprogressStatus.Transforming diff --git a/common/lib/idds/common/utils.py b/common/lib/idds/common/utils.py index e95b82ca..c8be14c8 100644 --- a/common/lib/idds/common/utils.py +++ b/common/lib/idds/common/utils.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2020 import datetime @@ -20,6 +20,7 @@ import tarfile from enum import Enum +from functools import wraps from idds.common.config import (config_has_section, config_has_option, config_get, config_get_bool) @@ -29,6 +30,7 @@ ContentType, ContentStatus, GranularityType, ProcessingStatus) from idds.common.dict_class import DictClass +from idds.common.exceptions import IDDSException # RFC 1123 @@ -334,10 +336,11 @@ def convert_request_type_to_transform_type(request_type): class DictClassEncoder(json.JSONEncoder): def default(self, obj): + # print(obj) if isinstance(obj, IDDSEnum) or isinstance(obj, DictClass): return obj.to_dict() - # elif isinstance(obj, datetime.datetime): - # return date_to_str(obj) + elif isinstance(obj, datetime.datetime): + return date_to_str(obj) # elif isinstance(obj, (datetime.time, datetime.date)): # return obj.isoformat() # elif isinstance(obj, datetime.timedelta): @@ -353,8 +356,8 @@ def as_has_dict(dct): return dct -def json_dumps(obj): - return json.dumps(obj, cls=DictClassEncoder) +def json_dumps(obj, indent=None, sort_keys=False): + return json.dumps(obj, indent=indent, sort_keys=sort_keys, cls=DictClassEncoder) def json_loads(obj): @@ -392,3 +395,33 @@ def tar_zip_files(output_dir, output_filename, files): with tarfile.open(output_filename, "w:gz") as tar: for file in files: tar.add(file, arcname=os.path.basename(file)) + + +def exception_handler(function): + @wraps(function) + def new_funct(*args, **kwargs): + try: + return function(*args, **kwargs) + except IDDSException as ex: + logging.error(ex) + except Exception as ex: + logging.error(ex) + return new_funct + + +def is_sub(a, b): + if not a: + return True + + for i in a: + if i not in b: + return False + return True + + +def get_proxy(): + if 'X509_USER_PROXY' in os.environ: + with open(os.environ['X509_USER_PROXY'], 'r') as fp: + proxy = fp.read() + return proxy + return None diff --git a/common/lib/idds/common/version.py b/common/lib/idds/common/version.py index 3b9ff3e9..d9044964 100644 --- a/common/lib/idds/common/version.py +++ b/common/lib/idds/common/version.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2021 -release_version = "0.0.5" +release_version = "0.1.0" diff --git a/common/tools/env/environment.yml b/common/tools/env/environment.yml index 776c514d..753f56cf 100644 --- a/common/tools/env/environment.yml +++ b/common/tools/env/environment.yml @@ -1,4 +1,4 @@ -name: iDDS +name: idds-common dependencies: - python==3.6 - pip @@ -8,4 +8,3 @@ dependencies: - flake8 # Wrapper around PyFlakes&pep8 - pytest # python testing tool - nose # nose test tools - diff --git a/docs/source/index.rst b/docs/source/index.rst index c2277a5b..17d14e75 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -10,7 +10,7 @@ iDDS(intelligent Data Delivery Service) is an intelligent Data Delivery Service. to intelligently transform and deliver the needed data to the processing workflow in a fine-grained approach for High Energy Physics workloads. -iDDS is developed with modular and is highly scalable with plugin structure. +iDDS is developed with modular and is highly scalable with a serializable Work/Workflow structure. This documentation is generated by the `Sphinx toolkit`_. and lives in the `source tree`_. @@ -53,6 +53,7 @@ User Documentation users/installing_client users/cli_examples + users/contributing Source Codes ============= diff --git a/docs/source/usecases/active_learning.rst b/docs/source/usecases/active_learning.rst index c2635e2a..309082cd 100644 --- a/docs/source/usecases/active_learning.rst +++ b/docs/source/usecases/active_learning.rst @@ -6,18 +6,13 @@ Active Learning is an usecase of iDDS. The purpose of iDDS AL is to use iDDS to iDDS AL workflow ^^^^^^^^^^^^^^^^^ -1. User creates an AL request with executable 'AL' process. -2. iDDS runs the 'AL' process which should generates an output json file. -3. iDDS parses the output json file and sends the output content to consumers. +1. User creates a panda processing task, say task1. +2. User defines a learning task and submitted it to iDDS (run in iDDS local cluster), say task2. +3. User defines the contion between task1 -> task2: When task1 is terminated, the condition function will be called. If it returns True, the next task task2 will be started. +4. User defines the contion between task2 -> task1: When task2 is terminated, this condition function will be called. +5. To trigger next task, if the current task returns parameters, these parameters will be used as inputs to trigger the next task. - -The AL process +The AL example -------------- -1. The AL process is an executable which runs in iDDS. It can be some executable shipped in a sandbox(through https) or some other extensions with new plugins. -2. The AL process should create an output json file. The json file name is defined in the 'output_json' in the request_metadata. Below is one example of the request. - - main/lib/idds/tests/activelearning_test.py - -3. iDDS parses the output json file and sends the output content to consumers through ActiveMQ. -4. The consumers can define how to parse the outputs. For example, the consumer can stop further processing when the output contents is None or []. +See examples in "User Documents" -> "iDDS RESTful client: Examples" diff --git a/docs/source/usecases/hyperparemeter_optimization.rst b/docs/source/usecases/hyperparemeter_optimization.rst index ac480f47..4fc5daf6 100644 --- a/docs/source/usecases/hyperparemeter_optimization.rst +++ b/docs/source/usecases/hyperparemeter_optimization.rst @@ -27,36 +27,25 @@ Optimization and Sampling -------------------------- Currently iDDS support several different ways to generate hyperparameter points. - 1. bayesian: This is a pre-defied method in iDDS. - - a. The process to generate new hyperparameters: atlas/lib/idds/atlas/processing/hyperparameteropt_bayesian.py - b. The example code to generate requests: main/lib/idds/tests/hyperparameteropt_bayesian_test.py - - 2. `nevergrad `_: This is another pre-defined method in iDDS. - - a. The process to generate new hyperparameters: atlas/lib/idds/atlas/processing/hyperparameteropt_nevergrad.py - b. The example code to generate requests: main/lib/idds/tests/hyperparameteropt_nevergrad_test.py - - 3. Steering container: Users can also provide their own container images to generate hyperparameter points. See `User-defined Steering Container`_ for the details. - - b. Here is a docker example: main/lib/idds/tests/hyperparameteropt_docker_test.py + See examples in "User Documents" -> "iDDS RESTful client: Examples" RESTful Service ---------------- 1. To retrieve hyperparameters. - client.get_hyperparameters(workload_id, request_id, id=None, status=None, limit=None) + See examples in "User Documents" -> "iDDS RESTful client: Examples" + clientmanager.get_hyperparameters(workload_id, request_id, id=None, status=None, limit=None) examples: - client.get_hyperparameters(workload_id=123, request_id=None) - client.get_hyperparameters(workload_id=None, request_id=456) - client.get_hyperparameters(workload_id=None, request_id=456, id=0) + clientmanager.get_hyperparameters(workload_id=123, request_id=None) + clientmanager.get_hyperparameters(workload_id=None, request_id=456) + clientmanager.get_hyperparameters(workload_id=None, request_id=456, id=0) 2. To register loss of a group of hyperparameters. - client.update_hyperparameter(request_id, id, loss) + clientmanager.update_hyperparameter(request_id, id, loss) 3. Example code: main/lib/idds/tests/hyperparameteropt_client_test.py diff --git a/docs/source/users/cli_examples.rst b/docs/source/users/cli_examples.rst index 885ad3cf..f7a3c502 100644 --- a/docs/source/users/cli_examples.rst +++ b/docs/source/users/cli_examples.rst @@ -3,83 +3,274 @@ iDDS RESTful client: Examples iDDS provides RESTful services and the client is used to access the RESTful service. -iDDS request client -~~~~~~~~~~~~~~~~~~~ +iDDS workflow manager +~~~~~~~~~~~~~~~~~~~~~~~~ -1. request properties +1. submit a workflow to the idds server -Below is one example for data carousel request. +Below is one example for submitting a workflow. .. code-block:: python - req_properties = { - 'scope': 'data16_13TeV', - 'name': 'data16_13TeV.00298862.physics_Main.daq.RAW', - 'requester': 'panda', - 'request_type': RequestType.StageIn, - 'transform_tag': 's2395', - 'status': RequestStatus.New, - 'priority': 0, - 'lifetime': 30, - 'request_metadata': {'workload_id': '20776840', 'src_rse': 'NDGF-T1_DATATAPE', 'dest_rse': 'NDGF-T1_DATADISK', 'rule_id': '236e4bf87e11490291e3259b14724e30'} - } + from idds.client.clientmanager import ClientManager + from idds.common.utils import get_rest_host -2. add request + # get the host from the client cfg + host = get_rest_host() + # or for example, host = https://iddsserver.cern.ch:443/idds + + # get a workflow + workflow = get_workflow() + + cm = ClientManager(host=host) + request_id = cm.submit(workflow) + +Below is an example for data carousel + +.. code-block:: python + + def get_workflow(): + from idds.workflow.workflow import Workflow + from idds.atlas.workflow.atlasstageinwork import ATLASStageinWork + + scope = 'data16_13TeV' + name = 'data16_13TeV.00298862.physics_Main.daq.RAW' + src_rse = 'NDGF-T1_DATATAPE' + dest_rse = 'NDGF-T1_DATADISK' + rule_id = '*****' + workload_id = + work = ATLASStageinWork(primary_input_collection={'scope': scope, 'name': name}, + output_collections={'scope': scope, 'name': name + '.idds.stagein'}, + max_waiting_time=max_waiting_time, + src_rse=src_rse, + dest_rse=dest_rse, + rule_id=rule_id) + wf = Workflow() + wf.set_workload_id(workload_id) + wf.add_work(work) + return wf + +Below is an example for hyperparameter optimization + +.. code-block:: python + + def get_workflow(): + from idds.workflow.workflow import Workflow + from idds.atlas.workflow.atlashpowork import ATLASHPOWork + + # request_metadata for predefined method 'nevergrad' + request_metadata = {'workload_id': '20525135', 'sandbox': None, 'method': 'nevergrad', 'opt_space': {"A": {"type": "Choice", "params": {"choices": [1, 4]}}, "B": {"type": "Scalar", "bounds": [0, 5]}}, 'initial_points': [({'A': 1, 'B': 2}, 0.3), ({'A': 1, 'B': 3}, None)], 'max_points': 20, 'num_points_per_generation': 10} + + # request_metadata for docker method + request_metadata = {'workload_id': '20525134', 'sandbox': 'wguanicedew/idds_hpo_nevergrad', 'workdir': '/data', 'executable': 'docker', 'arguments': 'python /opt/hyperparameteropt_nevergrad.py --max_points=%MAX_POINTS --num_points=%NUM_POINTS --input=/data/%IN --output=/data/%OUT', 'output_json': 'output.json', 'opt_space': {"A": {"type": "Choice", "params": {"choices": [1, 4]}}, "B": {"type": "Scalar", "bounds": [0, 5]}}, 'initial_points': [({'A': 1, 'B': 2}, 0.3), ({'A': 1, 'B': 3}, None)], 'max_points': 20, 'num_points_per_generation': 10} + + work = ATLASHPOWork(executable=request_metadata.get('executable', None), + arguments=request_metadata.get('arguments', None), + parameters=request_metadata.get('parameters', None), + setup=None, exec_type='local', + sandbox=request_metadata.get('sandbox', None), + method=request_metadata.get('method', None), + container_workdir=request_metadata.get('workdir', None), + output_json=request_metadata.get('output_json', None), + opt_space=request_metadata.get('opt_space', None), + initial_points=request_metadata.get('initial_points', None), + max_points=request_metadata.get('max_points', None), + num_points_per_iteration=request_metadata.get('num_points_per_iteration', 10)) + wf = Workflow() + wf.set_workload_id(request_metadata.get('workload_id', None)) + wf.add_work(work) + return wf + +Below is an example for active learning + +.. code-block:: python + + from idds.client.clientmanager import ClientManager + from idds.common.utils import get_rest_host, run_command + from idds.workflow.workflow import Condition, Workflow + from idds.atlas.workflow.atlaspandawork import ATLASPandaWork + from idds.atlas.workflow.atlasactuatorwork import ATLASActuatorWork + + def get_task_id(output, error): + m = re.search('jediTaskID=(\d+)', output + error) # noqa W605 + task_id = int(m.group(1)) + return task_id + + def submit_processing_task(): + cmd = "cd /afs/cern.ch/user/w/wguan/workdisk/iDDS/test/activelearning/hepexcursion/grid; prun --exec 'python simplescript.py 0.5 0.5 200 output.json' --outDS user.wguan.altest123456 --outputs output.json --nJobs=10" + status, output, error = run_command(cmd) + if status == 0: + task_id = get_task_id(output, error) + return task_id + else: + raise Exception(output + error) + + def get_workflow(panda_task_id): + ####################################### + # Current workflow: + # 1. a processing task is submitted to panda with prun or pathena(When prun or pathena provides some interface '--generate_task_parameter_map', this part will be changed). It will be the first ATLASPandaWork (the 'work' below). The ATLASPandaWork will use panda API to get the Panda task_parameter_map and keep this task_parameter_map for later task submission. + # 2. A learning task is defined ATLASActuatorWork (the 'actuator' below). This task will be executed in iDDS local condor cluster. + # 3. A DAG condition is defined between 'work' -> 'actuator'. In the example below, when the work 'is_finished' return True, 'actuator' will be triggerred to start. (Note: The condition will be checked only when the current work is terminated. The condtion function can be any function in the current work.) + # 4. A DAG condition is defined between 'actuator' -> 'work'. In the example below, when the 'actuator' is terminated, the condtion function generate_new_task will be called. + # In ATLASActuatorWork, one parameter "output_json='merge.json'" is defined. When ATLASActuatorWork finished, iDDS will read this 'output_json' file and use its contents as the input parameter to clone another ATLASPandaWork and submit this new task to Panda. In this example, if 'output_json' is empty, generate_new_task will return False. No new tasks will be triggered. + ####################################### + + # For ATLASPandaWork, there is a function set_parameters. + # If set_parameters is not called. ATLASPandaWork will just use the current panda_task_id as its task. + # If set_parameters is called, ATLASPandaWork will use the new parameter to clone a task from the current panda_task_id. + # for example, when set_paramter({'m1': 0.1, 'm2': 0.2, 'nevents': 300}), new arguments will be generated based on cmd_to_arguments['parameters']. Which will be 'python simplescript.py 0.1 0.2 300'. It will be used to replace the original task arguments cmd_to_arguments['arguments'] + # The current cmd_to_arguments['outDS'] is also required. Because when generating new tasks, iDDS will generate new dataset name to replace this 'outDS'. + cmd_to_arguments = {'arguments': 'python simplescript.py 0.5 0.5 200', + 'parameters': 'python simplescript.py {m1} {m2} {nevents}', + 'outDS': 'user.wguan.altest123456'} + work = ATLASPandaWork(panda_task_id=panda_task_id, cmd_to_arguments=cmd_to_arguments) + + # initialize_work will be executed only one time. iDDS will called it automatically. + # However, because here we need to get the output dataset name(work.get_output_collections called below). + # If not calling this function, work.get_output_collections will return None. + work.initialize_work() + + work_output_coll = work.get_output_collections()[0] + + input_coll = {'scope': work_output_coll['scope'], + 'name': work_output_coll['name'], + 'coll_metadata': {'force_close': True}} + output_coll = {'scope': work_output_coll['scope'], + 'name': work_output_coll['name'] + "." + str(int(time.time()))} + + # How to generate arguments: + # arguments = arguments.format(parameters) # you can call set_parameters to set different parameters. + # acutator = ATLASActuatorWork(executable='python', arguments='merge.py {output_json} {events} {dataset}/{filename}', + acutator = ATLASActuatorWork(executable='python', arguments='merge.py {output_json} {events} {dataset}', + parameters={'output_json': 'merge.json', + 'events': 200, + 'dataset': '{scope}:{name}'.format(**input_coll), + 'filename': 'output*.json'}, + sandbox=work.sandbox, primary_input_collection=input_coll, + output_collections=output_coll, output_json='merge.json') + wf = Workflow() + wf.add_work(work) + wf.add_work(acutator) + cond = Condition(work.is_finished, current_work=work, true_work=acutator, false_work=None) + wf.add_condition(cond) + cond1 = Condition(acutator.generate_new_task, current_work=acutator, true_work=work, false_work=None) + wf.add_condition(cond1) + + # because the two works are in a loop, they are not independent. This call is needed to tell which one to start. + # otherwise idds will use the first one to start. + wf.add_initial_works(work) + + return wf + +2. Abort a request + +.. code-block:: python + + # One of workload_id or request_id can be None + clientmanager.abort(request_id=, workload_id=) + +3. Get progress report + +.. code-block:: python + + # One of workload_id or request_id can be None + clientmanager.get_status(request_id=, workload_id=, with_detail=False/True) + +4. Download logs for a request + +.. code-block:: python + + # One of workload_id or request_id can be None + clientmanager.download_logs(request_id=, workload_id=, dest_dir='./', filename=None) + +5. Upload a file to the iDDS cacher .. code-block:: python - request_id = client.add_request(**props) + # filename is the source filename or full path of the source file. + # Upload file to iDDS cacher: On the cacher, the filename will be the basename of the file. + clientmanager.upload_to_cacher(filename) -3. Example codes: +6. Download a file from the iDDS cacher .. code-block:: python + + # filename is the destination filename or full path of the destination file. + # Download file from iDDS cacher: On the cacher, the filename will be the basename of the file. + clientmanager.download_from_cacher(filename) - main/lib/idds/tests/datacarousel_test.py, - main/lib/idds/tests/activelearning_test.py - main/lib/idds/tests/hyperparameteropt_bayesian_test.py - main/lib/idds/tests/hyperparameteropt_docker_local_test.py - main/lib/idds/tests/hyperparameteropt_docker_test.py - main/lib/idds/tests/hyperparameteropt_nevergrad_test.py +7. Get hyperparameters -iDDS HPO(HyperParameterClient) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. code-block:: python + + clientmanager.get_hyperparameters(request_id=, workload_id=, + id=, status=, limit=) + + clientmanager.get_hyperparameters(workload_id=123, request_id=None) + clientmanager.get_hyperparameters(workload_id=None, request_id=456) + clientmanager.get_hyperparameters(workload_id=None, request_id=456, id=0) -1. To retrieve hyperparameters. +8. Update hyperparameter .. code-block:: python - client.get_hyperparameters(workload_id, request_id, id=None, status=None, limit=None) + clientmanager.update_hyperparameter(request_id=, workload_id=, + id=, loss=) - examples: - client.get_hyperparameters(workload_id=123, request_id=None) - client.get_hyperparameters(workload_id=None, request_id=456) - client.get_hyperparameters(workload_id=None, request_id=456, id=0) +iDDS Command Line Interface (CLI) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -2. To register loss of a group of hyperparameters. +1. Abort a request .. code-block:: python - client.update_hyperparameter(request_id, id, loss) + # One of workload_id or request_id can be None + idds abort-requests --request_id= --workload_id= -3. Example code: +2. Get progress report .. code-block:: python - main/lib/idds/tests/hyperparameteropt_client_test.py + # One of workload_id or request_id can be None + idds get_requests_status --request_id= --workload_id= --with_detail=False/True -iDDS logs client -~~~~~~~~~~~~~~~~ +3. Download logs for a request + +.. code-block:: python -iDDS also provides a rest service for users to download logs for tasks running on iDDS, for example ActiveLearning and HyperParameterOptimization. + # One of workload_id or request_id can be None + idds download_logs --request_id= --workload_id= --dest_dir='./' --filename= -1. Download logs: +4. Upload a file to the iDDS cacher .. code-block:: python - client.download_logs(workload_id=workload_id, request_id=request_id, dest_dir='/tmp') + # filename is the source filename or full path of the source file. + # Upload file to iDDS cacher: On the cacher, the filename will be the basename of the file. + idds upload_to_cacher --filename= + +5. Download a file from the iDDS cacher + +.. code-block:: python + + # filename is the destination filename or full path of the destination file. + # Download file from iDDS cacher: On the cacher, the filename will be the basename of the file. + idds download_from_cacher --filename= + +6. Get hyperparameters + +.. code-block:: python + + idds get_hyperparameters --request_id= --workload_id= + --id= --status= --limit=) + + idds get_hyperparameters --workload_id=123 + idds get_hyperparameters --request_id=456 + idds get_hyperparameters --request_id=456 --id=0 -2. Example codes: +7. Update hyperparameter .. code-block:: python - main/lib/idds/tests/logs_test.py + idds update_hyperparameter --request_id= --workload_id=, + --id= --loss= diff --git a/docs/source/users/contributing.rst b/docs/source/users/contributing.rst new file mode 100644 index 00000000..c32dddfe --- /dev/null +++ b/docs/source/users/contributing.rst @@ -0,0 +1,91 @@ +================= +Contributor Guide +================= + +* Thank you for participating! +* Below are the guids about how to contribute to it. + +The `repository `_ consists of different branches: + * the **master** branch includes the main stable developments for the next major version. + * the **dev** branch includes latest developments. A contribution should be based on the dev branch. + * the **** branch includes deployments for different versions. + + +Generally all `pull requests `_ are to be created against the iDDS **dev** branch. Contributions will end up in the upstream **dev** when merged. + + +Getting started +--------------- + +**Step 1**: Fork the `repository `_ on Github. + +**Step 2**: Clone the repository to your development machine and configure it:: + + $ git clone https://github.com//iDDS/ + $ cd iDDS + $ git remote add upstream https://github.com/HSF/iDDS.git + +**Step 3**: Setup local dev environment(The virtual environment is based on conda):: + + $ # If you are not in the idds top directory. + $ # cd iDDS + $ CurrentDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + $ CondaDir=${CurrentDir}/.conda/iDDS + $ mkdir -p $CondaDir + + $ # For your local development, you may do not need all packages. In this case, you may need to comment out some packages, for example cx_Oracle. + $ echo conda env create --prefix=$CondaDir -f=main/tools/env/environment.yml + $ conda env create --prefix=$CondaDir -f=main/tools/env/environment.yml + +**Step 4**: Configure local environment:: + + $ # If you are not in the idds top directory. + $ # cd iDDS + $ RootDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + $ CondaDir=${RootDir}/.conda/iDDS + $ export IDDS_HOME=$RootDir + $ conda activate $CondaDir + +Contributing +------------ + +**Step 1**: Developing based on your personal repository(if you already have some codes and want to keep them, you need to use 'git rebase'. In this case, you may need to fix some conflicts; If you start from scratch, you can directly use 'git reset'. This method will overwrite local changes. Becareful to backup if you are not confident about using it):: + + $ git checkout dev + $ # echo "Updating dev" + $ git pull --all --prune --progress + $ echo "Rebasing dev" + $ # git rebase upstream/dev dev + $ git reset --hard upstream/dev + +**Step 2**: Check your codes and fix the codes(flake8 is installed when you configure your virtual environments):: + + $ flake8 yourcodes.py + $ flake8 */lib/idds/ + +**Step 3**: Commit your change. The commit command must include a specific message format:: + + $ git commit -m "...." + $ git push origin dev + +**Step 4**: Create the pull request to the **dev** branch of HSF/iDDS repository. + +While using the `github interface `_ is the default interface to create pull requests, you could also use GitHub's command-line wrapper `hub `_ or the `GitHub CLI `_. + +**Step 5**: Watch the pull request for comments and reviews. If there are some conflicts, you may need to rebase your codes and fix the conflicts. For any pull requests update, please try to squash/amend your commits to avoid "in-between" commits:: + + $ git rebase upstream/dev dev + + +Human Review +------------ + +Anyone is welcome to review merge requests and make comments! + +The development team can approve, request changes, or close pull requests. Merging of approved pull requests is done by the iDDS development lead. + + +Coding Style +------------ + +We use flake8 to sanitize our code. Please do the same before submitting a pull request. diff --git a/doma/LICENSE.rst b/doma/LICENSE.rst new file mode 100644 index 00000000..9dab48b1 --- /dev/null +++ b/doma/LICENSE.rst @@ -0,0 +1,14 @@ +Copyright 2019 + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/doma/README.md b/doma/README.md new file mode 100644 index 00000000..eba688fa --- /dev/null +++ b/doma/README.md @@ -0,0 +1,5 @@ +idds-doma +==== + +idds-doma subpackage is for DOMA hep specific functions and plugins. +With it, iDDS can support DOMA LSST workflows. diff --git a/doma/etc/panda/panda.cfg b/doma/etc/panda/panda.cfg new file mode 100644 index 00000000..dc475011 --- /dev/null +++ b/doma/etc/panda/panda.cfg @@ -0,0 +1,9 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Wen Guan, , 2020 + +[panda] +pandaserver = https://ai-idds-03.cern.ch/jobs/ diff --git a/doma/lib/idds/__init__.py b/doma/lib/idds/__init__.py new file mode 100644 index 00000000..865b774e --- /dev/null +++ b/doma/lib/idds/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 diff --git a/doma/lib/idds/doma/__init__.py b/doma/lib/idds/doma/__init__.py new file mode 100644 index 00000000..1bb8ee59 --- /dev/null +++ b/doma/lib/idds/doma/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 diff --git a/doma/lib/idds/doma/version.py b/doma/lib/idds/doma/version.py new file mode 100644 index 00000000..fcdcd3e8 --- /dev/null +++ b/doma/lib/idds/doma/version.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 - 2021 + + +release_version = "0.1.0" diff --git a/doma/lib/idds/doma/workflow/__init__.py b/doma/lib/idds/doma/workflow/__init__.py new file mode 100644 index 00000000..1bb8ee59 --- /dev/null +++ b/doma/lib/idds/doma/workflow/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 diff --git a/doma/lib/idds/doma/workflow/domalsstwork.py b/doma/lib/idds/doma/workflow/domalsstwork.py new file mode 100644 index 00000000..7d08ba24 --- /dev/null +++ b/doma/lib/idds/doma/workflow/domalsstwork.py @@ -0,0 +1,563 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 +# - Sergey Padolski, , 2020 + + +try: + import ConfigParser +except ImportError: + import configparser as ConfigParser + +import copy +import json +import os +import traceback +import uuid +import urllib +import ssl +import datetime + +from pandatools import Client + +from idds.common import exceptions +from idds.common.constants import (TransformType, CollectionType, CollectionStatus, + ContentStatus, ContentType, + ProcessingStatus, WorkStatus) +from idds.workflow.work import Work +from idds.workflow.workflow import Condition +import logging + +DEBUG = False +CACHE_TIMEOUT = 30*60 + + +class DomaCondition(Condition): + def __init__(self, cond=None, current_work=None, true_work=None, false_work=None): + super(DomaCondition, self).__init__(cond=cond, current_work=current_work, + true_work=true_work, false_work=false_work) + + +class DomaLSSTWork(Work): + def __init__(self, executable=None, arguments=None, parameters=None, setup=None, + work_tag='lsst', exec_type='panda', sandbox=None, work_id=None, + primary_input_collection=None, other_input_collections=None, + output_collections=None, log_collections=None, + logger=None, dependency_map=None, task_name="", task_queue=None, processing_type=None, + maxwalltime=90000, maxattempt=5, core_count=1): + + super(DomaLSSTWork, self).__init__(executable=executable, arguments=arguments, + parameters=parameters, setup=setup, work_type=TransformType.Processing, + work_tag=work_tag, exec_type=exec_type, sandbox=sandbox, work_id=work_id, + primary_input_collection=primary_input_collection, + other_input_collections=other_input_collections, + output_collections=output_collections, + log_collections=log_collections, + release_inputs_after_submitting=True, + logger=logger) + self.pandamonitor = None + self.dependency_map = dependency_map + self.logger.setLevel(logging.DEBUG) + self.task_name = task_name + self.queue = task_queue + self.dep_tasks_id_names_map = {} + self.executable = executable + self.jobs_cache = {} + self.processingType = processing_type + self.maxWalltime = maxwalltime + self.maxAttempt = maxattempt + self.core_count = core_count + + def my_condition(self): + if self.is_finished(): + return True + return False + + def put_into_cache(self, input_index, status, job_id): + if status == ContentStatus.Available: + validity = datetime.datetime.now(tz=None) + datetime.timedelta(days=365) + else: + validity = datetime.datetime.now(tz=None) + datetime.timedelta(seconds=CACHE_TIMEOUT) + self.jobs_cache[job_id] = {'status': status, 'validity': validity, 'input_index': input_index} + + def get_from_cache(self, job_id): + obj = self.jobs_cache.get(job_id, None) + if obj: + if obj['validity'] > datetime.datetime.now(tz=None): + return obj['input_index'], obj['status'] + return None + + def jobs_to_idd_ds_status(self, jobstatus): + if jobstatus == 'finished': + return ContentStatus.Available + elif jobstatus == 'failed': + return ContentStatus.Failed + else: + return ContentStatus.Processing + + def load_panda_config(self): + panda_config = ConfigParser.SafeConfigParser() + if os.environ.get('IDDS_PANDA_CONFIG', None): + configfile = os.environ['IDDS_PANDA_CONFIG'] + if panda_config.read(configfile) == [configfile]: + return panda_config + + configfiles = ['%s/etc/panda/panda.cfg' % os.environ.get('IDDS_HOME', ''), + '/etc/panda/panda.cfg', '/opt/idds/etc/panda/panda.cfg', + '%s/etc/panda/panda.cfg' % os.environ.get('VIRTUAL_ENV', '')] + for configfile in configfiles: + if panda_config.read(configfile) == [configfile]: + return panda_config + return panda_config + + def load_panda_monitor(self): + panda_config = self.load_panda_config() + self.logger.info("panda config: %s" % panda_config) + if panda_config.has_section('panda'): + if panda_config.has_option('panda', 'pandamonitor'): + pandamonitor = panda_config.get('panda', 'pandamonitor') + return pandamonitor + return None + + def poll_external_collection(self, coll): + try: + if 'status' in coll and coll['status'] in [CollectionStatus.Closed]: + return coll + else: + if 'coll_metadata' not in coll: + coll['coll_metadata'] = {} + coll['coll_metadata']['bytes'] = 1 + coll['coll_metadata']['availability'] = 1 + coll['coll_metadata']['events'] = 1 + coll['coll_metadata']['is_open'] = False + coll['coll_metadata']['run_number'] = 1 + coll['coll_metadata']['did_type'] = 'DATASET' + coll['coll_metadata']['list_all_files'] = False + + if 'is_open' in coll['coll_metadata'] and not coll['coll_metadata']['is_open']: + coll_status = CollectionStatus.Closed + else: + coll_status = CollectionStatus.Open + coll['status'] = coll_status + coll['coll_type'] = CollectionType.Dataset + + return coll + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) + + def get_input_collections(self): + """ + *** Function called by Transformer agent. + """ + colls = [self.primary_input_collection] + self.other_input_collections + for coll_int_id in colls: + coll = self.collections[coll_int_id] + coll = self.poll_external_collection(coll) + self.collections[coll_int_id] = coll + return super(DomaLSSTWork, self).get_input_collections() + + def get_unsubmitted_inputs(self): + not_submitted_inputs = filter(lambda t: not t["submitted"], self.dependency_map) + tasks_to_check = [] + for job in not_submitted_inputs: + tasks_to_check.extend([(input["task"], input["inputname"]) + for input in job["dependencies"] if not input["available"]]) + tasks_to_check_compact = {} + for task in tasks_to_check: + tasks_to_check_compact.setdefault(task[0], set()).add(task[1]) + return tasks_to_check_compact + + def set_dependency_input_available(self, taskname, available_inputs): + for job in self.dependency_map: + for dependency in job["dependencies"]: + if dependency["task"] == taskname and dependency["inputname"] in available_inputs: + dependency["available"] = True + + def update_dependencies(self): + tasks_to_check = self.get_unsubmitted_inputs() + for task, inputs in tasks_to_check.items(): + _, outputs = self.poll_panda_task(task_name=task) + available_inputs = set(list(dict(filter(lambda t: t[1] == ContentStatus.Available, + outputs.items())).keys())) + self.set_dependency_input_available(task, available_inputs) + + def get_ready_inputs(self): + not_submitted_inputs = filter(lambda j: not j["submitted"], self.dependency_map) + files_to_submit = [] + for job in not_submitted_inputs: + unresolved_deps = [input for input in job["dependencies"] if not input["available"]] + if len(unresolved_deps) == 0: + files_to_submit.append(job["name"]) + return files_to_submit + + def check_dependencies(self): + self.update_dependencies() + return self.get_ready_inputs() + + def can_close(self): + not_submitted_inputs = list(filter(lambda t: not t["submitted"], self.dependency_map)) + if len(not_submitted_inputs) == 0: + return True + else: + return False + + def get_input_contents(self): + """ + Get all input contents from DDM. + """ + try: + ret_files = [] + coll = self.collections[self.primary_input_collection] + for file in self.dependency_map: + ret_file = {'coll_id': coll['coll_id'] if not DEBUG else None, + 'scope': coll['scope'], + 'name': file['name'], # or a different file name from the dataset name + 'bytes': 1, + 'adler32': '12345678', + 'min_id': 0, + 'max_id': 1, + 'content_type': ContentType.File, + # here events is all events for eventservice, not used here. + 'content_metadata': {'events': 1}} + ret_files.append(ret_file) + return ret_files + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) + + def get_mapped_inputs(self, mapped_input_output_maps): + ret = [] + for map_id in mapped_input_output_maps: + inputs = mapped_input_output_maps[map_id]['inputs'] + + # if 'primary' is not set, the first one is the primary input. + primary_input = inputs[0] + for ip in inputs: + if 'primary' in ip['content_metadata'] and ip['content_metadata']['primary']: + primary_input = ip + ret.append(primary_input) + return ret + + def get_new_input_output_maps(self, mapped_input_output_maps={}): + """ + *** Function called by Transformer agent. + New inputs which are not yet mapped to outputs. + + :param mapped_input_output_maps: Inputs that are already mapped. + """ + inputs = self.get_input_contents() + mapped_inputs = self.get_mapped_inputs(mapped_input_output_maps) + mapped_inputs_scope_name = [ip['name'] for ip in mapped_inputs] + + new_inputs = [] + new_input_output_maps = {} + for ip in inputs: + ip_scope_name = ip['name'] + if ip_scope_name not in mapped_inputs_scope_name: + new_inputs.append(ip) + + # to avoid cheking new inputs if there are no new inputs anymore + if not new_inputs and self.collections[self.primary_input_collection]['status'] in [CollectionStatus.Closed]: + self.set_has_new_inputs(False) + else: + mapped_keys = mapped_input_output_maps.keys() + if mapped_keys: + next_key = max(mapped_keys) + 1 + else: + next_key = 1 + for ip in new_inputs: + out_ip = copy.deepcopy(ip) + if not DEBUG: + out_ip['coll_id'] = self.collections[self.output_collections[0]]['coll_id'] + new_input_output_maps[next_key] = {'inputs': [ip], + 'outputs': [out_ip]} + next_key += 1 + self.logger.debug("get_new_input_output_maps, new_input_output_maps: %s" % str(new_input_output_maps)) + self.collections[self.primary_input_collection]['coll_metadata']['is_open'] = False + self.collections[self.primary_input_collection]['status'] = CollectionStatus.Closed + return new_input_output_maps + + def get_processing(self, input_output_maps): + """ + *** Function called by Transformer agent. + + If there is already an active processing for this work, will do nothing. + If there is no active processings, create_processing will be called. + """ + if self.active_processings: + return self.processings[self.active_processings[0]] + else: + return None + + def create_processing(self, input_output_maps): + """ + *** Function called by Transformer agent. + + :param input_output_maps: new maps from inputs to outputs. + """ + in_files = [] + for map_id in input_output_maps: + # one map is a job which transform the inputs to outputs. + inputs = input_output_maps[map_id]['inputs'] + # outputs = input_output_maps[map_id]['outputs'] + for ip in inputs: + in_files.append(ip['name']) + + task_param_map = {} + task_param_map['vo'] = 'wlcg' + task_param_map['site'] = self.queue + task_param_map['workingGroup'] = 'lsst' + task_param_map['nFilesPerJob'] = 1 + task_param_map['nFiles'] = len(in_files) + task_param_map['noInput'] = True + task_param_map['pfnList'] = in_files + task_param_map['taskName'] = self.task_name + task_param_map['userName'] = 'Siarhei Padolski' + task_param_map['taskPriority'] = 900 + task_param_map['architecture'] = '' + task_param_map['transUses'] = '' + task_param_map['transHome'] = None + task_param_map['transPath'] = 'https://atlpan.web.cern.ch/atlpan/bash-c' + task_param_map['processingType'] = self.processingType + task_param_map['prodSourceLabel'] = 'test' + task_param_map['taskType'] = 'test' + task_param_map['coreCount'] = self.core_count + task_param_map['skipScout'] = True + task_param_map['cloud'] = 'US' + task_param_map['inputPreStaging'] = True + task_param_map['prestagingRuleID'] = 123 + task_param_map['nChunksToWait'] = 1 + task_param_map['maxCpuCount'] = self.maxWalltime + task_param_map['maxWalltime'] = self.maxWalltime + task_param_map['maxFailure'] = self.maxAttempt + task_param_map['maxAttempt'] = self.maxAttempt + task_param_map['jobParameters'] = [ + {'type': 'constant', + 'value': self.executable, # noqa: E501 + }, + ] + + proc = {'processing_metadata': {'internal_id': str(uuid.uuid1()), + 'task_id': None, + 'task_param': task_param_map}} + self.add_processing_to_processings(proc) + self.active_processings.append(proc['processing_metadata']['internal_id']) + return proc + + def submit_panda_task(self, processing): + try: + task_param = processing['processing_metadata']['task_param'] + return_code = Client.insertTaskParams(task_param, verbose=True) + if return_code[0] == 0: + return return_code[1][1] + else: + self.logger.error("submit_panda_task, return_code: %s" % str(return_code)) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc())) + return None + + def submit_processing(self, processing): + """ + *** Function called by Carrier agent. + """ + if 'task_id' in processing['processing_metadata'] and processing['processing_metadata']['task_id']: + pass + else: + task_id = self.submit_panda_task(processing) + processing['processing_metadata']['task_id'] = task_id + processing['processing_metadata']['workload_id'] = task_id + + def download_payload_json(self, task_url): + try: + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + req = urllib.request.Request(task_url) + response = urllib.request.urlopen(req, timeout=180, context=ctx).read() + response = json.loads(response) + except Exception or urllib.request.error as e: + raise e + return response + + def jedi_semaphore(self, processing): + task_id = processing['processing_metadata']['task_id'] + task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True) + if task_info[0] != 0: + return False + task_info = task_info[1] + jobs_statistics = task_info.get('statistics', "") + if len(jobs_statistics) == 0: + return False + return True + + def poll_panda_task(self, processing=None, task_name=None): + task_id = None + try: + if not self.pandamonitor: + self.pandamonitor = self.load_panda_monitor() + self.logger.info("panda server: %s" % self.pandamonitor) + task_info = None + jobs_ids = None + if processing: + task_id = processing['processing_metadata']['task_id'] + elif task_name: + task_id = self.dep_tasks_id_names_map.get(task_name, None) + + if task_id: + task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True) + if task_info[0] != 0: + self.logger.error("poll_panda_task, error getting task status, task_info: %s" % str(task_info)) + return "No status", {} + task_info = task_info[1] + jobs_ids = task_info['PandaID'] + elif task_name: + task_url = self.pandamonitor + '/tasks/?taskname=' + str(task_name) + "&status=!aborted&json&days=180" + self.logger.debug("poll_panda_task, task_url: %s" % str(task_url)) + task_json = self.download_payload_json(task_url) + if len(task_json) > 0: + task_info = task_json[0] + self.dep_tasks_id_names_map[task_name] = task_info.get('jeditaskid', None) + else: + return "No status", {} + task_id = task_info.get('jeditaskid', None) + if not task_id: + return "No status", {} + if not jobs_ids: + jobs_ids = Client.getPandaIDsWithTaskID(task_id)[1] + + outputs_status = {} + if len(jobs_ids) > 0: + not_cached_jobs_ids = [] + + for job_id in jobs_ids: + obj = self.get_from_cache(job_id) + if obj: + if obj[0] not in outputs_status or outputs_status[obj[0]] != ContentStatus.Available: + outputs_status[obj[0]] = obj[1] + else: + not_cached_jobs_ids.append(job_id) + + self.logger.debug("poll_panda_task, not_cached_jobs_ids: %s" % str(not_cached_jobs_ids)) + chunksize = 2000 + chunks = [not_cached_jobs_ids[i:i + chunksize] for i in range(0, len(not_cached_jobs_ids), chunksize)] + for chunk in chunks: + jobs_list = Client.getJobStatus(chunk, verbose=0)[1] + for job_info in jobs_list: + if job_info.Files and len(job_info.Files) > 0: + output_index = job_info.Files[0].lfn.split(':')[1] + status = self.jobs_to_idd_ds_status(job_info.jobStatus) + if output_index not in outputs_status or \ + outputs_status[output_index] != ContentStatus.Available: + outputs_status[output_index] = status + self.put_into_cache(output_index, status, job_info.PandaID) + + self.logger.debug("poll_panda_task, task_info: %s" % str(task_info)) + task_status = task_info["status"] + return task_status, outputs_status + except Exception as ex: + if task_id: + msg = "Failed to check the panda task (%s) status: %s" % (str(task_id), str(ex)) + else: + msg = "Failed to check the panda task (%s) status: %s" % (task_name, str(ex)) + raise exceptions.IDDSException(msg) + + def poll_processing_updates(self, processing, input_output_maps): + """ + *** Function called by Carrier agent. + """ + updated_contents = [] + update_processing = {} + self.logger.debug("poll_processing_updates, input_output_maps: %s" % str(input_output_maps)) + + if processing: + task_status, outputs_status = self.poll_panda_task(processing=processing) + just_resolved_deps = self.check_dependencies() + self.logger.debug("poll_processing_updates, outputs_status: %s" % str(outputs_status)) + self.logger.debug("poll_processing_updates, task_status: %s" % str(task_status)) + self.logger.debug("poll_processing_updates, files: %s" % str(just_resolved_deps)) + + content_substatus = {'finished': 0, 'unfinished': 0} + for map_id in input_output_maps: + outputs = input_output_maps[map_id]['outputs'] + for content in outputs: + key = content['name'] + if key in outputs_status: + if content.get('substatus', ContentStatus.New) != outputs_status[key]: + updated_content = {'content_id': content['content_id'] if not DEBUG else None, + 'substatus': outputs_status[key]} + updated_contents.append(updated_content) + content['substatus'] = outputs_status[key] + if not DEBUG: + if content['substatus'] == ContentStatus.Available: + content_substatus['finished'] += 1 + else: + content_substatus['unfinished'] += 1 + + inputs = input_output_maps[map_id]['inputs'] + if self.jedi_semaphore(processing): + if inputs[0]['name'] in just_resolved_deps and \ + task_status and task_status in ('pending', 'running', 'finished', 'defined'): + + inputs[0]['substatus'] = ContentStatus.Available + updated_content = { + 'content_id': inputs[0]['content_id'] if not DEBUG else inputs[0]['name'], + 'substatus': ContentStatus.Available + } + updated_contents.append(updated_content) + + if task_status and task_status == 'done' and \ + content_substatus['finished'] > 0 and content_substatus['unfinished'] == 0: + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': ProcessingStatus.Finished}} + + self.logger.debug("poll_processing_updates, task: %i, update_processing: %s" % + (processing['processing_metadata']['task_id'], str(update_processing))) + self.logger.debug("poll_processing_updates, task: %i, updated_contents: %s" % + (processing['processing_metadata']['task_id'], str(updated_contents))) + return update_processing, updated_contents + + def get_status_statistics(self, registered_input_output_maps): + status_statistics = {} + for map_id in registered_input_output_maps: + outputs = registered_input_output_maps[map_id]['outputs'] + + for content in outputs: + if content['status'].name not in status_statistics: + status_statistics[content['status'].name] = 0 + status_statistics[content['status'].name] += 1 + self.status_statistics = status_statistics + self.logger.debug("registered_input_output_maps, status_statistics: %s" % str(status_statistics)) + return status_statistics + + def syn_work_status(self, registered_input_output_maps): + self.get_status_statistics(registered_input_output_maps) + self.logger.debug("syn_work_status, self.active_processings: %s" % str(self.active_processings)) + self.logger.debug("syn_work_status, self.has_new_inputs(): %s" % str(self.has_new_inputs())) + self.logger.debug("syn_work_status, coll_metadata_is_open: %s" % + str(self.collections[self.primary_input_collection]['coll_metadata']['is_open'])) + self.logger.debug("syn_work_status, primary_input_collection_status: %s" % + str(self.collections[self.primary_input_collection]['status'])) + + if self.is_processings_terminated() and not self.has_new_inputs(): + keys = self.status_statistics.keys() + if ContentStatus.New.name in keys or ContentStatus.Processing.name in keys: + pass + else: + if len(keys) == 1: + if ContentStatus.Available.name in keys: + self.status = WorkStatus.Finished + else: + self.status = WorkStatus.Failed + else: + self.status = WorkStatus.SubFinished diff --git a/doma/setup.cfg b/doma/setup.cfg new file mode 100644 index 00000000..bc0f2d78 --- /dev/null +++ b/doma/setup.cfg @@ -0,0 +1,6 @@ +[metadata] +license_file = LICENSE.rst +description-file = README.md + +[bdist_wheel] +universal = true diff --git a/doma/setup.py b/doma/setup.py new file mode 100644 index 00000000..f333fa72 --- /dev/null +++ b/doma/setup.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 + + +import glob +import io +import os +import re +import sys +from distutils.sysconfig import get_python_lib +from setuptools import setup, find_packages, Distribution +from setuptools.command.install import install + + +current_dir = os.getcwd() +working_dir = os.path.dirname(os.path.realpath(__file__)) +os.chdir(working_dir) + + +with io.open('lib/idds/doma/version.py', "rt", encoding="utf8") as f: + version = re.search(r'release_version = "(.*?)"', f.read()).group(1) + + +with io.open('README.md', "rt", encoding="utf8") as f: + readme = f.read() + + +class OnlyGetScriptPath(install): + def run(self): + self.distribution.install_scripts = self.install_scripts + + +def get_python_bin_path(): + " Get the directory setuptools installs scripts to for current python " + dist = Distribution({'cmdclass': {'install': OnlyGetScriptPath}}) + dist.dry_run = True # not sure if necessary + dist.parse_config_files() + command = dist.get_command_obj('install') + command.ensure_finalized() + command.run() + return dist.install_scripts + + +def get_python_home(): + return sys.exec_prefix + + +def get_data_path(): + return sys.prefix + + +def get_reqs_from_file(requirements_file): + if os.path.exists(requirements_file): + return open(requirements_file, 'r').read().split('\n') + return [] + + +def parse_requirements(requirements_files): + requirements = [] + for requirements_file in requirements_files: + for line in get_reqs_from_file(requirements_file): + line = line.split('#')[0] + line = line.strip() + if line.startswith('- ') and not line.endswith(':'): + line = line.replace('- ', '').strip() + if len(line) and 'python==' not in line: + requirements.append(line) + return requirements + + +install_lib_path = get_python_lib() +install_bin_path = get_python_bin_path() +install_home_path = get_python_home() +install_data_path = get_data_path() + +requirements_files = ['tools/env/environment.yml'] +install_requires = parse_requirements(requirements_files=requirements_files) +data_files = [] +scripts = glob.glob('bin/*') + +setup( + name="idds-doma", + version=version, + description='intelligent Data Delivery Service(iDDS) Package', + long_description=readme, + long_description_content_type='text/markdown', + license='GPL', + author='IRIS-HEP Team', + author_email='atlas-adc-panda@cern.ch', + python_requires='>=3.6', + packages=find_packages('lib/'), + package_dir={'': 'lib'}, + install_requires=install_requires, + include_package_data=True, + data_files=data_files, + scripts=scripts, + project_urls={ + 'Documentation': 'https://github.com/HSF/iDDS/wiki', + 'Source': 'https://github.com/HSF/iDDS', + }, + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'Natural Language :: English', + 'Programming Language :: Python :: 3.6', + ], +) + +os.chdir(current_dir) diff --git a/doma/tools/env/environment.yml b/doma/tools/env/environment.yml new file mode 100644 index 00000000..e9b99529 --- /dev/null +++ b/doma/tools/env/environment.yml @@ -0,0 +1,14 @@ +name: iDDS-doma +dependencies: +- python==3.6 +- pip +- pip: + - futures # multiple process/threads + - unittest2 # unit test tool + - pep8 # checks for PEP8 code style compliance + - flake8 # Wrapper around PyFlakes&pep8 + - pytest # python testing tool + - nose # nose test tools + - panda-client # panda client + - idds-common + - idds-workflow diff --git a/main/etc/idds/idds.cfg.template b/main/etc/idds/idds.cfg.template index 8f9e0803..e6e4f124 100755 --- a/main/etc/idds/idds.cfg.template +++ b/main/etc/idds/idds.cfg.template @@ -19,7 +19,8 @@ loglevel = DEBUG # aipanda180 condor # aipanda187 monitor(can be reused) # aipanda160, 161, 162 (new vms) - +# doma aipanda015, aipanda016, and aipanda017 +# [database] #default = mysql://idds:idds@pcuwvirt5.cern.ch/idds #default = mysql://idds:idds_passwd@aipanda182.cern.ch/idds @@ -46,22 +47,23 @@ cacher_dir = /data/idds #plugin.. = [main] -agents = clerk, transporter, transformer, carrier, conductor +agents = clerk, marshaller, transformer, carrier, conductor [clerk] num_threads = 1 poll_time_period = 5 retrieve_bulk_size = 10 #plugin_sequence = collection_lister -plugin.collection_lister = idds.atlas.rucio.collection_lister.CollectionLister + +[marshaller] +num_threads = 1 +poll_time_period = 5 +retrieve_bulk_size = 10 [transformer] num_threads = 1 poll_time_period = 5 retrieve_bulk_size = 10 -plugin.stagein_transformer = idds.atlas.transformer.stagein_transformer.StageInTransformer -plugin.activelearning_transformer = idds.atlas.transformer.activelearning_transformer.ActiveLearningTransformer -plugin.hyperparameteropt_transformer = idds.atlas.transformer.hyperparameteropt_transformer.HyperParameterOptTransformer [transporter] num_threads = 1 @@ -70,56 +72,19 @@ poll_time_period = 5 poll_input_time_period = 600 poll_output_time_period = 5 retrieve_bulk_size = 10 -plugin.collection_metadata_reader = idds.atlas.rucio.collection_metadata_reader.CollectionMetadataReader -plugin.contents_lister = idds.atlas.rucio.contents_lister.ContentsLister -plugin.contents_register = idds.atlas.rucio.contents_register.ContentsRegister [carrier] num_threads = 1 poll_time_period = 5 retrieve_bulk_size = 10 message_bulk_size = 2000 -plugin.stagein_submitter = idds.atlas.processing.stagein_submitter.StageInSubmitter -plugin.stagein_submitter.poll_time_period = 5 -plugin.stagein_submitter.plugin.rule_submitter = idds.atlas.rucio.rule_submitter.RuleSubmitter -#7 * 24 * 3600 -plugin.stagein_submitter.plugin.rule_submitter.lifetime = 604800 -plugin.stagein_poller = idds.atlas.processing.stagein_poller.StageInPoller -plugin.stagein_poller.poll_time_period = 1800 -plugin.stagein_poller.plugin.rule_poller = idds.atlas.rucio.rule_poller.RulePoller -# 4 * 24 * 3600 -plugin.stagein_poller.plugin.rule_poller.default_max_waiting_time = 345600 -plugin.stagein_poller.plugin.rule_poller.check_all_rules_for_new_rule = True -plugin.stagein_poller.plugin.rule_poller.new_rule_lifetime = 604800 -plugin.stagein_poller.plugin.rule_creator = idds.atlas.rucio.rule_creator.RuleCreator -# new_rule_lifetime will overwrite this one -plugin.stagein_poller.plugin.rule_creator.lifetime = 604800 - -plugin.activelearning_submitter = idds.atlas.processing.activelearning_condor_submitter.ActiveLearningCondorSubmitter -plugin.activelearning_submitter.workdir = /data/idds_processing -plugin.activelearning_poller = idds.atlas.processing.activelearning_condor_poller.ActiveLearningCondorPoller -plugin.activelearning_poller.workdir = /data/idds_processing - -plugin.hyperparameteropt_submitter = idds.atlas.processing.hyperparameteropt_condor_submitter.HyperParameterOptCondorSubmitter -plugin.hyperparameteropt_submitter.workdir = /data/idds_processing -plugin.hyperparameteropt_submitter.max_unevaluated_points = 10 -plugin.hyperparameteropt_submitter.min_unevaluated_points = 2 - -plugin.hyperparameteropt_submitter.nevergrad.executable = docker -plugin.hyperparameteropt_submitter.nevergrad.arguments = run -v $(pwd):/data wguanicedew/idds_hpo_nevergrad python /opt/hyperparameteropt_nevergrad.py --max_points=%%MAX_POINTS --num_points=%%NUM_POINTS --input=/data/%%IN --output=/data/%%OUT -plugin.hyperparameteropt_submitter.nevergrad.output_json = output.json -plugin.hyperparameteropt_submitter.nevergrad.should_transfer_executable = False -plugin.hyperparameteropt_submitter.bayesian.executable = /opt/idds/lib/python3.6/site-packages/idds/atlas/processing/hyperparameteropt_bayesian.py -plugin.hyperparameteropt_submitter.bayesian.arguments = --max_points %%MAX_POINTS --num_points %%NUM_POINTS --input %%IN --output %%OUT -plugin.hyperparameteropt_submitter.bayesian.output_json = output.json -plugin.hyperparameteropt_submitter.bayesian.should_transfer_executable = True +atlashpowork.workdir = /data/idds_processing +atlashpowork.input_json = idds_input.json +atlashpowork.output_json = idds_output.json -plugin.hyperparameteropt_poller = idds.atlas.processing.hyperparameteropt_condor_poller.HyperParameterOptCondorPoller -plugin.hyperparameteropt_poller.workdir = /data/idds_processing -plugin.hyperparameteropt_poller.max_unevaluated_points = 20 -plugin.hyperparameteropt_poller.min_unevaluated_points = 2 -plugin.hyperparameteropt_poller.max_life_time = 1209600 +atlashpowork.nevergrad.executable = docker +atlashpowork.nevergrad.arguments = run -v $(pwd):/data wguanicedew/idds_hpo_nevergrad python /opt/hyperparameteropt_nevergrad.py --max_points=%%MAX_POINTS --num_points=%%NUM_POINTS --input=/data/%%IN --output=/data/%%OUT [conductor] retrieve_bulk_size = 10 diff --git a/main/etc/idds/supervisord.d/idds.ini b/main/etc/idds/supervisord.d/idds.ini index 42ef9a4d..77d3ac2b 100644 --- a/main/etc/idds/supervisord.d/idds.ini +++ b/main/etc/idds/supervisord.d/idds.ini @@ -2,6 +2,7 @@ environment = RUCIO_HOME=/opt/idds/, RUCIO_ACCOUNT=pilot, + RUCIO_AUTH_TYPE=x509_proxy, X509_USER_PROXY=/data/atlpilo1/x509up ;command=/opt/idds/bin/run-idds command=bash -c "source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds" diff --git a/main/etc/sql/oracle_11.sql b/main/etc/sql/oracle_11.sql index 56e8ab23..09694f1b 100644 --- a/main/etc/sql/oracle_11.sql +++ b/main/etc/sql/oracle_11.sql @@ -6,7 +6,19 @@ DROP SEQUENCE PROCESSING_ID_SEQ; DROP SEQUENCE COLLECTION_ID_SEQ; DROP SEQUENCE CONTENT_ID_SEQ; - +delete from HEALTH; +delete from MESSAGES; +delete from CONTENTS; +delete from REQ2WORKLOAD; +delete from REQ2TRANSFORMS; +delete from WP2TRANSFORMS; +delete from WORKPROGRESSES; +delete from PROCESSINGS; +delete from COLLECTIONS; +delete from TRANSFORMS; +delete from REQUESTS; + +Drop table HEALTH purge; DROP table MESSAGES purge; DROP table CONTENTS purge; DROP table REQ2WORKLOAD purge; @@ -70,22 +82,23 @@ CREATE TABLE WORKPROGRESSES ( workprogress_id NUMBER(12), request_id NUMBER(12) constraint WORKPROGRESS__REQ_ID_NN NOT NULL, + workload_id NUMBER(10), scope VARCHAR2(25) constraint WORKPROGRESS_SCOPE_NN NOT NULL, name VARCHAR2(255) constraint WORKPROGRESS_NAME_NN NOT NULL, priority NUMBER(7), status NUMBER(2) constraint WORKPROGRESS_STATUS_ID_NN NOT NULL, substatus NUMBER(2), locking NUMBER(2), - created_at DATE DEFAULT ON NULL SYS_EXTRACT_UTC(systimestamp(0)) constraint WORKPROGRESS_CREATED_NN NOT NULL, - updated_at DATE DEFAULT ON NULL SYS_EXTRACT_UTC(systimestamp(0)) constraint WORKPROGRESS_UPDATED_NN NOT NULL, - next_poll_at DATE DEFAULT ON NULL SYS_EXTRACT_UTC(systimestamp(0)) constraint WORKPROGRESS_NEXT_POLL_NN NOT NULL, + created_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)) constraint WORKPROGRESS_CREATED_NN NOT NULL, + updated_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)) constraint WORKPROGRESS_UPDATED_NN NOT NULL, + next_poll_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)) constraint WORKPROGRESS_NEXT_POLL_NN NOT NULL, accessed_at DATE, expired_at DATE, errors VARCHAR2(1024), - workprogress_metadata CLOB constraint WORKPROGRESS_REQUEST_METADATA_ENSURE_JSON CHECK(workprogress_metadata IS JSON(LAX)), - processing_metadata CLOB constraint WORKPROGRESS_PROCESSING_METADATA_ENSURE_JSON CHECK(processing_metadata IS JSON(LAX)), - CONSTRAINT WORKPROGRESS_PK PRIMARY KEY (workprogress_id) USING INDEX LOCAL, - CONSTRAINT WORKPROGRESS_REQ_ID_FK FOREIGN KEY(request_id) REFERENCES REQUESTS(request_id), + workprogress_metadata CLOB, + processing_metadata CLOB, + CONSTRAINT WORKPROGRESS_PK PRIMARY KEY (workprogress_id), --- USING INDEX LOCAL, + CONSTRAINT WORKPROGRESS_REQ_ID_FK FOREIGN KEY(request_id) REFERENCES REQUESTS(request_id) --- CONSTRAINT REQUESTS_NAME_SCOPE_UQ UNIQUE (name, scope, requester, request_type, transform_tag, workload_id) -- USING INDEX LOCAL, ) PCTFREE 3 @@ -104,7 +117,7 @@ CREATE OR REPLACE TRIGGER TRIG_WORKPROGRESS_ID CREATE INDEX WORKPROGRESS_SCOPE_NAME_IDX ON WORKPROGRESSES (name, scope, workprogress_id) LOCAL; --- drop index REQUESTS_STATUS_PRIORITY_IDX -CREATE INDEX WORKPROGRESS_STATUS_PRIORITY_IDX ON WORKPROGRESSES (status, priority, workprogress_id, locking, updated_at, next_poll_at, created_at) LOCAL COMPRESS 1; +CREATE INDEX WORKPROGRESS_STATUS_PRI_IDX ON WORKPROGRESSES (status, priority, workprogress_id, locking, updated_at, next_poll_at, created_at) LOCAL COMPRESS 1; --- transforms @@ -113,6 +126,8 @@ CREATE SEQUENCE TRANSFORM_ID_SEQ MINVALUE 1 INCREMENT BY 1 NOCACHE; CREATE TABLE TRANSFORMS ( transform_id NUMBER(12), + request_id NUMBER(12), + workload_id NUMBER(10), transform_type NUMBER(2) constraint TRANSFORM_TYPE_NN NOT NULL, transform_tag VARCHAR2(20), priority NUMBER(7), @@ -201,6 +216,8 @@ CREATE TABLE PROCESSINGS ( processing_id NUMBER(12), transform_id NUMBER(12) constraint PROCESSINGS_TRANSFORM_ID_NN NOT NULL, + request_id NUMBER(12), + workload_id NUMBER(10), status NUMBER(2) constraint PROCESSINGS_STATUS_ID_NN NOT NULL, substatus NUMBER(2), locking NUMBER(2), @@ -248,6 +265,8 @@ CREATE TABLE COLLECTIONS coll_id NUMBER(14), coll_type NUMBER(2), transform_id NUMBER(12) constraint COLLECTION_TRANSFORM_ID_NN NOT NULL, + request_id NUMBER(12), + workload_id NUMBER(10), relation_type NUMBER(2), -- input, output or log of the transform, scope VARCHAR2(25) constraint COLLECTION_SCOPE_NN NOT NULL, name VARCHAR2(255) constraint COLLECTION_NAME_NN NOT NULL, @@ -302,6 +321,8 @@ CREATE TABLE CONTENTS content_id NUMBER(12), transform_id NUMBER(12) constraint CONTENT_TRANSFORM_ID_NN NOT NULL, coll_id NUMBER(14) constraint CONTENT_COLL_ID_NN NOT NULL, + request_id NUMBER(12), + workload_id NUMBER(10), map_id NUMBER(12) DEFAULT 0, scope VARCHAR2(25) constraint CONTENT_SCOPE_NN NOT NULL, name VARCHAR2(255) constraint CONTENT_NAME_NN NOT NULL, @@ -333,7 +354,7 @@ CREATE TABLE CONTENTS ) PCTFREE 3 PARTITION BY RANGE(TRANSFORM_ID) -INTERVAL ( 10000 ) +INTERVAL ( 1000000 ) ( PARTITION initial_part VALUES LESS THAN (1) ); ---PCTFREE 0 @@ -363,6 +384,8 @@ CREATE TABLE MESSAGES substatus NUMBER(2), locking NUMBER(2), source NUMBER(2), + request_id NUMBER(12), + workload_id NUMBER(10), transform_id NUMBER(12), num_contents NUMBER(7), created_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), @@ -381,16 +404,53 @@ CREATE OR REPLACE TRIGGER TRIG_MESSAGE_ID / +--- health +CREATE SEQUENCE HEALTH_ID_SEQ MINVALUE 1 INCREMENT BY 1 START WITH 1 NOCACHE NOORDER NOCYCLE; +CREATE TABLE HEALTH +( + health_id NUMBER(12), + agent VARCHAR2(30), + hostname VARCHAR2(127), + pid Number(12), + thread_id Number(20), + thread_name VARCHAR2(255), + payload VARCHAR2(255), + created_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + updated_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + CONSTRAINT HEALTH_PK PRIMARY KEY (health_id), -- USING INDEX LOCAL, + CONSTRAINT HEALTH_UQ UNIQUE (agent, hostname, pid, thread_id) -- USING INDEX LOCAL +); + +CREATE OR REPLACE TRIGGER TRIG_HEALTH_ID + BEFORE INSERT + ON HEALTH + FOR EACH ROW + BEGIN + :NEW.health_id := HEALTH_ID_SEQ.NEXTVAL ; + END; + / + + +SELECT cols.table_name, cols.column_name, cols.position, cons.status, cons.owner +FROM all_constraints cons, all_cons_columns cols +WHERE cols.table_name = 'HEALTH' +AND cons.constraint_type = 'P' +AND cons.constraint_name = cols.constraint_name +AND cons.owner = cols.owner +ORDER BY cols.table_name, cols.position; select r.request_id, r.scope, r.name, r.status, tr.transform_id, tr.transform_status, tr.in_status, tr.in_total_files, tr.in_processed_files, tr.out_status, tr.out_total_files, tr.out_processed_files from requests r - full outer join req2transforms rt on (r.request_id=rt.request_id) + full outer join ( + select request_id, workprogress_id from workprogresses + ) wp on (r.request_id=wp.request_id) + full outer join wp2transforms wt on (wp.workprogress_id=wt.workprogress_id) full outer join ( select t.transform_id, t.status transform_status, in_coll.status in_status, in_coll.total_files in_total_files, in_coll.processed_files in_processed_files, out_coll.status out_status, out_coll.total_files out_total_files, out_coll.processed_files out_processed_files from transforms t full outer join (select coll_id , transform_id, status, total_files, processed_files from collections where relation_type = 0) in_coll on (t.transform_id = in_coll.transform_id) full outer join (select coll_id , transform_id, status, total_files, processed_files from collections where relation_type = 1) out_coll on (t.transform_id = out_coll.transform_id) - ) tr on (rt.transform_id=tr.transform_id) - + ) tr on (wt.transform_id=tr.transform_id) +order by r.request_id diff --git a/main/etc/sql/oracle_19.sql b/main/etc/sql/oracle_19.sql index eca31848..b6045880 100644 --- a/main/etc/sql/oracle_19.sql +++ b/main/etc/sql/oracle_19.sql @@ -6,7 +6,19 @@ DROP SEQUENCE PROCESSING_ID_SEQ; DROP SEQUENCE COLLECTION_ID_SEQ; DROP SEQUENCE CONTENT_ID_SEQ; +delete from HEALTH; +delete from MESSAGES; +delete from CONTENTS; +delete from REQ2WORKLOAD; +delete from REQ2TRANSFORMS; +delete from WP2TRANSFORMS; +delete from WORKPROGRESSES; +delete from PROCESSINGS; +delete from COLLECTIONS; +delete from TRANSFORMS; +delete from REQUESTS; +Drop table HEALTH purge; DROP table MESSAGES purge; DROP table CONTENTS purge; DROP table REQ2WORKLOAD purge; @@ -60,6 +72,7 @@ CREATE TABLE WORKPROGRESSES ( workprogress_id NUMBER(12) DEFAULT ON NULL WORKPROGRESS_ID_SEQ.NEXTVAL constraint WORKPROGRESS_ID_NN NOT NULL, request_id NUMBER(12) constraint WORKPROGRESS__REQ_ID_NN NOT NULL, + workload_id NUMBER(10), scope VARCHAR2(25) constraint WORKPROGRESS_SCOPE_NN NOT NULL, name VARCHAR2(255) constraint WORKPROGRESS_NAME_NN NOT NULL, priority NUMBER(7), @@ -93,6 +106,8 @@ CREATE SEQUENCE TRANSFORM_ID_SEQ MINVALUE 1 INCREMENT BY 1 ORDER NOCACHE NOCYCLE CREATE TABLE TRANSFORMS ( transform_id NUMBER(12) DEFAULT ON NULL TRANSFORM_ID_SEQ.NEXTVAL constraint TRANSFORM_ID_NN NOT NULL, + request_id NUMBER(12), + workload_id NUMBER(10), transform_type NUMBER(2) constraint TRANSFORM_TYPE_NN NOT NULL, transform_tag VARCHAR2(20), priority NUMBER(7), @@ -162,6 +177,8 @@ CREATE TABLE PROCESSINGS ( processing_id NUMBER(12) DEFAULT ON NULL PROCESSING_ID_SEQ.NEXTVAL constraint PROCESSING_ID_NN NOT NULL, transform_id NUMBER(12) constraint PROCESSINGS_TRANSFORM_ID_NN NOT NULL, + request_id NUMBER(12), + workload_id NUMBER(10), status NUMBER(2) constraint PROCESSINGS_STATUS_ID_NN NOT NULL, substatus NUMBER(2), locking NUMBER(2), @@ -193,6 +210,8 @@ CREATE TABLE COLLECTIONS coll_id NUMBER(14) DEFAULT ON NULL COLLECTION_ID_SEQ.NEXTVAL constraint COLLECTIONS_ID_NN NOT NULL, coll_type NUMBER(2), transform_id NUMBER(12) constraint COLLECTION_TRANSFORM_ID_NN NOT NULL, + request_id NUMBER(12), + workload_id NUMBER(10), relation_type NUMBER(2), -- input, output or log of the transform, scope VARCHAR2(25) constraint COLLECTION_SCOPE_NN NOT NULL, name VARCHAR2(255) constraint COLLECTION_NAME_NN NOT NULL, @@ -232,6 +251,8 @@ CREATE TABLE CONTENTS content_id NUMBER(12) DEFAULT ON NULL CONTENT_ID_SEQ.NEXTVAL constraint CONTENT_ID_NN NOT NULL, transform_id NUMBER(12) constraint CONTENT_TRANSFORM_ID_NN NOT NULL, coll_id NUMBER(14) constraint CONTENT_COLL_ID_NN NOT NULL, + request_id NUMBER(12), + workload_id NUMBER(10), map_id NUMBER(12) DEFAULT 0, scope VARCHAR2(25) constraint CONTENT_SCOPE_NN NOT NULL, name VARCHAR2(255) constraint CONTENT_NAME_NN NOT NULL, @@ -277,6 +298,8 @@ CREATE TABLE MESSAGES substatus NUMBER(2), locking NUMBER(2), source NUMBER(2), + request_id NUMBER(12), + workload_id NUMBER(10), transform_id NUMBER(12), num_contents NUMBER(7), created_at DATE DEFAULT ON NULL SYS_EXTRACT_UTC(systimestamp(0)), @@ -284,3 +307,21 @@ CREATE TABLE MESSAGES msg_content CLOB constraint MSG_CONTENT_ENSURE_JSON CHECK(msg_content IS JSON(LAX)), CONSTRAINT MESSAGES_PK PRIMARY KEY (msg_id) -- USING INDEX LOCAL, ); + + +--- health +CREATE SEQUENCE HEALTH_ID_SEQ MINVALUE 1 INCREMENT BY 1 START WITH 1 NOCACHE ORDER NOCYCLE GLOBAL; +CREATE TABLE HEALTH +( + health_id NUMBER(12) DEFAULT ON NULL HEALTH_ID_SEQ.NEXTVAL constraint HEALTH_ID_NN NOT NULL, + agent VARCHAR2(30), + hostname VARCHAR2(127), + pid Number(12), + thread_id Number(20), + thread_name VARCHAR2(255), + payload VARCHAR2(255), + created_at DATE DEFAULT ON NULL SYS_EXTRACT_UTC(systimestamp(0)), + updated_at DATE DEFAULT ON NULL SYS_EXTRACT_UTC(systimestamp(0)), + CONSTRAINT HEALTH_PK PRIMARY KEY (health_id), -- USING INDEX LOCAL, + CONSTRAINT HEALTH_UQ UNIQUE (agent, hostname, pid, thread_id) +); diff --git a/main/lib/idds/agents/carrier/carrier.py b/main/lib/idds/agents/carrier/carrier.py index 3d372442..d834164a 100644 --- a/main/lib/idds/agents/carrier/carrier.py +++ b/main/lib/idds/agents/carrier/carrier.py @@ -62,12 +62,19 @@ def get_new_processings(self): return processings def process_new_processing(self, processing): + # transform_id = processing['transform_id'] + # transform = core_transforms.get_transform(transform_id=transform_id) + # work = transform['transform_metadata']['work'] work = processing['processing_metadata']['work'] - work.submit_processing() - return {'processing_id': processing['processing_id'], - 'status': ProcessingStatus.Submitted, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period), - 'processing_metadata': processing['processing_metadata']} + # work.set_agent_attributes(self.agent_attributes) + work.submit_processing(processing) + ret = {'processing_id': processing['processing_id'], + 'status': ProcessingStatus.Submitted, + 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period), + 'processing_metadata': processing['processing_metadata']} + if processing['processing_metadata'] and 'workload_id' in processing['processing_metadata']: + ret['workload_id'] = processing['processing_metadata']['workload_id'] + return ret def process_new_processings(self): ret = [] @@ -100,7 +107,8 @@ def get_running_processings(self): """ Get running processing """ - processing_status = [ProcessingStatus.Submitting, ProcessingStatus.Submitted, ProcessingStatus.Running, ProcessingStatus.FinishedOnExec] + processing_status = [ProcessingStatus.Submitting, ProcessingStatus.Submitted, ProcessingStatus.Running, ProcessingStatus.FinishedOnExec, + ProcessingStatus.ToCancel, ProcessingStatus.Cancelling] processings = core_processings.get_processings_by_status(status=processing_status, # time_period=self.poll_time_period, locking=True, @@ -110,12 +118,37 @@ def get_running_processings(self): self.logger.info("Main thread get %s [submitting + submitted + running] processings to process: %s" % (len(processings), str([processing['processing_id'] for processing in processings]))) return processings + def get_collection_ids(self, collections): + coll_ids = [] + for coll in collections: + coll_ids.append(coll['coll_id']) + return coll_ids + def process_running_processing(self, processing): transform_id = processing['transform_id'] - input_output_maps = core_transforms.get_transform_input_output_maps(transform_id) + # transform = core_transforms.get_transform(transform_id=transform_id) + # work = transform['transform_metadata']['work'] work = processing['processing_metadata']['work'] + + input_collections = work.get_input_collections() + output_collections = work.get_output_collections() + log_collections = work.get_log_collections() + + input_coll_ids = self.get_collection_ids(input_collections) + output_coll_ids = self.get_collection_ids(output_collections) + log_coll_ids = self.get_collection_ids(log_collections) + + input_output_maps = core_transforms.get_transform_input_output_maps(transform_id, + input_coll_ids=input_coll_ids, + output_coll_ids=output_coll_ids, + log_coll_ids=log_coll_ids) + + if processing['status'] in [ProcessingStatus.ToCancel]: + work.abort_processing(processing) + + # work = processing['processing_metadata']['work'] # outputs = work.poll_processing() - processing_update, content_updates = work.poll_processing_updates(input_output_maps) + processing_update, content_updates = work.poll_processing_updates(processing, input_output_maps) if processing_update: processing_update['parameters']['locking'] = ProcessingLocking.Idle @@ -146,10 +179,10 @@ def finish_running_processings(self): while not self.running_output_queue.empty(): processing = self.running_output_queue.get() if processing: - self.logger.info("Main thread processing(processing_id: %s) status changed to %s" % (processing['processing_updates']['processing_id'], - processing['processing_updates']['parameters']['status'])) + self.logger.info("Main thread processing(processing_id: %s) updates: %s" % (processing['processing_update']['processing_id'], + processing['processing_update']['parameters'])) - self.logger.info("Main thread finishing running processing %s" % str(processing)) + # self.logger.info("Main thread finishing running processing %s" % str(processing)) core_processings.update_processing_contents(processing_update=processing['processing_update'], content_updates=processing['content_updates']) @@ -167,6 +200,8 @@ def run(self): self.load_plugins() self.init() + self.add_default_tasks() + task = self.create_task(task_func=self.get_new_processings, task_output_queue=self.new_task_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) self.add_task(task) task = self.create_task(task_func=self.process_new_processings, task_output_queue=self.new_output_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index 3cd16380..f316ba85 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -17,10 +17,10 @@ from Queue import Queue from idds.common.constants import (Sections, RequestStatus, RequestLocking, - WorkprogressStatus) + TransformStatus, WorkStatus) from idds.common.utils import setup_logging from idds.core import (requests as core_requests, - workprogress as core_workprogress) + transforms as core_transforms) from idds.agents.common.baseagent import BaseAgent setup_logging(__name__) @@ -62,43 +62,43 @@ def get_new_requests(self): def process_new_request(self, req): try: - req_id = req['request_id'] - # workload_id = req['workload_id'] workflow = req['request_metadata']['workflow'] - workflows = workflow.get_exact_workflows() - existed_wps = core_workprogress.get_workprogresses(req_id) - existed_workflows = [wp['workprogress_metadata']['workflow'] for wp in existed_wps] - new_workflows = [] - for wf in workflows: - if wf not in existed_workflows: - new_workflows.append(wf) - - wps = [] - for wf in new_workflows: - primary_init_collection = wf.get_primary_initial_collection() - workprogress = {'request_id': req_id, - 'scope': primary_init_collection['scope'], - 'name': primary_init_collection['name'], - 'priority': req['priority'], - 'status': req['status'], - 'locking': req['locking'], - 'expired_at': req['expired_at'], - 'errors': None, - 'workprogress_metadata': {'workflow': wf}, - 'processing_metadata': None} - wps.append(workprogress) + + wf = workflow.copy() + works = wf.get_new_works() + transforms = [] + for work in works: + new_work = work.copy() + new_work.add_proxy(wf.get_proxy()) + transform = {'request_id': req['request_id'], + 'workload_id': req['workload_id'], + 'transform_type': work.get_work_type(), + 'transform_tag': work.get_work_tag(), + 'priority': req['priority'], + 'status': TransformStatus.New, + 'retries': 0, + 'expired_at': req['expired_at'], + 'transform_metadata': {'orginal_work': work, 'work': new_work} + # 'collections': related_collections + } + transforms.append(transform) + self.logger.info("Processing request(%s): new transforms: %s" % (req['request_id'], + str(transforms))) + processing_metadata = req['processing_metadata'] + processing_metadata = {'workflow': wf} ret_req = {'request_id': req['request_id'], - 'status': RequestStatus.Transforming, - 'processing_metadata': {'total_workprogresses': len(workflows), - 'new_workprogresses': len(new_workflows)}, - 'new_workprogresses': wps} + 'parameters': {'status': RequestStatus.Transforming, + 'locking': RequestLocking.Idle, + 'processing_metadata': processing_metadata}, + 'new_transforms': transforms} except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) ret_req = {'request_id': req['request_id'], - 'status': RequestStatus.Failed, - 'errors': {'msg': '%s: %s' % (ex, traceback.format_exc())}} + 'parameters': {'status': RequestStatus.Failed, + 'locking': RequestLocking.Idle, + 'errors': {'msg': '%s: %s' % (ex, traceback.format_exc())}}} return ret_req def process_new_requests(self): @@ -124,23 +124,21 @@ def finish_new_requests(self): try: req = self.new_output_queue.get() self.logger.info("Main thread finished processing requst: %s" % req) - parameter = {'status': req['status'], 'locking': RequestLocking.Idle} + req['parameters']['locking'] = RequestLocking.Idle - if 'processing_metadata' not in req or not req['processing_metadata']: - processing_metadata = {} + if 'new_transforms' in req: + new_transforms = req['new_transforms'] else: - processing_metadata = req['processing_metadata'] - - parameter['processing_metadata'] = processing_metadata - - if 'errors' in req: - parameter['errors'] = req['errors'] + new_transforms = [] - if 'new_workprogresses' in req: - new_workprogresses = req['new_workprogresses'] + if 'update_transforms' in req: + update_transforms = req['update_transforms'] else: - new_workprogresses = [] - core_requests.update_request_with_workprogresses(req['request_id'], parameter, new_workprogresses) + update_transforms = [] + + core_requests.update_request_with_transforms(req['request_id'], req['parameters'], + new_transforms=new_transforms, + update_transforms=update_transforms) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) @@ -149,7 +147,7 @@ def get_running_requests(self): """ Get running requests """ - req_status = [RequestStatus.Transforming] + req_status = [RequestStatus.Transforming, RequestStatus.ToCancel, RequestStatus.Cancelling] reqs = core_requests.get_requests_by_status_type(status=req_status, time_period=self.poll_time_period, locking=True, bulk_size=self.retrieve_bulk_size) @@ -162,43 +160,97 @@ def process_running_request(self, req): """ process running request """ - wps = core_workprogress.get_workprogresses(request_id=req['request_id']) - wps_status = {} - for wp in wps: - status_name = wp['status'].name - if status_name not in wps_status: - wps_status[status_name] = 1 - else: - wps_status[status_name] += 1 + self.logger.info("process_running_request: request_id: %s" % req['request_id']) processing_metadata = req['processing_metadata'] - - processing_metadata['workprogresses_status'] = wps_status - - wps_status_keys = list(wps_status.keys()) - if len(wps_status_keys) == 0: - ret_req = {'request_id': req['request_id'], - 'status': RequestStatus.Failed, - 'processing_metadata': processing_metadata, - 'errors': {'msg': 'No transforms founded(no collections founded)'} - } - elif len(wps_status_keys) == 1: - if wps_status_keys[0] in [WorkprogressStatus.New, WorkprogressStatus.New.name, - WorkprogressStatus.Transforming, WorkprogressStatus.Transforming.name, - WorkprogressStatus.Extend, WorkprogressStatus.Extend.name]: - ret_req = {'request_id': req['request_id'], - 'status': RequestStatus.Transforming, - 'processing_metadata': processing_metadata - } + wf = processing_metadata['workflow'] + + new_transforms = [] + if req['status'] in [RequestStatus.Transforming]: + # new works + works = wf.get_new_works() + for work in works: + new_work = work.copy() + new_work.add_proxy(wf.get_proxy()) + new_transform = {'request_id': req['request_id'], + 'workload_id': req['workload_id'], + 'transform_type': work.get_work_type(), + 'transform_tag': work.get_work_tag(), + 'priority': req['priority'], + 'status': TransformStatus.New, + 'retries': 0, + 'expired_at': req['expired_at'], + 'transform_metadata': {'orginal_work': work, 'work': new_work} + # 'collections': related_collections + } + new_transforms.append(new_transform) + self.logger.info("Processing request(%s): new transforms: %s" % (req['request_id'], + str(new_transforms))) + + update_transforms = {} + if req['status'] in [RequestStatus.ToCancel]: + # current works + works = wf.get_current_works() + # print(works) + for work in works: + if work.get_status() not in [WorkStatus.Finished, WorkStatus.SubFinished, + WorkStatus.Failed, WorkStatus.Cancelling, + WorkStatus.Cancelled]: + update_transforms[work.get_work_id()] = {'status': TransformStatus.ToCancel} + + # current works + works = wf.get_current_works() + # print(works) + for work in works: + # print(work.get_work_id()) + tf = core_transforms.get_transform(transform_id=work.get_work_id()) + transform_work = tf['transform_metadata']['work'] + # work_status = WorkStatus(tf['status'].value) + # work.set_status(work_status) + work.sync_work_data(transform_work) + + if wf.is_terminated(): + if wf.is_finished(): + req_status = RequestStatus.Finished + elif wf.is_subfinished(): + req_status = RequestStatus.SubFinished + elif wf.is_failed(): + req_status = RequestStatus.Failed + elif wf.is_cancelled(): + req_status = RequestStatus.Cancelled else: - ret_req = {'request_id': req['request_id'], - 'status': dict(RequestStatus.__members__)[wps_status_keys[0]], - 'processing_metadata': processing_metadata - } + req_status = RequestStatus.Failed + req_msg = wf.get_terminated_msg() else: - ret_req = {'request_id': req['request_id'], - 'status': RequestStatus.Transforming, - 'processing_metadata': processing_metadata - } + req_status = RequestStatus.Transforming + req_msg = None + + parameters = {'status': req_status, + 'locking': RequestLocking.Idle, + 'processing_metadata': processing_metadata, + 'errors': {'msg': req_msg}} + ret = {'request_id': req['request_id'], + 'parameters': parameters, + 'new_transforms': new_transforms, + 'update_transforms': update_transforms} + return ret + + def process_tocancel_request(self, req): + """ + process ToCancel request + """ + tfs = core_transforms.get_transforms(request_id=req['request_id']) + tfs_status = {} + for tf in tfs: + if tf['status'] not in [RequestStatus.Finished, RequestStatus.SubFinished, + RequestStatus.Failed, RequestStatus.Cancelling, + RequestStatus.Cancelled]: + tfs_status[tf['request_id']] = {'status': RequestStatus.ToCancel} + + ret_req = {'request_id': req['request_id'], + 'parameters': {'status': RequestStatus.Cancelling, + 'locking': RequestLocking.Idle}, + 'update_transforms': tfs_status + } return ret_req def process_running_requests(self): @@ -210,8 +262,13 @@ def process_running_requests(self): try: req = self.running_task_queue.get() if req: - self.logger.info("Main thread processing running requst: %s" % req) - ret_req = self.process_running_request(req) + if req['status'] in [RequestStatus.Transforming, RequestStatus.Cancelling]: + self.logger.info("Main thread processing running requst: %s" % req) + ret_req = self.process_running_request(req) + elif req['status'] in [RequestStatus.ToCancel]: + self.logger.info("Main thread processing ToCancel requst: %s" % req) + ret_req = self.process_tocancel_request(req) + if ret_req: ret.append(ret_req) except Exception as ex: @@ -223,11 +280,21 @@ def finish_running_requests(self): while not self.running_output_queue.empty(): req = self.running_output_queue.get() self.logger.info("finish_running_requests: req: %s" % req) - parameter = {'locking': RequestLocking.Idle} - for key in ['status', 'errors', 'request_metadata', 'processing_metadata']: - if key in req: - parameter[key] = req[key] - core_requests.update_request(req['request_id'], parameter) + req['parameters']['locking'] = RequestLocking.Idle + + if 'new_transforms' in req: + new_transforms = req['new_transforms'] + else: + new_transforms = [] + + if 'update_transforms' in req: + update_transforms = req['update_transforms'] + else: + update_transforms = [] + + core_requests.update_request_with_transforms(req['request_id'], req['parameters'], + new_transforms=new_transforms, + update_transforms=update_transforms) def clean_locks(self): self.logger.info("clean locking") @@ -242,6 +309,8 @@ def run(self): self.load_plugins() + self.add_default_tasks() + task = self.create_task(task_func=self.get_new_requests, task_output_queue=self.new_task_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) self.add_task(task) task = self.create_task(task_func=self.process_new_requests, task_output_queue=self.new_output_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) diff --git a/main/lib/idds/agents/common/baseagent.py b/main/lib/idds/agents/common/baseagent.py index 832ba7a8..cbf83ad8 100644 --- a/main/lib/idds/agents/common/baseagent.py +++ b/main/lib/idds/agents/common/baseagent.py @@ -8,12 +8,17 @@ # Authors: # - Wen Guan, , 2019 +import os +import socket +import threading -from idds.common.constants import (Sections, TransformType, - MessageType, MessageStatus, MessageSource) +from idds.common.constants import Sections +from idds.common.constants import (MessageType, MessageTypeStr, + MessageStatus, MessageSource) from idds.common.plugin.plugin_base import PluginBase from idds.common.plugin.plugin_utils import load_plugins, load_plugin_sequence from idds.common.utils import setup_logging +from idds.core import health as core_health, messages as core_messages from idds.agents.common.timerscheduler import TimerScheduler @@ -37,9 +42,34 @@ def __init__(self, num_threads=1, **kwargs): for key in kwargs: setattr(self, key, kwargs[key]) + if not hasattr(self, 'heartbeat_delay'): + self.heartbeat_delay = 600 + self.plugins = {} self.plugin_sequence = [] + self.agent_attributes = self.load_agent_attributes(kwargs) + + self.logger.info("agent_attributes: %s" % self.agent_attributes) + + def get_name(self): + return self.name + + def load_agent_attributes(self, kwargs): + rets = {} + for key in kwargs: + if '.' not in key: + continue + key_items = key.split('.') + + ret_items = rets + for item in key_items[:-1]: + if item not in ret_items: + ret_items[item] = {} + ret_items = ret_items[item] + ret_items[key_items[-1]] = kwargs[key] + return rets + def load_plugin_sequence(self): self.plugin_sequence = load_plugin_sequence(self.config_section) @@ -73,45 +103,50 @@ def __call__(self): def terminate(self): self.stop() - def generate_file_message(self, transform, files): - if not files: - return [] - - updated_files_message = [] - for file in files: - updated_file_message = {'scope': file['scope'], - 'name': file['name'], - 'path': file['path'], - 'status': file['status'].name} - updated_files_message.append(updated_file_message) - - workload_id = None - if 'workload_id' in transform['transform_metadata']: - workload_id = transform['transform_metadata']['workload_id'] - - if transform['transform_type'] in [TransformType.StageIn, TransformType.StageIn.value]: - msg_type = 'file_stagein' - msg_type_c = MessageType.StageInFile - elif transform['transform_type'] in [TransformType.ActiveLearning, TransformType.ActiveLearning.value]: - msg_type = 'file_activelearning' - msg_type_c = MessageType.ActiveLearningFile - elif transform['transform_type'] in [TransformType.HyperParameterOpt, TransformType.HyperParameterOpt.value]: - msg_type = 'file_hyperparameteropt' - msg_type_c = MessageType.HyperParameterOptFile - else: - msg_type = 'file_unknown' - msg_type_c = MessageType.UnknownFile - - msg_content = {'msg_type': msg_type, - 'workload_id': workload_id, - 'files': updated_files_message} - file_msg_content = {'msg_type': msg_type_c, - 'status': MessageStatus.New, - 'source': MessageSource.Carrier, - 'transform_id': transform['transform_id'], - 'num_contents': len(updated_files_message), - 'msg_content': msg_content} - return file_msg_content + def health_heartbeat(self): + hostname = socket.getfqdn() + pid = os.getpid() + hb_thread = threading.current_thread() + thread_id = hb_thread.ident + thread_name = hb_thread.name + core_health.add_health_item(agent=self.get_name(), hostname=hostname, pid=pid, + thread_id=thread_id, thread_name=thread_name, payload=None) + + def add_default_tasks(self): + task = self.create_task(task_func=self.health_heartbeat, task_output_queue=None, + task_args=tuple(), task_kwargs={}, delay_time=self.heartbeat_delay, + priority=1) + self.add_task(task) + + def generate_health_messages(self): + core_health.clean_health(older_than=self.heartbeat_delay * 2) + items = core_health.retrieve_health_items() + msg_content = {'msg_type': MessageTypeStr.HealthHeartbeat.value, + 'agents': items} + num_msg_content = len(items) + + message = {'msg_type': MessageType.HealthHeartbeat, + 'status': MessageStatus.New, + 'source': MessageSource.Conductor, + 'request_id': None, + 'workload_id': None, + 'transform_id': None, + 'num_contents': num_msg_content, + 'msg_content': msg_content} + core_messages.add_message(msg_type=message['msg_type'], + status=message['status'], + source=message['source'], + request_id=message['request_id'], + workload_id=message['workload_id'], + transform_id=message['transform_id'], + num_contents=message['num_contents'], + msg_content=message['msg_content']) + + def add_health_message_task(self): + task = self.create_task(task_func=self.generate_health_messages, task_output_queue=None, + task_args=tuple(), task_kwargs={}, delay_time=self.heartbeat_delay, + priority=1) + self.add_task(task) if __name__ == '__main__': diff --git a/main/lib/idds/agents/common/timerscheduler.py b/main/lib/idds/agents/common/timerscheduler.py index cfe975bd..ef630e1d 100644 --- a/main/lib/idds/agents/common/timerscheduler.py +++ b/main/lib/idds/agents/common/timerscheduler.py @@ -80,3 +80,11 @@ def execute(self): self.graceful_stop.wait(1) except Exception as error: self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + + def execute_once(self): + try: + task = self.get_ready_task() + if task: + self.executors.submit(self.execute_task, task) + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) diff --git a/main/lib/idds/agents/conductor/conductor.py b/main/lib/idds/agents/conductor/conductor.py index 7d461285..270378fe 100644 --- a/main/lib/idds/agents/conductor/conductor.py +++ b/main/lib/idds/agents/conductor/conductor.py @@ -84,7 +84,13 @@ def run(self): self.load_plugins() self.start_notifier() + + self.add_health_message_task() + while not self.graceful_stop.is_set(): + # execute timer task + self.execute_once() + try: messages = self.get_messages() for message in messages: diff --git a/main/lib/idds/agents/conductor/consumer.py b/main/lib/idds/agents/conductor/consumer.py new file mode 100644 index 00000000..1883cbdb --- /dev/null +++ b/main/lib/idds/agents/conductor/consumer.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 + +import time +import traceback +try: + # python 3 + from queue import Queue +except ImportError: + # Python 2 + from Queue import Queue + +from idds.common.constants import (Sections, MessageStatus) +from idds.common.exceptions import AgentPluginError, IDDSException +from idds.common.utils import setup_logging +from idds.core import messages as core_messages +from idds.agents.common.baseagent import BaseAgent + + +setup_logging(__name__) + + +class Consumer(BaseAgent): + """ + Consumer works to notify workload management that the data is available. + """ + + def __init__(self, num_threads=1, retrieve_bulk_size=None, **kwargs): + super(Consumer, self).__init__(num_threads=num_threads, **kwargs) + self.config_section = Sections.Consumer + self.retrieve_bulk_size = int(retrieve_bulk_size) + self.message_queue = Queue() + + def __del__(self): + self.stop_receiver() + + def get_messages(self): + """ + Get messages + """ + messages = core_messages.retrieve_messages(status=MessageStatus.New, bulk_size=self.retrieve_bulk_size) + + self.logger.debug("Main thread get %s new messages" % len(messages)) + if messages: + self.logger.info("Main thread get %s new messages" % len(messages)) + + return messages + + def clean_messages(self, msgs): + # core_messages.delete_messages(msgs) + to_updates = [] + for msg in msgs: + to_updates.append({'msg_id': msg['msg_id'], + 'status': MessageStatus.Delivered}) + core_messages.update_messages(to_updates) + + def start_receiver(self): + if 'receiver' not in self.plugins: + raise AgentPluginError('Plugin receiver is required') + self.receiver = self.plugins['receiver'] + + self.logger.info("Starting receiver: %s" % self.receiver) + self.receiver.set_request_queue(self.message_queue) + self.receiver.start() + + def stop_receiver(self): + if hasattr(self, 'receiver') and self.receiver: + self.logger.info("Stopping receiver: %s" % self.receiver) + self.receiver.stop() + + def run(self): + """ + Main run function. + """ + try: + self.logger.info("Starting main thread") + self.load_plugins() + + self.start_receiver() + + while not self.graceful_stop.is_set(): + try: + pass + except IDDSException as error: + self.logger.error("Main thread IDDSException: %s" % str(error)) + except Exception as error: + self.logger.critical("Main thread exception: %s\n%s" % (str(error), traceback.format_exc())) + time.sleep(5) + self.stop() + except KeyboardInterrupt: + self.stop() + + def stop(self): + super(Consumer, self).stop() + self.stop_receiver() + + +if __name__ == '__main__': + agent = Consumer() + agent() diff --git a/main/lib/idds/agents/main.py b/main/lib/idds/agents/main.py index c4d7480f..b386bb5f 100755 --- a/main/lib/idds/agents/main.py +++ b/main/lib/idds/agents/main.py @@ -33,8 +33,10 @@ 'transformer': ['idds.agents.transformer.transformer.Transformer', Sections.Transformer], 'transporter': ['idds.agents.transporter.transporter.Transporter', Sections.Transporter], 'carrier': ['idds.agents.carrier.carrier.Carrier', Sections.Carrier], - 'conductor': ['idds.agents.conductor.conductor.Conductor', Sections.Conductor] + 'conductor': ['idds.agents.conductor.conductor.Conductor', Sections.Conductor], + 'consumer': ['idds.agents.conductor.consumer.Consumer', Sections.Consumer] } + RUNNING_AGENTS = [] diff --git a/main/lib/idds/agents/marshaller/marshaller.py b/main/lib/idds/agents/marshaller/marshaller.py index af1ff21e..3d263c82 100644 --- a/main/lib/idds/agents/marshaller/marshaller.py +++ b/main/lib/idds/agents/marshaller/marshaller.py @@ -66,20 +66,24 @@ def process_new_workprogress(self, workprogress): transforms = [] for work in works: + new_work = work.copy() + new_work.add_proxy(wf.get_proxy()) transform = {'workprogress_id': workprogress['workprogress_id'], + 'request_id': workprogress['request_id'], + 'workload_id': workprogress['workload_id'], 'transform_type': work.get_work_type(), 'transform_tag': work.get_work_tag(), 'priority': workprogress['priority'], 'status': TransformStatus.New, 'retries': 0, 'expired_at': workprogress['expired_at'], - 'transform_metadata': {'work': work} + 'transform_metadata': {'orginal_work': work, 'work': new_work} # 'collections': related_collections } transforms.append(transform) - self.logger.info("Processing workprogress(%s): transforms: %s" % (workprogress['workprogress_id'], - transforms)) + self.logger.info("Processing workprogress(%s): new transforms: %s" % (workprogress['workprogress_id'], + transforms)) workprogress['locking'] = WorkprogressLocking.Idle workprogress['status'] = WorkprogressStatus.Transforming @@ -105,11 +109,13 @@ def finish_new_workprogresses(self): while not self.new_output_queue.empty(): try: ret = self.new_output_queue.get() - self.logger.info("Main thread finishing processing workprogress: %s" % ret['workprogress']) + self.logger.info("Main thread finishing new workprogress: %s" % ret['workprogress']) if ret: wp = ret['workprogress'] tfs = ret['new_transforms'] - wp_parameters = {'status': wp['status'], 'locking': wp['locking'], 'workprogress_metadata': wp['workprogress_metadata']} + wp_parameters = {'status': wp['status'], + 'locking': wp['locking'], + 'workprogress_metadata': wp['workprogress_metadata']} core_workprogress.update_workprogress(workprogress_id=wp['workprogress_id'], parameters=wp_parameters, new_transforms=tfs) @@ -121,15 +127,16 @@ def get_running_workprogresses(self): """ Get workprogresses to running """ - workprogress_status = [WorkprogressStatus.Transforming] + workprogress_status = [WorkprogressStatus.Transforming, WorkprogressStatus.ToCancel, + WorkprogressStatus.Cancelling] workprogresses = core_workprogress.get_workprogresses_by_status(status=workprogress_status, period=self.poll_time_period, locking=True, bulk_size=self.retrieve_bulk_size) - self.logger.debug("Main thread get %s workprogressing workprogresses to process" % len(workprogresses)) + self.logger.debug("Main thread get %s progressing workprogresses to process" % len(workprogresses)) if workprogresses: - self.logger.info("Main thread get %s workprogressing workprogresses to process" % len(workprogresses)) + self.logger.info("Main thread get %s progressing workprogresses to process" % len(workprogresses)) return workprogresses def process_running_workprogress(self, workprogress): @@ -139,8 +146,46 @@ def process_running_workprogress(self, workprogress): self.logger.info("process_running_workprogress: workprogress_id: %s" % workprogress['workprogress_id']) workprogress_metadata = workprogress['workprogress_metadata'] wf = workprogress_metadata['workflow'] + + new_transforms = [] + if workprogress['status'] in [WorkprogressStatus.Transforming]: + # new works + works = wf.get_new_works() + for work in works: + new_work = work.copy() + new_work.add_proxy(wf.get_proxy()) + new_transform = {'workprogress_id': workprogress['workprogress_id'], + 'request_id': workprogress['request_id'], + 'workload_id': workprogress['workload_id'], + 'transform_type': work.get_work_type(), + 'transform_tag': work.get_work_tag(), + 'priority': workprogress['priority'], + 'status': TransformStatus.New, + 'retries': 0, + 'expired_at': workprogress['expired_at'], + 'transform_metadata': {'orginal_work': work, 'work': new_work} + # 'collections': related_collections + } + new_transforms.append(new_transform) + self.logger.info("Processing workprogress(%s): new transforms: %s" % (workprogress['workprogress_id'], + new_transforms)) + + update_transforms = {} + if workprogress['status'] in [WorkprogressStatus.ToCancel]: + # current works + works = wf.get_current_works() + # print(works) + for work in works: + if work.get_status() not in [WorkStatus.Finished, WorkStatus.SubFinished, + WorkStatus.Failed, WorkStatus.Cancelling, + WorkStatus.Cancelled]: + update_transforms[work.get_work_id()] = {'status': TransformStatus.ToCancel} + + # current works works = wf.get_current_works() + # print(works) for work in works: + # print(work.get_work_id()) tf = core_transforms.get_transform(transform_id=work.get_work_id()) work_status = WorkStatus(tf['status'].value) work.set_status(work_status) @@ -153,14 +198,22 @@ def process_running_workprogress(self, workprogress): wp_status = WorkprogressStatus.SubFinished elif wf.is_failed(): wp_status = WorkprogressStatus.Failed + elif wf.is_cancelled(): + wp_status = WorkprogressStatus.Cancelled else: wp_status = WorkprogressStatus.Failed wp_msg = wf.get_terminated_msg() else: wp_status = WorkprogressStatus.Transforming wp_msg = None - parameters = {'status': wp_status, 'locking': WorkprogressLocking.Idle, 'errors': {'msg': wp_msg}} - ret = {'workprogress_id': workprogress['workprogress_id'], 'parameters': parameters} + parameters = {'status': wp_status, + 'locking': WorkprogressLocking.Idle, + 'workprogress_metadata': workprogress_metadata, + 'errors': {'msg': wp_msg}} + ret = {'workprogress_id': workprogress['workprogress_id'], + 'parameters': parameters, + 'new_transforms': new_transforms, + 'update_transforms': update_transforms} return ret def process_running_workprogresses(self): @@ -180,10 +233,21 @@ def process_running_workprogresses(self): def finish_running_workprogresses(self): while not self.running_output_queue.empty(): - ret = self.running_output_queue.get() - wp_id = ret['workprogress_id'] - parameters = ret['parameters'] - core_workprogress.update_workprogress(workprogress_id=wp_id, parameters=parameters) + try: + ret = self.running_output_queue.get() + self.logger.info("Main thread finishing processing workprogress: %s" % ret) + + wp_id = ret['workprogress_id'] + parameters = ret['parameters'] + new_transforms = ret['new_transforms'] + update_transforms = ret['update_transforms'] + core_workprogress.update_workprogress(workprogress_id=wp_id, + parameters=parameters, + new_transforms=new_transforms, + update_transforms=update_transforms) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) def clean_locks(self): self.logger.info("clean locking") @@ -198,6 +262,8 @@ def run(self): self.load_plugins() + self.add_default_tasks() + task = self.create_task(task_func=self.get_new_workprogresses, task_output_queue=self.new_task_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) self.add_task(task) task = self.create_task(task_func=self.process_new_workprogresses, task_output_queue=self.new_output_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) diff --git a/main/lib/idds/agents/transformer/transformer.py b/main/lib/idds/agents/transformer/transformer.py index f064af24..cb04533a 100644 --- a/main/lib/idds/agents/transformer/transformer.py +++ b/main/lib/idds/agents/transformer/transformer.py @@ -6,8 +6,9 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2020 +# - Wen Guan, , 2019 - 2021 +import copy import traceback try: # python 3 @@ -17,13 +18,13 @@ from Queue import Queue -from idds.common.constants import (Sections, TransformStatus, TransformLocking, +from idds.common.constants import (Sections, TransformStatus, TransformLocking, TransformType, CollectionRelationType, CollectionStatus, CollectionType, ContentType, ContentStatus, - ProcessingStatus) + ProcessingStatus, MessageType, MessageTypeStr, + MessageStatus, MessageSource) from idds.common.utils import setup_logging -# from idds.core import (transforms as core_transforms, processings as core_processings) -from idds.core import (transforms as core_transforms) +from idds.core import (transforms as core_transforms, processings as core_processings) from idds.agents.common.baseagent import BaseAgent setup_logging(__name__) @@ -60,28 +61,27 @@ def get_new_transforms(self): self.logger.info("Main thread get %s New+Ready+Extend transforms to process" % len(transforms_new)) return transforms_new - def generate_collection_model(transform, collection, relation_type=CollectionRelationType.Input): + def generate_collection_model(self, transform, collection, relation_type=CollectionRelationType.Input): if 'coll_metadata' in collection: coll_metadata = collection['coll_metadata'] else: coll_metadata = {} - if 'did_type' in coll_metadata: - if coll_metadata['did_type'] == 'DATASET': - coll_type = CollectionType.Dataset - elif coll_metadata['did_type'] == 'CONTAINER': - coll_type = CollectionType.Container - else: - coll_type = CollectionType.File + if 'coll_type' in collection: + coll_type = collection['coll_type'] else: coll_type = CollectionType.Dataset - if 'is_open' in coll_metadata and not coll_metadata['is_open']: - coll_status = CollectionStatus.Closed + if 'status' in collection: + coll_status = collection['status'] else: coll_status = CollectionStatus.Open + # collection['status'] = coll_status + coll = {'transform_id': transform['transform_id'], + 'request_id': transform['request_id'], + 'workload_id': transform['workload_id'], 'coll_type': coll_type, 'scope': collection['scope'], 'name': collection['name'], @@ -97,65 +97,73 @@ def generate_collection_model(transform, collection, relation_type=CollectionRel return coll def get_new_contents(self, transform, new_input_output_maps): - new_contents = [] + new_input_contents, new_output_contents = [], [] for map_id in new_input_output_maps: - inputs = new_input_output_maps['map_id']['inputs'] - outputs = new_input_output_maps['map_id']['outputs'] + inputs = new_input_output_maps[map_id]['inputs'] + outputs = new_input_output_maps[map_id]['outputs'] for input_content in inputs: content = {'transform_id': transform['transform_id'], 'coll_id': input_content['coll_id'], + 'request_id': transform['request_id'], + 'workload_id': transform['workload_id'], 'map_id': map_id, 'scope': input_content['scope'], 'name': input_content['name'], 'min_id': input_content['min_id'] if 'min_id' in input_content else 0, 'max_id': input_content['max_id'] if 'max_id' in input_content else 0, 'status': ContentStatus.New, - 'path': None, + 'substatus': ContentStatus.New, + 'path': input_content['path'] if 'path' in input_content else None, 'content_type': input_content['content_type'] if 'content_type' in input_content else ContentType.File, 'bytes': input_content['bytes'], 'adler32': input_content['adler32'], 'content_metadata': input_content['content_metadata']} - new_contents.append(content) + new_input_contents.append(content) for output_content in outputs: content = {'transform_id': transform['transform_id'], 'coll_id': output_content['coll_id'], + 'request_id': transform['request_id'], + 'workload_id': transform['workload_id'], 'map_id': map_id, 'scope': output_content['scope'], 'name': output_content['name'], 'min_id': output_content['min_id'] if 'min_id' in output_content else 0, 'max_id': output_content['max_id'] if 'max_id' in output_content else 0, 'status': ContentStatus.New, - 'path': None, + 'substatus': ContentStatus.New, + 'path': output_content['path'] if 'path' in output_content else None, 'content_type': output_content['content_type'] if 'content_type' in output_content else ContentType.File, 'bytes': output_content['bytes'], 'adler32': output_content['adler32'], 'content_metadata': input_content['content_metadata']} - new_contents.append(content) - return new_contents + new_output_contents.append(content) + return new_input_contents, new_output_contents def get_updated_contents(self, transform, registered_input_output_maps): updated_contents = [] + updated_input_contents_full, updated_output_contents_full = [], [] + for map_id in registered_input_output_maps: - outputs = registered_input_output_maps['map_id']['outputs'] + inputs = registered_input_output_maps[map_id]['inputs'] + outputs = registered_input_output_maps[map_id]['outputs'] - for content in outputs: + for content in inputs: if content['status'] != content['substatus']: updated_content = {'content_id': content['content_id'], 'status': content['substatus']} + content['status'] = content['substatus'] updated_contents.append(updated_content) - return updated_contents + updated_input_contents_full.append(content) - """ - def get_processing(self, transform, input_colls, output_colls, log_colls, input_output_maps): - work = transform['transform_metadata']['work'] - processing = work.get_processing(input_output_maps) - if process: - processing = {'transform_id': transform['transform_id'], - 'status': ProcessingStatus.New, - 'processing_metadata': {'work': work}} - return processing - """ + for content in outputs: + if content['status'] != content['substatus']: + updated_content = {'content_id': content['content_id'], + 'status': content['substatus']} + content['status'] = content['substatus'] + updated_contents.append(updated_content) + updated_output_contents_full.append(content) + return updated_contents, updated_input_contents_full, updated_output_contents_full def process_new_transform(self, transform): """ @@ -163,6 +171,11 @@ def process_new_transform(self, transform): """ # self.logger.info("process_new_transform: transform_id: %s" % transform['transform_id']) work = transform['transform_metadata']['work'] + req_attributes = {'request_id': transform['request_id'], + 'workload_id': transform['workload_id'], + 'transform_id': transform['transform_id']} + work.set_agent_attributes(self.agent_attributes, req_attributes) + input_collections = work.get_input_collections() output_collections = work.get_output_collections() log_collections = work.get_log_collections() @@ -194,7 +207,7 @@ def process_new_transform(self, transform): # 'log_collections': log_colls, 'new_input_output_maps': input_output_maps, 'messages': file_msgs, # 'new_processing': processing} ret = {'transform': transform, 'input_collections': input_colls, 'output_collections': output_colls, - 'log_collections': log_coll} + 'log_collections': log_colls} return ret def process_new_transforms(self): @@ -239,7 +252,7 @@ def get_running_transforms(self): """ Get running transforms """ - transform_status = [TransformStatus.Transforming] + transform_status = [TransformStatus.Transforming, TransformStatus.ToCancel, TransformStatus.Cancelling] transforms = core_transforms.get_transforms_by_status(status=transform_status, period=self.poll_time_period, locking=True, @@ -250,32 +263,127 @@ def get_running_transforms(self): self.logger.info("Main thread get %s transforming transforms to process" % len(transforms)) return transforms - def process_transform_outputs(self, transform, output_collection): - transform_metadata = transform['transform_metadata'] - if not transform_metadata: - transform_metadata = {} - transform_metadata['output_collection_meta'] = output_collection['coll_metadata'] - if output_collection['status'] == CollectionStatus.Closed: - ret = {'transform_id': transform['transform_id'], - 'status': TransformStatus.Finished, - 'transform_metadata': transform_metadata} - elif output_collection['status'] == CollectionStatus.SubClosed: - ret = {'transform_id': transform['transform_id'], - 'status': TransformStatus.SubFinished, - 'transform_metadata': transform_metadata} - elif output_collection['status'] == CollectionStatus.Failed: - ret = {'transform_id': transform['transform_id'], - 'status': TransformStatus.Failed, - 'transform_metadata': transform_metadata} - elif output_collection['status'] == CollectionStatus.Deleted: - ret = {'transform_id': transform['transform_id'], - 'status': TransformStatus.Deleted, - 'transform_metadata': transform_metadata} + def get_collection_ids(self, collections): + coll_ids = [] + for coll in collections: + coll_ids.append(coll['coll_id']) + return coll_ids + + def get_message_type(self, transform_type, input_type='file'): + if transform_type in [TransformType.StageIn, TransformType.StageIn.value]: + if input_type == 'work': + msg_type_str = MessageTypeStr.StageInWork + msg_type = MessageType.StageInWork + elif input_type == 'collection': + msg_type_str = MessageTypeStr.StageInCollection + msg_type = MessageType.StageInCollection + else: + msg_type_str = MessageTypeStr.StageInFile + msg_type = MessageType.StageInFile + elif transform_type in [TransformType.ActiveLearning, TransformType.ActiveLearning.value]: + if input_type == 'work': + msg_type_str = MessageTypeStr.ActiveLearningWork + msg_type = MessageType.ActiveLearningWork + elif input_type == 'collection': + msg_type_str = MessageTypeStr.ActiveLearningCollection + msg_type = MessageType.ActiveLearningCollection + else: + msg_type_str = MessageTypeStr.ActiveLearningFile + msg_type = MessageType.ActiveLearningFile + elif transform_type in [TransformType.HyperParameterOpt, TransformType.HyperParameterOpt.value]: + if input_type == 'work': + msg_type_str = MessageTypeStr.HyperParameterOptWork + msg_type = MessageType.HyperParameterOptWork + elif input_type == 'collection': + msg_type_str = MessageTypeStr.HyperParameterOptCollection + msg_type = MessageType.HyperParameterOptCollection + else: + msg_type_str = MessageTypeStr.HyperParameterOptFile + msg_type = MessageType.HyperParameterOptFile + elif transform_type in [TransformType.Processing, TransformType.Processing.value]: + if input_type == 'work': + msg_type_str = MessageTypeStr.ProcessingWork + msg_type = MessageType.ProcessingWork + elif input_type == 'collection': + msg_type_str = MessageTypeStr.ProcessingCollection + msg_type = MessageType.ProcessingCollection + else: + msg_type_str = MessageTypeStr.ProcessingFile + msg_type = MessageType.ProcessingFile else: - ret = {'transform_id': transform['transform_id'], - 'status': TransformStatus.Transforming, - 'transform_metadata': transform_metadata} - return ret + if input_type == 'work': + msg_type_str = MessageTypeStr.UnknownWork + msg_type = MessageType.UnknownWork + elif input_type == 'collection': + msg_type_str = MessageTypeStr.UnknownCollection + msg_type = MessageType.UnknownCollection + else: + msg_type_str = MessageTypeStr.UnknownFile + msg_type = MessageType.UnknownFile + return msg_type, msg_type_str.value + + def generate_message(self, transform, work=None, collection=None, files=None, msg_type='file', relation_type='input'): + if msg_type == 'work': + if not work: + return None + elif msg_type == 'collection': + if not collection: + return None + if not work: + work = transform['transform_metadata']['work'] + else: + if not files: + return None + + request_id = transform['request_id'] + workload_id = transform['workload_id'] + i_msg_type, i_msg_type_str = None, None + + if msg_type == 'work': + i_msg_type, i_msg_type_str = self.get_message_type(transform['transform_type'], input_type='work') + msg_content = {'msg_type': i_msg_type_str, + 'request_id': request_id, + 'workload_id': workload_id, + 'status': transform['status'].name, + 'output': work.get_output_data(), + 'error': work.get_terminated_msg()} + num_msg_content = 1 + elif msg_type == 'collection': + i_msg_type, i_msg_type_str = self.get_message_type(transform['transform_type'], input_type='collection') + msg_content = {'msg_type': i_msg_type_str, + 'request_id': request_id, + 'workload_id': workload_id, + 'collections': [{'scope': collection['scope'], + 'name': collection['name'], + 'status': collection['status'].name}], + 'output': work.get_output_data(), + 'error': work.get_terminated_msg()} + num_msg_content = 1 + else: + i_msg_type, i_msg_type_str = self.get_message_type(transform['transform_type'], input_type='file') + files_message = [] + for file in files: + file_message = {'scope': file['scope'], + 'name': file['name'], + 'path': file['path'], + 'status': file['status'].name} + files_message.append(file_message) + msg_content = {'msg_type': i_msg_type_str, + 'request_id': request_id, + 'workload_id': workload_id, + 'relation_type': relation_type, + 'files': files_message} + num_msg_content = len(files_message) + + msg = {'msg_type': i_msg_type, + 'status': MessageStatus.New, + 'source': MessageSource.Transformer, + 'request_id': request_id, + 'workload_id': workload_id, + 'transform_id': transform['transform_id'], + 'num_contents': num_msg_content, + 'msg_content': msg_content} + return msg def process_running_transform(self, transform): """ @@ -288,57 +396,146 @@ def process_running_transform(self, transform): output_collections = work.get_output_collections() log_collections = work.get_log_collections() - registered_input_output_maps = core_transforms.get_transform_input_output_maps(transform['transform_id']) + input_coll_ids = self.get_collection_ids(input_collections) + output_coll_ids = self.get_collection_ids(output_collections) + log_coll_ids = self.get_collection_ids(log_collections) + + registered_input_output_maps = core_transforms.get_transform_input_output_maps(transform['transform_id'], + input_coll_ids=input_coll_ids, + output_coll_ids=output_coll_ids, + log_coll_ids=log_coll_ids) # update_input_output_maps = self.get_update_input_output_maps(registered_input_output_maps) - update_contents = self.get_updated_contents(transform, registered_input_output_maps) + # update_contents, updated_contents_full = self.get_updated_contents(transform, registered_input_output_maps) + # updated_contents, updated_input_contents_full, updated_output_contents_full = self.get_updated_contents(transform, registered_input_output_maps) + if work.has_new_inputs(): new_input_output_maps = work.get_new_input_output_maps(registered_input_output_maps) else: new_input_output_maps = {} - new_contents = self.get_new_contents(transform, new_input_output_maps) + new_input_contents, new_output_contents = self.get_new_contents(transform, new_input_output_maps) + new_contents = [] + if new_input_contents: + new_contents = new_contents + new_input_contents + if new_output_contents: + new_contents = new_contents + new_output_contents # new_input_output_maps = work.get_new_input_output_maps() # new_contents = self.get_new_contents(new_input_output_maps) - file_msgs = [] - """ - if new_contents: - file_msg = self.generate_file_message(transform, new_contents) - file_msgs.append(file_msg) - if updated_contents: - file_msg = self.generate_file_message(transform, updated_contents) - file_msgs.append(file_msg) - """ - # processing = self.get_processing(transform, input_colls, output_colls, log_colls, new_input_output_maps) processing = work.get_processing(new_input_output_maps) - new_processing = None - if not processing: - new_processing = work.create_processing(new_input_output_maps) - new_processing = {'transform_id': transform['transform_id'], - 'status': ProcessingStatus.New} - new_processing['processing_metadata']['work'] = work + self.logger.info("work get_processing: %s" % processing) + + new_processing_model, processing_model, update_processing_model = None, None, {} + if processing: + if 'processing_id' not in processing: + # new_processing = work.create_processing(new_input_output_maps) + new_processing_model = copy.deepcopy(processing) + new_processing_model['transform_id'] = transform['transform_id'] + new_processing_model['request_id'] = transform['request_id'] + new_processing_model['workload_id'] = transform['workload_id'] + new_processing_model['status'] = ProcessingStatus.New + if 'processing_metadata' not in processing: + processing['processing_metadata'] = {} + if 'processing_metadata' not in new_processing_model: + new_processing_model['processing_metadata'] = {} + new_processing_model['processing_metadata']['work'] = work + else: + processing_model = core_processings.get_processing(processing_id=processing['processing_id']) + work.set_processing_status(processing, processing_model['status']) + processing_metadata = processing_model['processing_metadata'] + if 'errors' in processing_metadata: + work.set_terminated_msg(processing_metadata['errors']) + work.set_processing_output_metadata(processing, processing_model['output_metadata']) + transform['workload_id'] = processing_model['workload_id'] + + if transform['status'] in [TransformStatus.ToCancel]: + if processing_model and processing_model['status'] in [ProcessingStatus.New, ProcessingStatus.Submitting, ProcessingStatus.Submitted, + ProcessingStatus.Running]: + update_processing_model[processing_model['processing_id']] = {'status': ProcessingStatus.ToCancel} + + updated_contents, updated_input_contents_full, updated_output_contents_full = [], [], [] + if work.should_release_inputs(processing_model): + updated_contents, updated_input_contents_full, updated_output_contents_full = self.get_updated_contents(transform, registered_input_output_maps) + + msgs = [] + if new_input_contents: + msg = self.generate_message(transform, files=new_input_contents, msg_type='file', relation_type='input') + msgs.append(msg) + if new_output_contents: + msg = self.generate_message(transform, files=new_output_contents, msg_type='file', relation_type='output') + msgs.append(msg) + if updated_input_contents_full: + msg = self.generate_message(transform, files=updated_input_contents_full, msg_type='file', relation_type='input') + msgs.append(msg) + if updated_output_contents_full: + msg = self.generate_message(transform, files=updated_output_contents_full, msg_type='file', relation_type='output') + msgs.append(msg) transform['locking'] = TransformLocking.Idle # status_statistics = work.get_status_statistics(registered_input_output_maps) work.syn_work_status(registered_input_output_maps) if work.is_finished(): transform['status'] = TransformStatus.Finished + msg = self.generate_message(transform, work=work, msg_type='work') + msgs.append(msg) + for coll in output_collections: + coll['status'] = CollectionStatus.Closed + msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection') + msgs.append(msg) + for coll in log_collections: + coll['status'] = CollectionStatus.Closed + msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection') + msgs.append(msg) elif work.is_subfinished(): transform['status'] = TransformStatus.SubFinished + msg = self.generate_message(transform, work=work, msg_type='work') + msgs.append(msg) + for coll in output_collections: + coll['status'] = CollectionStatus.SubClosed + msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection') + msgs.append(msg) + for coll in log_collections: + coll['status'] = CollectionStatus.SubClosed + msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection') + msgs.append(msg) elif work.is_failed(): transform['status'] = TransformStatus.Failed + msg = self.generate_message(transform, work=work, msg_type='work') + msgs.append(msg) + for coll in output_collections: + coll['status'] = CollectionStatus.Failed + msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection') + msgs.append(msg) + for coll in log_collections: + coll['status'] = CollectionStatus.Failed + msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection') + msgs.append(msg) + elif work.is_cancelled(): + transform['status'] = TransformStatus.Cancelled + msg = self.generate_message(transform, work=work, msg_type='work') + msgs.append(msg) + for coll in output_collections: + coll['status'] = CollectionStatus.Cancelled + msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection') + msgs.append(msg) + for coll in log_collections: + coll['status'] = CollectionStatus.Cancelled + msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection') + msgs.append(msg) else: transform['status'] = TransformStatus.Transforming + # print(input_collections) ret = {'transform': transform, - 'update_input_collections': input_collections, - 'update_output_collections': output_collections, - 'update_log_collections': log_collections, + 'update_input_collections': copy.deepcopy(input_collections) if input_collections else input_collections, + 'update_output_collections': copy.deepcopy(output_collections) if output_collections else output_collections, + 'update_log_collections': copy.deepcopy(log_collections) if log_collections else log_collections, 'new_contents': new_contents, - 'update_contents': update_contents, - 'messages': file_msgs, - 'new_processing': new_processing} + 'update_contents': updated_contents, + 'messages': msgs, + 'new_processing': new_processing_model, + 'update_processing': update_processing_model} return ret def process_running_transforms(self): @@ -357,36 +554,26 @@ def process_running_transforms(self): return ret def finish_running_transforms(self): - while not self.running_output_queue.empty(): - ret = self.running_output_queue.get() - core_transforms.add_transform_outputs(transform=ret['transform'], - input_collections=ret.get('input_collections', None), - output_collections=ret.get('output_collections', None), - log_collections=ret.get('log_collections', None), - new_contents=ret.get('new_contents', None), - update_input_collections=ret.get('update_input_collections', None), - update_output_collections=ret.get('update_output_collections', None), - update_log_collections=ret.get('update_log_collections', None), - update_contents=ret.get('update_contents', None), - messages=ret.get('messages', None), - new_processing=ret.get('new_processing', None), - message_bulk_size=self.message_bulk_size) - while not self.running_output_queue.empty(): try: - ret = self.new_output_queue.get() - self.logger.info("Main thread finishing processing transform: %s" % ret['transform']) + ret = self.running_output_queue.get() + self.logger.info("Main thread finishing running transform: %s" % ret['transform']) if ret: # self.logger.debug("wen: %s" % str(ret['output_contents'])) core_transforms.add_transform_outputs(transform=ret['transform'], - update_input_collections=ret['update_input_collections'], - update_output_collections=ret['update_output_collections'], - update_log_collections=ret['update_log_collections'], - new_input_output_maps=ret['new_input_output_maps'], - update_input_output_maps=ret['update_input_output_maps'], - processing=ret['processing'], - messages=ret['messages'], + input_collections=ret.get('input_collections', None), + output_collections=ret.get('output_collections', None), + log_collections=ret.get('log_collections', None), + new_contents=ret.get('new_contents', None), + update_input_collections=ret.get('update_input_collections', None), + update_output_collections=ret.get('update_output_collections', None), + update_log_collections=ret.get('update_log_collections', None), + update_contents=ret.get('update_contents', None), + messages=ret.get('messages', None), + new_processing=ret.get('new_processing', None), + update_processing=ret.get('update_processing', None), message_bulk_size=self.message_bulk_size) + except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) @@ -404,6 +591,8 @@ def run(self): self.load_plugins() + self.add_default_tasks() + task = self.create_task(task_func=self.get_new_transforms, task_output_queue=self.new_task_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) self.add_task(task) task = self.create_task(task_func=self.process_new_transforms, task_output_queue=self.new_output_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) diff --git a/main/lib/idds/agents/transporter/transporter.py b/main/lib/idds/agents/transporter/transporter.py index 53231bb9..070a6174 100644 --- a/main/lib/idds/agents/transporter/transporter.py +++ b/main/lib/idds/agents/transporter/transporter.py @@ -398,6 +398,8 @@ def run(self): self.load_plugins() self.init() + self.add_default_tasks() + task = self.create_task(task_func=self.get_new_input_collections, task_output_queue=self.new_input_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) self.add_task(task) task = self.create_task(task_func=self.process_input_collections, task_output_queue=self.processed_input_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) diff --git a/main/lib/idds/core/catalog.py b/main/lib/idds/core/catalog.py index 86cf54f1..7d71e1f8 100644 --- a/main/lib/idds/core/catalog.py +++ b/main/lib/idds/core/catalog.py @@ -71,41 +71,26 @@ def get_collections(scope=None, name=None, request_id=None, workload_id=None, tr :returns: dict of collections """ - if request_id or workload_id or transform_id: - transform_ids = orm_transforms.get_transform_ids(request_id=request_id, - workload_id=workload_id, - transform_id=transform_id, session=session) - - if transform_ids: - collections = orm_collections.get_collections(scope=scope, name=name, transform_id=transform_ids, - relation_type=relation_type, to_json=to_json, - session=session) - else: - collections = [] - else: - collections = orm_collections.get_collections(scope=scope, name=name, to_json=to_json, - relation_type=relation_type, session=session) - rets = {} - for collection in collections: - if request_id not in rets: - rets[request_id] = {} - transform_id = collection['transform_id'] - if transform_id not in rets[request_id]: - rets[request_id][transform_id] = [] - rets[request_id][transform_id].append(collection) - return rets + collections = orm_collections.get_collections(scope=scope, name=name, request_id=request_id, + workload_id=workload_id, transform_id=transform_id, + to_json=to_json, + relation_type=relation_type, session=session) + return collections @transactional_session -def add_collection(scope, name, coll_type=CollectionType.Dataset, transform_id=None, +def add_collection(request_id, workload_id, scope, name, coll_type=CollectionType.Dataset, transform_id=None, relation_type=CollectionRelationType.Input, coll_size=0, status=CollectionStatus.New, - total_files=0, retries=0, expired_at=None, coll_metadata=None, session=None): + total_files=0, new_files=0, processing_files=0, processed_files=0, retries=0, + expired_at=None, coll_metadata=None, session=None): """ Add a collection. :param scope: The scope of the request data. :param name: The name of the request data. :param coll_type: The type of dataset as dataset or container. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_id: The transform id related to this collection. :param relation_type: The relation between this collection and its transform, such as Input, Output, Log and so on. @@ -121,11 +106,13 @@ def add_collection(scope, name, coll_type=CollectionType.Dataset, transform_id=N :returns: collection id. """ - orm_collections.add_collection(scope=scope, name=name, coll_type=coll_type, + orm_collections.add_collection(request_id=request_id, workload_id=workload_id, + scope=scope, name=name, coll_type=coll_type, transform_id=transform_id, relation_type=relation_type, coll_size=coll_size, status=status, total_files=total_files, - retries=retries, expired_at=expired_at, coll_metadata=coll_metadata, - session=session) + new_files=new_files, processing_files=processing_files, + processed_files=processed_files, retries=retries, expired_at=expired_at, + coll_metadata=coll_metadata, session=session) @transactional_session @@ -294,7 +281,7 @@ def update_content(content_id, parameters, session=None): @read_session def get_contents(coll_scope=None, coll_name=None, request_id=None, workload_id=None, transform_id=None, - relation_type=None, to_json=False, session=None): + relation_type=None, status=None, to_json=False, session=None): """ Get contents with collection scope, collection name, request id, workload id and relation type. @@ -309,25 +296,15 @@ def get_contents(coll_scope=None, coll_name=None, request_id=None, workload_id=N :returns: dict of contents """ - req_transfomr_collections = get_collections(scope=coll_scope, name=coll_name, request_id=request_id, - workload_id=workload_id, transform_id=transform_id, - relation_type=relation_type, to_json=to_json, session=session) - - rets = {} - for request_id in req_transfomr_collections: - rets[request_id] = {} - for transform_id in req_transfomr_collections[request_id]: - rets[request_id][transform_id] = {} - for collection in req_transfomr_collections[request_id][transform_id]: - scope = collection['scope'] - name = collection['name'] - coll_id = collection['coll_id'] - coll_relation_type = collection['relation_type'] - scope_name = '%s:%s' % (scope, name) - contents = orm_contents.get_contents(coll_id=coll_id, to_json=to_json, session=session) - rets[request_id][transform_id][scope_name] = {'collection': collection, - 'relation_type': coll_relation_type, - 'contents': contents} + collections = get_collections(scope=coll_scope, name=coll_name, request_id=request_id, + workload_id=workload_id, transform_id=transform_id, + relation_type=relation_type, to_json=to_json, session=session) + + coll_ids = [coll['coll_id'] for coll in collections] + if coll_ids: + rets = orm_contents.get_contents(coll_id=coll_ids, status=status, to_json=to_json, session=session) + else: + rets = [] return rets diff --git a/main/lib/idds/core/health.py b/main/lib/idds/core/health.py new file mode 100644 index 00000000..9f630a71 --- /dev/null +++ b/main/lib/idds/core/health.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 + + +""" +operations related to Health. +""" + + +from idds.orm import health as orm_health + + +def add_health_item(agent, hostname, pid, thread_id, thread_name, payload): + """ + Add a health item. + + :param agent: The agent name. + :param hostname: The hostname. + :param pid: The pid. + :param thread_id: The thread id. + :param thread_name: The thread name. + :param payload: The payload. + """ + return orm_health.add_health_item(agent=agent, hostname=hostname, pid=pid, + thread_id=thread_id, + thread_name=thread_name, + payload=payload) + + +def retrieve_health_items(): + """ + Retrieve health items. + + :returns healths: List of dictionaries + """ + return orm_health.retrieve_health_items() + + +def clean_health(older_than=3600): + """ + Clearn items which is older than the time. + + :param older_than in seconds + """ + orm_health.clean_health(older_than=older_than) diff --git a/main/lib/idds/core/messages.py b/main/lib/idds/core/messages.py index f5ab72a6..35836dea 100644 --- a/main/lib/idds/core/messages.py +++ b/main/lib/idds/core/messages.py @@ -19,7 +19,8 @@ @transactional_session -def add_message(msg_type, status, source, transform_id, num_contents, msg_content, bulk_size=None, session=None): +def add_message(msg_type, status, source, request_id, workload_id, transform_id, + num_contents, msg_content, bulk_size=None, session=None): """ Add a message to be submitted asynchronously to a message broker. @@ -30,6 +31,7 @@ def add_message(msg_type, status, source, transform_id, num_contents, msg_conten :param session: The database session. """ return orm_messages.add_message(msg_type=msg_type, status=status, source=source, + request_id=request_id, workload_id=workload_id, transform_id=transform_id, num_contents=num_contents, bulk_size=bulk_size, msg_content=msg_content, session=session) diff --git a/main/lib/idds/core/processings.py b/main/lib/idds/core/processings.py index 8d57d97e..cafebe76 100644 --- a/main/lib/idds/core/processings.py +++ b/main/lib/idds/core/processings.py @@ -24,11 +24,14 @@ @transactional_session -def add_processing(transform_id, status, submitter=None, granularity=None, granularity_type=GranularityType.File, +def add_processing(request_id, workload_id, transform_id, status, submitter=None, + granularity=None, granularity_type=GranularityType.File, expired_at=None, processing_metadata=None, session=None): """ Add a processing. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_id: Transform id. :param status: processing status. :param submitter: submitter name. @@ -42,7 +45,8 @@ def add_processing(transform_id, status, submitter=None, granularity=None, granu :returns: processing id. """ - return orm_processings.add_processing(transform_id=transform_id, status=status, submitter=submitter, + return orm_processings.add_processing(request_id=request_id, workload_id=workload_id, transform_id=transform_id, + status=status, submitter=submitter, granularity=granularity, granularity_type=granularity_type, expired_at=expired_at, processing_metadata=processing_metadata, session=session) diff --git a/main/lib/idds/core/requests.py b/main/lib/idds/core/requests.py index f68bc5f7..8bbf7bef 100644 --- a/main/lib/idds/core/requests.py +++ b/main/lib/idds/core/requests.py @@ -14,12 +14,10 @@ """ -from idds.common import exceptions -from idds.common.constants import RequestStatus, RequestLocking +from idds.common.constants import RequestStatus, RequestLocking, WorkStatus from idds.orm.base.session import read_session, transactional_session from idds.orm import requests as orm_requests from idds.orm import transforms as orm_transforms -from idds.orm import collections as orm_collections from idds.orm import workprogress as orm_workprogresses # from idds.atlas.worflow.utils import convert_request_metadata_to_workflow @@ -108,7 +106,7 @@ def get_request_ids_by_workload_id(workload_id, session=None): @read_session -def get_requests(request_id=None, workload_id=None, to_json=False, session=None): +def get_requests(request_id=None, workload_id=None, with_detail=False, to_json=False, session=None): """ Get a request or raise a NoObject exception. @@ -121,7 +119,7 @@ def get_requests(request_id=None, workload_id=None, to_json=False, session=None) :returns: Request. """ return orm_requests.get_requests(request_id=request_id, workload_id=workload_id, - to_json=to_json, session=session) + with_detail=with_detail, to_json=to_json, session=session) @transactional_session @@ -160,53 +158,31 @@ def update_request(request_id, parameters, session=None): @transactional_session -def update_request_with_transforms(request_id, parameters, transforms_to_add, transforms_to_extend, session=None): +def update_request_with_transforms(request_id, parameters, new_transforms=None, update_transforms=None, session=None): """ update an request. :param request_id: the request id. :param parameters: A dictionary of parameters. - :param transforms_to_add: list of transforms - :param transforms_to_extend: list of transforms - """ - for transform in transforms_to_add: - if 'collections' not in transform or len(transform['collections']) == 0: - msg = "Transform must have collections, such as input collection, output collection and log collection" - raise exceptions.WrongParameterException(msg) - - collections = transform['collections'] - del transform['collections'] - transform_id = orm_transforms.add_transform(**transform, session=session) - - input_coll_ids = [] - log_coll_ids = [] - for collection in collections['input_collections']: - collection['transform_id'] = transform_id - input_coll_id = orm_collections.add_collection(**collection, session=session) - input_coll_ids.append(input_coll_id) - for collection in collections['log_collections']: - collection['transform_id'] = transform_id - log_coll_id = orm_collections.add_collection(**collection, session=session) - log_coll_ids.append(log_coll_id) - for collection in collections['output_collections']: - collection['transform_id'] = transform_id - workload_id = transform['transform_metadata']['workload_id'] if 'workload_id' in transform['transform_metadata'] else None - collection['coll_metadata'] = {'transform_id': transform_id, - 'workload_id': workload_id, - 'input_collections': input_coll_ids, - 'log_collections': log_coll_ids} - orm_collections.add_collection(**collection, session=session) - - for transform in transforms_to_extend: - transform_id = transform['transform_id'] - del transform['transform_id'] - # orm_transforms.add_req2transform(request_id, transform_id, session=session) - orm_transforms.update_transform(transform_id, parameters=transform, session=session) + :param new_transforms: list of transforms + :param update_transforms: list of transforms + """ + if new_transforms: + for tf in new_transforms: + tf_id = orm_transforms.add_transform(**tf, session=session) + orginal_work = tf['transform_metadata']['orginal_work'] + del tf['transform_metadata']['orginal_work'] + # work = tf['transform_metadata']['work'] + orginal_work.set_work_id(tf_id, transforming=True) + orginal_work.set_status(WorkStatus.New) + if update_transforms: + for tr_id in update_transforms: + orm_transforms.update_transform(transform_id=tr_id, parameters=update_transforms[tr_id], session=session) return orm_requests.update_request(request_id, parameters, session=session) @transactional_session -def update_request_with_workprogresses(request_id, parameters, new_workprogresses, session=None): +def update_request_with_workprogresses(request_id, parameters, new_workprogresses=None, update_workprogresses=None, session=None): """ update an request. @@ -216,6 +192,9 @@ def update_request_with_workprogresses(request_id, parameters, new_workprogresse """ if new_workprogresses: orm_workprogresses.add_workprogresses(new_workprogresses, session=session) + if update_workprogresses: + for workprogress_id in update_workprogresses: + orm_workprogresses.update_workprogress(workprogress_id, update_workprogresses[workprogress_id], session=session) return orm_requests.update_request(request_id, parameters, session=session) diff --git a/main/lib/idds/core/transforms.py b/main/lib/idds/core/transforms.py index cf6fb2eb..2a947de8 100644 --- a/main/lib/idds/core/transforms.py +++ b/main/lib/idds/core/transforms.py @@ -21,16 +21,19 @@ from idds.orm import (transforms as orm_transforms, collections as orm_collections, contents as orm_contents, - # messages as orm_messages, + messages as orm_messages, processings as orm_processings) @transactional_session -def add_transform(transform_type, transform_tag=None, priority=0, status=TransformStatus.New, locking=TransformLocking.Idle, +def add_transform(request_id, workload_id, transform_type, transform_tag=None, priority=0, + status=TransformStatus.New, locking=TransformLocking.Idle, retries=0, expired_at=None, transform_metadata=None, workprogress_id=None, session=None): """ Add a transform. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_type: Transform type. :param transform_tag: Transform tag. :param priority: priority. @@ -45,7 +48,8 @@ def add_transform(transform_type, transform_tag=None, priority=0, status=Transfo :returns: transform id. """ - transform_id = orm_transforms.add_transform(transform_type=transform_type, transform_tag=transform_tag, + transform_id = orm_transforms.add_transform(request_id=request_id, workload_id=workload_id, + transform_type=transform_type, transform_tag=transform_tag, priority=priority, status=status, locking=locking, retries=retries, expired_at=expired_at, transform_metadata=transform_metadata, workprogress_id=workprogress_id, session=session) @@ -89,7 +93,7 @@ def get_transforms_with_input_collection(transform_type, transform_tag, coll_sco @read_session -def get_transform_ids(workprogress_id, session=None): +def get_transform_ids(workprogress_id, request_id=None, workload_id=None, transform_id=None, session=None): """ Get transform ids or raise a NoObject exception. @@ -100,11 +104,12 @@ def get_transform_ids(workprogress_id, session=None): :returns: list of transform ids. """ - return orm_transforms.get_transform_ids(workprogress_id=workprogress_id, session=session) + return orm_transforms.get_transform_ids(workprogress_id=workprogress_id, request_id=request_id, + workload_id=workload_id, transform_id=transform_id, session=session) @read_session -def get_transforms(workprogress_id, to_json=False, session=None): +def get_transforms(workprogress_id=None, to_json=False, request_id=None, workload_id=None, transform_id=None, session=None): """ Get transforms or raise a NoObject exception. @@ -116,7 +121,9 @@ def get_transforms(workprogress_id, to_json=False, session=None): :returns: list of transform. """ - return orm_transforms.get_transforms(workprogress_id=workprogress_id, to_json=to_json, session=session) + return orm_transforms.get_transforms(workprogress_id=workprogress_id, request_id=request_id, + workload_id=workload_id, transform_id=transform_id, + to_json=to_json, session=session) @read_session @@ -161,8 +168,8 @@ def update_transform(transform_id, parameters, session=None): @transactional_session def add_transform_outputs(transform, input_collections=None, output_collections=None, log_collections=None, update_input_collections=None, update_output_collections=None, update_log_collections=None, - new_contents=None, update_contents=None, - new_processing=None, messages=None, message_bulk_size=1000, session=None): + new_contents=None, update_contents=None, new_processing=None, update_processing=None, + messages=None, message_bulk_size=1000, session=None): """ For input contents, add corresponding output contents. @@ -207,15 +214,15 @@ def add_transform_outputs(transform, input_collections=None, output_collections= if new_contents: orm_contents.add_contents(new_contents, session=session) if update_contents: - orm_contents.add_contents(update_contents, session=session) + orm_contents.update_contents(update_contents, session=session) processing_id = None if new_processing: + # print(new_processing) processing_id = orm_processings.add_processing(**new_processing, session=session) - - """ - if output_contents: - orm_contents.add_contents(output_contents, session=session) + if update_processing: + for proc_id in update_processing: + orm_processings.add_processing(processing_id=proc_id, parameters=update_processing[proc_id], session=session) if messages: if not type(messages) in [list, tuple]: @@ -224,12 +231,15 @@ def add_transform_outputs(transform, input_collections=None, output_collections= orm_messages.add_message(msg_type=message['msg_type'], status=message['status'], source=message['source'], + request_id=message['request_id'], + workload_id=message['workload_id'], transform_id=message['transform_id'], num_contents=message['num_contents'], msg_content=message['msg_content'], bulk_size=message_bulk_size, session=session) + """ if to_cancel_processing: to_cancel_params = {'status': ProcessingStatus.Cancel} for to_cancel_id in to_cancel_processing: @@ -251,6 +261,7 @@ def add_transform_outputs(transform, input_collections=None, output_collections= work.set_processing_id(new_processing, processing_id) parameters = {'status': transform['status'], 'locking': transform['locking'], + 'workload_id': transform['workload_id'], 'transform_metadata': transform['transform_metadata']} orm_transforms.update_transform(transform_id=transform['transform_id'], parameters=parameters, diff --git a/main/lib/idds/core/workprogress.py b/main/lib/idds/core/workprogress.py index c3257b6f..5c195025 100644 --- a/main/lib/idds/core/workprogress.py +++ b/main/lib/idds/core/workprogress.py @@ -20,12 +20,14 @@ from idds.workflow.work import WorkStatus -def create_workprogress(request_id, scope, name, priority=0, status=WorkprogressStatus.New, locking=WorkprogressLocking.Idle, +def create_workprogress(request_id, workload_id, scope, name, priority=0, status=WorkprogressStatus.New, + locking=WorkprogressLocking.Idle, expired_at=None, errors=None, workprogress_metadata=None, processing_metadata=None): """ Create a workprogress. :param request_id: The request id. + :param workload_id: The workload id. :param scope: The scope. :param name: The name. :param status: The status as integer. @@ -38,20 +40,23 @@ def create_workprogress(request_id, scope, name, priority=0, status=Workprogress :returns: workprogress. """ - return orm_workprogress.create_workprogress(request_id=request_id, scope=scope, name=name, priority=priority, status=status, + return orm_workprogress.create_workprogress(request_id=request_id, workload_id=workload_id, scope=scope, name=name, + priority=priority, status=status, locking=locking, expired_at=expired_at, workprogress_metadata=workprogress_metadata, processing_metadata=processing_metadata) @transactional_session -def add_workprogress(request_id, scope, name, priority=0, status=WorkprogressStatus.New, locking=WorkprogressLocking.Idle, +def add_workprogress(request_id, workload_id, scope, name, priority=0, status=WorkprogressStatus.New, + locking=WorkprogressLocking.Idle, expired_at=None, errors=None, workprogress_metadata=None, processing_metadata=None, session=None): """ Add a workprogress. :param request_id: The request id. + :param workload_id: The workload id. :param scope: The scope. :param name: The name. :param status: The status as integer. @@ -68,7 +73,8 @@ def add_workprogress(request_id, scope, name, priority=0, status=WorkprogressSta :returns: workprogress id. """ - return orm_workprogress.add_workprogress(request_id=request_id, scope=scope, name=name, priority=priority, status=status, + return orm_workprogress.add_workprogress(request_id=request_id, workload_id=workload_id, + scope=scope, name=name, priority=priority, status=status, locking=locking, expired_at=expired_at, workprogress_metadata=workprogress_metadata, processing_metadata=processing_metadata, @@ -92,7 +98,7 @@ def add_workprogresses(workprogresses, bulk_size=1000, session=None): @read_session -def get_workprogresses(request_id, to_json=False, session=None): +def get_workprogresses(request_id=None, to_json=False, session=None): """ Get workprogresses with request_id. @@ -145,7 +151,7 @@ def get_workprogresses_by_status(status, period=None, locking=False, bulk_size=N @transactional_session -def update_workprogress(workprogress_id, parameters, new_transforms=None, session=None): +def update_workprogress(workprogress_id, parameters, new_transforms=None, update_transforms=None, session=None): """ update a workprogress. @@ -160,10 +166,15 @@ def update_workprogress(workprogress_id, parameters, new_transforms=None, sessio if new_transforms: for tf in new_transforms: + orginal_work = tf['transform_metadata']['orginal_work'] + del tf['transform_metadata']['orginal_work'] tf_id = orm_transforms.add_transform(**tf, session=session) - work = tf['transform_metadata']['work'] - work.set_work_id(tf_id, transforming=True) - work.set_status(WorkStatus.New) + # work = tf['transform_metadata']['work'] + orginal_work.set_work_id(tf_id, transforming=True) + orginal_work.set_status(WorkStatus.New) + if update_transforms: + for tr_id in update_transforms: + orm_transforms.update_transform(transform_id=tr_id, parameters=update_transforms[tr_id], session=session) return orm_workprogress.update_workprogress(workprogress_id=workprogress_id, parameters=parameters, session=session) diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index 0f0354ef..1ec5dbc4 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -155,6 +155,7 @@ class Workprogress(BASE, ModelBase): __tablename__ = 'workprogresses' workprogress_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('WORKPROGRESS_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + workload_id = Column(Integer()) scope = Column(String(SCOPE_LENGTH)) name = Column(String(NAME_LENGTH)) # requester = Column(String(20)) @@ -186,6 +187,8 @@ class Transform(BASE, ModelBase): """Represents a transform""" __tablename__ = 'transforms' transform_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('TRANSFORM_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) + request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + workload_id = Column(Integer()) transform_type = Column(EnumWithValue(TransformType)) transform_tag = Column(String(20)) priority = Column(Integer()) @@ -224,6 +227,8 @@ class Processing(BASE, ModelBase): __tablename__ = 'processings' processing_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('PROCESSING_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) + request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + workload_id = Column(Integer()) status = Column(EnumWithValue(ProcessingStatus)) substatus = Column(EnumWithValue(ProcessingStatus)) locking = Column(EnumWithValue(ProcessingLocking)) @@ -251,6 +256,9 @@ class Collection(BASE, ModelBase): """Represents a collection""" __tablename__ = 'collections' coll_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('COLLECTION_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) + request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + workload_id = Column(Integer()) + transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) coll_type = Column(EnumWithValue(CollectionType)) transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) relation_type = Column(EnumWithValue(CollectionRelationType)) @@ -290,6 +298,9 @@ class Content(BASE, ModelBase): content_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('CONTENT_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) coll_id = Column(BigInteger().with_variant(Integer, "sqlite")) + request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + workload_id = Column(Integer()) + transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0) scope = Column(String(SCOPE_LENGTH)) name = Column(String(NAME_LENGTH)) @@ -324,6 +335,24 @@ class Content(BASE, ModelBase): Index('CONTENTS_STATUS_UPDATED_IDX', 'status', 'locking', 'updated_at', 'created_at')) +class Health(BASE, ModelBase): + """Represents the status of the running agents""" + __tablename__ = 'health' + health_id = Column(BigInteger().with_variant(Integer, "sqlite"), + Sequence('HEALTH_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), + primary_key=True) + agent = Column(String(30)) + hostname = Column(String(127)) + pid = Column(Integer, autoincrement=False) + thread_id = Column(BigInteger, autoincrement=False) + thread_name = Column(String(255)) + payload = Column(String(255)) + created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) + updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) + _table_args = (PrimaryKeyConstraint('health_id', name='HEALTH_PK'), + UniqueConstraint('agent', 'hostname', 'pid', 'thread_id', name='HEALTH_UK')) + + class Message(BASE, ModelBase): """Represents the event messages""" __tablename__ = 'messages' @@ -335,6 +364,8 @@ class Message(BASE, ModelBase): substatus = Column(Integer()) locking = Column(EnumWithValue(MessageLocking)) source = Column(EnumWithValue(MessageSource)) + request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + workload_id = Column(Integer()) transform_id = Column(Integer()) num_contents = Column(Integer()) created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) @@ -349,7 +380,7 @@ def register_models(engine): Creates database tables for all models with the given engine """ - models = (Request, Workprogress, Transform, Workprogress2transform, Processing, Collection, Content) + models = (Request, Workprogress, Transform, Workprogress2transform, Processing, Collection, Content, Health, Message) for model in models: model.metadata.create_all(engine) # pylint: disable=maybe-no-member @@ -360,7 +391,7 @@ def unregister_models(engine): Drops database tables for all models with the given engine """ - models = (Request, Workprogress, Transform, Workprogress2transform, Processing, Collection, Content) + models = (Request, Workprogress, Transform, Workprogress2transform, Processing, Collection, Content, Health, Message) for model in models: model.metadata.drop_all(engine) # pylint: disable=maybe-no-member diff --git a/main/lib/idds/orm/collections.py b/main/lib/idds/orm/collections.py index ac1a7249..45c06624 100644 --- a/main/lib/idds/orm/collections.py +++ b/main/lib/idds/orm/collections.py @@ -25,9 +25,10 @@ from idds.orm.base import models -def create_collection(scope, name, coll_type=CollectionType.Dataset, transform_id=None, +def create_collection(request_id, workload_id, scope, name, coll_type=CollectionType.Dataset, transform_id=None, relation_type=CollectionRelationType.Input, bytes=0, status=CollectionStatus.New, - locking=CollectionLocking.Idle, total_files=0, retries=0, expired_at=None, + locking=CollectionLocking.Idle, total_files=0, new_files=0, processing_files=0, + processed_files=0, retries=0, expired_at=None, coll_metadata=None): """ Create a collection. @@ -35,6 +36,8 @@ def create_collection(scope, name, coll_type=CollectionType.Dataset, transform_i :param scope: The scope of the request data. :param name: The name of the request data. :param coll_type: The type of dataset as dataset or container. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_id: The transform id related to this collection. :param relation_type: The relation between this collection and its transform, such as Input, Output, Log and so on. @@ -48,17 +51,20 @@ def create_collection(scope, name, coll_type=CollectionType.Dataset, transform_i :returns: collection. """ - new_coll = models.Collection(scope=scope, name=name, coll_type=coll_type, transform_id=transform_id, + new_coll = models.Collection(request_id=request_id, workload_id=workload_id, scope=scope, name=name, + coll_type=coll_type, transform_id=transform_id, relation_type=relation_type, bytes=bytes, status=status, locking=locking, - total_files=total_files, retries=retries, expired_at=expired_at, - coll_metadata=coll_metadata) + total_files=total_files, new_files=new_files, processing_files=processing_files, + processed_files=processed_files, retries=retries, + expired_at=expired_at, coll_metadata=coll_metadata) return new_coll @transactional_session -def add_collection(scope, name, coll_type=CollectionType.Dataset, transform_id=None, +def add_collection(request_id, workload_id, scope, name, coll_type=CollectionType.Dataset, transform_id=None, relation_type=CollectionRelationType.Input, bytes=0, status=CollectionStatus.New, - locking=CollectionLocking.Idle, total_files=0, retries=0, expired_at=None, + locking=CollectionLocking.Idle, total_files=0, new_files=0, processing_files=0, + processed_files=0, retries=0, expired_at=None, coll_metadata=None, session=None): """ Add a collection. @@ -66,6 +72,8 @@ def add_collection(scope, name, coll_type=CollectionType.Dataset, transform_id=N :param scope: The scope of the request data. :param name: The name of the request data. :param coll_type: The type of dataset as dataset or container. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_id: The transform id related to this collection. :param relation_type: The relation between this collection and its transform, such as Input, Output, Log and so on. @@ -83,10 +91,12 @@ def add_collection(scope, name, coll_type=CollectionType.Dataset, transform_id=N :returns: collection id. """ try: - new_coll = create_collection(scope=scope, name=name, coll_type=coll_type, transform_id=transform_id, + new_coll = create_collection(request_id=request_id, workload_id=workload_id, scope=scope, name=name, + coll_type=coll_type, transform_id=transform_id, relation_type=relation_type, bytes=bytes, status=status, locking=locking, - total_files=total_files, retries=retries, expired_at=expired_at, - coll_metadata=coll_metadata) + total_files=total_files, new_files=new_files, retries=retries, + processing_files=processing_files, processed_files=processed_files, + expired_at=expired_at, coll_metadata=coll_metadata) new_coll.save(session=session) coll_id = new_coll.coll_id return coll_id @@ -252,12 +262,15 @@ def get_collections_by_status(status, relation_type=CollectionRelationType.Input @read_session -def get_collections(scope=None, name=None, transform_id=None, relation_type=None, to_json=False, session=None): +def get_collections(scope=None, name=None, request_id=None, workload_id=None, transform_id=None, + relation_type=None, to_json=False, session=None): """ Get collections by request id or raise a NoObject exception. :param scope: collection scope. :param name: collection name, can be wildcard. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_id: list of transform id related to this collection. :param relation_type: The relation type between this collection and the transform: Input, Ouput and Log. :param to_json: return json format. @@ -276,9 +289,13 @@ def get_collections(scope=None, name=None, transform_id=None, relation_type=None query = query.filter(models.Collection.scope == scope) if name: query = query.filter(models.Collection.name.like(name.replace('*', '%'))) + if request_id: + query = query.filter(models.Collection.request_id == request_id) + if workload_id: + query = query.filter(models.Collection.workload_id == workload_id) if transform_id: query = query.filter(models.Collection.transform_id.in_(transform_id)) - if relation_type: + if relation_type is not None: query = query.filter(models.Collection.relation_type == relation_type) query = query.order_by(asc(models.Collection.updated_at)) diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index 027250ec..230b0169 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -26,13 +26,16 @@ from idds.orm.base import models -def create_content(transform_id, coll_id, map_id, scope, name, min_id, max_id, content_type=ContentType.File, +def create_content(request_id, workload_id, transform_id, coll_id, map_id, scope, name, + min_id, max_id, content_type=ContentType.File, status=ContentStatus.New, bytes=0, md5=None, adler32=None, processing_id=None, storage_id=None, retries=0, locking=ContentLocking.Idle, path=None, expired_at=None, content_metadata=None): """ Create a content. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_id: transform id. :param coll_id: collection id. :param map_id: The id to map inputs to outputs. @@ -55,7 +58,8 @@ def create_content(transform_id, coll_id, map_id, scope, name, min_id, max_id, c :returns: content. """ - new_content = models.Content(transform_id=transform_id, coll_id=coll_id, map_id=map_id, + new_content = models.Content(request_id=request_id, workload_id=workload_id, + transform_id=transform_id, coll_id=coll_id, map_id=map_id, scope=scope, name=name, min_id=min_id, max_id=max_id, content_type=content_type, status=status, bytes=bytes, md5=md5, adler32=adler32, processing_id=processing_id, storage_id=storage_id, @@ -65,12 +69,15 @@ def create_content(transform_id, coll_id, map_id, scope, name, min_id, max_id, c @transactional_session -def add_content(transform_id, coll_id, map_id, scope, name, min_id=0, max_id=0, content_type=ContentType.File, status=ContentStatus.New, +def add_content(request_id, workload_id, transform_id, coll_id, map_id, scope, name, min_id=0, max_id=0, + content_type=ContentType.File, status=ContentStatus.New, bytes=0, md5=None, adler32=None, processing_id=None, storage_id=None, retries=0, locking=ContentLocking.Idle, path=None, expired_at=None, content_metadata=None, session=None): """ Add a content. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_id: transform id. :param coll_id: collection id. :param map_id: The id to map inputs to outputs. @@ -97,7 +104,8 @@ def add_content(transform_id, coll_id, map_id, scope, name, min_id=0, max_id=0, """ try: - new_content = create_content(transform_id=transform_id, coll_id=coll_id, map_id=map_id, + new_content = create_content(request_id=request_id, workload_id=workload_id, transform_id=transform_id, + coll_id=coll_id, map_id=map_id, scope=scope, name=name, min_id=min_id, max_id=max_id, content_type=content_type, status=status, bytes=bytes, md5=md5, adler32=adler32, processing_id=processing_id, storage_id=storage_id, @@ -126,7 +134,8 @@ def add_contents(contents, bulk_size=1000, session=None): :returns: content id. """ - default_params = {'transform_id': None, 'coll_id': None, 'map_id': None, + default_params = {'request_id': None, 'workload_id': None, + 'transform_id': None, 'coll_id': None, 'map_id': None, 'scope': None, 'name': None, 'min_id': 0, 'max_id': 0, 'content_type': ContentType.File, 'status': ContentStatus.New, 'locking': ContentLocking.Idle, @@ -266,7 +275,7 @@ def get_contents(scope=None, name=None, coll_id=None, status=None, to_json=False :param scope: The scope of the content data. :param name: The name of the content data. - :param coll_id: Collection id. + :param coll_id: list of Collection ids. :param to_json: return json format. :param session: The database session in use. @@ -282,15 +291,20 @@ def get_contents(scope=None, name=None, coll_id=None, status=None, to_json=False status = [status] if len(status) == 1: status = [status[0], status[0]] + if coll_id is not None: + if not isinstance(coll_id, (tuple, list)): + coll_id = [coll_id] + if len(coll_id) == 1: + coll_id = [coll_id[0], coll_id[0]] query = session.query(models.Content) if coll_id: - query = query.filter(models.Content.coll_id == coll_id) + query = query.filter(models.Content.coll_id.in_(coll_id)) if scope: query = query.filter(models.Content.scope == scope) if name: query = query.filter(models.Content.name.like(name.replace('*', '%'))) - if status: + if status is not None: query = query.filter(models.Content.status.in_(status)) tmp = query.all() diff --git a/main/lib/idds/orm/health.py b/main/lib/idds/orm/health.py new file mode 100644 index 00000000..9b4a23f6 --- /dev/null +++ b/main/lib/idds/orm/health.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 + + +""" +operations related to Health. +""" + +import datetime + +from sqlalchemy.exc import DatabaseError, IntegrityError + +from idds.common import exceptions +from idds.orm.base import models +from idds.orm.base.session import read_session, transactional_session + + +@transactional_session +def add_health_item(agent, hostname, pid, thread_id, thread_name, payload, session=None): + """ + Add a health item. + + :param agent: The agent name. + :param hostname: The hostname. + :param pid: The pid. + :param thread_id: The thread id. + :param thread_name: The thread name. + :param payload: The payload. + :param session: The database session. + """ + + try: + counts = session.query(models.Health)\ + .filter(models.Health.agent == agent)\ + .filter(models.Health.hostname == hostname)\ + .filter(models.Health.pid == pid)\ + .filter(models.Health.thread_id == thread_id)\ + .update({'updated_at': datetime.datetime.utcnow()}) + if not counts: + new_h = models.Health(agent=agent, hostname=hostname, pid=pid, + thread_id=thread_id, thread_name=thread_name, + payload=payload) + new_h.save(session=session) + except DatabaseError as e: + raise exceptions.DatabaseException('Could not persist message: %s' % str(e)) + + +@read_session +def retrieve_health_items(session=None): + """ + Retrieve health items. + + :param session: The database session. + + :returns healths: List of dictionaries + """ + items = [] + try: + query = session.query(models.Health) + + tmp = query.all() + if tmp: + for t in tmp: + items.append(t.to_dict()) + return items + except IntegrityError as e: + raise exceptions.DatabaseException(e.args) + + +@transactional_session +def clean_health(older_than=3600, session=None): + """ + Clearn items which is older than the time. + + :param older_than in seconds + """ + + session.query(models.Health)\ + .filter(models.Health.updated_at < datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than))\ + .delete() diff --git a/main/lib/idds/orm/messages.py b/main/lib/idds/orm/messages.py index 59097f2f..c8fa36a5 100644 --- a/main/lib/idds/orm/messages.py +++ b/main/lib/idds/orm/messages.py @@ -25,13 +25,16 @@ @transactional_session -def add_message(msg_type, status, source, transform_id, num_contents, msg_content, bulk_size=None, session=None): +def add_message(msg_type, status, source, request_id, workload_id, transform_id, + num_contents, msg_content, bulk_size=None, session=None): """ Add a message to be submitted asynchronously to a message broker. :param msg_type: The type of the msg as a number, e.g., finished_stagein. :param status: The status about the message :param source: The source where the message is from. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_id: The transform id. :param num_contents: Number of items in msg_content. :param msg_content: The message msg_content as JSON. @@ -56,7 +59,8 @@ def add_message(msg_type, status, source, transform_id, num_contents, msg_conten msg_content_list.append(msg_content) for msg_content, num_contents in zip(msg_content_list, num_contents_list): - new_message = models.Message(msg_type=msg_type, status=status, transform_id=transform_id, + new_message = models.Message(msg_type=msg_type, status=status, request_id=request_id, + workload_id=workload_id, transform_id=transform_id, source=source, num_contents=num_contents, locking=0, msg_content=msg_content) new_message.save(session=session) diff --git a/main/lib/idds/orm/processings.py b/main/lib/idds/orm/processings.py index 80b9d63e..5b686503 100644 --- a/main/lib/idds/orm/processings.py +++ b/main/lib/idds/orm/processings.py @@ -25,12 +25,14 @@ from idds.orm.base import models -def create_processing(transform_id, status=ProcessingStatus.New, locking=ProcessingLocking.Idle, submitter=None, +def create_processing(request_id, workload_id, transform_id, status=ProcessingStatus.New, locking=ProcessingLocking.Idle, submitter=None, granularity=None, granularity_type=GranularityType.File, expired_at=None, processing_metadata=None, substatus=ProcessingStatus.New, output_metadata=None): """ Create a processing. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_id: Transform id. :param status: processing status. :param locking: processing locking. @@ -42,7 +44,8 @@ def create_processing(transform_id, status=ProcessingStatus.New, locking=Process :returns: processing. """ - new_processing = models.Processing(transform_id=transform_id, status=status, substatus=substatus, locking=locking, + new_processing = models.Processing(request_id=request_id, workload_id=workload_id, transform_id=transform_id, + status=status, substatus=substatus, locking=locking, submitter=submitter, granularity=granularity, granularity_type=granularity_type, expired_at=expired_at, processing_metadata=processing_metadata, output_metadata=output_metadata) @@ -50,12 +53,15 @@ def create_processing(transform_id, status=ProcessingStatus.New, locking=Process @transactional_session -def add_processing(transform_id, status=ProcessingStatus.New, locking=ProcessingLocking.Idle, submitter=None, +def add_processing(request_id, workload_id, transform_id, status=ProcessingStatus.New, + locking=ProcessingLocking.Idle, submitter=None, granularity=None, granularity_type=GranularityType.File, expired_at=None, processing_metadata=None, output_metadata=None, session=None): """ Add a processing. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_id: Transform id. :param status: processing status. :param locking: processing locking. @@ -71,7 +77,8 @@ def add_processing(transform_id, status=ProcessingStatus.New, locking=Processing :returns: processing id. """ try: - new_processing = create_processing(transform_id=transform_id, status=status, locking=locking, submitter=submitter, + new_processing = create_processing(request_id=request_id, workload_id=workload_id, transform_id=transform_id, + status=status, locking=locking, submitter=submitter, granularity=granularity, granularity_type=granularity_type, expired_at=expired_at, processing_metadata=processing_metadata, output_metadata=output_metadata) new_processing.save(session=session) diff --git a/main/lib/idds/orm/requests.py b/main/lib/idds/orm/requests.py index 84932f7f..636ae4a7 100644 --- a/main/lib/idds/orm/requests.py +++ b/main/lib/idds/orm/requests.py @@ -17,6 +17,7 @@ import random import sqlalchemy +from sqlalchemy import and_ from sqlalchemy.exc import DatabaseError, IntegrityError from sqlalchemy.sql.expression import asc, desc @@ -196,7 +197,7 @@ def get_request(request_id, to_json=False, session=None): @read_session -def get_requests(request_id=None, workload_id=None, to_json=False, session=None): +def get_requests(request_id=None, workload_id=None, with_detail=False, to_json=False, session=None): """ Get a request or raise a NoObject exception. @@ -209,14 +210,36 @@ def get_requests(request_id=None, workload_id=None, to_json=False, session=None) :returns: Request. """ - try: - query = session.query(models.Request)\ - .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", 'oracle') - if request_id: - query = query.filter(models.Request.request_id == request_id) + if not with_detail: + query = session.query(models.Request)\ + .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", 'oracle') + if request_id: + query = query.filter(models.Request.request_id == request_id) + if workload_id: + query = query.filter(models.Request.workload_id == workload_id) else: - query = query.filter(models.Request.workload_id == workload_id) + subquery1 = session.query(models.Collection.coll_id, models.Collection.transform_id, + models.Collection.status, models.Collection.total_files, + models.Collection.processed_files).filter(models.Collection.relation_type == 0) + subquery2 = session.query(models.Collection.coll_id, models.Collection.transform_id, + models.Collection.status, models.Collection.total_files, + models.Collection.processed_files).filter(models.Collection.relation_type == 1) + + query = session.query(models.Request, + models.Transfrom.transform_id, models.Transform.status, + subquery1.c.status, subquery1.c.total_files, subquery1.c.processed_files, + subquery2.c.status, subquery2.c.total_files, subquery2.c.processed_files) + if request_id: + query = query.filter(models.Request.request_id == request_id) + if workload_id: + query = query.filter(models.Request.workload_id == workload_id) + + query = query.outerjoin(models.Workprogress, and_(models.Request.request_id == models.Workprogress.request_id)) + query = query.outerjoin(models.Workprogress2transform, and_(models.Workprogress.workprogress_id == models.Workprogress2transform.workprogress_id)) + query = query.outerjoin(subquery1, and_(subquery1.c.transform_id == models.Workprogress2transform.transform_id)) + query = query.outerjoin(subquery2, and_(subquery2.c.transform_id == models.Workprogress2transform.transform_id)) + query = query.order_by(asc(models.Request.request_id)) tmp = query.all() rets = [] diff --git a/main/lib/idds/orm/transforms.py b/main/lib/idds/orm/transforms.py index 8c8a5af6..368d05f7 100644 --- a/main/lib/idds/orm/transforms.py +++ b/main/lib/idds/orm/transforms.py @@ -26,11 +26,14 @@ from idds.orm.base import models -def create_transform(transform_type, transform_tag=None, priority=0, status=TransformStatus.New, locking=TransformLocking.Idle, +def create_transform(request_id, workload_id, transform_type, transform_tag=None, + priority=0, status=TransformStatus.New, locking=TransformLocking.Idle, retries=0, expired_at=None, transform_metadata=None): """ Create a transform. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_type: Transform type. :param transform_tag: Transform tag. :param priority: priority. @@ -42,18 +45,22 @@ def create_transform(transform_type, transform_tag=None, priority=0, status=Tran :returns: transform. """ - new_transform = models.Transform(transform_type=transform_type, transform_tag=transform_tag, priority=priority, + new_transform = models.Transform(request_id=request_id, workload_id=workload_id, transform_type=transform_type, + transform_tag=transform_tag, priority=priority, status=status, locking=locking, retries=retries, expired_at=expired_at, transform_metadata=transform_metadata) return new_transform @transactional_session -def add_transform(transform_type, transform_tag=None, priority=0, status=TransformStatus.New, locking=TransformLocking.Idle, +def add_transform(request_id, workload_id, transform_type, transform_tag=None, priority=0, + status=TransformStatus.New, locking=TransformLocking.Idle, retries=0, expired_at=None, transform_metadata=None, workprogress_id=None, session=None): """ Add a transform. + :param request_id: The request id. + :param workload_id: The workload id. :param transform_type: Transform type. :param transform_tag: Transform tag. :param priority: priority. @@ -69,7 +76,8 @@ def add_transform(transform_type, transform_tag=None, priority=0, status=Transfo :returns: transform id. """ try: - new_transform = create_transform(transform_type=transform_type, transform_tag=transform_tag, priority=priority, + new_transform = create_transform(request_id=request_id, workload_id=workload_id, transform_type=transform_type, + transform_tag=transform_tag, priority=priority, status=status, locking=locking, retries=retries, expired_at=expired_at, transform_metadata=transform_metadata) new_transform.save(session=session) @@ -214,17 +222,15 @@ def get_transform_ids(workprogress_id=None, request_id=None, workload_id=None, t :returns: list of transform ids. """ try: - if workload_id: - query = session.query(models.Req2transform.transform_id)\ - .join(models.Request, and_(models.Req2transform.request_id == models.Request.request_id, - models.Request.workload_id == workload_id)) - else: - query = session.query(models.Req2transform.transform_id) - + query = session.query(models.Transform.transform_id) if request_id: - query = query.filter(models.Req2transform.request_id == request_id) + query = query.filter(models.Transform.request_id == request_id) + if workload_id: + query = query.filter(models.Transform.workload_id == workload_id) if transform_id: - query = query.filter(models.Req2transform.transform_id == transform_id) + query = query.filter(models.Transform.transform_id == transform_id) + if workprogress_id: + query = query.join(models.Workprogress2transform, and_(models.Workprogress2transform.workprogress_id == workprogress_id)) tmp = query.all() ret_ids = [] @@ -240,7 +246,8 @@ def get_transform_ids(workprogress_id=None, request_id=None, workload_id=None, t @read_session -def get_transforms(request_id=None, workload_id=None, transform_id=None, to_json=False, session=None): +def get_transforms(request_id=None, workload_id=None, transform_id=None, workprogress_id=None, + to_json=False, session=None): """ Get transforms or raise a NoObject exception. @@ -254,21 +261,15 @@ def get_transforms(request_id=None, workload_id=None, transform_id=None, to_json :returns: list of transforms. """ try: - if workload_id: - subquery = session.query(models.Req2transform.transform_id)\ - .join(models.Request, and_(models.Req2transform.request_id == models.Request.request_id, - models.Request.workload_id == workload_id)) - else: - subquery = session.query(models.Req2transform.transform_id) - + query = session.query(models.Transform) if request_id: - subquery = subquery.filter(models.Req2transform.request_id == request_id) + query = query.filter(models.Transform.request_id == request_id) + if workload_id: + query = query.filter(models.Transform.workload_id == workload_id) if transform_id: - subquery = subquery.filter(models.Req2transform.transform_id == transform_id) - subquery = subquery.subquery() - - query = session.query(models.Transform)\ - .join(subquery, and_(subquery.c.transform_id == models.Transform.transform_id)) + query = query.filter(models.Transform.transform_id == transform_id) + if workprogress_id: + query = query.join(models.Workprogress2transform, and_(models.Workprogress2transform.workprogress_id == workprogress_id)) tmp = query.all() rets = [] diff --git a/main/lib/idds/orm/workprogress.py b/main/lib/idds/orm/workprogress.py index 22396338..43b0e3ef 100644 --- a/main/lib/idds/orm/workprogress.py +++ b/main/lib/idds/orm/workprogress.py @@ -25,12 +25,14 @@ from idds.orm.base import models -def create_workprogress(request_id, scope, name, priority=0, status=WorkprogressStatus.New, locking=WorkprogressLocking.Idle, - expired_at=None, errors=None, workprogress_metadata=None, processing_metadata=None): +def create_workprogress(request_id, workload_id, scope, name, priority=0, status=WorkprogressStatus.New, + locking=WorkprogressLocking.Idle, expired_at=None, errors=None, + workprogress_metadata=None, processing_metadata=None): """ Create a workprogress. :param request_id: The request id. + :param workload_id: The workload id. :param scope: The scope. :param name: The name. :param status: The status as integer. @@ -43,7 +45,8 @@ def create_workprogress(request_id, scope, name, priority=0, status=Workprogress :returns: workprogress. """ - new_wp = models.WorkProgress(request_id=request_id, scope=scope, name=name, priority=priority, status=status, + new_wp = models.WorkProgress(request_id=request_id, workload_id=workload_id, scope=scope, name=name, + priority=priority, status=status, locking=locking, expired_at=expired_at, workprogress_metadata=workprogress_metadata, processing_metadata=processing_metadata) @@ -51,13 +54,15 @@ def create_workprogress(request_id, scope, name, priority=0, status=Workprogress @transactional_session -def add_workprogress(request_id, scope, name, priority=0, status=WorkprogressStatus.New, locking=WorkprogressLocking.Idle, +def add_workprogress(request_id, workload_id, scope, name, priority=0, status=WorkprogressStatus.New, + locking=WorkprogressLocking.Idle, expired_at=None, errors=None, workprogress_metadata=None, processing_metadata=None, session=None): """ Add a workprogress. :param request_id: The request id. + :param workload_id: The workload id. :param scope: The scope. :param name: The name. :param status: The status as integer. @@ -75,7 +80,8 @@ def add_workprogress(request_id, scope, name, priority=0, status=WorkprogressSta """ try: - new_wp = create_workprogress(request_id=request_id, scope=scope, name=name, priority=priority, status=status, + new_wp = create_workprogress(request_id=request_id, workload_id=workload_id, scope=scope, name=name, + priority=priority, status=status, locking=locking, expired_at=expired_at, workprogress_metadata=workprogress_metadata, processing_metadata=processing_metadata) @@ -115,7 +121,7 @@ def add_workprogresses(workprogresses, bulk_size=1000, session=None): @read_session -def get_workprogresses(request_id, to_json=False, session=None): +def get_workprogresses(request_id=None, to_json=False, session=None): """ Get workprogresses with request_id. @@ -130,8 +136,9 @@ def get_workprogresses(request_id, to_json=False, session=None): try: query = session.query(models.Workprogress)\ - .with_hint(models.Workprogress, "INDEX(WORKPROGRESSES WORKPROGRESS_PK)", 'oracle')\ - .filter(models.Workprogress.request_id == request_id) + .with_hint(models.Workprogress, "INDEX(WORKPROGRESSES WORKPROGRESS_PK)", 'oracle') + if request_id is not None: + query = query.filter(models.Workprogress.request_id == request_id) tmp = query.all() rets = [] if tmp: diff --git a/main/lib/idds/rest/v1/catalog.py b/main/lib/idds/rest/v1/catalog.py index 17e8ad97..d199920a 100644 --- a/main/lib/idds/rest/v1/catalog.py +++ b/main/lib/idds/rest/v1/catalog.py @@ -6,25 +6,23 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019-2020 -import copy -import json import traceback from flask import Blueprint from idds.common import exceptions from idds.common.constants import HTTP_STATUS_CODE -from idds.core.catalog import get_collections, get_contents, register_output_contents, get_match_contents +from idds.core.catalog import get_collections, get_contents from idds.rest.v1.controller import IDDSController class Collections(IDDSController): """ Catalog """ - def get(self, scope, name, request_id, workload_id): + def get(self, scope, name, request_id, workload_id, relation_type): """ Get collections by scope, name, request_id and workload_id. HTTP Success: 200 OK @@ -47,8 +45,13 @@ def get(self, scope, name, request_id, workload_id): workload_id = None else: workload_id = int(workload_id) + if relation_type in ['null', 'None']: + relation_type = None + else: + relation_type = int(relation_type) - rets = get_collections(scope=scope, name=name, request_id=request_id, workload_id=workload_id, to_json=True) + rets = get_collections(scope=scope, name=name, request_id=request_id, workload_id=workload_id, + relation_type=relation_type, to_json=False) except exceptions.NoObject as error: return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) except exceptions.IDDSException as error: @@ -64,7 +67,7 @@ def get(self, scope, name, request_id, workload_id): class Contents(IDDSController): """ Catalog """ - def get(self, coll_scope, coll_name, request_id, workload_id, relation_type): + def get(self, coll_scope, coll_name, request_id, workload_id, relation_type, status): """ Get contents by coll_scope, coll_name, request_id, workload_id and relation_type. HTTP Success: 200 OK @@ -91,70 +94,13 @@ def get(self, coll_scope, coll_name, request_id, workload_id, relation_type): relation_type = None else: relation_type = int(relation_type) + if status in ['null', 'None']: + status = None + else: + status = int(status) rets = get_contents(coll_scope=coll_scope, coll_name=coll_name, request_id=request_id, - workload_id=workload_id, relation_type=relation_type, to_json=True) - except exceptions.NoObject as error: - return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) - except exceptions.IDDSException as error: - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=error.__class__.__name__, exc_msg=error) - except Exception as error: - print(error) - print(traceback.format_exc()) - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - - return self.generate_http_response(HTTP_STATUS_CODE.OK, data=rets) - - -class Catalog(IDDSController): - """ Catalog """ - - def get(self, coll_scope, coll_name, scope, name, min_id, max_id, request_id, workload_id, only_return_best_match): - """ Get output contents by request id, workload id, coll_scope, coll_name, scope, name, min_id, max_id. - HTTP Success: - 200 OK - HTTP Error: - 404 Not Found - 500 InternalError - :returns: dictionary of an request. - """ - - try: - if coll_scope in ['null', 'None']: - coll_scope = None - if coll_name in ['null', 'None']: - coll_name = None - if scope in ['null', 'None']: - scope = None - if name in ['null', 'None']: - name = None - if min_id in ['null', 'None']: - min_id = None - else: - min_id = int(min_id) - if max_id in ['null', 'None']: - max_id = None - else: - max_id = int(max_id) - if request_id in ['null', 'None']: - request_id = None - else: - request_id = int(request_id) - if workload_id in ['null', 'None']: - workload_id = None - else: - workload_id = int(workload_id) - if only_return_best_match in ['null', 'None']: - only_return_best_match = None - else: - if only_return_best_match.lower() == 'true': - only_return_best_match = True - else: - only_return_best_match = False - - rets = get_match_contents(coll_scope=coll_scope, coll_name=coll_name, scope=scope, name=name, - min_id=min_id, max_id=max_id, request_id=request_id, to_json=True, - workload_id=workload_id, only_return_best_match=only_return_best_match) + workload_id=workload_id, relation_type=relation_type, status=status, to_json=False) except exceptions.NoObject as error: return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) except exceptions.IDDSException as error: @@ -166,54 +112,6 @@ def get(self, coll_scope, coll_name, scope, name, min_id, max_id, request_id, wo return self.generate_http_response(HTTP_STATUS_CODE.OK, data=rets) - def post(self, coll_scope, coll_name, request_id, workload_id): - """ register output contents. - HTTP Success: - 200 OK - HTTP Error: - 400 Bad request - 500 Internal Error - """ - kwargs = {'scope': None, 'name': None, 'min_id': None, 'max_id': None, - 'path': None, 'status': None} - try: - if coll_scope in ['null', 'None']: - coll_scope = None - if coll_name in ['null', 'None']: - coll_name = None - if request_id in ['null', 'None']: - request_id = None - else: - request_id = int(request_id) - if workload_id in ['null', 'None']: - workload_id = None - else: - workload_id = int(workload_id) - - contents = [] - parameters = self.get_request().data and json.loads(self.get_request().data) - if parameters: - for parameter in parameters: - content = copy.deepcopy(kwargs) - for key in kwargs: - if key in parameter: - content[key] = parameter[key] - contents.append(content) - register_output_contents(coll_scope=coll_scope, coll_name=coll_name, contents=contents, - request_id=request_id, workload_id=workload_id) - except ValueError: - return self.generate_http_response(HTTP_STATUS_CODE.BadRequest, exc_cls=exceptions.BadRequest.__name__, exc_msg='Cannot decode json parameter dictionary') - except exceptions.DuplicatedObject as error: - return self.generate_http_response(HTTP_STATUS_CODE.Conflict, exc_cls=error.__class__.__name__, exc_msg=error) - except exceptions.IDDSException as error: - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=error.__class__.__name__, exc_msg=error) - except Exception as error: - print(error) - print(traceback.format_exc()) - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - - return self.generate_http_response(HTTP_STATUS_CODE.OK, data=None) - """---------------------- Web service url maps @@ -223,20 +121,14 @@ def post(self, coll_scope, coll_name, request_id, workload_id): def get_blueprint(): bp = Blueprint('catalog', __name__) - catalog_view = Catalog.as_view('catalog') + # catalog_view = Catalog.as_view('catalog') collections_view = Collections.as_view('collections') - bp.add_url_rule('/catalog/collections////', + bp.add_url_rule('/catalog/collections/////', view_func=collections_view, methods=['get', ]) # get collections contents_view = Contents.as_view('contents') - bp.add_url_rule('/catalog/contents/////', + bp.add_url_rule('/catalog/contents//////', view_func=contents_view, methods=['get', ]) # get contents - catalog_view = Catalog.as_view('catalog') - bp.add_url_rule('/catalog/////////', - view_func=catalog_view, methods=['get', ]) # get match contents - bp.add_url_rule('/catalog////', - view_func=catalog_view, methods=['post', ]) # register contents - return bp diff --git a/main/lib/idds/rest/v1/hyperparameteropt.py b/main/lib/idds/rest/v1/hyperparameteropt.py index 2c20ef48..cadb93d3 100644 --- a/main/lib/idds/rest/v1/hyperparameteropt.py +++ b/main/lib/idds/rest/v1/hyperparameteropt.py @@ -15,8 +15,8 @@ from idds.common import exceptions from idds.common.constants import HTTP_STATUS_CODE -from idds.common.constants import ContentType -from idds.core import catalog, requests as core_requests +from idds.common.constants import CollectionRelationType, ContentStatus +from idds.core import catalog from idds.rest.v1.controller import IDDSController @@ -42,29 +42,28 @@ def put(self, workload_id, request_id, id, loss): if workload_id is None and request_id is None: error = "One of workload_id and request_id should not be None or empty" return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - if not request_id: - request_ids = core_requests.get_request_ids_by_workload_id(workload_id) - if not request_ids: - error = "Cannot find requests with this workloa_id: %s" % workload_id - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - else: - if len(request_ids) > 1: - error = "More than one request with the same workload_id. request_id should be provided." - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - else: - request_id = request_ids[0] except Exception as error: print(error) print(format_exc()) return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) try: + contents = catalog.get_contents(request_id=request_id, workload_id=workload_id, + relation_type=CollectionRelationType.Output) + + if id: + new_contents = [] + for content in contents: + if str(content['name']) == str(id): + new_contents.append(content) + contents = new_contents + content = contents[0] + loss = float(loss) - content = catalog.get_output_content_by_request_id_content_name(request_id, content_scope='hpo', content_name=str(id), content_type=ContentType.PseudoContent, min_id=0, max_id=0) content_id = content['content_id'] point = content['path'] param, origin_loss = json.loads(point) - params = {'path': json.dumps((param, loss))} + params = {'path': json.dumps((param, loss)), 'substatus': ContentStatus.Available} catalog.update_content(content_id, params) except exceptions.NoObject as error: return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) @@ -90,34 +89,13 @@ def get(self, workload_id, request_id, id=None, status=None, limit=None): 500 InternalError :returns: list of hyper parameters. """ + try: if workload_id == 'null': workload_id = None if request_id == 'null': request_id = None - if id == 'null': - id = None - if workload_id is None and request_id is None: - error = "One of workload_id and request_id should not be None or empty" - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - if not request_id: - request_ids = core_requests.get_request_ids_by_workload_id(workload_id) - if not request_ids: - error = "Cannot find requests with this workloa_id: %s" % workload_id - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - else: - if len(request_ids) > 1: - error = "More than one request with the same workload_id. request_id should be provided." - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - else: - request_id = request_ids[0] - except Exception as error: - print(error) - print(format_exc()) - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - - try: if status == 'null': status = None if limit == 'null': @@ -125,7 +103,19 @@ def get(self, workload_id, request_id, id=None, status=None, limit=None): if id == 'null': id = None - contents = catalog.get_output_contents_by_request_id_status(request_id, id, status, limit) + contents = catalog.get_contents(request_id=request_id, workload_id=workload_id, + status=status, relation_type=CollectionRelationType.Output) + + if id: + new_contents = [] + for content in contents: + if str(content['name']) == str(id): + new_contents.append(content) + contents = new_contents + + if contents and limit and len(contents) > limit: + contents = contents[:limit] + hyperparameters = [] for content in contents: point = content['path'] diff --git a/main/lib/idds/rest/v1/logs.py b/main/lib/idds/rest/v1/logs.py index 9c22fdd9..f4cfd9be 100644 --- a/main/lib/idds/rest/v1/logs.py +++ b/main/lib/idds/rest/v1/logs.py @@ -15,10 +15,8 @@ from idds.common import exceptions from idds.common.constants import HTTP_STATUS_CODE -from idds.common.utils import tar_zip_files -from idds.core import (requests as core_requests, - transforms as core_transforms, - processings as core_processings) +from idds.common.utils import tar_zip_files, get_rest_cacher_dir +from idds.core import (transforms as core_transforms) from idds.rest.v1.controller import IDDSController @@ -48,38 +46,33 @@ def get(self, workload_id, request_id): if workload_id is None and request_id is None: error = "One of workload_id and request_id should not be None or empty" return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - if not request_id: - request_ids = core_requests.get_request_ids_by_workload_id(workload_id) - if not request_ids: - error = "Cannot find requests with this workloa_id: %s" % workload_id - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - else: - if len(request_ids) > 1: - error = "More than one request with the same workload_id. request_id should be provided." - return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - else: - request_id = request_ids[0] except Exception as error: print(error) print(format_exc()) return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) try: - transform_ids = core_transforms.get_transform_ids(request_id) - files = [] - for transform_id in transform_ids: - processings = core_processings.get_processings_by_transform_id(transform_id) - for processing in processings: - processing_metadata = processing['processing_metadata'] - if processing_metadata and 'job_logs_tar' in processing_metadata and processing_metadata['job_logs_tar']: - files.append(processing_metadata['job_logs_tar']) - if not files: + transforms = core_transforms.get_transforms(request_id=request_id, workload_id=workload_id) + workdirs = [] + for transform in transforms: + work = transform['transform_metadata']['work'] + workdir = work.get_workdir() + if workdir: + workdirs.append(workdir) + if not workdirs: error = "No log files founded." return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) - cache_dir = os.path.dirname(files[0]) - output_filename = "%s.logs.tar.gz" % request_id - tar_zip_files(cache_dir, output_filename, files) + cache_dir = get_rest_cacher_dir() + if request_id and workload_id: + output_filename = "request_%s.workload_%s.logs.tar.gz" % (request_id, workload_id) + elif request_id: + output_filename = "request_%s.logs.tar.gz" % (request_id) + elif workload_id: + output_filename = "workload_%s.logs.tar.gz" % (workload_id) + else: + output_filename = "%s.logs.tar.gz" % (os.path.basename(cache_dir)) + tar_zip_files(cache_dir, output_filename, workdirs) return send_from_directory(cache_dir, output_filename, as_attachment=True, mimetype='application/x-tgz') except exceptions.NoObject as error: return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) diff --git a/main/lib/idds/rest/v1/requests.py b/main/lib/idds/rest/v1/requests.py index a325a62f..72e1a2e4 100644 --- a/main/lib/idds/rest/v1/requests.py +++ b/main/lib/idds/rest/v1/requests.py @@ -20,6 +20,8 @@ from idds.core.requests import add_request, get_requests, update_request from idds.rest.v1.controller import IDDSController +from idds.rest.v1.utils import convert_old_req_2_workflow_req + class Requests(IDDSController): """ Get request """ @@ -78,6 +80,7 @@ def post(self): return self.generate_http_response(HTTP_STATUS_CODE.BadRequest, exc_cls=exceptions.BadRequest.__name__, exc_msg='Cannot decode json parameter dictionary') try: + parameters = convert_old_req_2_workflow_req(parameters) request_id = add_request(**parameters) except exceptions.DuplicatedObject as error: return self.generate_http_response(HTTP_STATUS_CODE.Conflict, exc_cls=error.__class__.__name__, exc_msg=error) @@ -119,7 +122,7 @@ def put(self, request_id): return self.generate_http_response(HTTP_STATUS_CODE.OK, data={'status': 0, 'message': 'update successfully'}) - def get(self, request_id, workload_id): + def get(self, request_id, workload_id, with_detail): """ Get details about a specific Request with given id. HTTP Success: 200 OK @@ -134,9 +137,13 @@ def get(self, request_id, workload_id): request_id = None if workload_id == 'null': workload_id = None + if with_detail and with_detail.lower in ['true']: + with_detail = True + else: + with_detail = False # reqs = get_requests(request_id=request_id, workload_id=workload_id, to_json=True) - reqs = get_requests(request_id=request_id, workload_id=workload_id) + reqs = get_requests(request_id=request_id, workload_id=workload_id, with_detail=with_detail) except exceptions.NoObject as error: return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) except exceptions.IDDSException as error: @@ -166,5 +173,5 @@ def get_blueprint(): request_view = Request.as_view('request') bp.add_url_rule('/request', view_func=request_view, methods=['post', ]) bp.add_url_rule('/request/', view_func=request_view, methods=['put', ]) - bp.add_url_rule('/request//', view_func=request_view, methods=['get', ]) + bp.add_url_rule('/request///', view_func=request_view, methods=['get', ]) return bp diff --git a/main/lib/idds/rest/v1/utils.py b/main/lib/idds/rest/v1/utils.py new file mode 100644 index 00000000..d277d15e --- /dev/null +++ b/main/lib/idds/rest/v1/utils.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 + +from idds.common.constants import RequestType, RequestStatus + +from idds.workflow.workflow import Workflow + +# from idds.atlas.workflow.atlasstageinwork import ATLASStageInWork +# from idds.atlas.workflow.atlashpowork import ATLASHPOWork + + +def convert_stagein_request_metadata_to_workflow(scope, name, workload_id, request_metadata): + """ + Convert old format stagein request metadata of json to new format request metadata based on workflow. + + :param scope: The collection scope. + :param name: The collection name. + :param workload_id: The workload id. + :param request_metadata: The request metadata. + """ + # 'request_metadata': {'workload_id': '20776840', 'max_waiting_time': 3600, 'src_rse': 'NDGF-T1_DATATAPE', 'dest_rse': 'NDGF-T1_DATADISK', 'rule_id': '236e4bf87e11490291e3259b14724e30'} # noqa: E501 + + from idds.atlas.workflow.atlasstageinwork import ATLASStageinWork + + work = ATLASStageinWork(executable=None, arguments=None, parameters=None, setup=None, + exec_type='local', sandbox=None, + primary_input_collection={'scope': scope, 'name': name}, + other_input_collections=None, + output_collections={'scope': scope, 'name': name + '.idds.stagein'}, + log_collections=None, + logger=None, + max_waiting_time=request_metadata.get('max_waiting_time', 3600 * 7 * 24), + src_rse=request_metadata.get('src_rse', None), + dest_rse=request_metadata.get('dest_rse', None), + rule_id=request_metadata.get('rule_id', None)) + wf = Workflow() + wf.set_workload_id(workload_id) + wf.add_work(work) + # work.set_workflow(wf) + return wf + + +def convert_hpo_request_metadata_to_workflow(scope, name, workload_id, request_metadata): + """ + Convert old format hpo request metadata of json to new format request metadata based on workflow. + + :param scope: The collection scope. + :param name: The collection name. + :param workload_id: The workload id. + :param request_metadata: The request metadata. + """ + # 'request_metadata': {'workload_id': '20525134', 'sandbox': None, 'method': 'bayesian', 'opt_space': {'A': (1, 4), 'B': (1, 10)}, 'initial_points': [({'A': 1, 'B': 2}, 0.3), ({'A': 1, 'B': 3}, None)], 'max_points': 20, 'num_points_per_generation': 10} # noqa: E501 + # 'request_metadata': {'workload_id': '20525135', 'sandbox': None, 'method': 'nevergrad', 'opt_space': {"A": {"type": "Choice", "params": {"choices": [1, 4]}}, "B": {"type": "Scalar", "bounds": [0, 5]}}, 'initial_points': [({'A': 1, 'B': 2}, 0.3), ({'A': 1, 'B': 3}, None)], 'max_points': 20, 'num_points_per_generation': 10} # noqa: E501 + # 'request_metadata': {'workload_id': '20525134', 'sandbox': 'wguanicedew/idds_hpo_nevergrad', 'workdir': '/data', 'executable': 'docker', 'arguments': 'python /opt/hyperparameteropt_nevergrad.py --max_points=%MAX_POINTS --num_points=%NUM_POINTS --input=/data/%IN --output=/data/%OUT', 'output_json': 'output.json', 'opt_space': {"A": {"type": "Choice", "params": {"choices": [1, 4]}}, "B": {"type": "Scalar", "bounds": [0, 5]}}, 'initial_points': [({'A': 1, 'B': 2}, 0.3), ({'A': 1, 'B': 3}, None)], 'max_points': 20, 'num_points_per_generation': 10} # noqa: E501 + + from idds.atlas.workflow.atlashpowork import ATLASHPOWork + + work = ATLASHPOWork(executable=request_metadata.get('executable', None), + arguments=request_metadata.get('arguments', None), + parameters=request_metadata.get('parameters', None), + setup=None, exec_type='local', + sandbox=request_metadata.get('sandbox', None), + method=request_metadata.get('method', None), + container_workdir=request_metadata.get('workdir', None), + output_json=request_metadata.get('output_json', None), + opt_space=request_metadata.get('opt_space', None), + workload_id=workload_id, + initial_points=request_metadata.get('initial_points', None), + max_points=request_metadata.get('max_points', None), + num_points_per_iteration=request_metadata.get('num_points_per_iteration', 10)) + wf = Workflow() + wf.set_workload_id(workload_id) + wf.add_work(work) + return wf + + +def convert_old_req_2_workflow_req(data): + if not data: + return data + + if data['request_type'] == RequestType.Workflow: + return data + + workload_id = None + if 'workload_id' in data and data['workload_id']: + workload_id = data['workload_id'] + elif 'workload_id' in data['request_metadata'] and data['request_metadata']['workload_id']: + workload_id = data['request_metadata']['workload_id'] + + if data['request_type'] in [RequestType.StageIn, RequestType.StageIn.value]: + wf = convert_stagein_request_metadata_to_workflow(data['scope'], data['name'], workload_id, + data['request_metadata']) + data['request_type'] = RequestType.Workflow + data['transform_tag'] = 'workflow' + data['status'] = RequestStatus.New + data['workload_id'] = wf.get_workload_id() + data['request_metadata'] = {'workload_id': wf.get_workload_id(), + 'workflow': wf} + return data + if data['request_type'] in [RequestType.HyperParameterOpt, RequestType.HyperParameterOpt.value]: + wf = convert_hpo_request_metadata_to_workflow(data['scope'] if 'scope' in data else None, + data['name'] if 'name' in data else None, + workload_id, + data['request_metadata']) + primary_init_work = wf.get_primary_initial_collection() + if primary_init_work: + data['scope'] = primary_init_work['scope'] + data['name'] = primary_init_work['name'] + + data['request_type'] = RequestType.Workflow + data['transform_tag'] = 'workflow' + data['status'] = RequestStatus.New + data['workload_id'] = wf.get_workload_id() + data['request_metadata'] = {'workload_id': wf.get_workload_id(), + 'workflow': wf} + return data + return data diff --git a/main/lib/idds/tests/catalog_test.py b/main/lib/idds/tests/catalog_test.py new file mode 100644 index 00000000..78107758 --- /dev/null +++ b/main/lib/idds/tests/catalog_test.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 + + +""" +Test hyper parameter optimization test client. +""" + +from idds.client.client import Client +from idds.common.constants import CollectionRelationType, ContentStatus +from idds.common.utils import get_rest_host + + +# host = "https://aipanda181.cern.ch:443/idds" +host = get_rest_host() + +client = Client(host=host) +# props['request_metadata']['result_parser'] = 'default' + +scope = 'data16_13TeV' +name = 'data16_13TeV.00298862.physics_Main.daq.RAW.idds.stagein' +request_id = 12 +workload_id = 1601235010 +relation_type = CollectionRelationType.Output # Input, Log +status = ContentStatus.Available # New, Processing, Available, ... + +colls = client.get_collections(scope=scope, name=name, request_id=request_id, workload_id=workload_id, relation_type=relation_type) +print(colls) +# example outputs +# [{'relation_type': , 'next_poll_at': 'Sun, 27 Sep 2020 19:30:16 UTC', 'storage_id': None, 'scope': 'data16_13TeV', 'accessed_at': 'Sun, 27 Sep 2020 21:31:40 UTC', 'new_files': 0, 'name': 'data16_13TeV.00298862.physics_Main.daq.RAW.idds.stagein', 'expired_at': 'Tue, 27 Oct 2020 19:30:10 UTC', 'processed_files': 0, 'bytes': 0, 'coll_metadata': {'internal_id': 'db1bb0dc-00f7-11eb-a5d2-fa163eb98fd2'}, 'request_id': 12, 'processing_files': 0, 'status': , 'coll_id': 16, 'processing_id': None, 'substatus': None, 'workload_id': 1601235010, 'retries': 0, 'locking': , 'coll_type': , 'created_at': 'Sun, 27 Sep 2020 19:30:16 UTC', 'total_files': 0, 'transform_id': 9, 'updated_at': 'Sun, 27 Sep 2020 21:31:40 UTC'}] # noqa: E501 + +contents = client.get_contents(coll_scope=scope, coll_name=name, request_id=request_id, workload_id=workload_id, relation_type=relation_type, status=status) +for content in contents: + # print(content) + pass + +# example outputs +# {'substatus': , 'transform_id': 9, 'path': None, 'locking': , 'map_id': 8895, 'created_at': 'Sun, 27 Sep 2020 19:30:40 UTC', 'bytes': 534484132, 'scope': 'data16_13TeV', 'updated_at': 'Sun, 27 Sep 2020 19:44:55 UTC', 'md5': None, 'name': 'data16_13TeV.00298862.physics_Main.daq.RAW._lb0538._SFO-5._0003.data', 'accessed_at': 'Sun, 27 Sep 2020 19:44:56 UTC', 'adler32': 'e8543989', 'content_id': 2361650, 'min_id': 0, 'expired_at': 'Tue, 27 Oct 2020 19:30:26 UTC', 'coll_id': 16, 'processing_id': None, 'max_id': 579, 'content_metadata': {'events': 579}, 'storage_id': None, 'request_id': 12, 'content_type': , 'workload_id': 1601235010, 'retries': 0, 'status': } # noqa: E501 diff --git a/main/lib/idds/tests/migrating_requests_v1_to_v2.py b/main/lib/idds/tests/migrating_requests_v1_to_v2.py new file mode 100644 index 00000000..934a6068 --- /dev/null +++ b/main/lib/idds/tests/migrating_requests_v1_to_v2.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2020 + +from idds.common.utils import get_rest_host +from idds.core.requests import get_requests +from idds.workflow.workflow import Workflow + +from idds.atlas.workflow.atlasstageinwork import ATLASStageinWork + +from idds.client.clientmanager import ClientManager + + +def convert_req2reqv2(req): + # v1: {'created_at': datetime.datetime(2020, 11, 3, 10, 9, 32), 'substatus': None, 'priority': 0, 'transform_tag': '2', 'requester': 'panda', 'request_metadata': {'workload_id': 23083304, 'rule_id': 'bef3da17f17c49ac97863bb9e96af672'}, 'name': 'valid1.361027.Pythia8EvtGen_A14NNPDF23LO_jetjet_JZ7W.simul.HITS.e5112_s3227_tid12560193_00', 'request_id': 3775, 'accessed_at': datetime.datetime(2020, 11, 3, 10, 9, 32), 'updated_at': datetime.datetime(2020, 11, 3, 10, 9, 32), 'locking': , 'status': , 'workload_id': 23083304, 'request_type': , 'errors': None, 'processing_metadata': None, 'scope': 'valid1', 'expired_at': datetime.datetime(2020, 12, 3, 10, 9, 32), 'next_poll_at': datetime.datetime(2020, 11, 3, 10, 9, 32)} # noqa E501 + + request_metadata = req['request_metadata'] + work = ATLASStageinWork(executable=None, arguments=None, parameters=None, setup=None, + exec_type='local', sandbox=None, + primary_input_collection={'scope': req['scope'], 'name': req['name']}, + other_input_collections=None, + output_collections={'scope': req['scope'], 'name': req['name'] + '.idds.stagein'}, + log_collections=None, + logger=None, + max_waiting_time=request_metadata.get('max_waiting_time', 3600 * 7 * 24), + src_rse=request_metadata.get('src_rse', None), + dest_rse=request_metadata.get('dest_rse', None), + rule_id=request_metadata.get('rule_id', None)) + + workload_id = req['workload_id'] + if not workload_id and 'workload_id' in request_metadata: + workload_id = request_metadata['workload_id'] + + wf = Workflow() + wf.set_workload_id(workload_id) + wf.add_work(work) + + host = get_rest_host() + wm = ClientManager(host=host) + request_id = wm.submit(wf) + # print(request_id) + return request_id + + +reqs = get_requests() +print(len(reqs)) +for req in reqs: + # if req['request_id'] in [3743, 3755, 3769, 3775]: + if req['request_id'] in [3787, 3789, 3791]: + print(req) + # new_req_id = convert_req2reqv2(req) + # print("convert old request %s to new request %s" % (req['request_id'], new_req_id)) + # print(req['request_metadata']['workflow'].to_dict()) + pass diff --git a/main/lib/idds/tests/test_activelearning.py b/main/lib/idds/tests/test_activelearning.py new file mode 100644 index 00000000..1589e44d --- /dev/null +++ b/main/lib/idds/tests/test_activelearning.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 - 2021 + + +""" +Test client. +""" +import re +import time +# import traceback + +# from idds.client.client import Client +from idds.client.clientmanager import ClientManager +# from idds.common.constants import RequestType, RequestStatus +from idds.common.utils import get_rest_host, run_command +# from idds.common.utils import json_dumps +# from idds.tests.common import get_example_real_tape_stagein_request +# from idds.tests.common import get_example_prodsys2_tape_stagein_request + +# from idds.workflow.work import Work, Parameter, WorkStatus +from idds.workflow.workflow import Condition, Workflow +# from idds.workflow.workflow import Workflow +# from idds.atlas.workflow.atlasstageinwork import ATLASStageinWork +from idds.atlas.workflow.atlaspandawork import ATLASPandaWork +from idds.atlas.workflow.atlasactuatorwork import ATLASActuatorWork + + +def get_task_id(output, error): + m = re.search('jediTaskID=(\d+)', output + error) # noqa W605 + task_id = int(m.group(1)) + return task_id + + +def submit_processing_task(): + cmd = "cd /afs/cern.ch/user/w/wguan/workdisk/iDDS/test/activelearning/hepexcursion/grid; prun --exec 'python simplescript.py 0.5 0.5 200 output.json' --outDS user.wguan.altest123456 --outputs output.json --nJobs=10" + status, output, error = run_command(cmd) + """ + print("status:") + print(status) + print("output:") + print(output) + print("error:") + print(error) + + status: + 0 + output: + + error: + INFO : gathering files under /afs/cern.ch/work/w/wguan/iDDS/test/activelearning/hepexcursion/grid + INFO : upload source files + INFO : submit user.wguan.altest1234/ + INFO : succeeded. new jediTaskID=23752996 + """ + if status == 0: + task_id = get_task_id(output, error) + return task_id + else: + raise Exception(output + error) + + +def test_panda_work(panda_task_id): + cmd_to_arguments = {'arguments': 'python simplescript.py 0.5 0.5 200', + 'parameters': 'python simplescript.py {m1} {m2} {nevents}', + 'outDS': 'user.wguan.altest123456'} + work = ATLASPandaWork(panda_task_id=panda_task_id, cmd_to_arguments=cmd_to_arguments) + work.initialize_work() + print(work.__class__.__name__) + print('sandbox: %s' % work.sandbox) + print('output_collections: %s' % str(work.get_output_collections())) + + print("new work") + test_work = work.generate_work_from_template() + test_work.initialize_work() + test_work.set_parameters({'m1': 0.5, 'm2': 0.5, 'nevents': 100}) + test_work.parse_arguments() + # print(json_dumps(test_work, sort_keys=True, indent=4)) + # print('output_collections: %s' % str(test_work.get_output_collections())) + # print(json_dumps(test_work, sort_keys=True, indent=4)) + + # from pandatools import Client + # Client.getJediTaskDetails(taskDict,fullFlag,withTaskInfo,verbose=False) + # ret = Client.getJediTaskDetails({'jediTaskID': panda_task_id},False,True) + # print(ret) + + +def get_workflow(panda_task_id): + cmd_to_arguments = {'arguments': 'python simplescript.py 0.5 0.5 200', + 'parameters': 'python simplescript.py {m1} {m2} {nevents}', + 'outDS': 'user.wguan.altest123456'} + work = ATLASPandaWork(panda_task_id=panda_task_id, cmd_to_arguments=cmd_to_arguments) + + # it's needed to parse the panda task parameter information, for example output dataset name, for the next task. + # if the information is not needed, you don't need to run it manually. iDDS will call it interally to parse the information. + work.initialize_work() + + work_output_coll = work.get_output_collections()[0] + + input_coll = {'scope': work_output_coll['scope'], + 'name': work_output_coll['name'], + 'coll_metadata': {'force_close': True}} + output_coll = {'scope': work_output_coll['scope'], + 'name': work_output_coll['name'] + "." + str(int(time.time()))} + + # acutator = ATLASActuatorWork(executable='python', arguments='merge.py {output_json} {events} {dataset}/{filename}', + acutator = ATLASActuatorWork(executable='python', arguments='merge.py {output_json} {events} {dataset}', + parameters={'output_json': 'merge.json', + 'events': 200, + 'dataset': '{scope}:{name}'.format(**input_coll), + 'filename': 'output*.json'}, + sandbox=work.sandbox, primary_input_collection=input_coll, + output_collections=output_coll, output_json='merge.json') + wf = Workflow() + # because the two tasks are in a loop. It's good to set which one to start. + wf.add_work(work) + wf.add_work(acutator) + cond = Condition(work.is_finished, current_work=work, true_work=acutator, false_work=None) + wf.add_condition(cond) + cond1 = Condition(acutator.generate_new_task, current_work=acutator, true_work=work, false_work=None) + wf.add_condition(cond1) + + # because the two works are in a loop, they are not independent. This call is needed to tell which one to start. + # otherwise idds will use the first one to start. + wf.add_initial_works(work) + + # work.set_workflow(wf) + return wf + + +if __name__ == '__main__': + host = get_rest_host() + # panda_task_id = submit_processing_task() + # panda_task_id = 23752996 + # panda_task_id = 23810059 + panda_task_id = 23818866 + print(panda_task_id) + test_panda_work(panda_task_id) + workflow = get_workflow(panda_task_id) + wm = ClientManager(host=host) + request_id = wm.submit(workflow) + print(request_id) diff --git a/main/lib/idds/tests/test_datacarousel.py b/main/lib/idds/tests/test_datacarousel.py index 6a1378a9..fae340e7 100644 --- a/main/lib/idds/tests/test_datacarousel.py +++ b/main/lib/idds/tests/test_datacarousel.py @@ -19,7 +19,7 @@ from rucio.common.exception import CannotAuthenticate # from idds.client.client import Client -from idds.client.workflowmanager import WorkflowManager +from idds.client.clientmanager import ClientManager from idds.common.constants import RequestType, RequestStatus from idds.common.utils import get_rest_host # from idds.tests.common import get_example_real_tape_stagein_request @@ -102,7 +102,7 @@ def get_workflow(): other_input_collections=None, output_collections={'scope': scope, 'name': name + '.idds.stagein'}, log_collections=None, - workflow=None, logger=None, + logger=None, max_waiting_time=3600 * 7 * 24, src_rse=src_rse, dest_rse=dest_rse, rule_id=rule_id) wf = Workflow() wf.add_work(work) @@ -132,6 +132,6 @@ def pre_check(req): # props = pre_check(props) # print(props) - wm = WorkflowManager(host=host) + wm = ClientManager(host=host) request_id = wm.submit(workflow) print(request_id) diff --git a/main/lib/idds/tests/test_domalsst.py b/main/lib/idds/tests/test_domalsst.py new file mode 100644 index 00000000..b3f74719 --- /dev/null +++ b/main/lib/idds/tests/test_domalsst.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 - 2020 + + +""" +Test client. +""" + +# import traceback + +# from rucio.client.client import Client as Rucio_Client +# from rucio.common.exception import CannotAuthenticate + +# from idds.client.client import Client +from idds.client.clientmanager import ClientManager +# from idds.common.constants import RequestType, RequestStatus +from idds.common.utils import get_rest_host +# from idds.tests.common import get_example_real_tape_stagein_request +# from idds.tests.common import get_example_prodsys2_tape_stagein_request + +# from idds.workflow.work import Work, Parameter, WorkStatus +# from idds.workflow.workflow import Condition, Workflow +from idds.workflow.workflow import Condition, Workflow +# from idds.atlas.workflow.atlasstageinwork import ATLASStageinWork +from idds.doma.workflow.domalsstwork import DomaLSSTWork +import string, random + +def randStr(chars=string.ascii_lowercase + string.digits, N=10): + return ''.join(random.choice(chars) for _ in range(N)) + +class LSSTTask(object): + name = None + step = None + dependencies = [] + +def setup_workflow(): + + taskN1 = LSSTTask() + taskN1.step = "step1" + taskN1.name = taskN1.step + "_" + randStr() + taskN1.dependencies = [ + {"name": "00000"+str(k), + "dependencies":[], + "submitted": False} for k in range(6) + ] + + taskN2 = LSSTTask() + taskN2.step = "step2" + taskN2.name = taskN2.step + "_" + randStr() + taskN2.dependencies = [ + { + "name": "000010", + "dependencies":[{"task":taskN1.name, "inputname": "000001", "available": False},{"task":taskN1.name, "inputname": "000002", "available": False}], + "submitted": False + }, + { + "name": "000011", + "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, {"task": taskN1.name, "inputname": "000002", "available": False}], + "submitted": False + }, + { + "name": "000012", + "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, {"task": taskN1.name, "inputname": "000002", "available": False}], + "submitted": False + } + ] + + taskN3 = LSSTTask() + taskN3.step = "step3" + taskN3.name = taskN3.step + "_" + randStr() + taskN3.dependencies = [ + { + "name": "000020", + "dependencies":[], + "submitted": False + }, + { + "name": "000021", + "dependencies": [{"task": taskN2.name, "inputname": "000010", "available": False}, {"task": taskN2.name, "inputname": "000011", "available": False}], + "submitted": False + }, + { + "name": "000022", + "dependencies": [{"task": taskN2.name, "inputname": "000011", "available": False}, {"task": taskN2.name, "inputname": "000012", "available": False}], + "submitted": False + }, + { + "name": "000023", + "dependencies":[], + "submitted": False + }, + { + "name": "000024", + "dependencies": [{"task": taskN3.name, "inputname": "000021", "available": False}, {"task": taskN3.name, "inputname": "000023", "available": False}], + "submitted": False + }, + ] + + work1 = DomaLSSTWork(executable='echo', + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#1'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#1'}], + log_collections=[], dependency_map=taskN1.dependencies, task_name=taskN1.name) + work2 = DomaLSSTWork(executable='echo', + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#2'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#2'}], + log_collections=[], dependency_map=taskN2.dependencies, task_name=taskN2.name) + work3 = DomaLSSTWork(executable='echo', + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#3'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#3'}], + log_collections=[], dependency_map=taskN3.dependencies, task_name=taskN3.name) + + workflow = Workflow() + workflow.add_work(work1) + #workflow.add_work(work2) + #workflow.add_work(work3) + return workflow + + +if __name__ == '__main__': + host = get_rest_host() + workflow = setup_workflow() + + wm = ClientManager(host=host) + request_id = wm.submit(workflow) + print(request_id) diff --git a/main/lib/idds/tests/test_hyperparameteropt.py b/main/lib/idds/tests/test_hyperparameteropt.py new file mode 100644 index 00000000..f0bd0db2 --- /dev/null +++ b/main/lib/idds/tests/test_hyperparameteropt.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 - 2020 + + +""" +Test client. +""" + + +from idds.client.clientmanager import ClientManager +from idds.common.utils import get_rest_host + + +def get_workflow(): + from idds.workflow.workflow import Workflow + from idds.atlas.workflow.atlashpowork import ATLASHPOWork + + # request_metadata for predefined method 'nevergrad' + request_metadata = {'workload_id': '20525135', 'sandbox': None, 'method': 'nevergrad', 'opt_space': {"A": {"type": "Choice", "params": {"choices": [1, 4]}}, "B": {"type": "Scalar", "bounds": [0, 5]}}, 'initial_points': [({'A': 1, 'B': 2}, 0.3), ({'A': 1, 'B': 3}, None)], 'max_points': 20, 'num_points_per_generation': 10} # noqa E501 + + # request_metadata for docker method + request_metadata = {'workload_id': '20525134', 'sandbox': 'wguanicedew/idds_hpo_nevergrad', 'workdir': '/data', 'executable': 'docker', 'arguments': 'python /opt/hyperparameteropt_nevergrad.py --max_points=%MAX_POINTS --num_points=%NUM_POINTS --input=/data/%IN --output=/data/%OUT', 'output_json': 'output.json', 'opt_space': {"A": {"type": "Choice", "params": {"choices": [1, 4]}}, "B": {"type": "Scalar", "bounds": [0, 5]}}, 'initial_points': [({'A': 1, 'B': 2}, 0.3), ({'A': 1, 'B': 3}, None)], 'max_points': 20, 'num_points_per_generation': 10} # noqa E501 + + work = ATLASHPOWork(executable=request_metadata.get('executable', None), + arguments=request_metadata.get('arguments', None), + parameters=request_metadata.get('parameters', None), + setup=None, exec_type='local', + sandbox=request_metadata.get('sandbox', None), + method=request_metadata.get('method', None), + container_workdir=request_metadata.get('workdir', None), + output_json=request_metadata.get('output_json', None), + opt_space=request_metadata.get('opt_space', None), + initial_points=request_metadata.get('initial_points', None), + max_points=request_metadata.get('max_points', None), + num_points_per_iteration=request_metadata.get('num_points_per_iteration', 10)) + wf = Workflow() + wf.set_workload_id(request_metadata.get('workload_id', None)) + wf.add_work(work) + return wf + + +if __name__ == '__main__': + host = get_rest_host() + # props = get_req_properties() + # props = get_example_real_tape_stagein_request() + # props = get_example_prodsys2_tape_stagein_request() + # props = get_example_active_learning_request() + workflow = get_workflow() + + # props = pre_check(props) + # print(props) + + wm = ClientManager(host=host) + request_id = wm.submit(workflow) + print(request_id) diff --git a/main/lib/idds/version.py b/main/lib/idds/version.py index 3b9ff3e9..d9044964 100644 --- a/main/lib/idds/version.py +++ b/main/lib/idds/version.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2021 -release_version = "0.0.5" +release_version = "0.1.0" diff --git a/main/tools/env/environment.yml b/main/tools/env/environment.yml index 6ca27308..5ed4a79b 100644 --- a/main/tools/env/environment.yml +++ b/main/tools/env/environment.yml @@ -22,3 +22,6 @@ dependencies: - recommonmark # use Markdown with Sphinx - sphinx-rtd-theme # sphinx readthedoc theme - nevergrad # nevergrad hyper parameter optimization + - idds-common + - idds-workflow + - idds-client diff --git a/main/tools/git/configure_git.sh b/main/tools/git/configure_git.sh new file mode 100755 index 00000000..2c8affac --- /dev/null +++ b/main/tools/git/configure_git.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# + + +cp tools/git/pre-commit .git/hooks/pre-commit +chmod +x .git/hooks/pre-commit +cp tools/git/prepare-commit-msg .git/hooks/prepare-commit-msg +chmod +x .git/hooks/prepare-commit-msg diff --git a/main/tools/git/create-dev-branch b/main/tools/git/create-dev-branch new file mode 100755 index 00000000..362cf3a3 --- /dev/null +++ b/main/tools/git/create-dev-branch @@ -0,0 +1,29 @@ +#!/bin/bash + +if [[ $# -ne 1 ]] ; then + echo 'usage: tools/create-dev-branch ' + echo + echo 'examples: tools/create-dev-branch fancydev' + echo ' tools/create-dev-branch "my fancy dev"' + exit 1 +fi + +echo "Switching to dev" +git checkout dev + +echo "Updating dev" +git pull --all --prune --progress + +echo "Rebasing dev" +#git rebase upstream/dev dev +git reset --hard upstream/dev + +if [ $? != 0 ]; then + echo "Can't rebase to dev. Unstaged changes?" + exit 1 +fi + +echo "Creating dev branch" +git checkout -b dev-${1//[^a-zA-Z0-9]/_} dev + +echo "Done" diff --git a/main/tools/git/create-patch-branch b/main/tools/git/create-patch-branch new file mode 100755 index 00000000..d96c10ed --- /dev/null +++ b/main/tools/git/create-patch-branch @@ -0,0 +1,29 @@ +#!/bin/bash + +if [[ $# -ne 1 ]] ; then + echo 'usage: tools/create-patch-branch ' + echo + echo 'examples: tools/create-patch-branch fancypatch' + echo ' tools/create-patch-branch "my fancy patch"' + exit 1 +fi + +echo "Switching to patch" +git checkout master + +echo "Updating master" +git pull --all --prune --progress + +echo "Rebasing master" +#git rebase upstream/master master +git reset --hard upstream/master + +if [ $? != 0 ]; then + echo "Can't rebase to master. Unstaged changes?" + exit 1 +fi + +echo "Creating patch branch" +git checkout -b patch-${1//[^a-zA-Z0-9]/_} master + +echo "Done" diff --git a/main/tools/git/pre-commit b/main/tools/git/pre-commit new file mode 100755 index 00000000..3da08cfa --- /dev/null +++ b/main/tools/git/pre-commit @@ -0,0 +1,58 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# + + +from __future__ import with_statement +import os +import re +import shutil +import subprocess +import sys +import tempfile + + +def system(*args, **kwargs): + kwargs.setdefault('stdout', subprocess.PIPE) + proc = subprocess.Popen(args, **kwargs) + out, err = proc.communicate() + return out + + +def main(): + currentbranch = system('git', 'rev-parse', '--abbrev-ref=strict', 'HEAD').rstrip('\n') + if currentbranch=='master' or currentbranch=='dev': + print "You are trying to commit on your master/dev, that's probably a mistake. Exiting..." + sys.exit(1) + modified = re.compile('^[AM]+\s+(?P.*\.py)', re.MULTILINE) + files = system('git', 'status', '--porcelain') + files = modified.findall(files) + tempdir = tempfile.mkdtemp() + for name in files: + filename = os.path.join(tempdir, name) + filepath = os.path.dirname(filename) + if not os.path.exists(filepath): + os.makedirs(filepath) + with file(filename, 'w') as f: + system('git', 'show', ':' + name, stdout=f) + + f = open(tempdir + '/.pep8', 'w+') + f.write('''[pep8] +max-line-length=256 + +[flake8] +max-line-length=256 +''') + f.close() + output = system('flake8', '.', cwd=tempdir) + shutil.rmtree(tempdir) + if output: + print output, + sys.exit(1) + +if __name__ == '__main__': + main() diff --git a/main/tools/git/prepare-commit-msg b/main/tools/git/prepare-commit-msg new file mode 100755 index 00000000..a39846c4 --- /dev/null +++ b/main/tools/git/prepare-commit-msg @@ -0,0 +1,30 @@ +#!/bin/sh +# +# An example hook script to prepare the commit log message. +# Called by "git commit" with the name of the file that has the +# commit message, followed by the description of the commit +# message's source. The hook's purpose is to edit the commit +# message file. If the hook fails with a non-zero status, +# the commit is aborted. +# +# To enable this hook, rename this file to "prepare-commit-msg". + +# This hook includes three examples. The first comments out the +# "Conflicts:" part of a merge commit. +# +# The second includes the output of "git diff --name-status -r" +# into the message, just before the "git status" output. It is +# commented because it doesn't cope with --amend or with squashed +# commits. +# +# The third example adds a Signed-off-by line to the message, that can +# still be edited. This is rarely a good idea. + + +#NUMBER=$(git branch | grep '*'|grep 'patch\|feature'|cut -d- -f2) +#if [ -z $NUMBER ] +#then +# echo "Not a patch or feature branch" +#else +# sed -i "1s/^/\[PILOT-$NUMBER\] /" $1 +#fi diff --git a/main/tools/git/submit-merge b/main/tools/git/submit-merge new file mode 100755 index 00000000..812b97e6 --- /dev/null +++ b/main/tools/git/submit-merge @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Wen Guan, , 2019 + +import commands +import sys +import json +import requests + +# requests.packages.urllib3.disable_warnings() + +project_url = "https://api.github.com/repos/wguanicedew/ess/pulls" + +def submit_merge_request(token, data): + r = requests.post(url='%s' % project_url, + headers={"Content-Type": "application/json", + "Authorization": "token %s" % token}, + data=json.dumps(data)) + return r + + +root_git_dir = commands.getstatusoutput('git rev-parse --show-toplevel')[1] + +# Load private_token +print 'Loading private token ...', +try: + with open(root_git_dir + '/.githubkey', 'r') as f: + private_token = f.readline().strip() + print 'OK' +except: + print 'ERROR' + print 'No github keyfile found at %s' % root_git_dir + '/.githubkey' + sys.exit(-1) + +# Check if current branch is not master or next +print 'Checking if current branch is a patch/feature branch ...', +current_branch = commands.getstatusoutput('git rev-parse --abbrev-ref HEAD')[1] +commit_msg = current_branch +for line in commands.getstatusoutput('git show')[1].splitlines(): + if line.strip().startswith('[PILOT-'): + commit_msg = line.strip() +if current_branch == 'master' or current_branch == 'dev': + print 'ERROR' + print 'You are currently on branch \'%s\'. Please change to a dev/patch branch.' % current_branch + sys.exit(-1) +if not current_branch.startswith('patch') and not current_branch.startswith('dev'): + print 'ERROR' + print 'You are currently on branch \'%s\'. This is not a dev/patch branch.' % current_branch + sys.exit(-1) +print 'OK' + +# Push the branch to origin +print 'Pushing the dev/patch branch to origin ...', +op = commands.getstatusoutput('git push origin %s' % current_branch) +if op[0] == 0: + print 'OK' +else: + print 'ERROR' + print op[1] + sys.exit(-1) + +# Check if there is already a merge request for this: +print 'Checking if there already exists a merge request for this dev/patch ...', +resp = requests.get(url='%s' % project_url, + params={'state': 'open'}) +mr_list = json.loads(resp.text) +for mr in mr_list: + if current_branch in mr['head']['ref'] and mr['state'] == 'open': + print 'ERROR' + print 'There is already an open Merge Request for this branch.' + sys.exit(-1) +print 'OK' + +# Create the Merge-requests +if current_branch.startswith('patch'): + print 'Submitting merge request against patch ...', + result = submit_merge_request(token=private_token, + data={'head': current_branch, + 'base': 'master', + 'title': commit_msg, + 'body': commit_msg}) + if result.status_code == 200 or result.status_code == 201: + print 'OK' + else: + print 'ERROR' + print result.content + +# Submit against dev +print 'Submitting merge request against dev ...', +result = submit_merge_request(token=private_token, + data={'head': current_branch, + 'base': 'dev', + 'title': commit_msg, + 'body': commit_msg}) +if result.status_code == 200 or result.status_code == 201: + print 'OK' +else: + print 'ERROR' + print result.content diff --git a/main/tools/git/submit-push b/main/tools/git/submit-push new file mode 100755 index 00000000..9a9a7a86 --- /dev/null +++ b/main/tools/git/submit-push @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Wen Guan, , 2016-2017 + +import commands +import sys +import json +import requests + +# requests.packages.urllib3.disable_warnings() + +project_url = "https://api.github.com/repos/wguanicedew/ess/pulls" + +def submit_merge_request(token, data): + r = requests.post(url='%s' % project_url, + headers={"Content-Type": "application/json", + "Authorization": "token %s" % token}, + data=json.dumps(data)) + return r + + +root_git_dir = commands.getstatusoutput('git rev-parse --show-toplevel')[1] + +# Load private_token +print 'Loading private token ...', +try: + with open(root_git_dir + '/.githubkey', 'r') as f: + private_token = f.readline().strip() + print 'OK' +except: + print 'ERROR' + print 'No github keyfile found at %s' % root_git_dir + '/.githubkey' + sys.exit(-1) + +# Check if current branch is not master or next +print 'Checking if current branch is a patch/dev branch ...', +current_branch = commands.getstatusoutput('git rev-parse --abbrev-ref HEAD')[1] + +# Push the branch to origin +print 'Pushing the dev/patch branch to origin ...', +op = commands.getstatusoutput('git push origin %s' % current_branch) +if op[0] == 0: + print 'OK' +else: + print 'ERROR' + print op[1] + sys.exit(-1) + diff --git a/main/tools/orm/create_database.py b/main/tools/orm/create_database.py new file mode 100644 index 00000000..e6e75aba --- /dev/null +++ b/main/tools/orm/create_database.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 + + +""" +create the database. +""" + +from ess.orm.utils import build_database + +if __name__ == '__main__': + + build_database() diff --git a/main/tools/orm/destory_database.py b/main/tools/orm/destory_database.py new file mode 100644 index 00000000..d90dfa37 --- /dev/null +++ b/main/tools/orm/destory_database.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 + + +""" +destory the database. +""" + + +from ess.orm.utils import destory_everything + +if __name__ == '__main__': + + destory_everything() diff --git a/main/tools/orm/dump_db_schema.py b/main/tools/orm/dump_db_schema.py new file mode 100644 index 00000000..982e27f5 --- /dev/null +++ b/main/tools/orm/dump_db_schema.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 + + +""" +dump the database schema. +""" + +from ess.orm.utils import dump_schema + +if __name__ == '__main__': + + dump_schema() diff --git a/main/tools/orm/mysql_create_user.sql b/main/tools/orm/mysql_create_user.sql new file mode 100644 index 00000000..26584c76 --- /dev/null +++ b/main/tools/orm/mysql_create_user.sql @@ -0,0 +1,9 @@ +#create database ess DEFAULT CHARACTER SET utf8 DEFAULT COLLATE utf8_general_ci; +create database ess DEFAULT CHARACTER SET latin1 DEFAULT COLLATE latin1_general_ci; + + +CREATE USER 'ess'@'%' IDENTIFIED BY 'ess_passwd'; +GRANT ALL PRIVILEGES ON ess.* TO 'ess'@'%'; +CREATE USER 'ess'@'localhost' IDENTIFIED BY 'ess_passwd'; +GRANT ALL PRIVILEGES ON ess.* TO 'ess'@'localhost'; +flush PRIVILEGES; diff --git a/main/tools/test/test.sh b/main/tools/test/test.sh new file mode 100755 index 00000000..27b0d505 --- /dev/null +++ b/main/tools/test/test.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +CurrentDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +RootDir="$( dirname $( dirname "$CurrentDir" ))" +echo $RootDir + +export ESS_HOME=$RootDir +export PYTHONPATH=$RootDir/lib:$PYTHONPATH + +python -m unittest2 discover -v main/lib/idds/tests/ "test_*.py" diff --git a/setup.py b/setup.py index 335f176c..95e8b4b2 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def setup(argv): current_dir = os.path.dirname(os.path.realpath(__file__)) - packages = ['common', 'main', 'client', 'atlas', 'workflow'] + packages = ['common', 'main', 'client', 'atlas', 'workflow', 'doma'] for package in packages: path = os.path.join(current_dir, '%s/setup.py' % package) diff --git a/workflow/lib/idds/workflow/version.py b/workflow/lib/idds/workflow/version.py index 3b9ff3e9..d9044964 100644 --- a/workflow/lib/idds/workflow/version.py +++ b/workflow/lib/idds/workflow/version.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2021 -release_version = "0.0.5" +release_version = "0.1.0" diff --git a/workflow/lib/idds/workflow/work.py b/workflow/lib/idds/workflow/work.py index 089199f0..e1d731b5 100644 --- a/workflow/lib/idds/workflow/work.py +++ b/workflow/lib/idds/workflow/work.py @@ -10,10 +10,12 @@ import copy import logging -import re +import os +import stat import uuid -from idds.common.constants import WorkStatus +from idds.common import exceptions +from idds.common.constants import WorkStatus, ProcessingStatus from idds.common.utils import setup_logging from .base import Base @@ -45,8 +47,9 @@ class Work(Base): def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_type=None, work_tag=None, exec_type='local', sandbox=None, work_id=None, primary_input_collection=None, other_input_collections=None, - output_collections=None, log_collections=None, - workflow=None, logger=None): + output_collections=None, log_collections=None, release_inputs_after_submitting=False, + agent_attributes=None, + logger=None): """ Init a work/task/transformation. @@ -61,9 +64,13 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, :param primary_input_collection: The primary input collection. :param other_input_collections: List of the input collections. :param output_collections: List of the output collections. - :param workflow: The workflow the current work belongs to. + # :param workflow: The workflow the current work belongs to. """ self.internal_id = str(uuid.uuid1()) + self.template_work_id = self.internal_id + self.class_name = self.__class__.__name__.lower() + self.initialized = False + self.sequence_id = 0 self.logger = logger if self.logger is None: @@ -79,8 +86,9 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.exec_type = exec_type self.sandbox = sandbox self.work_id = work_id - self.workflow = workflow + # self.workflow = workflow self.transforming = False + self.workdir = None self.collections = {} self.primary_input_collection = None @@ -90,55 +98,121 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, # self.primary_input_collection = primary_input_collection self.set_primary_input_collection(primary_input_collection) # self.other_input_collections = other_input_collections + if other_input_collections and type(other_input_collections) not in [list, tuple]: + other_input_collections = [other_input_collections] self.add_other_input_collections(other_input_collections) + if output_collections and type(output_collections) not in [list, tuple]: + output_collections = [output_collections] self.add_output_collections(output_collections) + if log_collections and type(log_collections) not in [list, tuple]: + log_collections = [log_collections] self.add_log_collections(log_collections) + self.release_inputs_after_submitting = release_inputs_after_submitting self._has_new_inputs = True self.status = WorkStatus.New + self.errors = [] self.next_works = [] self.processings = {} self.active_processings = [] self.terminated_msg = "" + self.output_data = None + self.parameters_for_next_task = None self.status_statistics = {} + self.agent_attributes = agent_attributes + + self.proxy = None + self.original_proxy = None + def get_class_name(self): return self.__class__.__name__ def get_internal_id(self): return self.internal_id + def set_sequence_id(self, seq_id): + self.sequence_id = seq_id + + def get_sequence_id(self): + return self.sequence_id + def setup_logger(self): """ Setup logger """ self.logger = logging.getLogger(self.get_class_name()) + def add_errors(self, error): + self.errors.append(error) + + def get_errors(self): + return self.errors + def set_work_id(self, work_id, transforming=True): + """ + *** Function called by Marshaller agent. + *** It's the transform_id set by core_workprogresses + """ self.work_id = work_id self.transforming = transforming def get_work_id(self): + """ + *** Function called by Marshaller agent. + """ return self.work_id # def set_workflow(self, workflow): # self.workflow = workflow + def set_agent_attributes(self, attrs, req_attributes=None): + if attrs and self.class_name in attrs: + self.agent_attributes = attrs[self.class_name] + self.logger.info("agent_attributes: %s" % self.agent_attributes) + + def set_workdir(self, workdir): + self.workdir = workdir + + def get_workdir(self): + return self.workdir + def set_status(self, status): + """ + *** Function called by Marshaller agent. + """ assert(isinstance(status, WorkStatus)) self.status = status # if self.workflow: # self.workflow.work_status_update_trigger(self, status) + def get_status(self): + return self.status + def set_terminated_msg(self, msg): + """ + *** Function called by Marshaller agent. + """ self.terminated_msg = msg def get_terminated_msg(self): return self.terminated_msg + def set_output_data(self, data): + self.output_data = data + + def get_output_data(self): + return self.output_data + + def set_parameters_for_next_task(self, params): + self.parameters_for_next_task = params + + def get_parameters_for_next_task(self): + return self.parameters_for_next_task + def __eq__(self, obj): if self.work_id == obj.work_id: return True @@ -147,9 +221,6 @@ def __eq__(self, obj): def __hash__(self): return self.work_id - def copy(self): - return copy.deepcopy(self) - """ def to_dict(self): return {key: value for key, value @@ -160,44 +231,132 @@ def __str__(self): return str(self.to_dict()) def get_work_type(self): + """ + *** Function called by Marshaller agent. + """ return self.work_type def get_work_tag(self): + """ + *** Function called by Marshaller agent. + """ return self.work_tag def set_parameters(self, parameters): self.parameters = parameters - def syn_work_status(self): - pass + def get_parameters(self): + return self.parameters + + def set_arguments(self, arguments): + self.arguments = arguments + + def get_arguments(self): + return self.arguments def is_terminated(self): + """ + *** Function called by Transformer agent. + """ if self.status in [WorkStatus.Finished, WorkStatus.SubFinished, WorkStatus.Failed, WorkStatus.Cancelled]: return True return False def is_finished(self): + """ + *** Function called by Transformer agent. + """ if self.status in [WorkStatus.Finished]: return True return False def is_subfinished(self): + """ + *** Function called by Transformer agent. + """ if self.status in [WorkStatus.SubFinished]: return True return False def is_failed(self): - if self.status in [WorkStatus.Failed, WorkStatus.Cancelled]: + """ + *** Function called by Transformer agent. + """ + if self.status in [WorkStatus.Failed]: + return True + return False + + def is_cancelled(self): + """ + *** Function called by Transformer agent. + """ + if self.status in [WorkStatus.Cancelled]: return True return False def add_next_work(self, work): self.next_works.append(work) + def parse_arguments(self): + try: + arguments = self.get_arguments() + parameters = self.get_parameters() + arguments = arguments.format(**parameters) + return arguments + except Exception as ex: + self.add_errors(str(ex)) + + def set_initialized(self): + self.initialized = True + + def unset_initialized(self): + self.initialized = False + + def is_initialized(self): + return self.initialized + def initialize_work(self): - if self.parameters: - for key in self.parameters.get_param_names(): - self.arguments = re.sub(key, str(self.parameters.get_param_value(key)), self.arguments) + if self.parameters and self.arguments: + # for key in self.parameters.get_param_names(): + # self.arguments = re.sub(key, str(self.parameters.get_param_value(key)), self.arguments) + # self.arguments = self.arguments.format(**self.parameters) + pass + if not self.is_initialized(): + self.set_initialized() + + def copy(self): + new_work = copy.deepcopy(self) + return new_work + + def __deepcopy__(self, memo): + logger = self.logger + self.logger = None + + cls = self.__class__ + result = cls.__new__(cls) + + memo[id(self)] = result + + # Deep copy all other attributes + for k, v in self.__dict__.items(): + setattr(result, k, copy.deepcopy(v, memo)) + + self.logger = logger + result.logger = logger + return result + + def generate_work_from_template(self): + logger = self.logger + self.logger = None + new_work = copy.deepcopy(self) + self.logger = logger + new_work.logger = logger + # new_work.template_work_id = self.get_internal_id() + new_work.internal_id = str(uuid.uuid1()) + return new_work + + def get_template_id(self): + return self.template_work_id def add_collection_to_collections(self, coll): assert(isinstance(coll, dict)) @@ -209,10 +368,14 @@ def add_collection_to_collections(self, coll): self.collections[coll['coll_metadata']['internal_id']] = coll def set_primary_input_collection(self, coll): - self.add_collection_to_collections(coll) - self.primary_input_collection = coll['coll_metadata']['internal_id'] + if coll: + self.add_collection_to_collections(coll) + self.primary_input_collection = coll['coll_metadata']['internal_id'] def get_primary_input_collection(self): + """ + *** Function called by Marshaller agent. + """ return self.collections[self.primary_input_collection] def add_other_input_collections(self, colls): @@ -227,6 +390,9 @@ def get_other_input_collections(self): return [self.collections[k] for k in self.other_input_collections] def get_input_collections(self): + """ + *** Function called by Transformer agent. + """ keys = [self.primary_input_collection] + self.other_input_collections return [self.collections[k] for k in keys] @@ -238,15 +404,6 @@ def get_input_contents(self): def add_output_collections(self, colls): """ - if scope is None: - self.output_collection_scope = self.input_collection_scope - else: - self.output_collection_scope = scope - - if name is None: - self.output_collection_name = self.input_collection_name + "." + self.work_type + "." + str(self.work_id) - else: - self.output_collection_name = name """ if not colls: return @@ -276,10 +433,14 @@ def set_has_new_inputs(self, yes=True): self._has_new_inputs = yes def has_new_inputs(self): + """ + *** Function called by Transformer agent. + """ return self._has_new_inputs def get_new_input_output_maps(self, mapped_input_output_maps={}): """ + *** Function called by Transformer agent. New inputs which are not yet mapped to outputs. :param mapped_input_output_maps: Inputs that are already mapped. @@ -307,8 +468,18 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): return new_input_maps def set_collection_id(self, collection, coll_id): + # print(collection) + # print(coll_id) self.collections[collection['coll_metadata']['internal_id']]['coll_id'] = coll_id + def should_release_inputs(self, processing=None): + if self.release_inputs_after_submitting: + if (processing and 'status' in processing + and processing['status'] in [ProcessingStatus.Submitted, ProcessingStatus.Submitted.value]): # noqa: W503 + return True + return False + return True + def add_processing_to_processings(self, processing): assert(isinstance(processing, dict)) # assert('processing_metadata' in processing) @@ -323,14 +494,107 @@ def add_processing_to_processings(self, processing): # self.processing = processing def set_processing_id(self, processing, processing_id): + """ + *** Function called by Transformer agent. + """ self.processings[processing['processing_metadata']['internal_id']]['processing_id'] = processing_id + def set_processing_status(self, processing, status): + """ + *** Function called by Transformer agent. + """ + self.processings[processing['processing_metadata']['internal_id']]['status'] = status + # if status not in [ProcessingStatus.New, ProcessingStatus.Submitting, + # ProcessingStatus.Submitted, ProcessingStatus.Running]: + # if processing['processing_metadata']['internal_id'] in self.active_processings: + # del self.active_processings[processing['processing_metadata']['internal_id']] + + def set_processing_output_metadata(self, processing, output_metadata): + """ + *** Function called by Transformer agent. + """ + processing = self.processings[processing['processing_metadata']['internal_id']] + processing['output_metadata'] = output_metadata + self.set_output_data(output_metadata) + + def is_processing_terminated(self, processing): + if 'status' in processing and processing['status'] not in [ProcessingStatus.New, + ProcessingStatus.Submitting, + ProcessingStatus.Submitted, + ProcessingStatus.Running]: + return True + return False + + def reap_processing(self, processing): + if self.is_processing_terminated(processing): + self.active_processings.remove(processing['processing_metadata']['internal_id']) + else: + self.logger.error("Cannot reap an unterminated processing: %s" % processing) + + def is_processings_terminated(self): + """ + *** Function called by Transformer agent. + """ + for p_id in self.active_processings: + p = self.processings[p_id] + if self.is_processing_terminated(p): + pass + else: + return False + return True + + def is_processings_finished(self): + """ + *** Function called by Transformer agent. + """ + for p_id in self.active_processings: + p = self.processings[p_id] + if not self.is_processing_terminated(p) or p['status'] not in [ProcessingStatus.Finished]: + return False + return True + + def is_processings_subfinished(self): + """ + *** Function called by Transformer agent. + """ + has_finished = False + has_failed = False + for p_id in self.active_processings: + p = self.processings[p_id] + if not self.is_processing_terminated(p): + return False + else: + if p['status'] in [ProcessingStatus.Finished]: + has_finished = True + if p['status'] in [ProcessingStatus.Failed]: + has_failed = True + if has_finished and has_failed: + return True + return False + + def is_processings_failed(self): + """ + *** Function called by Transformer agent. + """ + for p_id in self.active_processings: + p = self.processings[p_id] + if not self.is_processing_terminated(p) or p['status'] not in [ProcessingStatus.Failed]: + return False + return True + def create_processing(self, input_output_maps): + """ + *** Function called by Transformer agent. + """ proc = {'processing_metadata': {'internal_id': str(uuid.uuid1())}} self.add_processing_to_processings(proc) self.active_processings.append(proc['processing_metadata']['internal_id']) + return proc def get_processing(self, input_output_maps): + """ + *** Function called by Transformer agent. + """ if self.active_processings: return self.processings[self.active_processings[0]] else: @@ -338,8 +602,64 @@ def get_processing(self, input_output_maps): # self.process = process # return process - def submit_processing(self): - pass + def submit_processing(self, processing): + """ + *** Function called by Carrier agent. + """ + raise exceptions.NotImplementedException - def poll_processing(self): - pass + def abort_processing(self, processing): + """ + *** Function called by Carrier agent. + """ + raise exceptions.NotImplementedException + + def poll_processing_updates(self, processing, input_output_maps): + """ + *** Function called by Carrier agent. + """ + raise exceptions.NotImplementedException + + def syn_work_status(self, input_output_maps): + """ + *** Function called by Transformer agent. + """ + raise exceptions.NotImplementedException + + def sync_work_data(self, work): + self.status = work.status + self.workdir = work.workdir + self._has_new_inputs = work._has_new_inputs + self.errors = work.errors + self.next_works = work.next_works + + self.terminated_msg = work.terminated_msg + self.output_data = work.output_data + self.parameters_for_next_task = work.parameters_for_next_task + + self.status_statistics = work.status_statistics + + self.processings = work.processings + self.active_processings = work.active_processings + + def add_proxy(self, proxy): + self.proxy = proxy + + def get_proxy(self): + return self.proxy + + def set_user_proxy(self): + if 'X509_USER_PROXY' in os.environ: + self.original_proxy = os.environ['X509_USER_PROXY'] + if self.get_proxy(): + user_proxy = '/tmp/idds_user_proxy' + with open(user_proxy, 'w') as fp: + fp.write(self.get_proxy()) + os.chmod(user_proxy, stat.S_IRUSR | stat.S_IWUSR) + os.environ['X509_USER_PROXY'] = user_proxy + + def unset_user_proxy(self): + if self.original_proxy: + os.environ['X509_USER_PROXY'] = self.original_proxy + else: + del os.environ['X509_USER_PROXY'] diff --git a/workflow/lib/idds/workflow/workflow.py b/workflow/lib/idds/workflow/workflow.py index 38ce2783..e3f5cbd6 100644 --- a/workflow/lib/idds/workflow/workflow.py +++ b/workflow/lib/idds/workflow/workflow.py @@ -8,64 +8,91 @@ # Authors: # - Wen Guan, , 2020 +import copy import datetime +import logging +import inspect import random import time +import uuid -from idds.common.utils import json_dumps +from idds.common.utils import json_dumps, setup_logging, get_proxy from .base import Base -class Condition(object): - def __init__(self, cond, prework, true_work, false_work=None): +setup_logging(__name__) + + +class Condition(Base): + def __init__(self, cond=None, current_work=None, true_work=None, false_work=None, logger=None): """ Condition. if cond() is true, return true_work, else return false_work. :param cond: executable return true or false. - :param prework: Work instance. + :param work: The current Work instance. :param true_work: Work instance. :param false_work: Work instance. """ + if cond and callable(cond): + assert(inspect.ismethod(cond)) + assert(cond.__self__ == current_work) self.cond = cond - self.prework = prework - self.true_work = true_work - self.false_work = false_work + self.current_work = None + if current_work: + self.current_work = current_work.get_template_id() + self.true_work = None + if true_work: + self.true_work = true_work.get_template_id() + self.false_work = None + if false_work: + self.false_work = false_work.get_template_id() def all_works(self): works = [] - works.append(self.prework) - works.append(self.true_work) + works.append(self.current_work) + if self.true_work: + works.append(self.true_work) if self.false_work: works.append(self.false_work) return works def all_next_works(self): works = [] - works.append(self.true_work) + if self.true_work: + works.append(self.true_work) if self.false_work: works.append(self.false_work) return works - def get_next_work(self): + def get_cond_status(self): if callable(self.cond): if self.cond(): - return self.true_work + return True else: - return self.false_work + return False else: if self.cond: - return self.true_work + return True else: - return self.false_work + return False + + def get_next_work(self): + if self.get_cond_status(): + return self.true_work + else: + return self.false_work class Workflow(Base): - def __init__(self, name=None, workload_id=None): + def __init__(self, name=None, workload_id=None, logger=None): """ Init a workflow. """ + self.internal_id = str(uuid.uuid1()) + self.template_work_id = self.internal_id + if name: self.name = name + "." + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") + str(random.randint(1, 1000)) else: @@ -73,25 +100,124 @@ def __init__(self, name=None, workload_id=None): self.workload_id = workload_id if self.workload_id is None: - self.workload_id = time.time() + self.workload_id = int(time.time()) + self.logger = logger + if self.logger is None: + self.setup_logger() + + self.works_template = {} self.works = {} + self.work_sequence = {} # order list + self.terminated_works = [] self.initial_works = [] # if the primary initial_work is not set, it's the first initial work. self.primary_initial_work = None - self.new_works = [] - self.current_works = [] + self.independent_works = [] + + self.first_initial = False + self.new_to_run_works = [] + self.current_running_works = [] self.work_conds = {} + self.num_subfinished_works = 0 self.num_finished_works = 0 + self.num_failed_works = 0 + self.num_cancelled_works = 0 self.num_total_works = 0 self.last_work = None + # user defined Condition class + self.user_defined_conditions = {} + + self.proxy = None + def get_name(self): return self.name + def get_class_name(self): + return self.__class__.__name__ + + def setup_logger(self): + """ + Setup logger + """ + self.logger = logging.getLogger(self.get_class_name()) + + def log_info(self, info): + if self.logger is None: + self.setup_logger() + self.logger.info(info) + + def log_debug(self, info): + if self.logger is None: + self.setup_logger() + self.logger.debug(info) + + def get_internal_id(self): + return self.internal_id + + def copy(self): + new_wf = copy.deepcopy(self) + return new_wf + + def __deepcopy__(self, memo): + logger = self.logger + self.logger = None + + cls = self.__class__ + result = cls.__new__(cls) + + memo[id(self)] = result + + # Deep copy all other attributes + for k, v in self.__dict__.items(): + setattr(result, k, copy.deepcopy(v, memo)) + + self.logger = logger + result.logger = logger + return result + + def get_works_template(self): + return self.works_template + + def add_work_template(self, work): + self.works_template[work.get_template_id()] = work + + def get_new_work_from_template(self, work_id, new_parameters=None): + # template_id = work.get_template_id() + template_id = work_id + work = self.works_template[template_id] + new_work = work.generate_work_from_template() + if new_parameters: + new_work.set_parameters(new_parameters) + new_work.set_sequence_id(self.num_total_works) + new_work.initialize_work() + self.works[new_work.get_internal_id()] = new_work + # self.work_sequence.append(new_work.get_internal_id()) + self.work_sequence[str(self.num_total_works)] = new_work.get_internal_id() + self.num_total_works += 1 + self.new_to_run_works.append(new_work.get_internal_id()) + self.last_work = new_work.get_internal_id() + return new_work + + def register_user_defined_condition(self, condition): + cond_src = inspect.getsource(condition) + self.user_defined_conditions[condition.__name__] = cond_src + + def load_user_defined_condition(self): + # try: + # Condition() + # except NameError: + # global Condition + # import Condition + + for cond_src_name in self.user_defined_conditions: + # global cond_src_name + exec(self.user_defined_conditions[cond_src_name]) + def set_workload_id(self, workload_id): self.workload_id = workload_id @@ -99,132 +225,144 @@ def get_workload_id(self): return self.workload_id def add_work(self, work, initial=False, primary=False): - self.num_total_works += 1 - self.works[work.get_internal_id()] = work + self.first_initial = False + self.add_work_template(work) if initial: if primary: - self.primary_initial_work = work.get_internal_id() - self.initial_works.append(work.get_internal_id()) + self.primary_initial_work = work.get_template_id() + self.add_initial_works(work.get_template_id()) - if self.primary_initial_work is None: - self.primary_initial_work = work.get_internal_id() - self.new_works.append(work.get_internal_id()) + self.independent_works.append(work.get_template_id()) def add_condition(self, cond): + self.first_initial = False cond_works = cond.all_works() for cond_work in cond_works: - assert(cond_work in self.works) - # for next_work in cond.all_next_works(): - # if next_work in self.current_works: - # del self.current_works[next_work] - - if cond.prework not in self.work_conds: - self.work_conds[cond.prework.get_internal_id()] = [] - self.work_conds[cond.prework.get_internal_id()].append(cond) - - def __eq__(self, wf): - if self.name == wf.name: - return True - else: - return False + assert(cond_work in self.get_works_template()) + + if cond.current_work not in self.work_conds: + self.work_conds[cond.current_work] = [] + self.work_conds[cond.current_work].append(cond) + + # if a work is a true_work or false_work of a condition, + # should remove it from independent_works + cond_next_works = cond.all_next_works() + for next_work in cond_next_works: + if next_work in self.independent_works: + self.independent_works.remove(next_work) + + def add_initial_works(self, work): + assert(work.get_template_id() in self.get_works_template()) + self.initial_works.append(work.get_template_id()) + if self.primary_initial_work is None: + self.primary_initial_work = work.get_template_id() + + def enable_next_work(self, work, cond): + self.log_debug("Checking Work %s condition: %s" % (work.get_internal_id(), + json_dumps(cond, sort_keys=True, indent=4))) + if cond and self.is_class_method(cond.cond): + # cond_work_id = self.works[cond.cond['idds_method_class_id']] + cond.cond = getattr(work, cond.cond['idds_method']) + self.log_info("Work %s condition: %s" % (work.get_internal_id(), cond.cond)) + next_work = cond.get_next_work() + self.log_info("Work %s condition status %s" % (work.get_internal_id(), cond.get_cond_status())) + self.log_info("Work %s next work %s" % (work.get_internal_id(), next_work)) + if next_work is not None: + new_parameters = work.get_parameters_for_next_task() + next_work = self.get_new_work_from_template(next_work, new_parameters) + work.add_next_work(next_work.get_internal_id()) + return next_work def __str__(self): return str(json_dumps(self)) def get_new_works(self): """ + *** Function called by Marshaller agent. + new works to be ready to start """ - return [self.works[k] for k in self.new_works] + self.sync_works() + return [self.works[k] for k in self.new_to_run_works] def get_current_works(self): """ + *** Function called by Marshaller agent. + Current running works """ self.sync_works() - return [self.works[k] for k in self.current_works] + return [self.works[k] for k in self.current_running_works] def get_primary_initial_collection(self): + """ + *** Function called by Clerk agent. + """ + if self.primary_initial_work: - return self.works[self.primary_initial_work].get_primary_input_collection() + return self.get_works_template()[self.primary_initial_work].get_primary_input_collection() + elif self.initial_works: + return self.get_works_template()[self.initial_works[0]].get_primary_input_collection() + elif self.independent_works: + return self.get_works_template()[self.independent_works[0]].get_primary_input_collection() + else: + keys = self.get_works_template().keys() + return self.get_works_template()[keys[0]].get_primary_input_collection() return None - def update_work_status(self, work, status): - assert(work in self.current_works) - work.set_status(status) - if work.is_terminated(): - if work.is_finished(): - self.num_finished_works += 1 - else: - # self.num_finished_works - pass - if work not in self.work_conds: - # has no next work - work_cp = work.copy() - self.last_work = work_cp - self.terminated_works.append(work_cp) - self.current_works.remove(work) + def first_initialize(self): + # set new_to_run works + if not self.first_initial: + self.first_initial = True + if self.initial_works: + tostart_works = self.initial_works + elif self.independent_works: + tostart_works = self.independent_works else: - backup_work = work.copy() - for cond in self.work_conds[work]: - next_work = cond.get_next_work() - if next_work is not None: - next_work.initialize_work() - self.current_works.append(next_work) - backup_work.add_next_work(next_work) - self.last_work = backup_work - self.terminated_works.append(backup_work) - self.current_works.remove(work) - - def work_status_update_trigger(self, work, status): - assert(work in self.new_works + self.current_works) - if work in self.new_works and work.is_running(): - self.new_works.remove(work) - self.current_works.append(work) - - if work.is_terminated(): - if work not in self.work_conds: - # has no next work - self.terminated_works.append(work.copy()) - self.current_works.remove(work) - else: - backup_work = work.copy() - for cond in self.work_conds[work]: - next_work = cond.get_next_work() - if next_work is not None: - next_work.initialize_work() - # self.current_works.append(next_work) - self.new_works.append(next_work) - backup_work.add_next_work(next_work) - self.terminated_works.append(backup_work) - self.current_works.remove(work) + tostart_works = list(self.get_works_template().keys()) + tostart_works = [tostart_works[0]] + + for work_id in tostart_works: + self.get_new_work_from_template(work_id) def sync_works(self): - for work in [self.works[k] for k in self.new_works]: + self.first_initialize() + + for work in [self.works[k] for k in self.new_to_run_works]: if work.transforming: - self.new_works.remove(work.get_internal_id()) - self.current_works.append(work.get_internal_id()) + self.new_to_run_works.remove(work.get_internal_id()) + self.current_running_works.append(work.get_internal_id()) - for work in [self.works[k] for k in self.current_works]: + for work in [self.works[k] for k in self.current_running_works]: if work.is_terminated(): - if work not in self.work_conds: + self.log_info("Work %s is terminated" % work.get_internal_id()) + self.log_debug("Work conditions: %s" % json_dumps(self.work_conds, sort_keys=True, indent=4)) + if work.get_template_id() not in self.work_conds: # has no next work + self.log_info("Work %s has no condition dependencies" % work.get_internal_id()) self.terminated_works.append(work.get_internal_id()) - self.current_works.remove(work.get_internal_id()) + self.current_running_works.remove(work.get_internal_id()) else: - for cond in self.work_conds[work.get_internal_id()]: - next_work = cond.get_next_work() - if next_work is not None: - next_work.initialize_work() - # self.current_works.append(next_work) - # self.new_works.append(next_work.get_internal_id()) - self.add_work(next_work) - work.add_next_work(next_work.get_internal_id()) + self.log_debug("Work %s has condition dependencies %s" % (work.get_internal_id(), + json_dumps(self.work_conds[work.get_template_id()], sort_keys=True, indent=4))) + for cond in self.work_conds[work.get_template_id()]: + self.enable_next_work(work, cond) self.terminated_works.append(work.get_internal_id()) - self.current_works.remove(work.get_internal_id()) + self.current_running_works.remove(work.get_internal_id()) + + if work.is_finished(): + self.num_finished_works += 1 + elif work.is_subfinished(): + self.num_subfinished_works += 1 + elif work.is_failed(): + self.num_failed_works += 1 + elif work.is_cancelled(): + self.num_cancelled_works += 1 def get_exact_workflows(self): """ + *** Function called by Clerk agent. + TODO: The primary dataset for the initial work is a dataset with '*'. workflow.primary_initial_collection = 'some datasets with *' collections = get_collection(workflow.primary_initial_collection) @@ -239,21 +377,50 @@ def get_exact_workflows(self): return [self] def is_terminated(self): + """ + *** Function called by Marshaller agent. + """ self.sync_works() - if len(self.new_works) == 0 and len(self.current_works) == 0: + if len(self.new_to_run_works) == 0 and len(self.current_running_works) == 0: return True return False def is_finished(self): + """ + *** Function called by Marshaller agent. + """ return self.is_terminated() and self.num_finished_works == self.num_total_works def is_subfinished(self): - return self.is_terminated() and (self.num_finished_works < self.num_total_works) and (self.num_finished_works > 0) + """ + *** Function called by Marshaller agent. + """ + return self.is_terminated() and (self.num_finished_works > 0 and self.num_finished_works < self.num_total_works) def is_failed(self): - return self.is_terminated() and (self.num_finished_works == 0) + """ + *** Function called by Marshaller agent. + """ + return self.is_terminated() and (self.num_failed_works > 0) and (self.num_cancelled_works == 0) + + def is_cancelled(self): + """ + *** Function called by Marshaller agent. + """ + return self.is_terminated() and (self.num_cancelled_works > 0) def get_terminated_msg(self): + """ + *** Function called by Marshaller agent. + """ if self.last_work: - return self.last_work.get_terminated_msg() + return self.works[self.last_work].get_terminated_msg() return None + + def add_proxy(self): + self.proxy = get_proxy() + if not self.proxy: + raise Exception("Cannot get local proxy") + + def get_proxy(self): + return self.proxy diff --git a/workflow/tools/env/environment.yml b/workflow/tools/env/environment.yml index 776c514d..1857df0d 100644 --- a/workflow/tools/env/environment.yml +++ b/workflow/tools/env/environment.yml @@ -8,4 +8,4 @@ dependencies: - flake8 # Wrapper around PyFlakes&pep8 - pytest # python testing tool - nose # nose test tools - + - idds-common