From f21d83201cf0da80a38c365bce2b17102a872ccd Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 22 Nov 2022 10:45:36 +0100 Subject: [PATCH 01/91] sync statistics immediately after task submission --- main/lib/idds/agents/carrier/submitter.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/main/lib/idds/agents/carrier/submitter.py b/main/lib/idds/agents/carrier/submitter.py index 22634dbc..15d9a317 100644 --- a/main/lib/idds/agents/carrier/submitter.py +++ b/main/lib/idds/agents/carrier/submitter.py @@ -16,6 +16,7 @@ from idds.core import processings as core_processings from idds.agents.common.eventbus.event import (EventType, NewProcessingEvent, + SyncProcessingEvent, UpdateTransformEvent) from .utils import handle_new_processing @@ -155,9 +156,9 @@ def process_new_processing(self, event): event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id'], content=submit_event_content) self.event_bus.send(event) - # self.logger.info(log_pre + "SyncProcessingEvent(processing_id: %s)" % pr['processing_id']) - # event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) - # self.event_bus.send(event) + self.logger.info(log_pre + "SyncProcessingEvent(processing_id: %s)" % pr['processing_id']) + event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + self.event_bus.send(event) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) From a890cf8284f36c6a98ddc38554beafc6976fc1bb Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 28 Nov 2022 14:37:21 +0100 Subject: [PATCH 02/91] update work tag --- atlas/lib/idds/atlas/workflow/atlaslocalpandawork.py | 2 +- atlas/lib/idds/atlas/workflow/atlaspandawork.py | 2 +- atlas/lib/idds/atlas/workflowv2/atlaslocalpandawork.py | 2 +- atlas/lib/idds/atlas/workflowv2/atlaspandawork.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/atlas/lib/idds/atlas/workflow/atlaslocalpandawork.py b/atlas/lib/idds/atlas/workflow/atlaslocalpandawork.py index 5f9fbf56..d3e4e4cb 100644 --- a/atlas/lib/idds/atlas/workflow/atlaslocalpandawork.py +++ b/atlas/lib/idds/atlas/workflow/atlaslocalpandawork.py @@ -27,7 +27,7 @@ class ATLASLocalPandaWork(ATLASPandaWork): def __init__(self, task_parameters=None, - work_tag='atlas', exec_type='panda', work_id=None, + work_tag='atlaslocalpanda', exec_type='panda', work_id=None, primary_input_collection=None, other_input_collections=None, input_collections=None, primary_output_collection=None, other_output_collections=None, diff --git a/atlas/lib/idds/atlas/workflow/atlaspandawork.py b/atlas/lib/idds/atlas/workflow/atlaspandawork.py index d1a5e697..27597aa0 100644 --- a/atlas/lib/idds/atlas/workflow/atlaspandawork.py +++ b/atlas/lib/idds/atlas/workflow/atlaspandawork.py @@ -32,7 +32,7 @@ class ATLASPandaWork(Work): def __init__(self, task_parameters=None, - work_tag='atlas', exec_type='panda', work_id=None, + work_tag='atlaspanda', exec_type='panda', work_id=None, primary_input_collection=None, other_input_collections=None, input_collections=None, primary_output_collection=None, other_output_collections=None, diff --git a/atlas/lib/idds/atlas/workflowv2/atlaslocalpandawork.py b/atlas/lib/idds/atlas/workflowv2/atlaslocalpandawork.py index 12d215ae..e7cc5f8e 100644 --- a/atlas/lib/idds/atlas/workflowv2/atlaslocalpandawork.py +++ b/atlas/lib/idds/atlas/workflowv2/atlaslocalpandawork.py @@ -27,7 +27,7 @@ class ATLASLocalPandaWork(ATLASPandaWork): def __init__(self, task_parameters=None, - work_tag='atlas', exec_type='panda', work_id=None, + work_tag='atlaslocalpanda', exec_type='panda', work_id=None, primary_input_collection=None, other_input_collections=None, input_collections=None, primary_output_collection=None, other_output_collections=None, diff --git a/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py b/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py index 9ffd452b..bf0dd7b0 100644 --- a/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py +++ b/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py @@ -35,7 +35,7 @@ class ATLASPandaWork(Work): def __init__(self, task_parameters=None, - work_tag='atlas', exec_type='panda', work_id=None, + work_tag='atlaspanda', exec_type='panda', work_id=None, primary_input_collection=None, other_input_collections=None, input_collections=None, primary_output_collection=None, other_output_collections=None, From 3314aac74ed7e38d1fb4cc1eb4cf7212255b6cab Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 28 Nov 2022 14:39:10 +0100 Subject: [PATCH 03/91] enable retries for different work type --- main/lib/idds/agents/carrier/utils.py | 6 ++++++ main/lib/idds/agents/clerk/clerk.py | 30 +++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index c4b5f4a1..0132be94 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -73,6 +73,12 @@ def is_process_terminated(processing_status): return False +def is_process_finished(processing_status): + if processing_status in [ProcessingStatus.Finished]: + return True + return False + + def is_all_contents_available(contents): for content in contents: if type(content) is dict: diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index 632da2c8..0616372f 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -276,12 +276,38 @@ def load_poll_period(self, req, parameters): parameters['max_update_retries'] = req['max_update_retries'] if req['max_update_retries'] is not None else self.max_update_retries return parameters + def get_work_tag_attribute(self, work_tag, attribute): + work_tag_attribute = work_tag + "_" + attribute + work_tag_attribute_value = None + if not hasattr(self, work_tag_attribute): + work_tag_attribute_value = int(self.work_tag_attribute) + return work_tag_attribute_value + def generate_transform(self, req, work): wf = req['request_metadata']['workflow'] work.set_request_id(req['request_id']) work.username = req['username'] + transform_tag = work.get_work_tag() + if req['max_new_retries']: + max_new_retries = req['max_new_retries'] + else: + work_tag_max_new_retries = self.get_work_tag_attribute(transform_tag, "max_new_retries") + if work_tag_max_new_retries: + max_new_retries = work_tag_max_new_retries + else: + max_new_retries = self.max_new_retries + + if req['max_update_retries']: + max_update_retries = req['max_update_retries'] + else: + work_tag_max_update_retries = self.get_work_tag_attribute(transform_tag, "max_update_retries") + if work_tag_max_update_retries: + max_update_retries = work_tag_max_update_retries + else: + max_update_retries = self.max_update_retries + new_transform = {'request_id': req['request_id'], 'workload_id': req['workload_id'], 'transform_type': work.get_work_type(), @@ -292,8 +318,8 @@ def generate_transform(self, req, work): 'name': work.get_work_name(), 'new_poll_period': self.new_poll_period, 'update_poll_period': self.update_poll_period, - 'max_new_retries': req['max_new_retries'] if req['max_new_retries'] is not None else self.max_new_retries, - 'max_update_retries': req['max_update_retries'] if req['max_update_retries'] is not None else self.max_update_retries, + 'max_new_retries': max_new_retries, + 'max_update_retries': max_update_retries, # 'expired_at': req['expired_at'], 'expired_at': None, 'transform_metadata': {'internal_id': work.get_internal_id(), From ff1b54c2328dbca7f8dfc9974c3ccb93e7d32f2e Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 28 Nov 2022 14:44:11 +0100 Subject: [PATCH 04/91] enable to define different poll period for different tasks --- main/lib/idds/agents/carrier/poller.py | 32 ++++++++++++++++++-- main/lib/idds/tests/test_migrate_requests.py | 10 ++++-- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py index e83fc377..ca15f88d 100644 --- a/main/lib/idds/agents/carrier/poller.py +++ b/main/lib/idds/agents/carrier/poller.py @@ -24,7 +24,7 @@ SyncProcessingEvent, TerminatedProcessingEvent) -from .utils import handle_update_processing, is_process_terminated +from .utils import handle_update_processing, is_process_terminated, is_process_finished setup_logging(__name__) @@ -143,10 +143,28 @@ def get_processing(self, processing_id, status=None, locking=False): self.logger.error(traceback.format_exc()) return None + def get_work_tag_attribute(self, work_tag, attribute): + work_tag_attribute = work_tag + "_" + attribute + work_tag_attribute_value = None + if not hasattr(self, work_tag_attribute): + work_tag_attribute_value = int(self.work_tag_attribute) + return work_tag_attribute_value + def load_poll_period(self, processing, parameters): - if self.new_poll_period and processing['new_poll_period'] != self.new_poll_period: + proc = processing['processing_metadata']['processing'] + work = proc.work + work_tag = work.get_work_tag() + + work_tag_new_poll_period = self.get_work_tag_attribute(work_tag, "new_poll_period") + if work_tag_new_poll_period: + parameters['new_poll_period'] = work_tag_new_poll_period + elif self.new_poll_period and processing['new_poll_period'] != self.new_poll_period: parameters['new_poll_period'] = self.new_poll_period - if self.update_poll_period and processing['update_poll_period'] != self.update_poll_period: + + work_tag_update_poll_period = self.get_work_tag_attribute(work_tag, "update_poll_period") + if work_tag_update_poll_period: + parameters['update_poll_period'] = work_tag_update_poll_period + elif self.update_poll_period and processing['update_poll_period'] != self.update_poll_period: parameters['update_poll_period'] = self.update_poll_period return parameters @@ -233,6 +251,14 @@ def handle_update_processing(self, processing): new_process_status = process_status if is_process_terminated(process_status): new_process_status = ProcessingStatus.Terminating + if is_process_finished(process_status): + new_process_status = ProcessingStatus.Terminating + else: + retries = processing['update_retries'] + 1 + if processing['max_update_retries'] and retries < processing['max_update_retries']: + work.reactivate_processing(processing, log_prefix=log_prefix) + process_status = ProcessingStatus.Running + new_process_status = ProcessingStatus.Running update_processing = {'processing_id': processing['processing_id'], 'parameters': {'status': new_process_status, diff --git a/main/lib/idds/tests/test_migrate_requests.py b/main/lib/idds/tests/test_migrate_requests.py index 48467cf3..998c058d 100644 --- a/main/lib/idds/tests/test_migrate_requests.py +++ b/main/lib/idds/tests/test_migrate_requests.py @@ -35,7 +35,9 @@ def migrate(): # doma google doma_google_host = 'https://34.133.138.229:443/idds' # noqa F841 - cm1 = ClientManager(host=atlas_host) + slac_k8s_dev_host = 'https://rubin-panda-idds-dev.slac.stanford.edu:8443/idds' # noqa F841 + + # cm1 = ClientManager(host=atlas_host) cm1 = ClientManager(host=doma_host) # reqs = cm1.get_requests(request_id=290) # old_request_id = 298163 @@ -47,6 +49,7 @@ def migrate(): old_request_id = 372930 old_request_id = 2603 old_request_id = 2802 + old_request_id = 2816 # for old_request_id in [152]: # for old_request_id in [60]: # noqa E115 @@ -54,9 +57,10 @@ def migrate(): for old_request_id in [old_request_id]: # noqa E115 # doma 183 reqs = cm1.get_requests(request_id=old_request_id, with_metadata=True) - cm2 = ClientManager(host=dev_host) - cm2 = ClientManager(host=doma_host) + # cm2 = ClientManager(host=dev_host) + # cm2 = ClientManager(host=doma_host) # cm2 = ClientManager(host=atlas_host) + cm2 = ClientManager(host=slac_k8s_dev_host) # print(reqs) print("num requests: %s" % len(reqs)) From 137c5fda28bb9cf1110e7601a872f79db73b4ca4 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 30 Nov 2022 19:44:20 +0100 Subject: [PATCH 05/91] add content_dep_ids and corresponding funcs --- main/lib/idds/core/processings.py | 10 +++++++++- main/lib/idds/orm/base/models.py | 4 +++- main/lib/idds/orm/contents.py | 25 +++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/main/lib/idds/core/processings.py b/main/lib/idds/core/processings.py index f193be80..d6d52a08 100644 --- a/main/lib/idds/core/processings.py +++ b/main/lib/idds/core/processings.py @@ -286,7 +286,8 @@ def update_processing_with_collection_contents(updated_processing, new_processin @transactional_session def update_processing_contents(update_processing, update_contents, update_messages=None, new_contents=None, - update_collections=None, messages=None, message_bulk_size=2000, session=None): + update_dep_contents=None, update_collections=None, messages=None, + message_bulk_size=2000, session=None): """ Update processing with contents. @@ -299,6 +300,13 @@ def update_processing_contents(update_processing, update_contents, update_messag orm_contents.update_contents(update_contents, session=session) if new_contents: orm_contents.add_contents(new_contents, session=session) + if update_dep_contents: + request_id, update_dep_contents_status_name, update_dep_contents_status = update_dep_contents + for status_name in update_dep_contents_status_name: + status = update_dep_contents_status_name[status_name] + status_content_ids = update_dep_contents_status[status_name] + if status_content_ids: + orm_contents.update_dep_contents(request_id, status_content_ids, status, session=session) if update_processing: orm_processings.update_processing(processing_id=update_processing['processing_id'], parameters=update_processing['parameters'], diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index 472d00d2..ee97bac4 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2020 +# - Wen Guan, , 2019 - 2022 """ @@ -500,6 +500,7 @@ class Content(BASE, ModelBase): request_id = Column(BigInteger().with_variant(Integer, "sqlite")) workload_id = Column(Integer()) map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0) + content_dep_id = Column(BigInteger()) scope = Column(String(SCOPE_LENGTH)) name = Column(String(LONG_NAME_LENGTH)) min_id = Column(Integer(), default=0) @@ -533,6 +534,7 @@ class Content(BASE, ModelBase): CheckConstraint('coll_id IS NOT NULL', name='CONTENTS_COLL_ID_NN'), Index('CONTENTS_STATUS_UPDATED_IDX', 'status', 'locking', 'updated_at', 'created_at'), Index('CONTENTS_ID_NAME_IDX', 'coll_id', 'scope', 'name', 'status'), + Index('CONTENTS_DEP_IDX', 'request_id', 'transform_id', 'content_dep_id'), Index('CONTENTS_REQ_TF_COLL_IDX', 'request_id', 'transform_id', 'coll_id', 'status')) diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index cc9a46ec..e0398e05 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -549,6 +549,31 @@ def update_contents(parameters, session=None): raise exceptions.NoObject('Content cannot be found: %s' % (error)) +@transactional_session +def update_dep_contents(request_id, content_dep_ids, status, bulk_size=1000, session=None): + """ + update dependency contents. + + :param content_dep_ids: list of content dependency id. + :param status: Content status. + :param session: The database session in use. + + :raises NoObject: If no content is founded. + :raises DatabaseException: If there is a database error. + + """ + try: + params = {'substatus': status} + chunks = [content_dep_ids[i:i + bulk_size] for i in range(0, len(content_dep_ids), bulk_size)] + for chunk in chunks: + session.query(models.Content).with_hint(models.Content, "INDEX(CONTENTS CONTENTS_DEP_IDX)")\ + .filter(models.Content.request_id == request_id)\ + .filter(models.Content.content_id.in_(chunk))\ + .update(params, synchronize_session=False) + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('Content cannot be found: %s' % (error)) + + @transactional_session def delete_content(content_id=None, session=None): """ From ae7d9003750a9411780593fa9331f86b98dad410 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 30 Nov 2022 19:57:27 +0100 Subject: [PATCH 06/91] update timeout --- main/lib/idds/agents/common/plugins/messaging.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/main/lib/idds/agents/common/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py index 78c14ffd..648187dd 100644 --- a/main/lib/idds/agents/common/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2022 import logging @@ -46,7 +46,7 @@ def on_error(self, frame): self.logger.error('[broker] [%s]: %s', self.__broker, frame.body) def on_message(self, frame): - # self.logger.info('[broker] [%s]: %s', self.__broker, body) + self.logger.debug('[broker] [%s]: %s', self.__broker, frame.body) self.__output_queue.put(frame.body) pass @@ -109,9 +109,7 @@ def connect_to_messaging_brokers(self, sender=True): self.logger.info("Resolved broker addresses: %s" % broker_addresses) - timeout = None - if sender: - timeout = self.broker_timeout + timeout = self.broker_timeout conns = [] for broker, port in broker_addresses: From c3dcf3b2827866d8f5559cc0f23a02f5d6a13ccb Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 30 Nov 2022 20:01:19 +0100 Subject: [PATCH 07/91] optimize trigger to release jobs --- main/lib/idds/agents/carrier/poller.py | 1 + main/lib/idds/agents/carrier/trigger.py | 10 +- main/lib/idds/agents/carrier/utils.py | 331 ++++++++++-------------- 3 files changed, 147 insertions(+), 195 deletions(-) diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py index ca15f88d..1bf197fd 100644 --- a/main/lib/idds/agents/carrier/poller.py +++ b/main/lib/idds/agents/carrier/poller.py @@ -193,6 +193,7 @@ def update_processing(self, processing, processing_model): core_processings.update_processing_contents(update_processing=processing.get('update_processing', None), update_collections=processing.get('update_collections', None), update_contents=processing.get('update_contents', None), + update_dep_contents=processing.get('update_dep_contents', None), messages=processing.get('messages', None), update_messages=processing.get('update_messages', None), new_contents=processing.get('new_contents', None)) diff --git a/main/lib/idds/agents/carrier/trigger.py b/main/lib/idds/agents/carrier/trigger.py index 549b48af..4adfdbf5 100644 --- a/main/lib/idds/agents/carrier/trigger.py +++ b/main/lib/idds/agents/carrier/trigger.py @@ -73,10 +73,11 @@ def get_trigger_processings(self): def handle_trigger_processing(self, processing): try: log_prefix = self.get_log_prefix(processing) - process_status, update_contents, ret_msgs, parameters = handle_trigger_processing(processing, - self.agent_attributes, - logger=self.logger, - log_prefix=log_prefix) + ret_trigger_processing = handle_trigger_processing(processing, + self.agent_attributes, + logger=self.logger, + log_prefix=log_prefix) + process_status, update_contents, ret_msgs, parameters, update_dep_contents_status_name, update_dep_contents_status = ret_trigger_processing new_process_status = process_status if is_process_terminated(process_status): @@ -95,6 +96,7 @@ def handle_trigger_processing(self, processing): ret = {'update_processing': update_processing, 'update_contents': update_contents, 'messages': ret_msgs, + 'update_dep_contents': (processing['request_id'], update_dep_contents_status_name, update_dep_contents_status), 'processing_status': new_process_status} except exceptions.ProcessFormatNotSupported as ex: self.logger.error(ex) diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index 0132be94..d2b3efab 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -184,9 +184,23 @@ def get_input_output_maps(transform_id, work): return mapped_input_output_maps +def resolve_input_dependency_id(request_id, new_input_dep_coll_ids, new_input_dependency_contents): + contents = core_catalog.get_contents_by_coll_id_status(coll_id=new_input_dep_coll_ids) + content_name_id_map = {} + for content in contents: + if content['coll_id'] not in content_name_id_map: + content_name_id_map[content['coll_id']] = {} + content_name_id_map[content['coll_id']][content['name']] = content['content_id'] + for content in new_input_dependency_contents: + content_dep_id = content_name_id_map[content['coll_id']][content['name']] + content['content_dep_id'] = content_dep_id + return new_input_dependency_contents + + def get_new_contents(request_id, transform_id, workload_id, new_input_output_maps): new_input_contents, new_output_contents, new_log_contents = [], [], [] new_input_dependency_contents = [] + new_input_dep_coll_ids = [] for map_id in new_input_output_maps: inputs = new_input_output_maps[map_id]['inputs'] if 'inputs' in new_input_output_maps[map_id] else [] inputs_dependency = new_input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in new_input_output_maps[map_id] else [] @@ -199,12 +213,16 @@ def get_new_contents(request_id, transform_id, workload_id, new_input_output_map for input_content in inputs_dependency: content = get_new_content(request_id, transform_id, workload_id, map_id, input_content, content_relation_type=ContentRelationType.InputDependency) new_input_dependency_contents.append(content) + if content['coll_id'] not in new_input_dep_coll_ids: + new_input_dep_coll_ids.append(content['coll_id']) for output_content in outputs: content = get_new_content(request_id, transform_id, workload_id, map_id, output_content, content_relation_type=ContentRelationType.Output) new_output_contents.append(content) for log_content in logs: content = get_new_content(request_id, transform_id, workload_id, map_id, log_content, content_relation_type=ContentRelationType.Log) new_log_contents.append(content) + + new_input_dependency_contents = resolve_input_dependency_id(request_id, new_input_dep_coll_ids, new_input_dependency_contents) return new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents @@ -417,7 +435,8 @@ def handle_new_processing(processing, agent_attributes, logger=None, log_prefix= return True, processing, update_collections, new_contents, ret_msgs, errors -def get_updated_contents_by_request(request_id, transform_id, workload_id, work, terminated=False, logger=None, log_prefix=''): +def get_updated_contents_by_request(request_id, transform_id, workload_id, work, terminated=False, input_output_maps=None, + logger=None, log_prefix=''): logger = get_logger(logger) status_to_check = [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed, @@ -442,6 +461,41 @@ def get_updated_contents_by_request(request_id, transform_id, workload_id, work, return updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps +def get_updated_contents_by_input_output_maps(input_output_maps=None, logger=None, log_prefix=''): + updated_contents, updated_contents_full_input, updated_contents_full_output = [], [], [] + updated_contents_full_input_deps = [] + + status_to_check = [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed, + ContentStatus.Missing, ContentStatus.Failed, ContentStatus.Lost, + ContentStatus.Deleted] + + for map_id in input_output_maps: + inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] + inputs_dependency = input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in input_output_maps[map_id] else [] + outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] + # logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] + + for content in inputs: + if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: + u_content = {'content_id': content['content_id'], + 'status': content['substatus']} + updated_contents.append(u_content) + updated_contents_full_input.append(content) + for content in outputs: + if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: + u_content = {'content_id': content['content_id'], + 'status': content['substatus']} + updated_contents.append(u_content) + updated_contents_full_output.append(content) + for content in inputs_dependency: + if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: + u_content = {'content_id': content['content_id'], + 'status': content['substatus']} + updated_contents.append(u_content) + updated_contents_full_input_deps.append(content) + return updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps + + def get_transform_dependency_map(transform_id, logger=None, log_prefix=''): cache = get_redis_cache() transform_dependcy_map_key = "transform_dependcy_map_%s" % transform_id @@ -595,214 +649,101 @@ def get_content_status_with_status_map(content_ids, content_status_map): return content_id_status -def trigger_release_inputs_no_deps(request_id, transform_id, workload_id, work, logger=None, log_prefix=''): +def trigger_release_inputs_no_deps(request_id, transform_id, workload_id, work, input_output_maps, logger=None, log_prefix=''): logger = get_logger(logger) - content_depency_map, transform_dependency_maps, content_status_map = get_input_dependency_map_by_request(request_id, transform_id, workload_id, work, - logger=logger, log_prefix=log_prefix) - # logger.debug(log_prefix + "content_depency_map[:2]: %s" % str({k: content_depency_map[k] for k in list(content_depency_map.keys())[:2]})) - # logger.debug(log_prefix + "transform_dependency_map[:2]: %s" % str({key: transform_dependency_map[key] for k in list(transform_dependency_map.keys())[:2]})) - logger.debug(log_prefix + "transform_dependency_maps.keys[:2]: %s" % str(list(transform_dependency_maps.keys())[:2])) - - update_contents, update_contents_dict = [], {} + update_contents = [] update_input_contents_full = {} - update_content_ids = [] + update_input_contents_full[transform_id] = [] - # release jobs without inputs_dependency - # after reload from the cache, the key will be changed to string. - str_transform_id = str(transform_id) + for map_id in input_output_maps: + inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] + inputs_dependency = input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in input_output_maps[map_id] else [] + # outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] + # logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] - if str_transform_id not in transform_dependency_maps: - logger.warn(log_prefix + "transform_id %s not in transform_dependency_maps" % transform_id) - else: - transform_dependency_map = transform_dependency_maps[str_transform_id] - for map_id in transform_dependency_map: - inputs_dependency = transform_dependency_map[map_id]['input_deps'] - if len(inputs_dependency) == 0: - inputs = transform_dependency_map[map_id]['inputs'] - for content_id in inputs: - update_content_ids.append(content_id) - u_content = {'content_id': content_id, + if not inputs_dependency: + for content in inputs: + if content['substatus'] != ContentStatus.Available: + u_content = {'content_id': content['content_id'], # 'status': ContentStatus.Available, 'substatus': ContentStatus.Available} - # update_contents.append(u_content) - update_contents_dict[content_id] = u_content - - if update_content_ids: - contents = core_catalog.get_contents_by_content_ids(update_content_ids, request_id=request_id) - for content in contents: - content_id = content['content_id'] - if update_contents_dict[content_id]['substatus'] != content['substatus']: - update_contents.append(update_contents_dict[content_id]) - if content['transform_id'] not in update_input_contents_full: - update_input_contents_full[content['transform_id']] = [] - u_content_full = {'content_id': content_id, - 'request_id': content['request_id'], - 'transform_id': content['transform_id'], - 'workload_id': content['workload_id'], - 'status': ContentStatus.Available, - 'substatus': ContentStatus.Available, - 'scope': content['scope'], - 'name': content['name'], - 'path': content['path']} - update_input_contents_full[content['transform_id']].append(u_content_full) + update_contents.append(u_content) + content['status'] = ContentStatus.Available + content['substatus'] = ContentStatus.Available + update_input_contents_full[transform_id].append(content) return update_contents, update_input_contents_full def trigger_release_inputs(request_id, transform_id, workload_id, work, updated_contents_full_output, updated_contents_full_input, - updated_contents_full_input_deps, logger=None, log_prefix=''): + updated_contents_full_input_deps, input_output_maps, logger=None, log_prefix=''): logger = get_logger(logger) status_to_check = [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed, ContentStatus.Missing] # status_to_check_fake = [ContentStatus.FakeAvailable, ContentStatus.Missing] - content_depency_map, transform_dependency_maps, content_status_map = get_input_dependency_map_by_request(request_id, transform_id, workload_id, work, - logger=logger, log_prefix=log_prefix) - # logger.debug(log_prefix + "content_depency_map[:2]: %s" % str({k: content_depency_map[k] for k in list(content_depency_map.keys())[:2]})) - # logger.debug(log_prefix + "transform_dependency_map[:2]: %s" % str({key: transform_dependency_maps[key] for k in list(transform_dependency_maps.keys())[:2]})) - logger.debug(log_prefix + "transform_dependency_maps.keys[:2]: %s" % str(list(transform_dependency_maps.keys())[:2])) - - triggered_map_ids, triggered_map_ids_input = [], [] update_contents = [] - update_trigger_contents_dict, update_contents_dict = {}, {} + update_contents_status = {} + update_contents_status_name = {} update_input_contents_full = {} - # 1. use the outputs to check input_dependency - for content in updated_contents_full_output: - # update the status - u_content = {'content_id': content['content_id'], 'status': content['substatus']} - update_contents.append(u_content) + update_input_contents_full[transform_id] = [] - if content['substatus'] in status_to_check: - str_content_id = str(content['content_id']) - content_status_map[str_content_id] = content['substatus'].value - - if str_content_id in content_depency_map: - # t_contents are the contents to be triggered - t_contents = content_depency_map[str_content_id] - for t_content in t_contents: - t_content_id, t_transform_id, t_map_id = t_content - - content_status_map[str(t_content_id)] = content['substatus'].value - # update the input_dependency - u_content = {'content_id': t_content_id, - # 'status': content['status'], - 'substatus': content['substatus']} - # update_contents.append(u_content) - # update_contents_ids.append(t_content_id) - update_trigger_contents_dict[t_content_id] = u_content + for status in status_to_check: + update_contents_status[status.name] = [] + update_contents_status_name[status.name] = status - t_tf_map_id = (str(t_transform_id), str(t_map_id)) - if t_tf_map_id not in triggered_map_ids: - triggered_map_ids.append(t_tf_map_id) - - # 2. use the inputs to check the outputs - # If the input is missing or fakeavailable, set the outputs missing. - for content in updated_contents_full_input: + for content in updated_contents_full_output: + # update the status # u_content = {'content_id': content['content_id'], 'status': content['substatus']} # update_contents.append(u_content) - # if content['substatus'] in status_to_check_fake: - if content['substatus'] in status_to_check: - content_status_map[str(content['content_id'])] = content['substatus'].value - t_tf_map_id = (str(content['transform_id']), str(content['map_id'])) - if t_tf_map_id not in triggered_map_ids_input: - triggered_map_ids_input.append(t_tf_map_id) - for t_tf_map_id in triggered_map_ids_input: - transform_id, map_id = t_tf_map_id - inputs = transform_dependency_maps[transform_id][map_id]['inputs'] - inputs_status = get_content_status_with_status_map(inputs, content_status_map) - - if is_all_contents_available(inputs_status): - for content_id, status in inputs_status: - u_content = {'content_id': content_id, - 'status': status, - 'substatus': status} - update_contents.append(u_content) - elif is_all_contents_terminated_but_not_available(inputs_status): - # for this case, will not generate jobs. so we need to set the output status. - for content_id, status in inputs_status: - u_content = {'content_id': content_id, - 'status': status, - 'substatus': status} - update_contents.append(u_content) - # update the outputs - outputs = transform_dependency_maps[transform_id][map_id]['outputs'] - for content_id in outputs: - u_content = {'content_id': content_id, - # 'status': content_update_status, - 'substatus': ContentStatus.Missing} - update_contents.append(u_content) - # 3. use the input_deps to update the trigger contents - for content in updated_contents_full_input_deps: if content['substatus'] in status_to_check: - content_status_map[str(content['content_id'])] = content['substatus'].value - # u_content = {'content_id': content['content_id'], - # # 'status': content['status'], - # 'substatus': content['substatus']} - # update_trigger_contents_dict[content['content_id']] = u_content - t_tf_map_id = (str(content['transform_id']), str(content['map_id'])) - if t_tf_map_id not in triggered_map_ids: - triggered_map_ids.append(t_tf_map_id) - - # update the content status - set_content_status_map(request_id, content_status_map, logger=logger, log_prefix=log_prefix) - - # 4. use the updated input_dependency to release inputs - input_update_content_ids = [] - for t_tf_map_id in triggered_map_ids: - transform_id, map_id = t_tf_map_id - inputs_dependency = transform_dependency_maps[transform_id][map_id]['input_deps'] - inputs_dependency_status = get_content_status_with_status_map(inputs_dependency, content_status_map) + update_contents_status[content['substatus'].name].append(content['content_id']) - content_update_status = None - if is_all_contents_available(inputs_dependency_status): - content_update_status = ContentStatus.Available - elif is_all_contents_terminated(inputs_dependency_status): - content_update_status = ContentStatus.Missing - if content_update_status: - for content_id, status in inputs_dependency_status: - # update input dependencies status from substatus - u_content = {'content_id': content_id, - 'status': status, - 'substatus': status} + for map_id in input_output_maps: + inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] + inputs_dependency = input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in input_output_maps[map_id] else [] + outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] + # logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] + + input_content_update_status = None + if is_all_contents_available(inputs_dependency): + input_content_update_status = ContentStatus.Available + elif is_all_contents_terminated(inputs_dependency): + input_content_update_status = ContentStatus.Missing + if input_content_update_status: + for content in inputs_dependency: + # u_content = {'content_id': content['content_id'], 'status': content['substatus']) + # update_contents.append(u_content) + pass + for content in inputs: + u_content = {'content_id': content['content_id'], + 'substatus': input_content_update_status} update_contents.append(u_content) - if content_id in update_trigger_contents_dict: - del update_trigger_contents_dict[content_id] - - inputs = transform_dependency_maps[transform_id][map_id]['inputs'] - for content_id in inputs: - u_content = {'content_id': content_id, - # 'status': content_update_status, - 'substatus': content_update_status} + content['status'] = input_content_update_status + content['substatus'] = input_content_update_status + update_input_contents_full[transform_id].append(content) + + output_content_update_status = None + if is_all_contents_available(inputs): + # wait for the job to finish + # for content in inputs: + # u_content = {'content_id': content['content_id'], 'status': content['substatus']) + # update_contents.append(u_content) + pass + elif is_all_contents_terminated_but_not_available(inputs): + # for content in inputs: + # u_content = {'content_id': content['content_id'], 'status': content['substatus']) + # update_contents.append(u_content) + pass + output_content_update_status = ContentStatus.Missing + if output_content_update_status: + for content in outputs: + u_content = {'content_id': content['content_id'], + 'substatus': output_content_update_status} update_contents.append(u_content) - update_contents_dict[content_id] = u_content - input_update_content_ids.append(content_id) - - # the input is not triggered. only update the substatus, not update the status - for content_id in update_trigger_contents_dict: - update_contents.append(update_trigger_contents_dict[content_id]) - - if input_update_content_ids: - contents = core_catalog.get_contents_by_content_ids(input_update_content_ids, request_id=request_id) - for content in contents: - content_id = content['content_id'] - # if update_contents_dict[content_id]['status'] != content['status']: - # update_contents.append(update_contents_dict[content_id]) - if True: - if content['transform_id'] not in update_input_contents_full: - update_input_contents_full[content['transform_id']] = [] - u_content_full = {'content_id': content_id, - 'request_id': content['request_id'], - 'transform_id': content['transform_id'], - 'workload_id': content['workload_id'], - 'status': update_contents_dict[content_id]['substatus'], - 'substatus': update_contents_dict[content_id]['substatus'], - 'scope': content['scope'], - 'name': content['name'], - 'path': content['path']} - update_input_contents_full[content['transform_id']].append(u_content_full) - return update_contents, update_input_contents_full + return update_contents, update_input_contents_full, update_contents_status_name, update_contents_status def poll_missing_outputs(input_output_maps): @@ -901,9 +842,13 @@ def handle_trigger_processing(processing, agent_attributes, logger=None, log_pre if not work.use_dependency_to_release_jobs(): return processing['substatus'], [], [], {} else: + input_output_maps = get_input_output_maps(transform_id, work) + logger.debug(log_prefix + "input_output_maps.keys[:2]: %s" % str(list(input_output_maps.keys())[:2])) + + """ content_updates_trigger_no_deps, updated_input_contents_no_deps = [], [] content_updates_trigger_no_deps, updated_input_contents_no_deps = trigger_release_inputs_no_deps(request_id, transform_id, workload_id, work, - logger, log_prefix) + input_output_maps, logger, log_prefix) logger.debug(log_prefix + "trigger_release_inputs_no_deps: content_updates_trigger_no_deps[:3] %s" % (content_updates_trigger_no_deps[:3])) # logger.debug(log_prefix + "trigger_release_inputs_no_deps: updated_input_contents_no_deps[:3] %s" % (updated_input_contents_no_deps[:3])) @@ -920,8 +865,9 @@ def handle_trigger_processing(processing, agent_attributes, logger=None, log_pre ret_msgs = ret_msgs + msgs is_terminated = is_process_terminated(processing['substatus']) - updated_contents_ret = get_updated_contents_by_request(request_id, transform_id, workload_id, work, terminated=is_terminated, - logger=logger, log_prefix=log_prefix) + """ + + updated_contents_ret = get_updated_contents_by_input_output_maps(input_output_maps=input_output_maps, logger=logger, log_prefix=log_prefix) updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps = updated_contents_ret logger.debug(log_prefix + "handle_trigger_processing: updated_contents[:3] %s" % (updated_contents[:3])) @@ -941,12 +887,15 @@ def handle_trigger_processing(processing, agent_attributes, logger=None, log_pre # content_updates = content_updates + updated_contents # updated_contents_full_output_input_deps = updated_contents_full_output + updated_contents_full_input_deps - if updated_contents_full_output or updated_contents_full_input_deps or updated_contents_full_input: - content_updates_trigger, updated_input_contents = trigger_release_inputs(request_id, transform_id, workload_id, work, - updated_contents_full_output, - updated_contents_full_input, - updated_contents_full_input_deps, - logger, log_prefix) + # if updated_contents_full_output or updated_contents_full_input_deps or updated_contents_full_input: + if True: + ret_trigger_release_inputs = trigger_release_inputs(request_id, transform_id, workload_id, work, + updated_contents_full_output, + updated_contents_full_input, + updated_contents_full_input_deps, + input_output_maps, + logger, log_prefix) + content_updates_trigger, updated_input_contents, update_contents_status_name, update_contents_status = ret_trigger_release_inputs logger.debug(log_prefix + "trigger_release_inputs: content_updates_trigger[:3] %s" % (content_updates_trigger[:3])) # logger.debug(log_prefix + "trigger_release_inputs: updated_input_contents[:3] %s" % (updated_input_contents[:3])) @@ -961,7 +910,7 @@ def handle_trigger_processing(processing, agent_attributes, logger=None, log_pre msgs = generate_messages(trigger_req_id, trigger_tf_id, trigger_workload_id, work, msg_type='file', files=updated_input_contents[trigger_tf_id], relation_type='input') ret_msgs = ret_msgs + msgs - return processing['substatus'], content_updates, ret_msgs, {} + return processing['substatus'], content_updates, ret_msgs, {}, update_contents_status_name, update_contents_status def get_content_status_from_panda_msg_status(status): From 181d600b8ccdad7a12238bd590c37bd6372add8f Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 1 Dec 2022 16:52:23 +0100 Subject: [PATCH 08/91] propagate job info to contents_ext --- doma/lib/idds/doma/workflow/domapandawork.py | 4 +- .../lib/idds/doma/workflowv2/domapandawork.py | 178 +++++++++++++++--- 2 files changed, 149 insertions(+), 33 deletions(-) diff --git a/doma/lib/idds/doma/workflow/domapandawork.py b/doma/lib/idds/doma/workflow/domapandawork.py index 0a5bf035..b9bb6d68 100644 --- a/doma/lib/idds/doma/workflow/domapandawork.py +++ b/doma/lib/idds/doma/workflow/domapandawork.py @@ -1033,7 +1033,7 @@ def abort_processing(self, processing, log_prefix=''): def resume_processing(self, processing, log_prefix=''): self.reactivate_processing(processing, log_prefix=log_prefix) - def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): + def poll_processing_updates(self, processing, input_output_maps, contents_ext=None, log_prefix=''): """ *** Function called by Carrier agent. """ @@ -1052,7 +1052,7 @@ def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): if update_contents: proc.has_new_updates() - return processing_status, update_contents, {}, update_contents_full, {} + return processing_status, update_contents, {}, update_contents_full, {}, [], [] def get_status_statistics(self, registered_input_output_maps): status_statistics = {} diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index c0c96628..2f8f4582 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -749,23 +749,31 @@ def get_update_contents_from_map_id(self, map_id, input_output_maps, job_info): def get_panda_job_status(self, jobids): self.logger.debug("get_panda_job_status, jobids[:10]: %s" % str(jobids[:10])) - from pandaclient import Client - ret = Client.getJobStatus(jobids, verbose=0) - if ret[0] == 0: - left_jobids = [] - ret_jobs = [] - jobs_list = ret[1] - for jobid, jobinfo in zip(jobids, jobs_list): - if jobinfo is None: - left_jobids.append(jobid) - else: - ret_jobs.append(jobinfo) - if left_jobids: - ret1 = Client.getFullJobStatus(ids=left_jobids, verbose=False) - if ret1[0] == 0: - left_jobs_list = ret1[1] - ret_jobs = ret_jobs + left_jobs_list - return ret_jobs + try: + from pandaclient import Client + ret = Client.getJobStatus(jobids, verbose=0) + if ret[0] == 0: + left_jobids = [] + ret_jobs = [] + jobs_list = ret[1] + for jobid, jobinfo in zip(jobids, jobs_list): + if jobinfo is None: + left_jobids.append(jobid) + else: + ret_jobs.append(jobinfo) + if left_jobids: + try: + ret1 = Client.getFullJobStatus(ids=left_jobids, verbose=False) + if ret1[0] == 0: + left_jobs_list = ret1[1] + ret_jobs = ret_jobs + left_jobs_list + except Exception as ex: + self.logger.error(str(ex)) + self.logger.error(traceback.format_exc()) + return ret_jobs + except Exception as ex: + self.logger.error(str(ex)) + self.logger.error(traceback.format_exc()) return [] def map_panda_ids(self, unregistered_job_ids, input_output_maps): @@ -850,7 +858,7 @@ def poll_panda_jobs(self, job_ids): # input_file = job_file.lfn.split(':')[1] else: input_file = job_file.lfn - inputname_jobid_map[input_file] = {'panda_id': job_info.PandaID, 'status': job_status} + inputname_jobid_map[input_file] = {'panda_id': job_info.PandaID, 'status': job_status, 'job_info': job_info} else: self.logger.warn("poll_panda_jobs, input jobs: %s, output_jobs: %s" % (len(chunk), jobs_list)) return inputname_jobid_map @@ -882,11 +890,13 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m update_contents = [] update_contents_full = [] + contents_ext_full = {} num_updated_contents, num_unupdated_contents = 0, 0 for inputname in inputnames: panda_id_status = inputname_jobid_map[inputname] panda_id = panda_id_status['panda_id'] panda_status = panda_id_status['status'] + job_info = panda_id_status['job_info'] map_id_contents = inputname_mapid_map[inputname] contents = map_id_contents['outputs'] for content in contents: @@ -933,11 +943,108 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m # num_unupdated_contents += 1 pass + if panda_status in [ContentStatus.Available, ContentStatus.Failed, ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: + contents_ext_full[content['content_id']] = {'content': content, 'job_info': job_info} + self.logger.debug("get_update_contents, num_updated_contents: %s, num_unupdated_contents: %s" % (num_updated_contents, num_unupdated_contents)) self.logger.debug("get_update_contents, update_contents[:3]: %s" % (str(update_contents[:3]))) - return update_contents, update_contents_full + return update_contents, update_contents_full, contents_ext_full + + def get_contents_ext_detail(self, contents_ext_full, contents_ext_ids, job_info_items={}): + contents_ext_full_ids = set(contents_ext_full.keys()) + new_ids = contents_ext_full_ids - contents_ext_ids + to_update_ids = contents_ext_full_ids - new_ids + new_contents_ext, update_contents_ext = [], [] + + for new_id in new_ids: + content = contents_ext_full[new_id]['content'] + job_info = contents_ext_full[new_id]['job_info'] + new_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id'], + 'map_id': content['map_id'], + 'status': content['status']} + for job_info_item in job_info_items: + new_content_ext[job_info_item] = getattr(job_info, job_info_item) + + new_contents_ext.append(new_content_ext) + for to_update_id in to_update_ids: + content = contents_ext_full[new_id]['content'] + job_info = contents_ext_full[new_id]['job_info'] + update_content_ext = {'content_id': content['content_id'], 'status': content['status']} + for job_info_item in job_info_items: + update_content_ext[job_info_item] = getattr(job_info, job_info_item) + update_contents_ext.append(update_content_ext) + return new_contents_ext, update_contents_ext + + def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, job_info_items={}): + contents_ext_ids = [content['content_id'] for content in contents_ext] + contents_ext_ids = set(contents_ext_ids) + contents_ext_panda_ids = [content['PandaID'] for content in contents_ext] + contents_ext_panda_ids = set(contents_ext_panda_ids) + + new_contents_ext, update_contents_ext = [], [] + terminated_contents, terminated_contents_full = [], {} + terminated_contents_full_no_panda, terminated_contents_full_no_panda_full = [], {} - def poll_panda_task(self, processing=None, input_output_maps=None, log_prefix=''): + for map_id in input_output_maps: + # inputs = input_output_maps[map_id]['inputs'] + outputs = input_output_maps[map_id]['outputs'] + + for content in outputs: + if content['substatus'] in [ContentStatus.Available, ContentStatus.Failed, ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: + # terminated_contents.append(content['content_id']) + # terminated_contents_full[content['content_id']] = content + if content['content_metadata'] and 'panda_id' in content['content_metadata']: + terminated_contents.append(content['content_metadata']['panda_id']) + terminated_contents_full[content['content_metadata']['panda_id']] = content + else: + terminated_contents_full_no_panda.append(content['content_id']) + terminated_contents_full_no_panda_full[content['content_id']] = content + + to_check_panda_ids = [] + terminated_contents = set(terminated_contents) + contents_ext_full_panda_ids = [contents_ext_full[content_id]['job_info'].PandaID for content_id in contents_ext_full] + contents_ext_full_panda_ids = set(contents_ext_full_panda_ids) + to_check_panda_ids = terminated_contents - contents_ext_panda_ids - contents_ext_full_panda_ids + + terminated_contents_full_no_panda = set(terminated_contents_full_no_panda) + final_term_contents = terminated_contents_full_no_panda - contents_ext_ids + for content_id in final_term_contents: + new_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id'], + 'map_id': content['map_id'], + 'status': content['status']} + new_contents_ext.append(new_content_ext) + + left_panda_ids = [] + if to_check_panda_ids: + checked_panda_ids = [] + ret_job_infos = self.get_panda_job_status(to_check_panda_ids) + for job_info in ret_job_infos: + checked_panda_ids.append(job_info.PandaID) + content = terminated_contents_full[job_info.PandaID] + contents_ext_full[content['content_id']] = {'content': content, 'job_info': job_info} + + to_check_panda_ids = set(to_check_panda_ids) + checked_panda_ids = set(checked_panda_ids) + left_panda_ids = to_check_panda_ids - checked_panda_ids + left_panda_ids = list(left_panda_ids) + + new_contents_ext1, update_contents_ext1 = self.get_contents_ext_detail(contents_ext_full, contents_ext_ids, job_info_items) + new_contents_ext = new_contents_ext + new_contents_ext1 + update_contents_ext = update_contents_ext + update_contents_ext1 + + return new_contents_ext, update_contents_ext, left_panda_ids + + def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext=None, job_info_items={}, log_prefix=''): task_id = None try: from pandaclient import Client @@ -974,20 +1081,22 @@ def poll_panda_task(self, processing=None, input_output_maps=None, log_prefix='' inputname_jobid_map = self.poll_panda_jobs(unterminated_jobs) intersection_keys = set(inputname_mapid_map.keys()) & set(inputname_jobid_map.keys()) - updated_contents, update_contents_full = self.get_update_contents(list(intersection_keys), - inputname_mapid_map, - inputname_jobid_map) + updated_contents, update_contents_full, contents_ext_full = self.get_update_contents(list(intersection_keys), + inputname_mapid_map, + inputname_jobid_map) - return processing_status, updated_contents, update_contents_full + new_contents_ext, update_contents_ext = self.get_contents_ext(input_output_maps, contents_ext, + contents_ext_full, job_info_items) + return processing_status, updated_contents, update_contents_full, new_contents_ext, update_contents_ext else: - return ProcessingStatus.Running, [], [] + return ProcessingStatus.Running, [], [], [], [] except Exception as ex: msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) self.logger.error(log_prefix + msg) self.logger.error(log_prefix + str(ex)) self.logger.error(traceback.format_exc()) # raise exceptions.IDDSException(msg) - return ProcessingStatus.Running, [], [] + return ProcessingStatus.Running, [], [], [], [] def kill_processing(self, processing, log_prefix=''): try: @@ -1043,7 +1152,10 @@ def abort_processing(self, processing, log_prefix=''): def resume_processing(self, processing, log_prefix=''): self.reactivate_processing(processing, log_prefix=log_prefix) - def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): + def require_ext_contents(self): + return True + + def poll_processing_updates(self, processing, input_output_maps, contents_ext=None, job_info_items={}, log_prefix=''): """ *** Function called by Carrier agent. """ @@ -1054,15 +1166,19 @@ def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): if processing: proc = processing['processing_metadata']['processing'] - processing_status, update_contents, update_contents_full = self.poll_panda_task(processing=processing, - input_output_maps=input_output_maps, - log_prefix=log_prefix) + ret_poll_panda_task = self.poll_panda_task(processing=processing, + input_output_maps=input_output_maps, + contents_ext=contents_ext, + job_info_items=job_info_items, + log_prefix=log_prefix) + + processing_status, update_contents, update_contents_full, new_contents_ext, update_contents_ext = ret_poll_panda_task # self.logger.debug(log_prefix + "poll_processing_updates, processing_status: %s" % str(processing_status)) # self.logger.debug(log_prefix + "poll_processing_updates, update_contents[:10]: %s" % str(update_contents[:10])) if update_contents: proc.has_new_updates() - return processing_status, update_contents, {}, update_contents_full, {} + return processing_status, update_contents, {}, update_contents_full, {}, new_contents_ext, update_contents_ext def get_status_statistics(self, registered_input_output_maps): status_statistics = {} From db90cae0073d5b09f03da02a8c1519e48240f25b Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 1 Dec 2022 16:52:32 +0100 Subject: [PATCH 09/91] propagate job info to contents_ext --- doma/lib/idds/doma/workflowv2/domapandawork.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index 2f8f4582..ff9d6b28 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -1085,8 +1085,11 @@ def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext= inputname_mapid_map, inputname_jobid_map) - new_contents_ext, update_contents_ext = self.get_contents_ext(input_output_maps, contents_ext, - contents_ext_full, job_info_items) + new_contents_ext, update_contents_ext, left_jobs = self.get_contents_ext(input_output_maps, contents_ext, + contents_ext_full, job_info_items) + if left_jobs: + processing_status = ProcessingStatus.Running + return processing_status, updated_contents, update_contents_full, new_contents_ext, update_contents_ext else: return ProcessingStatus.Running, [], [], [], [] From 076263bcf91b929ad4ca7becf9c3525742108f46 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 1 Dec 2022 16:57:28 +0100 Subject: [PATCH 10/91] add contents_ext and funcs to fill it --- main/lib/idds/agents/carrier/poller.py | 12 +- main/lib/idds/agents/carrier/utils.py | 20 ++- main/lib/idds/core/catalog.py | 73 ++++++++++ main/lib/idds/orm/base/models.py | 94 ++++++++++++- main/lib/idds/orm/contents.py | 184 ++++++++++++++++++++++++- workflow/lib/idds/workflow/work.py | 5 +- workflow/lib/idds/workflowv2/work.py | 5 +- 7 files changed, 379 insertions(+), 14 deletions(-) diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py index 1bf197fd..02a591b4 100644 --- a/main/lib/idds/agents/carrier/poller.py +++ b/main/lib/idds/agents/carrier/poller.py @@ -239,10 +239,12 @@ def update_processing(self, processing, processing_model): def handle_update_processing(self, processing): try: log_prefix = self.get_log_prefix(processing) - process_status, new_contents, ret_msgs, update_contents, parameters = handle_update_processing(processing, - self.agent_attributes, - logger=self.logger, - log_prefix=log_prefix) + ret_handle_update_processing = handle_update_processing(processing, + self.agent_attributes, + logger=self.logger, + log_prefix=log_prefix) + + process_status, new_contents, ret_msgs, update_contents, parameters, new_contents_ext, update_contents_ext = ret_handle_update_processing proc = processing['processing_metadata']['processing'] work = proc.work @@ -287,6 +289,8 @@ def handle_update_processing(self, processing): 'update_contents': update_contents, 'new_contents': new_contents, 'messages': ret_msgs, + 'new_contents_ext': new_contents_ext, + 'update_contents_ext': update_contents_ext, 'processing_status': new_process_status} except exceptions.ProcessFormatNotSupported as ex: diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index d2b3efab..2fa5f8d3 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -184,6 +184,11 @@ def get_input_output_maps(transform_id, work): return mapped_input_output_maps +def get_ext_contents(transform_id, work): + contents_ids = core_catalog.get_contents_ext_ids(transform_id=transform_id) + return contents_ids + + def resolve_input_dependency_id(request_id, new_input_dep_coll_ids, new_input_dependency_contents): contents = core_catalog.get_contents_by_coll_id_status(coll_id=new_input_dep_coll_ids) content_name_id_map = {} @@ -793,8 +798,17 @@ def handle_update_processing(processing, agent_attributes, logger=None, log_pref logger.debug(log_prefix + "get_new_input_output_maps: len: %s" % len(new_input_output_maps)) logger.debug(log_prefix + "get_new_input_output_maps.keys[:3]: %s" % str(list(new_input_output_maps.keys())[:3])) - ret_poll_processing = work.poll_processing_updates(processing, input_output_maps, log_prefix=log_prefix) - process_status, content_updates, new_input_output_maps1, updated_contents_full, parameters = ret_poll_processing + if work.require_ext_contents(): + contents_ext = get_ext_contents(transform_id, work) + job_info_items = core_catalog.get_contents_ext_items() + ret_poll_processing = work.poll_processing_updates(processing, input_output_maps, contents_ext=contents_ext, + job_info_items=job_info_items, log_prefix=log_prefix) + process_status, content_updates, new_input_output_maps1, updated_contents_full, parameters, new_contents_ext, update_contents_ext = ret_poll_processing + else: + ret_poll_processing = work.poll_processing_updates(processing, input_output_maps, log_prefix=log_prefix) + new_contents_ext, update_contents_ext = [], [] + process_status, content_updates, new_input_output_maps1, updated_contents_full, parameters = ret_poll_processing + new_input_output_maps.update(new_input_output_maps1) logger.debug(log_prefix + "poll_processing_updates process_status: %s" % process_status) logger.debug(log_prefix + "poll_processing_updates content_updates[:3]: %s" % content_updates[:3]) @@ -822,7 +836,7 @@ def handle_update_processing(processing, agent_attributes, logger=None, log_pref files=updated_contents_full, relation_type='output') ret_msgs = ret_msgs + msgs - return process_status, new_contents, ret_msgs, content_updates + content_updates_missing, parameters + return process_status, new_contents, ret_msgs, content_updates + content_updates_missing, parameters, new_contents_ext, update_contents_ext def handle_trigger_processing(processing, agent_attributes, logger=None, log_prefix=''): diff --git a/main/lib/idds/core/catalog.py b/main/lib/idds/core/catalog.py index eb2b9ea8..2d8924a3 100644 --- a/main/lib/idds/core/catalog.py +++ b/main/lib/idds/core/catalog.py @@ -609,3 +609,76 @@ def get_output_contents_by_request_id_status(request_id, name, content_status, l if contents and limit and len(contents) > limit: contents = contents[:limit] return contents + + +def get_contents_ext_items(): + return orm_contents.get_contents_ext_items() + + +@transactional_session +def add_contents_ext(contents, bulk_size=10000, session=None): + """ + Add contents ext. + + :param contents: dict of contents. + :param session: session. + + :raises DuplicatedObject: If a collection with the same name exists. + :raises DatabaseException: If there is a database error. + + :returns: content ids. + """ + return orm_contents.add_contents_ext(contents, bulk_size=bulk_size, session=session) + + +@transactional_session +def update_contents_ext(parameters, session=None): + """ + update contents ext. + + :param parameters: list of dictionary of parameters. + :param session: The database session in use. + + :raises NoObject: If no content is founded. + :raises DatabaseException: If there is a database error. + + """ + return orm_contents.update_contents_ext(parameters, session=session) + + +@read_session +def get_contents_ext(request_id=None, transform_id=None, workload_id=None, coll_id=None, status=None, session=None): + """ + Get content or raise a NoObject exception. + + :param request_id: request id. + :param transform_id: transform id. + :param workload_id: workload id. + + :param session: The database session in use. + + :raises NoObject: If no content is founded. + + :returns: list of contents. + """ + return orm_contents.get_contents_ext(request_id=request_id, transform_id=transform_id, workload_id=workload_id, + coll_id=coll_id, status=status, session=session) + + +@read_session +def get_contents_ext_ids(request_id=None, transform_id=None, workload_id=None, coll_id=None, status=None, session=None): + """ + Get content or raise a NoObject exception. + + :param request_id: request id. + :param transform_id: transform id. + :param workload_id: workload id. + + :param session: The database session in use. + + :raises NoObject: If no content is founded. + + :returns: list of content ids. + """ + return orm_contents.get_contents_ext_ids(request_id=request_id, transform_id=transform_id, workload_id=workload_id, + coll_id=coll_id, status=status, session=session) diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index ee97bac4..e5849317 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -16,7 +16,7 @@ import datetime from enum import Enum -from sqlalchemy import BigInteger, Boolean, Column, DateTime, Integer, String, event, DDL, Interval +from sqlalchemy import BigInteger, Boolean, Column, DateTime, Integer, String, Float, event, DDL, Interval from sqlalchemy.ext.compiler import compiles # from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import object_mapper @@ -538,6 +538,92 @@ class Content(BASE, ModelBase): Index('CONTENTS_REQ_TF_COLL_IDX', 'request_id', 'transform_id', 'coll_id', 'status')) +class Content_ext(BASE, ModelBase): + """Represents a content extension""" + __tablename__ = 'contents_ext' + content_id = Column(BigInteger().with_variant(Integer, "sqlite"), primary_key=True) + transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) + coll_id = Column(BigInteger().with_variant(Integer, "sqlite")) + request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + workload_id = Column(Integer()) + map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0) + status = Column(EnumWithValue(ContentStatus)) + PandaID = Column(BigInteger()) + jobDefinitionID = Column(BigInteger()) + schedulerID = Column(String(128)) + pilotID = Column(String(200)) + creationTime = Column(DateTime) + modificationTime = Column(DateTime) + startTime = Column(DateTime) + endTime = Column(DateTime) + prodSourceLabel = Column(String(20)) + prodUserID = Column(String(250)) + assignedPriority = Column(Integer()) + currentPriority = Column(Integer()) + attemptNr = Column(Integer()) + maxAttempt = Column(Integer()) + maxCpuCount = Column(Integer()) + maxCpuUnit = Column(String(32)) + maxDiskCount = Column(Integer()) + maxDiskUnit = Column(String(10)) + minRamCount = Column(Integer()) + maxRamUnit = Column(String(10)) + cpuConsumptionTime = Column(Integer()) + cpuConsumptionUnit = Column(String(128)) + jobStatus = Column(String(10)) + jobName = Column(String(255)) + transExitCode = Column(Integer()) + pilotErrorCode = Column(Integer()) + pilotErrorDiag = Column(String(500)) + exeErrorCode = Column(Integer()) + exeErrorDiag = Column(String(500)) + supErrorCode = Column(Integer()) + supErrorDiag = Column(String(250)) + ddmErrorCode = Column(Integer()) + ddmErrorDiag = Column(String(500)) + brokerageErrorCode = Column(Integer()) + brokerageErrorDiag = Column(String(250)) + jobDispatcherErrorCode = Column(Integer()) + jobDispatcherErrorDiag = Column(String(250)) + taskBufferErrorCode = Column(Integer()) + taskBufferErrorDiag = Column(String(300)) + computingSite = Column(String(128)) + computingElement = Column(String(128)) + grid = Column(String(50)) + cloud = Column(String(50)) + cpuConversion = Column(Float()) + taskID = Column(BigInteger()) + vo = Column(String(16)) + pilotTiming = Column(String(100)) + workingGroup = Column(String(20)) + processingType = Column(String(64)) + prodUserName = Column(String(60)) + coreCount = Column(Integer()) + nInputFiles = Column(Integer()) + reqID = Column(BigInteger()) + jediTaskID = Column(BigInteger()) + actualCoreCount = Column(Integer()) + maxRSS = Column(Integer()) + maxVMEM = Column(Integer()) + maxSWAP = Column(Integer()) + maxPSS = Column(Integer()) + avgRSS = Column(Integer()) + avgVMEM = Column(Integer()) + avgSWAP = Column(Integer()) + avgPSS = Column(Integer()) + maxWalltime = Column(Integer()) + diskIO = Column(Integer()) + failedAttempt = Column(Integer()) + hs06 = Column(Integer()) + hs06sec = Column(Integer()) + memory_leak = Column(String(10)) + memory_leak_x2 = Column(String(10)) + job_label = Column(String(20)) + + _table_args = (PrimaryKeyConstraint('content_id', name='CONTENTS_EXT_PK'), + Index('CONTENTS_EXT_RTF_IDX', 'request_id', 'transform_id', 'workload_id', 'coll_id', 'content_id', 'PandaID', 'status')) + + class Health(BASE, ModelBase): """Represents the status of the running agents""" __tablename__ = 'health' @@ -598,7 +684,7 @@ class Command(BASE, ModelBase): status = Column(EnumWithValue(CommandStatus)) substatus = Column(Integer()) locking = Column(EnumWithValue(CommandLocking)) - username = Column(String(20)) + username = Column(String(50)) retries = Column(Integer(), default=0) source = Column(EnumWithValue(CommandLocation)) destination = Column(EnumWithValue(CommandLocation)) @@ -619,7 +705,7 @@ def register_models(engine): """ # models = (Request, Workprogress, Transform, Workprogress2transform, Processing, Collection, Content, Health, Message) - models = (Request, Transform, Processing, Collection, Content, Health, Message, Command) + models = (Request, Transform, Processing, Collection, Content, Content_ext, Health, Message, Command) for model in models: # if not engine.has_table(model.__tablename__, model.metadata.schema): @@ -632,7 +718,7 @@ def unregister_models(engine): """ # models = (Request, Workprogress, Transform, Workprogress2transform, Processing, Collection, Content, Health, Message) - models = (Request, Transform, Processing, Collection, Content, Health, Message, Command) + models = (Request, Transform, Processing, Collection, Content, Content_ext, Health, Message, Command) for model in models: model.metadata.drop_all(engine) # pylint: disable=maybe-no-member diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index e0398e05..435670d5 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2020 +# - Wen Guan, , 2019 - 2022 """ @@ -589,3 +589,185 @@ def delete_content(content_id=None, session=None): session.query(models.Content).filter_by(content_id=content_id).delete() except sqlalchemy.orm.exc.NoResultFound as error: raise exceptions.NoObject('Content %s cannot be found: %s' % (content_id, error)) + + +def get_contents_ext_items(): + default_params = {'PandaID': None, 'jobDefinitionID': None, 'schedulerID': None, + 'pilotID': None, 'creationTime': None, 'modificationTime': None, + 'startTime': None, 'endTime': None, 'prodSourceLabel': None, + 'prodUserID': None, 'assignedPriority': None, 'currentPriority': None, + 'attemptNr': None, 'maxAttempt': None, 'maxCpuCount': None, + 'maxCpuUnit': None, 'maxDiskCount': None, 'maxDiskUnit': None, + 'minRamCount': None, 'maxRamUnit': None, 'cpuConsumptionTime': None, + 'cpuConsumptionUnit': None, 'jobStatus': None, 'jobName': None, + 'transExitCode': None, 'pilotErrorCode': None, 'pilotErrorDiag': None, + 'exeErrorCode': None, 'exeErrorDiag': None, 'supErrorCode': None, + 'supErrorDiag': None, 'ddmErrorCode': None, 'ddmErrorDiag': None, + 'brokerageErrorCode': None, 'brokerageErrorDiag': None, + 'jobDispatcherErrorCode': None, 'jobDispatcherErrorDiag': None, + 'taskBufferErrorCode': None, 'taskBufferErrorDiag': None, + 'computingSite': None, 'computingElement': None, + 'grid': None, 'cloud': None, 'cpuConversion': None, 'taskID': None, + 'vo': None, 'pilotTiming': None, 'workingGroup': None, + 'processingType': None, 'prodUserName': None, 'coreCount': None, + 'nInputFiles': None, 'reqID': None, 'jediTaskID': None, + 'actualCoreCount': None, 'maxRSS': None, 'maxVMEM': None, + 'maxSWAP': None, 'maxPSS': None, 'avgRSS': None, 'avgVMEM': None, + 'avgSWAP': None, 'avgPSS': None, 'maxWalltime': None, 'diskIO': None, + 'failedAttempt': None, 'hs06': None, 'hs06sec': None, + 'memory_leak': None, 'memory_leak_x2': None, 'job_label': None} + return default_params + + +@transactional_session +def add_contents_ext(contents, bulk_size=10000, session=None): + """ + Add contents ext. + + :param contents: dict of contents. + :param session: session. + + :raises DuplicatedObject: If a collection with the same name exists. + :raises DatabaseException: If there is a database error. + + :returns: content ids. + """ + default_params = get_contents_ext_items() + default_params['status'] = ContentStatus.New + + for content in contents: + for key in default_params: + if key not in content: + content[key] = default_params[key] + + sub_params = [contents[i:i + bulk_size] for i in range(0, len(contents), bulk_size)] + + try: + for sub_param in sub_params: + session.bulk_insert_mappings(models.Content, sub_param) + content_ids = [None for _ in range(len(contents))] + return content_ids + except IntegrityError as error: + raise exceptions.DuplicatedObject('Duplicated objects: %s' % (error)) + except DatabaseError as error: + raise exceptions.DatabaseException(error) + + +@transactional_session +def update_contents_ext(parameters, session=None): + """ + update contents ext. + + :param parameters: list of dictionary of parameters. + :param session: The database session in use. + + :raises NoObject: If no content is founded. + :raises DatabaseException: If there is a database error. + + """ + try: + session.bulk_update_mappings(models.Content, parameters) + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('Content cannot be found: %s' % (error)) + + +@read_session +def get_contents_ext(request_id=None, transform_id=None, workload_id=None, coll_id=None, status=None, session=None): + """ + Get content or raise a NoObject exception. + + :param request_id: request id. + :param transform_id: transform id. + :param workload_id: workload id. + + :param session: The database session in use. + + :raises NoObject: If no content is founded. + + :returns: list of contents. + """ + + try: + if status is not None: + if not isinstance(status, (tuple, list)): + status = [status] + + query = session.query(models.Content_ext) + query = query.with_hint(models.Content_ext, "INDEX(CONTENTS_EXT CONTENTS_EXT_RTF_IDX)") + if request_id: + query = query.filter(models.Content_ext.request_id == request_id) + if transform_id: + query = query.filter(models.Content_ext.transform_id == transform_id) + if workload_id: + query = query.filter(models.Content_ext.workload_id == workload_id) + if coll_id: + query = query.filter(models.Content_ext.coll_id == coll_id) + if status is not None: + query = query.filter(models.Content_ext.status.in_(status)) + query = query.order_by(asc(models.Content.request_id), asc(models.Content.transform_id), asc(models.Content.map_id)) + + tmp = query.all() + rets = [] + if tmp: + for t in tmp: + rets.append(t.to_dict()) + return rets + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('No record can be found with (transform_id=%s): %s' % + (transform_id, error)) + except Exception as error: + raise error + + +@read_session +def get_contents_ext_ids(request_id=None, transform_id=None, workload_id=None, coll_id=None, status=None, session=None): + """ + Get content or raise a NoObject exception. + + :param request_id: request id. + :param transform_id: transform id. + :param workload_id: workload id. + + :param session: The database session in use. + + :raises NoObject: If no content is founded. + + :returns: list of content ids. + """ + + try: + if status is not None: + if not isinstance(status, (tuple, list)): + status = [status] + + query = session.query(models.Content_ext.request_id, + models.Content_ext.transform_id, + models.Content_ext.workload_id, + models.Content_ext.coll_id, + models.Content_ext.content_id, + models.Content_ext.PandaID, + models.Content_ext.status) + query = query.with_hint(models.Content_ext, "INDEX(CONTENTS_EXT CONTENTS_EXT_RTF_IDX)") + if request_id: + query = query.filter(models.Content_ext.request_id == request_id) + if transform_id: + query = query.filter(models.Content_ext.transform_id == transform_id) + if workload_id: + query = query.filter(models.Content_ext.workload_id == workload_id) + if coll_id: + query = query.filter(models.Content_ext.coll_id == coll_id) + if status is not None: + query = query.filter(models.Content_ext.status.in_(status)) + query = query.order_by(asc(models.Content.request_id), asc(models.Content.transform_id), asc(models.Content.map_id)) + + tmp = query.all() + rets = [] + if tmp: + for t in tmp: + rets.append(t.to_dict()) + return rets + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('No record can be found with (transform_id=%s): %s' % + (transform_id, error)) + except Exception as error: + raise error diff --git a/workflow/lib/idds/workflow/work.py b/workflow/lib/idds/workflow/work.py index 779cce15..cc69fe05 100644 --- a/workflow/lib/idds/workflow/work.py +++ b/workflow/lib/idds/workflow/work.py @@ -1817,6 +1817,9 @@ def use_dependency_to_release_jobs(self): """ return False + def require_ext_contents(self): + return False + def set_work_name_to_coll_map(self, work_name_to_coll_map): self.work_name_to_coll_map = work_name_to_coll_map @@ -2115,7 +2118,7 @@ def finish_processing(self, processing, forcing=False): if forcing: proc.toforcefinish = True - def poll_processing_updates(self, processing, input_output_maps): + def poll_processing_updates(self, processing, input_output_maps, contents_ext=None, log_prefix=''): """ *** Function called by Carrier agent. """ diff --git a/workflow/lib/idds/workflowv2/work.py b/workflow/lib/idds/workflowv2/work.py index 7ffe4e51..dc4f1cc4 100644 --- a/workflow/lib/idds/workflowv2/work.py +++ b/workflow/lib/idds/workflowv2/work.py @@ -1799,6 +1799,9 @@ def use_dependency_to_release_jobs(self): """ return False + def require_ext_contents(self): + return False + def set_work_name_to_coll_map(self, work_name_to_coll_map): self.work_name_to_coll_map = work_name_to_coll_map @@ -2097,7 +2100,7 @@ def finish_processing(self, processing, forcing=False): if forcing: proc.toforcefinish = True - def poll_processing_updates(self, processing, input_output_maps): + def poll_processing_updates(self, processing, input_output_maps, contents_ext=None, log_prefix=''): """ *** Function called by Carrier agent. """ From 5447c64c34aab1c29d38b1ef2bb6e41c1b8e1da4 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 1 Dec 2022 17:33:55 +0100 Subject: [PATCH 11/91] add contents_ext oracle table --- main/etc/sql/oracle_update.sql | 97 ++++++++++++++++++++ main/lib/idds/tests/test_migrate_requests.py | 5 + 2 files changed, 102 insertions(+) diff --git a/main/etc/sql/oracle_update.sql b/main/etc/sql/oracle_update.sql index 91c1afe9..8980e0b1 100644 --- a/main/etc/sql/oracle_update.sql +++ b/main/etc/sql/oracle_update.sql @@ -112,3 +112,100 @@ CREATE INDEX COMMANDS_TYPE_ST_PR_IDX ON COMMANDS (cmd_type, status, destination, alter table transforms add name VARCHAR2(255); alter table collections add failed_files NUMBER(10); alter table collections add missing_files NUMBER(10); + + +-- 2022.11.30 +alter table contents add content_dep_id NUMBER(12); +CREATE INDEX CONTENTS_DEP_IDX ON CONTENTS (request_id, transform_id, content_dep_id) LOCAL; + +-- 2022.12.01 +alter table requests modify username VARCHAR2(50) default null; + +-- oracle 19 +CREATE TABLE CONTENTS_ext +( + content_id NUMBER(12) constraint CONTENT_EXT_PK_NN NOT NULL, + transform_id NUMBER(12) constraint CONTENT_EXT_TF_ID_NN NOT NULL, + coll_id NUMBER(14), + request_id NUMBER(12), + workload_id NUMBER(10), + map_id NUMBER(12) DEFAULT 0, + status NUMBER(2) constraint CONTENT_EXT_STATUS_NN NOT NULL, + PandaID NUMBER(14), + jobDefinitionID NUMBER(12), + schedulerID VARCHAR2(128), + pilotID VARCHAR2(200), + creationTime DATE, + modificationTime DATE, + startTime DATE, + endTime DATE, + prodSourceLabel VARCHAR2(20), + prodUserID VARCHAR2(250), + assignedPriority NUMBER(5), + currentPriority NUMBER(5), + attemptNr NUMBER(5), + maxAttempt NUMBER(5), + maxCpuCount NUMBER(5), + maxCpuUnit VARCHAR2(32), + maxDiskCount NUMBER(12), + maxDiskUnit VARCHAR2(10), + minRamCount NUMBER(12), + maxRamUnit VARCHAR2(10), + cpuConsumptionTime NUMBER(12), + cpuConsumptionUnit VARCHAR2(128), + jobStatus VARCHAR2(10), + jobName VARCHAR2(255), + transExitCode NUMBER(5), + pilotErrorCode NUMBER(5), + pilotErrorDiag VARCHAR2(500), + exeErrorCode NUMBER(5), + exeErrorDiag VARCHAR2(500), + supErrorCode NUMBER(5), + supErrorDiag VARCHAR2(250), + ddmErrorCode NUMBER(5), + ddmErrorDiag VARCHAR2(500), + brokerageErrorCode NUMBER(5), + brokerageErrorDiag VARCHAR2(250), + jobDispatcherErrorCode NUMBER(5), + jobDispatcherErrorDiag VARCHAR2(250), + taskBufferErrorCode NUMBER(5), + taskBufferErrorDiag VARCHAR2(300), + computingSite VARCHAR2(128), + computingElement VARCHAR2(128), + grid VARCHAR2(50), + cloud VARCHAR2(50), + cpuConversion float(20), + taskID NUMBER(12), + vo VARCHAR2(16), + pilotTiming VARCHAR2(100), + workingGroup VARCHAR2(20), + processingType VARCHAR2(64), + prodUserName VARCHAR2(60), + coreCount NUMBER(5), + nInputFiles NUMBER(10), + reqID NUMBER(12), + jediTaskID NUMBER(12), + actualCoreCount NUMBER(5), + maxRSS NUMBER(12), + maxVMEM NUMBER(12), + maxSWAP NUMBER(12), + maxPSS NUMBER(12), + avgRSS NUMBER(12), + avgVMEM NUMBER(12), + avgSWAP NUMBER(12), + avgPSS NUMBER(12), + maxWalltime NUMBER(12), + diskIO NUMBER(12), + failedAttempt NUMBER(5), + hs06 NUMBER(12), + hs06sec NUMBER(12), + memory_leak VARCHAR2(10), + memory_leak_x2 VARCHAR2(10), + job_label VARCHAR2(20) + CONSTRAINT CONTENT_EXT_PK PRIMARY KEY (content_id), + CONSTRAINT CONTENT_EXT_TRANSFORM_ID_FK FOREIGN KEY(transform_id) REFERENCES TRANSFORMS(transform_id), +) +PCTFREE 0 +PARTITION BY REFERENCE(CONTENT_EXT_TRANSFORM_ID_FK); + +CREATE INDEX CONTENTS_EXT_RTF_IDX ON CONTENTS (request_id, transform_id, workload_id, coll_id, content_id, PandaID, status) LOCAL; diff --git a/main/lib/idds/tests/test_migrate_requests.py b/main/lib/idds/tests/test_migrate_requests.py index 998c058d..6b6bcac7 100644 --- a/main/lib/idds/tests/test_migrate_requests.py +++ b/main/lib/idds/tests/test_migrate_requests.py @@ -37,8 +37,11 @@ def migrate(): slac_k8s_dev_host = 'https://rubin-panda-idds-dev.slac.stanford.edu:8443/idds' # noqa F841 + cern_k8s_dev_host = 'https://panda-idds-dev.cern.ch/idds' # noqa F841 + # cm1 = ClientManager(host=atlas_host) cm1 = ClientManager(host=doma_host) + # cm1 = ClientManager(host=slac_k8s_dev_host) # reqs = cm1.get_requests(request_id=290) # old_request_id = 298163 # old_request_id = 350723 @@ -51,6 +54,7 @@ def migrate(): old_request_id = 2802 old_request_id = 2816 + # old_request_id = 1 # for old_request_id in [152]: # for old_request_id in [60]: # noqa E115 # for old_request_id in [200]: # noqa E115 @@ -61,6 +65,7 @@ def migrate(): # cm2 = ClientManager(host=doma_host) # cm2 = ClientManager(host=atlas_host) cm2 = ClientManager(host=slac_k8s_dev_host) + # cm2 = ClientManager(host=cern_k8s_dev_host) # print(reqs) print("num requests: %s" % len(reqs)) From 9d22d791afac25d0ff22fdaaa70ce2a0716cbeec Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 1 Dec 2022 19:09:49 +0100 Subject: [PATCH 12/91] publish content_ext messages --- common/lib/idds/common/constants.py | 3 ++ main/lib/idds/agents/carrier/utils.py | 60 ++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/common/lib/idds/common/constants.py b/common/lib/idds/common/constants.py index a3c1b846..2b0fc185 100644 --- a/common/lib/idds/common/constants.py +++ b/common/lib/idds/common/constants.py @@ -349,6 +349,7 @@ class MessageType(IDDSEnum): ProcessingWork = 11 HealthHeartbeat = 12 IDDSCommunication = 13 + ContentExt = 14 UnknownFile = 97 UnknownCollection = 98 UnknownWork = 99 @@ -372,6 +373,7 @@ class MessageTypeStr(IDDSEnum): UnknownFile = 'file_unknown' UnknownCollection = 'collection_unknown' UnknownWork = 'work_unknown' + ContentExt = 'content_ext' TransformType2MessageTypeMap = { @@ -431,6 +433,7 @@ class MessageDestination(IDDSEnum): Carrier = 3 Conductor = 4 Outside = 5 + ContentExt = 6 class CommandType(IDDSEnum): diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index 2fa5f8d3..bf9d35b1 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -19,6 +19,7 @@ WorkStatus, TransformType, TransformType2MessageTypeMap, + MessageType, MessageTypeStr, MessageStatus, MessageSource, MessageDestination) from idds.common.utils import setup_logging @@ -314,6 +315,33 @@ def generate_file_messages(request_id, transform_id, workload_id, work, files, r return i_msg_type, msg_content, num_msg_content +def generate_content_ext_messages(request_id, transform_id, workload_id, work, files, relation_type, input_output_maps): + i_msg_type = MessageType.ContentExt + i_msg_type_str = MessageTypeStr.ContentExt + + content_map = {} + for map_id in input_output_maps: + outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] + for content in outputs: + content_map[content['content_id']] = content + + files_message = [] + for file in files: + file['status'] = file['status'].name + content_id = file['content_id'] + content = content_map[content_id] + file['scope'] = content['scope'] + file['name'] = content['name'] + files_message.append(file) + msg_content = {'msg_type': i_msg_type_str.value, + 'request_id': request_id, + 'workload_id': workload_id, + 'relation_type': relation_type, + 'files': files_message} + num_msg_content = len(files_message) + return i_msg_type, msg_content, num_msg_content + + def generate_collection_messages(request_id, transform_id, workload_id, work, collection, relation_type): coll_name = collection.name if coll_name.endswith(".idds.stagein"): @@ -346,12 +374,12 @@ def generate_work_messages(request_id, transform_id, workload_id, work, relation return i_msg_type, msg_content, num_msg_content -def generate_messages(request_id, transform_id, workload_id, work, msg_type='file', files=[], relation_type='input'): +def generate_messages(request_id, transform_id, workload_id, work, msg_type='file', files=[], relation_type='input', input_output_maps=None): if msg_type == 'file': i_msg_type, msg_content, num_msg_content = generate_file_messages(request_id, transform_id, workload_id, work, files=files, relation_type=relation_type) msg = {'msg_type': i_msg_type, 'status': MessageStatus.New, - 'source': MessageSource.Transformer, + 'source': MessageSource.Carrier, 'destination': MessageDestination.Outside, 'request_id': request_id, 'workload_id': workload_id, @@ -359,6 +387,20 @@ def generate_messages(request_id, transform_id, workload_id, work, msg_type='fil 'num_contents': num_msg_content, 'msg_content': msg_content} return [msg] + elif msg_type == 'content_ext': + i_msg_type, msg_content, num_msg_content = generate_content_ext_messages(request_id, transform_id, workload_id, work, files=files, + relation_type=relation_type, + input_output_maps=input_output_maps) + msg = {'msg_type': i_msg_type, + 'status': MessageStatus.New, + 'source': MessageSource.Carrier, + 'destination': MessageDestination.ContentExt, + 'request_id': request_id, + 'workload_id': workload_id, + 'transform_id': transform_id, + 'num_contents': num_msg_content, + 'msg_content': msg_content} + return [msg] elif msg_type == 'work': # link collections input_collections = work.get_input_collections() @@ -382,7 +424,7 @@ def generate_messages(request_id, transform_id, workload_id, work, msg_type='fil for i_msg_type, msg_content, num_msg_content in msg_type_contents: msg = {'msg_type': i_msg_type, 'status': MessageStatus.New, - 'source': MessageSource.Transformer, + 'source': MessageSource.Carrier, 'destination': MessageDestination.Outside, 'request_id': request_id, 'workload_id': workload_id, @@ -1231,10 +1273,10 @@ def sync_processing(processing, agent_attributes, terminate=False, logger=None, work = proc.work work.set_agent_attributes(agent_attributes, processing) - # input_output_maps = get_input_output_maps(transform_id, work) + input_output_maps = get_input_output_maps(transform_id, work) update_collections, all_updates_flushed = sync_collection_status(request_id, transform_id, workload_id, work, - input_output_maps=None, close_collection=True, - terminate=terminate) + input_output_maps=input_output_maps, + close_collection=True, terminate=terminate) messages = [] sync_work_status(request_id, transform_id, workload_id, work) @@ -1250,6 +1292,12 @@ def sync_processing(processing, agent_attributes, terminate=False, logger=None, processing['status'] = ProcessingStatus.Failed else: processing['status'] = ProcessingStatus.SubFinished + + if work.require_ext_contents(): + contents_ext = core_catalog.get_contents_ext(request_id=request_id, transform_id=transform_id) + msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='content_ext', files=contents_ext, + relation_type='output', input_output_maps=input_output_maps) + messages += msgs return processing, update_collections, messages From 2a657e9519f6c4c443f6c56eb2ab5d71b98dba98 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Fri, 2 Dec 2022 15:34:11 +0100 Subject: [PATCH 13/91] messaging to support multiple channels --- .../idds/agents/common/plugins/messaging.py | 149 ++++++++++-------- main/lib/idds/agents/conductor/conductor.py | 2 +- 2 files changed, 81 insertions(+), 70 deletions(-) diff --git a/main/lib/idds/agents/common/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py index 648187dd..0caf0afb 100644 --- a/main/lib/idds/agents/common/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -62,20 +62,10 @@ def __init__(self, name="MessagingSender", **kwargs): self.output_queue = None self.response_queue = None - if not hasattr(self, 'brokers'): - raise Exception('brokers is required but not defined.') - else: - self.brokers = [b.strip() for b in self.brokers.split(',')] - if not hasattr(self, 'port'): - raise Exception('port is required but not defined.') - if not hasattr(self, 'vhost'): - self.vhost = None - if not hasattr(self, 'destination'): - raise Exception('destination is required but not defined.') - if not hasattr(self, 'broker_timeout'): - self.broker_timeout = 60 - else: - self.broker_timeout = int(self.broker_timeout) + if not hasattr(self, 'channels'): + raise Exception('"channels" is required but not defined.') + + self.broker_timeout = 3600 self.conns = [] @@ -92,67 +82,85 @@ def set_response_queue(self, response_queue): self.response_queue = response_queue def connect_to_messaging_brokers(self, sender=True): - broker_addresses = [] - for b in self.brokers: - try: - if ":" in b: + channel_conns = {} + for name in self.channels: + channel = self.channels[name] + brokers = channel['brokers'] + # destination = channel['destination'] + # username = channel['username'] + # password = channel['password'] + broker_timeout = channel['broker_timeout'] + + broker_addresses = [] + for b in brokers: + try: b, port = b.split(":") - else: - port = self.port - addrinfos = socket.getaddrinfo(b, 0, socket.AF_INET, 0, socket.IPPROTO_TCP) - for addrinfo in addrinfos: - b_addr = addrinfo[4][0] - broker_addresses.append((b_addr, port)) - except socket.gaierror as error: - self.logger.error('Cannot resolve hostname %s: %s' % (b, str(error))) + addrinfos = socket.getaddrinfo(b, 0, socket.AF_INET, 0, socket.IPPROTO_TCP) + for addrinfo in addrinfos: + b_addr = addrinfo[4][0] + broker_addresses.append((b_addr, port)) + except socket.gaierror as error: + self.logger.error('Cannot resolve hostname %s: %s' % (b, str(error))) - self.logger.info("Resolved broker addresses: %s" % broker_addresses) + self.logger.info("Resolved broker addresses for channel %s: %s" % (name, broker_addresses)) - timeout = self.broker_timeout + timeout = broker_timeout - conns = [] - for broker, port in broker_addresses: - conn = stomp.Connection12(host_and_ports=[(broker, port)], - vhost=self.vhost, - keepalive=True, - heartbeats=(60000, 60000), # one minute - timeout=timeout) - conns.append(conn) - return conns + conns = [] + for broker, port in broker_addresses: + conn = stomp.Connection12(host_and_ports=[(broker, port)], + vhost=self.vhost, + keepalive=True, + heartbeats=(60000, 60000), # one minute + timeout=timeout) + conns.append(conn) + channel_conns[name] = conns + return channel_conns def disconnect(self, conns): - for conn in conns: - try: - conn.disconnect() - except Exception: - pass + for name in conns: + for conn in conns[name]: + try: + conn.disconnect() + except Exception: + pass - def get_connection(self): + def get_connection(self, destination): try: - conn = random.sample(self.conns, 1)[0] + if destination not in self.conns: + destination = 'default' + conn = random.sample(self.conns[destination], 1)[0] + queue_dest = self.channels[destination]['destination'] if not conn.is_connected(): # conn.start() conn.connect(self.username, self.password, wait=True) - return conn + return conn, queue_dest except Exception as error: self.logger.error("Failed to connect to message broker(will re-resolve brokers): %s" % str(error)) self.disconnect(self.conns) - self.conns = self.connect_to_messaging_brokers(sender=True) - conn = random.sample(self.conns, 1)[0] - if not conn.is_connected(): - conn.connect(self.username, self.password, wait=True) - return conn + try: + self.conns = self.connect_to_messaging_brokers(sender=True) + if destination not in self.conns: + destination = 'default' + conn = random.sample(self.conns[destination], 1)[0] + queue_dest = self.channels[destination]['destination'] + if not conn.is_connected(): + conn.connect(self.username, self.password, wait=True) + return conn, queue_dest + except Exception as error: + self.logger.error("Failed to connect to message broker(will re-resolve brokers): %s" % str(error)) def send_message(self, msg): - conn = self.get_connection() + destination = msg['destination'] if 'destination' in msg else 'default' + conn, queue_dest = self.get_connection(destination) self.logger.info("Sending message to message broker: %s" % msg['msg_id']) self.logger.debug("Sending message to message broker: %s" % json.dumps(msg['msg_content'])) conn.send(body=json.dumps(msg['msg_content']), - destination=self.destination, + destination=queue_dest, id='atlas-idds-messaging', ack='auto', headers={'persistent': 'true', @@ -201,11 +209,12 @@ def get_listener(self, broker): def subscribe(self): self.receiver_conns = self.connect_to_messaging_brokers() - for conn in self.receiver_conns: - self.logger.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) - conn.set_listener('message-receiver', self.get_listener(conn.transport._Transport__host_and_ports[0])) - conn.connect(self.username, self.password, wait=True) - conn.subscribe(destination=self.destination, id='atlas-idds-messaging', ack='auto') + for name in self.receiver_conns: + for conn in self.receiver_conns[name]: + self.logger.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) + conn.set_listener('message-receiver', self.get_listener(conn.transport._Transport__host_and_ports[0])) + conn.connect(self.channels[name]['username'], self.channels[name]['password'], wait=True) + conn.subscribe(destination=self.channels[name]['destination'], id='atlas-idds-messaging', ack='auto') def execute_subscribe(self): try: @@ -216,12 +225,13 @@ def execute_subscribe(self): while not self.graceful_stop.is_set(): has_failed_connection = False try: - for conn in self.receiver_conns: - if not conn.is_connected(): - conn.set_listener('message-receiver', self.get_listener(conn.transport._Transport__host_and_ports[0])) - # conn.start() - conn.connect(self.username, self.password, wait=True) - conn.subscribe(destination=self.destination, id='atlas-idds-messaging', ack='auto') + for name in self.receiver_conns: + for conn in self.receiver_conns[name]: + if not conn.is_connected(): + conn.set_listener('message-receiver', self.get_listener(conn.transport._Transport__host_and_ports[0])) + # conn.start() + conn.connect(self.channels[name]['username'], self.channels[name]['password'], wait=True) + conn.subscribe(destination=self.channels[name]['destination'], id='atlas-idds-messaging', ack='auto') time.sleep(0.1) except Exception as error: self.logger.error("Messaging receiver throws an exception: %s, %s" % (error, traceback.format_exc())) @@ -278,12 +288,13 @@ def execute_send_subscribe(self): # subscribe has_failed_connection = False try: - for conn in self.receiver_conns: - if not conn.is_connected(): - conn.set_listener('message-receiver', self.get_listener(conn.transport._Transport__host_and_ports[0])) - # conn.start() - conn.connect(self.username, self.password, wait=True) - conn.subscribe(destination=self.destination, id='atlas-idds-messaging', ack='auto') + for name in self.receiver_conns: + for conn in self.receiver_conns[name]: + if not conn.is_connected(): + conn.set_listener('message-receiver', self.get_listener(conn.transport._Transport__host_and_ports[0])) + # conn.start() + conn.connect(self.channels[name]['username'], self.channels[name]['password'], wait=True) + conn.subscribe(destination=self.channels[name]['destination'], id='atlas-idds-messaging', ack='auto') except Exception as error: self.logger.error("Messaging receiver throws an exception: %s, %s" % (error, traceback.format_exc())) has_failed_connection = True diff --git a/main/lib/idds/agents/conductor/conductor.py b/main/lib/idds/agents/conductor/conductor.py index bc72f185..94fe967d 100644 --- a/main/lib/idds/agents/conductor/conductor.py +++ b/main/lib/idds/agents/conductor/conductor.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2022 import time import traceback From d675805c77971a8d08228267dea1a74054977924 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 5 Dec 2022 17:51:13 +0100 Subject: [PATCH 14/91] fix clean works --- workflow/lib/idds/workflowv2/workflow.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/workflow/lib/idds/workflowv2/workflow.py b/workflow/lib/idds/workflowv2/workflow.py index 3472dbf5..5d494e19 100644 --- a/workflow/lib/idds/workflowv2/workflow.py +++ b/workflow/lib/idds/workflowv2/workflow.py @@ -1828,8 +1828,8 @@ def clean_works(self): self.terminated_works = [] self.current_running_works = [] - self.works = {} - self.work_sequence = {} # order list + # self.works = {} + # self.work_sequence = {} # order list self.first_initial = False self.new_to_run_works = [] @@ -2260,6 +2260,7 @@ def resume_works(self): def clean_works(self): # if self.runs: # self.runs[str(self.num_run)].clean_works() + self.template.clean_works() self.parent_num_run = None self._num_run = 0 self.runs = {} From 6bf6d0695c4f49b396d12a34d5eb18625520bd92 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 5 Dec 2022 17:51:52 +0100 Subject: [PATCH 15/91] fix logs --- main/lib/idds/agents/clerk/clerk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index 0616372f..8b0e045c 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -555,7 +555,7 @@ def handle_update_request_real(self, req, event): event_content = {'request_id': req['request_id'], 'cmd_type': CommandType.ExpireRequest, 'cmd_content': {}} - self.logger.debug(log_pre + "ExpireRequestEvent(request_id: %s" % req['request_id']) + self.logger.debug(log_pre + "ExpireRequestEvent(request_id: %s)" % req['request_id']) event = ExpireRequestEvent(publisher_id=self.id, request_id=req['request_id'], content=event_content) self.event_bus.send(event) From bf79dd02d4c1a8abad395d3c11282512088838b8 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 6 Dec 2022 11:06:10 +0100 Subject: [PATCH 16/91] add combine extension outputs functions --- main/lib/idds/agents/carrier/utils.py | 14 ++++---------- main/lib/idds/core/catalog.py | 4 ++++ main/lib/idds/orm/contents.py | 23 +++++++++++++++++++++++ 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index bf9d35b1..5f85d729 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -319,20 +319,14 @@ def generate_content_ext_messages(request_id, transform_id, workload_id, work, f i_msg_type = MessageType.ContentExt i_msg_type_str = MessageTypeStr.ContentExt - content_map = {} + output_contents = [] for map_id in input_output_maps: outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] for content in outputs: - content_map[content['content_id']] = content + # content_map[content['content_id']] = content + output_contents += outputs - files_message = [] - for file in files: - file['status'] = file['status'].name - content_id = file['content_id'] - content = content_map[content_id] - file['scope'] = content['scope'] - file['name'] = content['name'] - files_message.append(file) + files_message = core_catalog.combine_contents_ext(output_contents, files, with_status_name=True) msg_content = {'msg_type': i_msg_type_str.value, 'request_id': request_id, 'workload_id': workload_id, diff --git a/main/lib/idds/core/catalog.py b/main/lib/idds/core/catalog.py index 2d8924a3..c7421fc9 100644 --- a/main/lib/idds/core/catalog.py +++ b/main/lib/idds/core/catalog.py @@ -682,3 +682,7 @@ def get_contents_ext_ids(request_id=None, transform_id=None, workload_id=None, c """ return orm_contents.get_contents_ext_ids(request_id=request_id, transform_id=transform_id, workload_id=workload_id, coll_id=coll_id, status=status, session=session) + + +def combine_contents_ext(contents, contents_ext, with_status_name=False): + return orm_contents.combine_contents_ext(contents, contents_ext, with_status_name=with_status_name) diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index 435670d5..e4707395 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -771,3 +771,26 @@ def get_contents_ext_ids(request_id=None, transform_id=None, workload_id=None, c (transform_id, error)) except Exception as error: raise error + + +def combine_contents_ext(contents, contents_ext, with_status_name=False): + contents_ext_map = {} + for content in contents_ext: + contents_ext_map[content['content_id']] = content + + rets = [] + for content in contents: + content_id = content['content_id'] + if content_id in contents_ext_map: + ret = contents_ext_map[content_id] + else: + ret = {'content_id': content_id} + if with_status_name: + ret['status'] = content['status'].name + else: + ret['status'] = content['status'] + ret['scope'] = content['scope'] + ret['name'] = content['name'] + + rets.append(ret) + return rets From 9466f4fa86fb52305f44e20f50205c9af33f1e80 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 6 Dec 2022 11:09:54 +0100 Subject: [PATCH 17/91] add rest and client for get contents_ext --- client/lib/idds/client/catalogclient.py | 23 ++++++++++ main/lib/idds/rest/v1/catalog.py | 57 ++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/client/lib/idds/client/catalogclient.py b/client/lib/idds/client/catalogclient.py index 0a3f2350..305725a8 100644 --- a/client/lib/idds/client/catalogclient.py +++ b/client/lib/idds/client/catalogclient.py @@ -172,3 +172,26 @@ def register_contents(self, coll_scope, coll_name, request_id, workload_id, cont r = self.get_request_response(url, type='POST', data=contents) return r + + def get_contents_output_ext(self, request_id=None, workload_id=None, transform_id=None): + """ + Get output extension contents from the Head service. + + :param request_id: the request id. + :param workload_id: the workload id. + :param transform_id: the transform id. + + :raise exceptions if it's not got successfully. + """ + path = os.path.join(self.CATALOG_BASEURL, 'contents_output_ext') + if request_id is None: + request_id = 'null' + if workload_id is None: + workload_id = 'null' + if transform_id is None: + transform_id = 'null' + + url = self.build_url(self.host, path=os.path.join(path, str(request_id), str(workload_id), str(transform_id))) + + contents = self.get_request_response(url, type='GET') + return contents diff --git a/main/lib/idds/rest/v1/catalog.py b/main/lib/idds/rest/v1/catalog.py index d199920a..3474a21a 100644 --- a/main/lib/idds/rest/v1/catalog.py +++ b/main/lib/idds/rest/v1/catalog.py @@ -14,8 +14,8 @@ from flask import Blueprint from idds.common import exceptions -from idds.common.constants import HTTP_STATUS_CODE -from idds.core.catalog import get_collections, get_contents +from idds.common.constants import HTTP_STATUS_CODE, CollectionRelationType +from idds.core.catalog import get_collections, get_contents, get_contents_ext, combine_contents_ext from idds.rest.v1.controller import IDDSController @@ -113,6 +113,55 @@ def get(self, coll_scope, coll_name, request_id, workload_id, relation_type, sta return self.generate_http_response(HTTP_STATUS_CODE.OK, data=rets) +class ContentsOutputExt(IDDSController): + """ Catalog """ + + def get(self, request_id, workload_id, transform_id): + """ Get contents by request_id, workload_id and transform_id. + HTTP Success: + 200 OK + HTTP Error: + 404 Not Found + 500 InternalError + :returns: contents. + """ + + try: + if request_id in ['null', 'None']: + request_id = None + else: + request_id = int(request_id) + if workload_id in ['null', 'None']: + workload_id = None + else: + workload_id = int(workload_id) + if transform_id in ['null', 'None']: + transform_id = None + else: + transform_id = int(transform_id) + + if transform_id is None: + self.generate_http_response(HTTP_STATUS_CODE.BadRequest, + exc_cls=exceptions.BadRequest.__name__, + exc_msg="Transform_id must not be None") + + contents = get_contents(request_id=request_id, workload_id=workload_id, transform_id=transform_id, + relation_type=CollectionRelationType.Output) + contents_ext = get_contents_ext(request_id=request_id, workload_id=workload_id, transform_id=transform_id) + + rets = combine_contents_ext(contents, contents_ext, with_status_name=True) + except exceptions.NoObject as error: + return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) + except exceptions.IDDSException as error: + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=error.__class__.__name__, exc_msg=error) + except Exception as error: + print(error) + print(traceback.format_exc()) + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) + + return self.generate_http_response(HTTP_STATUS_CODE.OK, data=rets) + + """---------------------- Web service url maps ----------------------""" @@ -131,4 +180,8 @@ def get_blueprint(): bp.add_url_rule('/catalog/contents//////', view_func=contents_view, methods=['get', ]) # get contents + contents_ext_view = ContentsOutputExt.as_view('contents_output_ext') + bp.add_url_rule('/catalog/contents_output_ext///', + view_func=contents_ext_view, methods=['get', ]) + return bp From f4925793a25f3dc5a75c96a5519f6e3586df0958 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 6 Dec 2022 11:40:55 +0100 Subject: [PATCH 18/91] add ext statistics and use it to finish processing --- .../lib/idds/doma/workflowv2/domapandawork.py | 4 +- main/lib/idds/agents/carrier/utils.py | 45 +++++++++++++++++-- main/lib/idds/orm/base/models.py | 4 ++ workflow/lib/idds/workflow/work.py | 5 +++ workflow/lib/idds/workflowv2/work.py | 5 +++ 5 files changed, 58 insertions(+), 5 deletions(-) diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index ff9d6b28..b54aac99 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -1087,8 +1087,8 @@ def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext= new_contents_ext, update_contents_ext, left_jobs = self.get_contents_ext(input_output_maps, contents_ext, contents_ext_full, job_info_items) - if left_jobs: - processing_status = ProcessingStatus.Running + # if left_jobs: + # processing_status = ProcessingStatus.Running return processing_status, updated_contents, update_contents_full, new_contents_ext, update_contents_ext else: diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index 5f85d729..84bd6b95 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -1166,7 +1166,9 @@ def sync_collection_status(request_id, transform_id, workload_id, work, input_ou for content in inputs + outputs + logs: if content['coll_id'] not in coll_status: coll_status[content['coll_id']] = {'total_files': 0, 'processed_files': 0, 'processing_files': 0, 'bytes': 0, - 'new_files': 0, 'failed_files': 0, 'missing_files': 0} + 'new_files': 0, 'failed_files': 0, 'missing_files': 0, + 'ext_files': 0, 'processed_ext_files': 0, 'failed_ext_files': 0, + 'missing_ext_files': 0} coll_status[content['coll_id']]['total_files'] += 1 if content['status'] in [ContentStatus.Available, ContentStatus.Mapped, @@ -1186,6 +1188,22 @@ def sync_collection_status(request_id, transform_id, workload_id, work, input_ou if content['status'] != content['substatus']: all_updates_flushed = False + all_ext_updated = True + if work.require_ext_contents(): + all_ext_updated = False + contents_ext = core_catalog.get_contents_ext(request_id=request_id, transform_id=transform_id) + for content in contents_ext: + coll_status[content['coll_id']]['ext_files'] += 1 + + if content['status'] in [ContentStatus.Available, ContentStatus.Mapped, + ContentStatus.Available.value, ContentStatus.Mapped.value, + ContentStatus.FakeAvailable, ContentStatus.FakeAvailable.value]: + coll_status[content['coll_id']]['processed_extfiles'] += 1 + elif content['status'] in [ContentStatus.Failed, ContentStatus.FinalFailed]: + coll_status[content['coll_id']]['failed_ext_files'] += 1 + elif content['status'] in [ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: + coll_status[content['coll_id']]['missing_ext_files'] += 1 + input_collections = work.get_input_collections(poll_externel=True) output_collections = work.get_output_collections() log_collections = work.get_log_collections() @@ -1203,6 +1221,10 @@ def sync_collection_status(request_id, transform_id, workload_id, work, input_ou coll.new_files = coll_status[coll.coll_id]['new_files'] coll.failed_files = coll_status[coll.coll_id]['failed_files'] coll.missing_files = coll_status[coll.coll_id]['missing_files'] + coll.ext_files = coll_status[coll.coll_id]['ext_files'] + coll.processed_ext_files = coll_status[coll.coll_id]['processed_ext_files'] + coll.failed_ext_files = coll_status[coll.coll_id]['failed_ext_files'] + coll.missing_ext_files = coll_status[coll.coll_id]['missing_ext_files'] else: coll.total_files = 0 coll.processed_files = 0 @@ -1210,6 +1232,10 @@ def sync_collection_status(request_id, transform_id, workload_id, work, input_ou coll.new_files = 0 coll.failed_files = 0 coll.missing_files = 0 + coll.ext_files = 0 + coll.processed_ext_files = 0 + coll.failed_ext_files = 0 + coll.missing_ext_files = 0 u_coll = {'coll_id': coll.coll_id, 'total_files': coll.total_files, @@ -1218,9 +1244,22 @@ def sync_collection_status(request_id, transform_id, workload_id, work, input_ou 'new_files': coll.new_files, 'failed_files': coll.failed_files, 'missing_files': coll.missing_files, - 'bytes': coll.bytes} + 'bytes': coll.bytes, + 'ext_files': coll.ext_files, + 'processed_ext_files': coll.processed_ext_files, + 'failed_ext_files': coll.failed_ext_files, + 'missing_ext_files': coll.missing_ext_files} if terminate: - if force_close_collection or close_collection and all_updates_flushed or coll.status == CollectionStatus.Closed: + if work.require_ext_contents(): + if coll.processed_files == coll.ext_processed_files and coll.failed_files == coll.failed_ext_files: + all_ext_updated = True + if (force_close_collection or (close_collection and all_updates_flushed and all_ext_updated) + or coll.status == CollectionStatus.Closed): # noqa W503 + u_coll['status'] = CollectionStatus.Closed + u_coll['substatus'] = CollectionStatus.Closed + coll.status = CollectionStatus.Closed + coll.substatus = CollectionStatus.Closed + elif force_close_collection or close_collection and all_updates_flushed or coll.status == CollectionStatus.Closed: u_coll['status'] = CollectionStatus.Closed u_coll['substatus'] = CollectionStatus.Closed coll.status = CollectionStatus.Closed diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index e5849317..49df2c26 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -472,6 +472,10 @@ class Collection(BASE, ModelBase): processing_files = Column(Integer()) failed_files = Column(Integer()) missing_files = Column(Integer()) + ext_files = Column(Integer()) + processed_ext_files = Column(Integer()) + failed_ext_files = Column(Integer()) + missing_ext_files = Column(Integer()) processing_id = Column(Integer()) retries = Column(Integer(), default=0) created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) diff --git a/workflow/lib/idds/workflow/work.py b/workflow/lib/idds/workflow/work.py index cc69fe05..8432e469 100644 --- a/workflow/lib/idds/workflow/work.py +++ b/workflow/lib/idds/workflow/work.py @@ -72,6 +72,11 @@ def __init__(self, scope=None, name=None, coll_type=CollectionType.Dataset, coll self.failed_files = 0 self.missing_files = 0 + self.ext_files = 0 + self.processed_ext_files = 0 + self.failed_ext_files = 0 + self.missing_ext_files = 0 + @property def internal_id(self): return self.get_metadata_item('internal_id') diff --git a/workflow/lib/idds/workflowv2/work.py b/workflow/lib/idds/workflowv2/work.py index dc4f1cc4..c1a33e93 100644 --- a/workflow/lib/idds/workflowv2/work.py +++ b/workflow/lib/idds/workflowv2/work.py @@ -72,6 +72,11 @@ def __init__(self, scope=None, name=None, coll_type=CollectionType.Dataset, coll self.failed_files = 0 self.missing_files = 0 + self.ext_files = 0 + self.processed_ext_files = 0 + self.failed_ext_files = 0 + self.missing_ext_files = 0 + @property def internal_id(self): return self.get_metadata_item('internal_id') From 86c9a117a283a931eb2c86d130e8a46092784a60 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 6 Dec 2022 12:19:14 +0100 Subject: [PATCH 19/91] enable json outputs --- client/lib/idds/client/base.py | 11 +++++++++++ client/lib/idds/client/clientmanager.py | 11 ++++++++++- main/lib/idds/rest/v1/controller.py | 21 ++++++++++++++++----- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/client/lib/idds/client/base.py b/client/lib/idds/client/base.py index 1d8bd189..168a0eb9 100644 --- a/client/lib/idds/client/base.py +++ b/client/lib/idds/client/base.py @@ -73,6 +73,11 @@ def __init__(self, host=None, auth=None, timeout=None, client_proxy=None): self.original_user_cert = None self.original_user_token = None + self.json_outputs = False + + def enable_json_outputs(self): + self.json_outputs = True + def get_user_proxy(sellf): """ Get the user proxy. @@ -135,12 +140,18 @@ def build_url(self, url, path=None, params=None, doseq=False): full_url = url if path is not None: full_url = '/'.join([full_url, path]) + + if params is None: + params = {} + if self.json_outputs: + params['json_outputs'] = 'true' if params: full_url += "?" if isinstance(params, str): full_url += quote(params) else: full_url += urlencode(params, doseq=doseq) + return full_url def get_request_response(self, url, type='GET', data=None, headers=None, auth_setup_step=False): diff --git a/client/lib/idds/client/clientmanager.py b/client/lib/idds/client/clientmanager.py index 3847a188..1d700894 100644 --- a/client/lib/idds/client/clientmanager.py +++ b/client/lib/idds/client/clientmanager.py @@ -67,6 +67,8 @@ def __init__(self, host=None, timeout=600, setup_client=False): self.oidc_token = None self.vo = None + self.enable_json_outputs = False + self.configuration = ConfigParser.ConfigParser() self.client = None @@ -90,6 +92,9 @@ def setup_client(self, auth_setup=False): 'auth_setup': auth_setup}, timeout=self.timeout) + if self.enable_json_outputs: + self.client.enable_json_outputs() + def get_local_config_root(self): local_cfg_root = get_local_config_root(self.local_config_root) return local_cfg_root @@ -105,7 +110,8 @@ def get_config_value(self, configuration, section, name, current, default): 'auth_type': 'IDDS_AUTH_TYPE', 'oidc_token': 'IDDS_OIDC_TOKEN', 'vo': 'IDDS_VO', - 'auth_no_verify': 'IDDS_AUTH_NO_VERIFY'} + 'auth_no_verify': 'IDDS_AUTH_NO_VERIFY', + 'enable_json_outputs': 'IDDS_ENABLE_JSON_OUTPUTS'} if not section: section = self.get_section(name) @@ -175,6 +181,8 @@ def get_local_configuration(self): self.vo = self.get_config_value(config, None, 'vo', current=self.vo, default=None) + self.enable_json_outputs = self.get_config_value(config, None, 'enable_json_outputs', + current=self.enable_json_outputs, default=None) self.configuration = config def set_local_configuration(self, name, value): @@ -200,6 +208,7 @@ def save_local_configuration(self): self.set_local_configuration(name='x509_proxy', value=self.x509_proxy) self.set_local_configuration(name='oidc_token', value=self.oidc_token) self.set_local_configuration(name='vo', value=self.vo) + self.set_local_configuration(name='enable_json_outputs', value=self.enable_json_outputs) with open(local_cfg, 'w') as configfile: self.configuration.write(configfile) diff --git a/main/lib/idds/rest/v1/controller.py b/main/lib/idds/rest/v1/controller.py index 6a6931de..c1401080 100644 --- a/main/lib/idds/rest/v1/controller.py +++ b/main/lib/idds/rest/v1/controller.py @@ -35,7 +35,7 @@ def delete(self): """ Not supported. """ return Response(status=HTTP_STATUS_CODE.BadRequest, content_type='application/json')() - def get_request(sel): + def get_request(self): return request def get_username(self): @@ -55,8 +55,19 @@ def generate_message(self, exc_cls=None, exc_msg=None): return json_dumps(message) def generate_http_response(self, status_code, data=None, exc_cls=None, exc_msg=None): - resp = Response(response=json_dumps(data, sort_keys=True, indent=4) if data is not None else data, status=status_code, content_type='application/json') - if exc_cls: - resp.headers['ExceptionClass'] = exc_cls - resp.headers['ExceptionMessage'] = self.generate_message(exc_cls, exc_msg) + enable_json_outputs = self.get_request().args.get('json_outputs', None) + if enable_json_outputs and enable_json_outputs.upper == 'TRUE': + error = None + if exc_cls: + error = {'ExceptionClass': exc_cls, + 'ExceptionMessage': self.generate_message(exc_cls, exc_msg)} + response = {'ret_code': status_code, + 'data': data, + 'error': error} + resp = Response(response=json_dumps(response, sort_keys=True, indent=4), status=HTTP_STATUS_CODE.OK, content_type='application/json') + else: + resp = Response(response=json_dumps(data, sort_keys=True, indent=4) if data is not None else data, status=status_code, content_type='application/json') + if exc_cls: + resp.headers['ExceptionClass'] = exc_cls + resp.headers['ExceptionMessage'] = self.generate_message(exc_cls, exc_msg) return resp From 8028e6ac4c4ad0c384e0baaacaf1e7f23561300a Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 8 Dec 2022 15:25:58 +0100 Subject: [PATCH 20/91] workflow to support build work --- workflow/lib/idds/workflow/workflow.py | 6 + workflow/lib/idds/workflowv2/work.py | 55 +++++++ workflow/lib/idds/workflowv2/workflow.py | 180 ++++++++++++++++++++++- 3 files changed, 238 insertions(+), 3 deletions(-) diff --git a/workflow/lib/idds/workflow/workflow.py b/workflow/lib/idds/workflow/workflow.py index 836cbd17..dcbc8fb0 100644 --- a/workflow/lib/idds/workflow/workflow.py +++ b/workflow/lib/idds/workflow/workflow.py @@ -1216,6 +1216,9 @@ def add_work(self, work, initial=False, primary=False): self.independent_works.append(work.get_internal_id()) + def has_to_build_work(self): + return False + def add_condition(self, cond): self.first_initial = False cond_works = cond.all_works() @@ -2096,6 +2099,9 @@ def get_workload_id(self): def add_work(self, work, initial=False, primary=False): self.template.add_work(work, initial, primary) + def has_to_build_work(self): + return False + def add_condition(self, cond): self.template.add_condition(cond) diff --git a/workflow/lib/idds/workflowv2/work.py b/workflow/lib/idds/workflowv2/work.py index c1a33e93..2f8437b6 100644 --- a/workflow/lib/idds/workflowv2/work.py +++ b/workflow/lib/idds/workflowv2/work.py @@ -588,6 +588,8 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.sliced_global_parameters = None + self.is_build_work = False + """ self._running_data_names = [] for name in ['internal_id', 'template_work_id', 'initialized', 'sequence_id', 'parameters', 'work_id', 'transforming', 'workdir', @@ -782,6 +784,33 @@ def substatus(self): def substatus(self, value): self.add_metadata_item('substatus', value.value if value else value) + @property + def is_build_work(self): + st = self.get_metadata_item('is_build_work', False) + return st + + @is_build_work.setter + def is_build_work(self, value): + self.add_metadata_item('is_build_work', value) + + def set_build_work(self): + self.is_build_work = True + + @property + def signature(self): + st = self.get_metadata_item('signature', None) + return st + + @signature.setter + def signature(self, value): + self.add_metadata_item('signature', value) + + def sign(self): + self.signature = str(uuid.uuid4()) + + def get_signature(self): + return self.signature + @property def polling_retries(self): return self.get_metadata_item('polling_retries', 0) @@ -1253,6 +1282,7 @@ def clean_work(self): self.terminated_msg = "" self.output_data = {} self.parameters_for_next_task = None + self.last_updated_at = datetime.datetime.utcnow() def set_agent_attributes(self, attrs, req_attributes=None): if attrs and self.class_name in attrs: @@ -1364,6 +1394,31 @@ def get_backup_to_release_inputs(self): self.backup_to_release_inputs['0'] = [] return to_release_inputs + def is_to_expire(self, expired_at=None, pending_time=None, request_id=None): + if expired_at: + if type(expired_at) in [str]: + expired_at = str_to_date(expired_at) + if expired_at < datetime.datetime.utcnow(): + self.logger.info("Request(%s) expired_at(%s) is smaller than utc now(%s), expiring" % (request_id, + expired_at, + datetime.datetime.utcnow())) + return True + + if pending_time: + act_pending_time = float(pending_time) + act_pending_seconds = int(86400 * act_pending_time) + if self.last_updated_at + datetime.timedelta(seconds=act_pending_seconds) < datetime.datetime.utcnow(): + log_str = "Request(%s) last updated at(%s) + pending seconds(%s)" % (request_id, + self.last_updated_at, + act_pending_seconds) + log_str += " is smaller than utc now(%s), expiring" % (datetime.datetime.utcnow()) + self.logger.info(log_str) + return True + return False + + def is_starting(self): + return self.transforming + def is_started(self): return self.started or self.submitted diff --git a/workflow/lib/idds/workflowv2/workflow.py b/workflow/lib/idds/workflowv2/workflow.py index 5d494e19..5b64bce8 100644 --- a/workflow/lib/idds/workflowv2/workflow.py +++ b/workflow/lib/idds/workflowv2/workflow.py @@ -622,6 +622,7 @@ def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None if self.logger is None: self.setup_logger() + self.build_work = None self._works = {} self.works = {} self.work_sequence = {} # order list @@ -1257,6 +1258,15 @@ def add_work(self, work, initial=False, primary=False): self.independent_works.append(work.get_internal_id()) + def add_build_work(self, work, initial=False, primary=False): + self.build_work = work + self.build_work.set_build_work() + + def has_to_build_work(self): + if self.build_work is not None: + return True + return False + def add_condition(self, cond): self.first_initial = False cond_works = cond.all_works() @@ -1825,6 +1835,8 @@ def clean_works(self): self.num_total_works = 0 self.last_updated_at = datetime.datetime.utcnow() + if self.build_work: + self.build_work.clean_works() self.terminated_works = [] self.current_running_works = [] @@ -1979,6 +1991,7 @@ def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None self.template = WorkflowBase(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) + self.build_work = None # parent_num_run is string. self.parent_num_run = None self._num_run = 0 @@ -2021,7 +2034,8 @@ def get_template_id(self): @property def metadata(self): - run_metadata = {'parent_num_run': self.parent_num_run, + run_metadata = {'build': self.get_build_metadata(), + 'parent_num_run': self.parent_num_run, 'num_run': self._num_run, 'runs': {}} for run_id in self.runs: @@ -2034,6 +2048,8 @@ def metadata(self): def metadata(self, value): self.template.load_metadata() run_metadata = value + build_metadata = run_metadata['build'] + self.set_build_metadata(build_metadata) self.parent_num_run = run_metadata['parent_num_run'] self._num_run = run_metadata['num_run'] runs = run_metadata['runs'] @@ -2185,6 +2201,45 @@ def get_workload_id(self): def add_work(self, work, initial=False, primary=False): self.template.add_work(work, initial, primary) + def add_build_work(self, work, initial=False, primary=False): + self.template.add_build_work(work, initial, primary) + + def get_build_metadata(self): + if self.build_work is not None: + return self.build_work.metadata + + build_work = self.template.get_build_work() + if build_work is not None: + return build_work.metadata + return None + + def set_build_metadata(self, metadata): + if self.build_work is not None: + self.build_work.metadata = metadata + + build_work = self.template.get_build_work() + if build_work is not None: + self.build_work = self.template.copy() + self.build_work.metadata = metadata + + def get_buil_work(self): + if self.build_work is not None: + return self.build_work + build_work = self.template.get_build_work() + if build_work is not None: + self.build_work = self.template.copy() + self.build_work.sign() + return self.build_work + + def has_to_build_work(self): + build_work = self.get_buil_work() + if build_work is None: + return False + + if not (build_work.is_started() or build_work.is_starting()): + return True + return False + def add_condition(self, cond): self.template.add_condition(cond) @@ -2221,6 +2276,16 @@ def sync_global_parameters(self, global_parameters, sliced_global_parameters=Non def get_new_works(self, synchronize=True): self.logger.info("%s get_new_works" % self.get_internal_id()) + + build_work = self.get_build_work() + if build_work: + if not (build_work.is_started() or build_work.is_starting()): + return build_work + elif not build_work.is_terminated(): + return [] + elif build_work.is_terminated() and not build_work.is_finished(): + return [] + self.log_debug("synchronizing works") if synchronize: self.sync_works(to_cancel=self.to_cancel) @@ -2232,6 +2297,16 @@ def get_new_works(self, synchronize=True): return works def get_current_works(self): + build_work = self.get_build_work() + if build_work: + if (build_work.is_started() or build_work.is_starting()): + if (not build_work.is_terminated()): + return [build_work] + elif not build_work.is_finished(): + return [] + else: + return [] + self.sync_works(to_cancel=self.to_cancel) if self.runs: return self.runs[str(self.num_run)].get_current_works() @@ -2239,11 +2314,20 @@ def get_current_works(self): def get_all_works(self, synchronize=True): self.logger.info("%s get_all_works" % self.get_internal_id()) + works = [] + + build_work = self.get_build_work() + if build_work: + if build_work.is_finished(): + works = [build_work] + else: + return [build_work] + if synchronize: self.sync_works(to_cancel=self.to_cancel) - works = [] if self.runs: - works = self.runs[str(self.num_run)].get_all_works(synchronize=False) + run_works = self.runs[str(self.num_run)].get_all_works(synchronize=False) + works = works + run_works self.logger.info("%s get_all_works done" % self.get_internal_id()) return works @@ -2264,13 +2348,33 @@ def clean_works(self): self.parent_num_run = None self._num_run = 0 self.runs = {} + self.build_work = None def is_to_expire(self, expired_at=None, pending_time=None, request_id=None): + build_work = self.get_build_work() + if build_work: + if not build_work.is_terminated(): + return build_work.is_to_expire(expired_at=expired_at, pending_time=pending_time, request_id=request_id) + elif build_work.is_terminated() and not build_work.is_finished(): + return False + else: + pass + if self.runs: return self.runs[str(self.num_run)].is_to_expire(expired_at=expired_at, pending_time=pending_time, request_id=request_id) return False def is_terminated(self, synchronize=True): + build_work = self.get_build_work() + if build_work: + if build_work.is_terminated(): + if not build_work.is_finished(): + return True + else: + pass + else: + return False + if self.runs: if self.runs[str(self.num_run)].is_terminated(synchronize=synchronize): if not self.runs[str(self.num_run)].has_loop_condition() or not self.runs[str(self.num_run)].get_loop_condition_status(): @@ -2279,31 +2383,94 @@ def is_terminated(self, synchronize=True): def is_finished(self, synchronize=True): if self.is_terminated(synchronize=synchronize): + build_work = self.get_build_work() + if build_work: + if build_work.is_terminated(): + if not build_work.is_finished(): + return False + else: + pass + else: + return False return self.runs[str(self.num_run)].is_finished(synchronize=False) return False def is_subfinished(self, synchronize=True): if self.is_terminated(synchronize=synchronize): + build_work = self.get_build_work() + if build_work: + if build_work.is_terminated(): + if not build_work.is_finished(): + return False + else: + pass + else: + return False return self.runs[str(self.num_run)].is_subfinished(synchronize=False) return False def is_failed(self, synchronize=True): if self.is_terminated(synchronize=synchronize): + build_work = self.get_build_work() + if build_work: + if build_work.is_terminated(): + if not build_work.is_finished(): + return True + else: + pass + else: + return False return self.runs[str(self.num_run)].is_failed(synchronize=False) return False def is_expired(self, synchronize=True): if self.is_terminated(synchronize=synchronize): + build_work = self.get_build_work() + if build_work: + if build_work.is_terminated(): + if not build_work.is_finished(): + return False + else: + pass + else: + if build_work.is_expired(): + return True + else: + return False return self.runs[str(self.num_run)].is_expired(synchronize=False) return False def is_cancelled(self, synchronize=True): if self.is_terminated(synchronize=synchronize): + build_work = self.get_build_work() + if build_work: + if build_work.is_terminated(): + if not build_work.is_finished(): + return False + else: + pass + else: + if build_work.is_cancelled(): + return True + else: + return False return self.runs[str(self.num_run)].is_cancelled(synchronize=False) return False def is_suspended(self, synchronize=True): if self.is_terminated(synchronize=synchronize): + build_work = self.get_build_work() + if build_work: + if build_work.is_terminated(): + if not build_work.is_finished(): + return False + else: + pass + else: + if build_work.is_suspended(): + return True + else: + return False return self.runs[str(self.num_run)].is_suspended(synchronize=False) return False @@ -2343,6 +2510,13 @@ def sync_works(self, to_cancel=False): if to_cancel: self.to_cancel = to_cancel + build_work = self.get_build_work() + if build_work: + if not build_work.is_terminated(): + return + elif build_work.is_terminated() and not build_work.is_finished(): + return + self.refresh_works() # position is end. if self.num_run < 1: From 41a855dcb127cd23f7c0d7293c5578169e036ef0 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 8 Dec 2022 15:32:03 +0100 Subject: [PATCH 21/91] core functions to support build workflow --- main/lib/idds/core/requests.py | 21 ++++++- main/lib/idds/orm/base/models.py | 58 +++++++++++------ main/lib/idds/orm/requests.py | 104 +++++++++++++++++++++---------- 3 files changed, 129 insertions(+), 54 deletions(-) diff --git a/main/lib/idds/core/requests.py b/main/lib/idds/core/requests.py index 01087741..40ebfe46 100644 --- a/main/lib/idds/core/requests.py +++ b/main/lib/idds/core/requests.py @@ -110,6 +110,23 @@ def add_request(scope=None, name=None, requester=None, request_type=None, return orm_requests.add_request(**kwargs) +@read_session +def get_request(request_id, to_json=False, session=None): + """ + Get a request or raise a NoObject exception. + + :param request_id: The id of the request. + :param to_json: return json format. + + :param session: The database session in use. + + :raises NoObject: If no request is founded. + + :returns: Request. + """ + return orm_requests.get_request(request_id=request_id, to_json=to_json, session=session) + + @read_session def get_request_ids_by_workload_id(workload_id, session=None): """ @@ -183,14 +200,14 @@ def cancel_requests(request_id=None, workload_id=None, session=None): @transactional_session -def update_request(request_id, parameters, session=None): +def update_request(request_id, parameters, update_request_metadata=False, session=None): """ update an request. :param request_id: the request id. :param parameters: A dictionary of parameters. """ - return orm_requests.update_request(request_id, parameters, session=session) + return orm_requests.update_request(request_id, parameters, update_request_metadata=update_request_metadata, session=session) def generate_collection(transform, collection, relation_type=CollectionRelationType.Input): diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index 49df2c26..77492596 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -163,14 +163,23 @@ class Request(BASE, ModelBase): @property def request_metadata(self): - if self._request_metadata and 'workflow' in self._request_metadata: - workflow = self._request_metadata['workflow'] - workflow_data = None - if self._processing_metadata and 'workflow_data' in self._processing_metadata: - workflow_data = self._processing_metadata['workflow_data'] - if workflow is not None and workflow_data is not None: - workflow.metadata = workflow_data - self._request_metadata['workflow'] = workflow + if self._request_metadata: + if 'workflow' in self._request_metadata: + workflow = self._request_metadata['workflow'] + workflow_data = None + if self._processing_metadata and 'workflow_data' in self._processing_metadata: + workflow_data = self._processing_metadata['workflow_data'] + if workflow is not None and workflow_data is not None: + workflow.metadata = workflow_data + self._request_metadata['workflow'] = workflow + if 'build_workflow' in self._request_metadata: + build_workflow = self._request_metadata['build_workflow'] + build_workflow_data = None + if self._processing_metadata and 'build_workflow_data' in self._processing_metadata: + build_workflow_data = self._processing_metadata['build_workflow_data'] + if build_workflow is not None and build_workflow_data is not None: + build_workflow.metadata = build_workflow_data + self._request_metadata['build_workflow'] = build_workflow return self._request_metadata @request_metadata.setter @@ -179,9 +188,13 @@ def request_metadata(self, request_metadata): self._request_metadata = request_metadata if self._processing_metadata is None: self._processing_metadata = {} - if request_metadata and 'workflow' in request_metadata: - workflow = request_metadata['workflow'] - self._processing_metadata['workflow_data'] = workflow.metadata + if request_metadata: + if 'workflow' in request_metadata: + workflow = request_metadata['workflow'] + self._processing_metadata['workflow_data'] = workflow.metadata + if 'build_workflow' in request_metadata: + build_workflow = request_metadata['build_workflow'] + self._processing_metadata['build_workflow_data'] = build_workflow.metadata @property def processing_metadata(self): @@ -193,7 +206,7 @@ def processing_metadata(self, processing_metadata): self._processing_metadata = {} if processing_metadata: for k in processing_metadata: - if k != 'workflow_data': + if k != 'workflow_data' and k != 'build_workflow_data': self._processing_metadata[k] = processing_metadata[k] def _items_extend(self): @@ -201,13 +214,22 @@ def _items_extend(self): ('processing_metadata', self.processing_metadata)] def update(self, values, flush=True, session=None): - if values and 'request_metadata' in values and 'workflow' in values['request_metadata']: - workflow = values['request_metadata']['workflow'] + if values and 'request_metadata' in values: + if 'workflow' in values['request_metadata']: + workflow = values['request_metadata']['workflow'] + + if workflow is not None: + if 'processing_metadata' not in values: + values['processing_metadata'] = {} + values['processing_metadata']['workflow_data'] = workflow.metadata + if 'build_workflow' in values['request_metadata']: + build_workflow = values['request_metadata']['build_workflow'] + + if build_workflow is not None: + if 'processing_metadata' not in values: + values['processing_metadata'] = {} + values['processing_metadata']['build_workflow_data'] = build_workflow.metadata - if workflow is not None: - if 'processing_metadata' not in values: - values['processing_metadata'] = {} - values['processing_metadata']['workflow_data'] = workflow.metadata if values and 'request_metadata' in values: del values['request_metadata'] if values and 'processing_metadata' in values: diff --git a/main/lib/idds/orm/requests.py b/main/lib/idds/orm/requests.py index c87ef135..1d2fee6d 100644 --- a/main/lib/idds/orm/requests.py +++ b/main/lib/idds/orm/requests.py @@ -418,14 +418,23 @@ def get_requests(request_id=None, workload_id=None, with_detail=False, with_meta # t2 = dict(t) t2 = dict(zip(t.keys(), t)) - if 'request_metadata' in t2 and t2['request_metadata'] and 'workflow' in t2['request_metadata']: - workflow = t2['request_metadata']['workflow'] - workflow_data = None - if 'processing_metadata' in t2 and t2['processing_metadata'] and 'workflow_data' in t2['processing_metadata']: - workflow_data = t2['processing_metadata']['workflow_data'] - if workflow is not None and workflow_data is not None: - workflow.metadata = workflow_data - t2['request_metadata']['workflow'] = workflow + if 'request_metadata' in t2 and t2['request_metadata']: + if 'workflow' in t2['request_metadata']: + workflow = t2['request_metadata']['workflow'] + workflow_data = None + if 'processing_metadata' in t2 and t2['processing_metadata'] and 'workflow_data' in t2['processing_metadata']: + workflow_data = t2['processing_metadata']['workflow_data'] + if workflow is not None and workflow_data is not None: + workflow.metadata = workflow_data + t2['request_metadata']['workflow'] = workflow + if 'build_workflow' in t2['request_metadata']: + build_workflow = t2['request_metadata']['build_workflow'] + build_workflow_data = None + if 'processing_metadata' in t2 and t2['processing_metadata'] and 'build_workflow_data' in t2['processing_metadata']: + build_workflow_data = t2['processing_metadata']['build_workflow_data'] + if build_workflow is not None and build_workflow_data is not None: + build_workflow.metadata = build_workflow_data + t2['request_metadata']['build_workflow'] = build_workflow rets.append(t2) return rets @@ -485,14 +494,23 @@ def get_requests(request_id=None, workload_id=None, with_detail=False, with_meta # t2 = dict(t) t2 = dict(zip(t.keys(), t)) - if 'request_metadata' in t2 and t2['request_metadata'] and 'workflow' in t2['request_metadata']: - workflow = t2['request_metadata']['workflow'] - workflow_data = None - if 'processing_metadata' in t2 and t2['processing_metadata'] and 'workflow_data' in t2['processing_metadata']: - workflow_data = t2['processing_metadata']['workflow_data'] - if workflow is not None and workflow_data is not None: - workflow.metadata = workflow_data - t2['request_metadata']['workflow'] = workflow + if 'request_metadata' in t2 and t2['request_metadata']: + if 'workflow' in t2['request_metadata']: + workflow = t2['request_metadata']['workflow'] + workflow_data = None + if 'processing_metadata' in t2 and t2['processing_metadata'] and 'workflow_data' in t2['processing_metadata']: + workflow_data = t2['processing_metadata']['workflow_data'] + if workflow is not None and workflow_data is not None: + workflow.metadata = workflow_data + t2['request_metadata']['workflow'] = workflow + if 'build_workflow' in t2['request_metadata']: + build_workflow = t2['request_metadata']['build_workflow'] + build_workflow_data = None + if 'processing_metadata' in t2 and t2['processing_metadata'] and 'build_workflow_data' in t2['processing_metadata']: + build_workflow_data = t2['processing_metadata']['build_workflow_data'] + if build_workflow is not None and build_workflow_data is not None: + build_workflow.metadata = build_workflow_data + t2['request_metadata']['build_workflow'] = build_workflow rets.append(t2) return rets @@ -631,14 +649,23 @@ def get_requests(request_id=None, workload_id=None, with_detail=False, with_meta # t2 = dict(t) t2 = dict(zip(t.keys(), t)) - if 'request_metadata' in t2 and t2['request_metadata'] and 'workflow' in t2['request_metadata']: - workflow = t2['request_metadata']['workflow'] - workflow_data = None - if 'processing_metadata' in t2 and t2['processing_metadata'] and 'workflow_data' in t2['processing_metadata']: - workflow_data = t2['processing_metadata']['workflow_data'] - if workflow is not None and workflow_data is not None: - workflow.metadata = workflow_data - t2['request_metadata']['workflow'] = workflow + if 'request_metadata' in t2 and t2['request_metadata']: + if 'workflow' in t2['request_metadata']: + workflow = t2['request_metadata']['workflow'] + workflow_data = None + if 'processing_metadata' in t2 and t2['processing_metadata'] and 'workflow_data' in t2['processing_metadata']: + workflow_data = t2['processing_metadata']['workflow_data'] + if workflow is not None and workflow_data is not None: + workflow.metadata = workflow_data + t2['request_metadata']['workflow'] = workflow + if 'build_workflow' in t2['request_metadata']: + build_workflow = t2['request_metadata']['build_workflow'] + build_workflow_data = None + if 'processing_metadata' in t2 and t2['processing_metadata'] and 'build_workflow_data' in t2['processing_metadata']: + build_workflow_data = t2['processing_metadata']['build_workflow_data'] + if build_workflow is not None and build_workflow_data is not None: + build_workflow.metadata = build_workflow_data + t2['request_metadata']['build_workflow'] = build_workflow rets.append(t2) return rets @@ -806,7 +833,7 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, req @transactional_session -def update_request(request_id, parameters, session=None): +def update_request(request_id, parameters, update_request_metadata=False, session=None): """ update an request. @@ -826,16 +853,25 @@ def update_request(request_id, parameters, session=None): if 'update_poll_period' in parameters and type(parameters['update_poll_period']) not in [datetime.timedelta]: parameters['update_poll_period'] = datetime.timedelta(seconds=parameters['update_poll_period']) - if 'request_metadata' in parameters and 'workflow' in parameters['request_metadata']: - workflow = parameters['request_metadata']['workflow'] - - if workflow is not None: - workflow.refresh_works() - if 'processing_metadata' not in parameters or not parameters['processing_metadata']: - parameters['processing_metadata'] = {} - parameters['processing_metadata']['workflow_data'] = workflow.metadata - if 'request_metadata' in parameters: + if 'workflow' in parameters['request_metadata']: + workflow = parameters['request_metadata']['workflow'] + + if workflow is not None: + workflow.refresh_works() + if 'processing_metadata' not in parameters or not parameters['processing_metadata']: + parameters['processing_metadata'] = {} + parameters['processing_metadata']['workflow_data'] = workflow.metadata + if 'build_workflow' in parameters['request_metadata']: + build_workflow = parameters['request_metadata']['build_workflow'] + + if build_workflow is not None: + build_workflow.refresh_works() + if 'processing_metadata' not in parameters or not parameters['processing_metadata']: + parameters['processing_metadata'] = {} + parameters['processing_metadata']['build_workflow_data'] = build_workflow.metadata + + if 'request_metadata' in parameters and not update_request_metadata: del parameters['request_metadata'] if 'processing_metadata' in parameters: parameters['_processing_metadata'] = parameters['processing_metadata'] From 00f273c23be6527ecafb8feba3fe57370ae60298 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 8 Dec 2022 15:39:01 +0100 Subject: [PATCH 22/91] clerk to support build setp --- common/lib/idds/common/constants.py | 4 ++ main/lib/idds/agents/clerk/clerk.py | 71 +++++++++++++++++++++++++++-- 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/common/lib/idds/common/constants.py b/common/lib/idds/common/constants.py index 2b0fc185..d827dddd 100644 --- a/common/lib/idds/common/constants.py +++ b/common/lib/idds/common/constants.py @@ -142,6 +142,8 @@ class RequestStatus(IDDSEnum): ToFinish = 18 ToForceFinish = 19 Terminating = 20 + Building = 21 + Built = 22 class RequestLocking(IDDSEnum): @@ -222,6 +224,8 @@ class TransformStatus(IDDSEnum): ToFinish = 18 ToForceFinish = 19 Terminating = 20 + Building = 21 + Built = 22 class TransformLocking(IDDSEnum): diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index 8b0e045c..af3e8de9 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -130,7 +130,7 @@ def get_new_requests(self): self.show_queue_size() - req_status = [RequestStatus.New, RequestStatus.Extend] + req_status = [RequestStatus.New, RequestStatus.Extend, RequestStatus.Built] reqs_new = core_requests.get_requests_by_status_type(status=req_status, locking=True, not_lock=True, new_poll=True, only_return_id=True, @@ -395,6 +395,68 @@ def handle_new_request(self, req): self.logger.warn(log_pre + "Handle new request error result: %s" % str(ret_req)) return ret_req + def has_to_build_work(self, req): + try: + if req['status'] in [RequestStatus.New] and 'build_workflow' in req['request_metadata']: + log_pre = self.get_log_prefix(req) + self.logger.info(log_pre + "has build work") + return True + # workflow = req['request_metadata']['build_workflow'] + # if workflow.has_to_build_work(): + # log_pre = self.get_log_prefix(req) + # self.logger.info(log_pre + "has to_build work") + # return True + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + return False + + def handle_build_request(self, req): + try: + log_pre = self.get_log_prefix(req) + self.logger.info(log_pre + "handle build request") + + workflow = req['request_metadata']['build_workflow'] + build_work = workflow.get_build_work() + build_work.add_proxy(workflow.get_proxy()) + transform = self.generate_transform(req, build_work) + self.logger.debug(log_pre + "Processing request(%s): new build transforms: %s" % (req['request_id'], + str(transform))) + + ret_req = {'request_id': req['request_id'], + 'parameters': {'status': RequestStatus.Building, + 'locking': RequestLocking.Idle, + # 'processing_metadata': processing_metadata, + 'request_metadata': req['request_metadata']}, + 'new_transforms': [transform]} + ret_req['parameters'] = self.load_poll_period(req, ret_req['parameters']) + self.logger.info(log_pre + "Handle build request result: %s" % str(ret_req)) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + retries = req['new_retries'] + 1 + if not req['max_new_retries'] or retries < req['max_new_retries']: + req_status = req['status'] + else: + req_status = RequestStatus.Failed + + # increase poll period + new_poll_period = int(req['new_poll_period'].total_seconds() * self.poll_period_increase_rate) + if new_poll_period > self.max_new_poll_period: + new_poll_period = self.max_new_poll_period + + error = {'submit_err': {'msg': truncate_string('%s' % (ex), length=200)}} + + ret_req = {'request_id': req['request_id'], + 'parameters': {'status': req_status, + 'locking': RequestLocking.Idle, + 'new_retries': retries, + 'new_poll_period': new_poll_period, + 'errors': req['errors'] if req['errors'] else {}}} + ret_req['parameters']['errors'].update(error) + self.logger.warn(log_pre + "Handle build request error result: %s" % str(ret_req)) + return ret_req + def update_request(self, req): new_tf_ids, update_tf_ids = [], [] try: @@ -463,13 +525,16 @@ def process_new_request(self, event): self.number_workers += 1 try: if event: - req_status = [RequestStatus.New, RequestStatus.Extend] + req_status = [RequestStatus.New, RequestStatus.Extend, RequestStatus.Built] req = self.get_request(request_id=event._request_id, status=req_status, locking=True) if not req: self.logger.error("Cannot find request for event: %s" % str(event)) elif req: log_pre = self.get_log_prefix(req) - ret = self.handle_new_request(req) + if self.has_to_build_work(req): + ret = self.handle_build_request(req) + else: + ret = self.handle_new_request(req) new_tf_ids, update_tf_ids = self.update_request(ret) for tf_id in new_tf_ids: self.logger.info(log_pre + "NewTransformEvent(transform_id: %s)" % str(tf_id)) From fcd69151f0f226ffa861c1ade43f3fadbe1d852d Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 8 Dec 2022 15:42:46 +0100 Subject: [PATCH 23/91] rest api and client to support build request --- client/lib/idds/client/clientmanager.py | 64 +++++++++++++++++++++++++ client/lib/idds/client/requestclient.py | 17 +++++++ main/lib/idds/rest/v1/requests.py | 53 +++++++++++++++++++- 3 files changed, 133 insertions(+), 1 deletion(-) diff --git a/client/lib/idds/client/clientmanager.py b/client/lib/idds/client/clientmanager.py index 1d700894..c8957052 100644 --- a/client/lib/idds/client/clientmanager.py +++ b/client/lib/idds/client/clientmanager.py @@ -418,6 +418,70 @@ def submit(self, workflow, username=None, userdn=None, use_dataset_name=True): request_id = self.client.add_request(**props) return request_id + @exception_handler + def submit_build(self, workflow, username=None, userdn=None, use_dataset_name=True): + """ + Submit the workflow as a request to iDDS server. + + :param workflow: The workflow to be submitted. + """ + self.setup_client() + + props = { + 'scope': 'workflow', + 'name': workflow.name, + 'requester': 'panda', + 'request_type': RequestType.Workflow, + 'username': username if username else workflow.username, + 'userdn': userdn if userdn else workflow.userdn, + 'transform_tag': 'workflow', + 'status': RequestStatus.New, + 'priority': 0, + 'lifetime': workflow.lifetime, + 'workload_id': workflow.get_workload_id(), + 'request_metadata': {'version': release_version, 'workload_id': workflow.get_workload_id(), 'build_workflow': workflow} + } + + if self.client.original_user_name: + props['username'] = self.client.original_user_name + if self.client.original_user_dn: + props['userdn'] = self.client.original_user_dn + + if self.auth_type == 'x509_proxy': + workflow.add_proxy() + + if use_dataset_name: + primary_init_work = workflow.get_primary_initial_collection() + if primary_init_work: + if type(primary_init_work) in [Collection, CollectionV1]: + props['scope'] = primary_init_work.scope + props['name'] = primary_init_work.name + else: + props['scope'] = primary_init_work['scope'] + props['name'] = primary_init_work['name'] + + # print(props) + request_id = self.client.add_request(**props) + return request_id + + @exception_handler + def update_build(self, request_id, signature, workflow): + """ + Submit the workflow as a request to iDDS server. + + :param workflow: The workflow to be submitted. + """ + self.setup_client() + + parameters = { + 'request_id': request_id, + 'signature': signature, + 'workflow': workflow + } + + ret = self.client.update_build_request(request_id=request_id, parameters=parameters) + return ret + @exception_handler def abort(self, request_id=None, workload_id=None): """ diff --git a/client/lib/idds/client/requestclient.py b/client/lib/idds/client/requestclient.py index e6dfec21..33f55045 100644 --- a/client/lib/idds/client/requestclient.py +++ b/client/lib/idds/client/requestclient.py @@ -80,6 +80,23 @@ def update_request(self, request_id, parameters): r = self.get_request_response(url, type='PUT', data=data) return r + def update_build_request(self, request_id, parameters): + """ + Update Build Request to the Head service. + + :param request_id: the request. + :param kwargs: other attributes of the request. + + :raise exceptions if it's not updated successfully. + """ + path = self.REQUEST_BASEURL + path += "/build" + url = self.build_url(self.host, path=os.path.join(path, str(request_id))) + + data = parameters + r = self.get_request_response(url, type='POST', data=data) + return r + def get_requests(self, request_id=None, workload_id=None, with_detail=False, with_metadata=False, with_transform=False, with_processing=False): """ Get request from the Head service. diff --git a/main/lib/idds/rest/v1/requests.py b/main/lib/idds/rest/v1/requests.py index 84b8fe24..73add121 100644 --- a/main/lib/idds/rest/v1/requests.py +++ b/main/lib/idds/rest/v1/requests.py @@ -21,7 +21,8 @@ MessageSource, MessageDestination, CommandType) from idds.common.utils import json_loads -from idds.core.requests import add_request, get_requests +from idds.core.requests import (add_request, get_requests, + get_request, update_request) from idds.core.messages import add_message from idds.core.commands import add_command from idds.rest.v1.controller import IDDSController @@ -216,6 +217,53 @@ def post_test(self): pprint.pprint(self.get_request().url_rule) +class RequestBuild(IDDSController): + """ Create, Update, get and delete Request. """ + + def post(self): + """ update build request result. + HTTP Success: + 200 OK + HTTP Error: + 400 Bad request + 500 Internal Error + """ + try: + parameters = self.get_request().data and json_loads(self.get_request().data) + if 'request_id' not in parameters or 'signature' not in parameters or 'workflow' not in parameters: + raise exceptions.IDDSException("request_id/signature/workflow are required") + + request_id = parameters['request_id'] + signature = parameters['signature'] + workflow = parameters['workflow'] + + req = get_request(request_id=request_id) + if not req: + raise exceptions.IDDSException("Request %s is not found" % request_id) + if req['status'] not in [RequestStatus.Building]: + raise exceptions.IDDSException("Request (request_id: %s, status: %s) is not in Building status" % (request_id, req['status'])) + + build_workflow = req['request_metadata']['build_workflow'] + build_work = build_workflow.get_build_work() + if build_work.get_signature() != signature: + raise exceptions.IDDSException("Request (request_id: %s) has a different signature(%s != %s)" % (request_id, + signature, + build_work.get_signature())) + req['request_metadata']['workflow'] = workflow + + parameters = {'status': RequestStatus.Built, + 'request_metadata': req['request_metadata']} + update_request(request_id=req['request_id'], parameters=parameters, update_request_metadata=True) + except exceptions.IDDSException as error: + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=error.__class__.__name__, exc_msg=error) + except Exception as error: + print(error) + print(format_exc()) + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) + + return self.generate_http_response(HTTP_STATUS_CODE.OK, data={'request_id': request_id}) + + class RequestAbort(IDDSController): """ Abort Request. """ @@ -355,6 +403,9 @@ def get_blueprint(): bp.add_url_rule('/request/////', view_func=request_view, methods=['get', ]) bp.add_url_rule('/request//////', view_func=request_view, methods=['get', ]) + request_build = RequestBuild.as_view('request_build') + bp.add_url_rule('/request/build/', view_func=request_build, methods=['post', ]) + request_abort = RequestAbort.as_view('request_abort') bp.add_url_rule('/request/abort//', view_func=request_abort, methods=['put', ]) bp.add_url_rule('/request/abort///task_id', view_func=request_abort, methods=['put', ]) From 9a1a84ff1f6634fb9ce7a65747cf991a17824130 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 8 Dec 2022 16:02:00 +0100 Subject: [PATCH 24/91] add oracle contents_ext table --- main/etc/sql/oracle_update.sql | 20 +++++++++++++++----- main/lib/idds/tests/trigger_release.py | 4 ++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/main/etc/sql/oracle_update.sql b/main/etc/sql/oracle_update.sql index 8980e0b1..686a6459 100644 --- a/main/etc/sql/oracle_update.sql +++ b/main/etc/sql/oracle_update.sql @@ -121,6 +121,14 @@ CREATE INDEX CONTENTS_DEP_IDX ON CONTENTS (request_id, transform_id, content_dep -- 2022.12.01 alter table requests modify username VARCHAR2(50) default null; +-- 2022.12.06 +alter table collections add ext_files NUMBER(10); +alter table collections add processed_ext_files NUMBER(10); +alter table collections add failed_ext_files NUMBER(10); +alter table collections add missing_ext_files NUMBER(10); + + +-- 2022.12.08 -- oracle 19 CREATE TABLE CONTENTS_ext ( @@ -201,11 +209,13 @@ CREATE TABLE CONTENTS_ext hs06sec NUMBER(12), memory_leak VARCHAR2(10), memory_leak_x2 VARCHAR2(10), - job_label VARCHAR2(20) + job_label VARCHAR2(20), CONSTRAINT CONTENT_EXT_PK PRIMARY KEY (content_id), - CONSTRAINT CONTENT_EXT_TRANSFORM_ID_FK FOREIGN KEY(transform_id) REFERENCES TRANSFORMS(transform_id), + CONSTRAINT CONTENT_EXT_TRANSFORM_ID_FK FOREIGN KEY(transform_id) REFERENCES TRANSFORMS(transform_id) ) -PCTFREE 0 -PARTITION BY REFERENCE(CONTENT_EXT_TRANSFORM_ID_FK); +PCTFREE 3 +PARTITION BY RANGE(TRANSFORM_ID) +INTERVAL ( 100000 ) +( PARTITION initial_part VALUES LESS THAN (1) ); -CREATE INDEX CONTENTS_EXT_RTF_IDX ON CONTENTS (request_id, transform_id, workload_id, coll_id, content_id, PandaID, status) LOCAL; +CREATE INDEX CONTENTS_EXT_RTF_IDX ON CONTENTS_ext (request_id, transform_id, workload_id, coll_id, content_id, PandaID, status) LOCAL; diff --git a/main/lib/idds/tests/trigger_release.py b/main/lib/idds/tests/trigger_release.py index 5f5390ac..cf04a307 100644 --- a/main/lib/idds/tests/trigger_release.py +++ b/main/lib/idds/tests/trigger_release.py @@ -12,7 +12,7 @@ request_ids = [368, 369, 370, 371, 372, 373, 374, 375, 376] -request_ids = [1689] +request_ids = [2895] for request_id in request_ids: contents = get_contents(request_id=request_id, status=ContentStatus.Available) ret_contents = {} @@ -31,4 +31,4 @@ print(update_content) # break - # update_contents(updated_contents) + update_contents(updated_contents) From 67ecc3b3188cc9477337198ac027ff1bcb022d07 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 8 Dec 2022 16:05:07 +0100 Subject: [PATCH 25/91] add test client --- main/lib/idds/tests/test_client.py | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 main/lib/idds/tests/test_client.py diff --git a/main/lib/idds/tests/test_client.py b/main/lib/idds/tests/test_client.py new file mode 100644 index 00000000..4021b755 --- /dev/null +++ b/main/lib/idds/tests/test_client.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2021 + + +""" +Test client. +""" + + +from idds.client.clientmanager import ClientManager +from idds.common.utils import json_dumps # noqa F401 +from idds.rest.v1.utils import convert_old_req_2_workflow_req # noqa F401 +from idds.common.utils import setup_logging + + +setup_logging("idds.log") + + +def test(): + # 72533, 72569, 72733, 72769, 72783, 72939, 73351, 73983, 74545, + # 74567, 74569, 74573 + # dev + dev_host = 'https://aipanda160.cern.ch:443/idds' # noqa F841 + # doma + doma_host = 'https://aipanda015.cern.ch:443/idds' # noqa F841 + # atlas + atlas_host = 'https://aipanda181.cern.ch:443/idds' # noqa F841 + # doma google + doma_google_host = 'https://34.133.138.229:443/idds' # noqa F841 + + cm1 = ClientManager(host=atlas_host) + cm1 = ClientManager(host=doma_host) + cm1 = ClientManager(host=dev_host) + request_id = 389 + + ret = cm1.get_requests(request_id, with_detail=True) + print(json_dumps(ret, sort_keys=True, indent=4)) + + +if __name__ == '__main__': + test() From 32ee51e7efcaafd0b5c3c7b642e3a68f3028ff64 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 8 Dec 2022 16:56:20 +0100 Subject: [PATCH 26/91] comment out log --- common/lib/idds/common/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/lib/idds/common/config.py b/common/lib/idds/common/config.py index fad6e1ed..5cd489e4 100644 --- a/common/lib/idds/common/config.py +++ b/common/lib/idds/common/config.py @@ -131,7 +131,8 @@ def get_local_config_root(local_config_root=None): # print("IDDS_LOCAL_CONFIG_ROOT is set. Will use it.") local_config_root = os.environ['IDDS_LOCAL_CONFIG_ROOT'] else: - print("local_config_root is set to %s. Ignore IDDS_LOCAL_CONFIG_ROOT" % local_config_root) + # print("local_config_root is set to %s. Ignore IDDS_LOCAL_CONFIG_ROOT" % local_config_root) + pass if local_config_root is None: # local_config_root = "~/.idds" From 3f8726058192c410d983148b400c81798fc2fcc2 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 8 Dec 2022 20:16:35 +0100 Subject: [PATCH 27/91] bug fixes --- main/lib/idds/agents/carrier/poller.py | 4 ++-- main/lib/idds/agents/clerk/clerk.py | 4 ++-- main/lib/idds/agents/common/plugins/messaging.py | 2 +- main/lib/idds/orm/contents.py | 8 ++++---- monitor/data/conf.js | 12 ++++++------ workflow/lib/idds/workflowv2/workflow.py | 5 ++++- 6 files changed, 19 insertions(+), 16 deletions(-) diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py index 02a591b4..5f6b90dc 100644 --- a/main/lib/idds/agents/carrier/poller.py +++ b/main/lib/idds/agents/carrier/poller.py @@ -146,8 +146,8 @@ def get_processing(self, processing_id, status=None, locking=False): def get_work_tag_attribute(self, work_tag, attribute): work_tag_attribute = work_tag + "_" + attribute work_tag_attribute_value = None - if not hasattr(self, work_tag_attribute): - work_tag_attribute_value = int(self.work_tag_attribute) + if hasattr(self, work_tag_attribute): + work_tag_attribute_value = int(getattr(self, work_tag_attribute)) return work_tag_attribute_value def load_poll_period(self, processing, parameters): diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index af3e8de9..1eb2079e 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -279,8 +279,8 @@ def load_poll_period(self, req, parameters): def get_work_tag_attribute(self, work_tag, attribute): work_tag_attribute = work_tag + "_" + attribute work_tag_attribute_value = None - if not hasattr(self, work_tag_attribute): - work_tag_attribute_value = int(self.work_tag_attribute) + if hasattr(self, work_tag_attribute): + work_tag_attribute_value = int(getattr(self, work_tag_attribute)) return work_tag_attribute_value def generate_transform(self, req, work): diff --git a/main/lib/idds/agents/common/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py index 0caf0afb..c442e808 100644 --- a/main/lib/idds/agents/common/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -64,6 +64,7 @@ def __init__(self, name="MessagingSender", **kwargs): if not hasattr(self, 'channels'): raise Exception('"channels" is required but not defined.') + self.channels = json.loads(self.channels) self.broker_timeout = 3600 @@ -110,7 +111,6 @@ def connect_to_messaging_brokers(self, sender=True): conns = [] for broker, port in broker_addresses: conn = stomp.Connection12(host_and_ports=[(broker, port)], - vhost=self.vhost, keepalive=True, heartbeats=(60000, 60000), # one minute timeout=timeout) diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index e4707395..8dcb8ec5 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -644,7 +644,7 @@ def add_contents_ext(contents, bulk_size=10000, session=None): try: for sub_param in sub_params: - session.bulk_insert_mappings(models.Content, sub_param) + session.bulk_insert_mappings(models.Content_ext, sub_param) content_ids = [None for _ in range(len(contents))] return content_ids except IntegrityError as error: @@ -666,7 +666,7 @@ def update_contents_ext(parameters, session=None): """ try: - session.bulk_update_mappings(models.Content, parameters) + session.bulk_update_mappings(models.Content_ext, parameters) except sqlalchemy.orm.exc.NoResultFound as error: raise exceptions.NoObject('Content cannot be found: %s' % (error)) @@ -704,7 +704,7 @@ def get_contents_ext(request_id=None, transform_id=None, workload_id=None, coll_ query = query.filter(models.Content_ext.coll_id == coll_id) if status is not None: query = query.filter(models.Content_ext.status.in_(status)) - query = query.order_by(asc(models.Content.request_id), asc(models.Content.transform_id), asc(models.Content.map_id)) + query = query.order_by(asc(models.Content_ext.request_id), asc(models.Content_ext.transform_id), asc(models.Content_ext.map_id)) tmp = query.all() rets = [] @@ -758,7 +758,7 @@ def get_contents_ext_ids(request_id=None, transform_id=None, workload_id=None, c query = query.filter(models.Content_ext.coll_id == coll_id) if status is not None: query = query.filter(models.Content_ext.status.in_(status)) - query = query.order_by(asc(models.Content.request_id), asc(models.Content.transform_id), asc(models.Content.map_id)) + query = query.order_by(asc(models.Content_ext.request_id), asc(models.Content_ext.transform_id), asc(models.Content_ext.map_id)) tmp = query.all() rets = [] diff --git a/monitor/data/conf.js b/monitor/data/conf.js index b8a48092..dd0f9da8 100644 --- a/monitor/data/conf.js +++ b/monitor/data/conf.js @@ -1,9 +1,9 @@ var appConfig = { - 'iddsAPI_request': "https://lxplus8s12.cern.ch:443/idds/monitor_request/null/null", - 'iddsAPI_transform': "https://lxplus8s12.cern.ch:443/idds/monitor_transform/null/null", - 'iddsAPI_processing': "https://lxplus8s12.cern.ch:443/idds/monitor_processing/null/null", - 'iddsAPI_request_detail': "https://lxplus8s12.cern.ch:443/idds/monitor/null/null/true/false/false", - 'iddsAPI_transform_detail': "https://lxplus8s12.cern.ch:443/idds/monitor/null/null/false/true/false", - 'iddsAPI_processing_detail': "https://lxplus8s12.cern.ch:443/idds/monitor/null/null/false/false/true" + 'iddsAPI_request': "https://lxplus8s08.cern.ch:443/idds/monitor_request/null/null", + 'iddsAPI_transform': "https://lxplus8s08.cern.ch:443/idds/monitor_transform/null/null", + 'iddsAPI_processing': "https://lxplus8s08.cern.ch:443/idds/monitor_processing/null/null", + 'iddsAPI_request_detail': "https://lxplus8s08.cern.ch:443/idds/monitor/null/null/true/false/false", + 'iddsAPI_transform_detail': "https://lxplus8s08.cern.ch:443/idds/monitor/null/null/false/true/false", + 'iddsAPI_processing_detail': "https://lxplus8s08.cern.ch:443/idds/monitor/null/null/false/false/true" } diff --git a/workflow/lib/idds/workflowv2/workflow.py b/workflow/lib/idds/workflowv2/workflow.py index 5b64bce8..e14500d5 100644 --- a/workflow/lib/idds/workflowv2/workflow.py +++ b/workflow/lib/idds/workflowv2/workflow.py @@ -1262,6 +1262,9 @@ def add_build_work(self, work, initial=False, primary=False): self.build_work = work self.build_work.set_build_work() + def get_build_work(self): + return self.build_work + def has_to_build_work(self): if self.build_work is not None: return True @@ -2222,7 +2225,7 @@ def set_build_metadata(self, metadata): self.build_work = self.template.copy() self.build_work.metadata = metadata - def get_buil_work(self): + def get_build_work(self): if self.build_work is not None: return self.build_work build_work = self.template.get_build_work() From 0419be495f57c50f6111dd1c312c194b5d351d73 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 12 Dec 2022 15:01:08 +0100 Subject: [PATCH 28/91] fix that abort_request tries to abort not-created transforms --- main/lib/idds/agents/clerk/clerk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index 1eb2079e..d1163d53 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -788,7 +788,7 @@ def process_abort_request(self, event): works = wf.get_all_works() if works: for work in works: - if not work.is_terminated(): + if (work.is_started() or work.is_starting()) and not work.is_terminated(): if not to_abort_transform_id or to_abort_transform_id == work.get_work_id(): self.logger.info(log_pre + "AbortTransformEvent(transform_id: %s)" % str(work.get_work_id())) event = AbortTransformEvent(publisher_id=self.id, From 7b1d5bf3fe5478a23910a3635cbb1da3957d2204 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 20 Dec 2022 14:11:02 +0100 Subject: [PATCH 29/91] add syslog-ng in Docker --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 42e9d630..5bdf58ce 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,7 +28,7 @@ RUN yum upgrade -y && \ # RUN yum install -y httpd.x86_64 conda gridsite mod_ssl.x86_64 httpd-devel.x86_64 gcc.x86_64 supervisor.noarch fetch-crl.noarch lcg-CA postgresql postgresql-contrib postgresql-static postgresql-libs postgresql-devel && \ # yum clean all && \ # rm -rf /var/cache/yum -RUN yum install -y httpd.x86_64 which conda gridsite mod_ssl.x86_64 httpd-devel.x86_64 gcc.x86_64 supervisor.noarch fetch-crl.noarch lcg-CA redis && \ +RUN yum install -y httpd.x86_64 which conda gridsite mod_ssl.x86_64 httpd-devel.x86_64 gcc.x86_64 supervisor.noarch fetch-crl.noarch lcg-CA redis syslog-ng && \ yum clean all && \ rm -rf /var/cache/yum From 3be492dc62639d7a3be4db7d8e233953fcb6c596 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 20 Dec 2022 14:11:46 +0100 Subject: [PATCH 30/91] add contents_ext in domapanda --- .../lib/idds/doma/workflowv2/domapandawork.py | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index b54aac99..ea4b4228 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -748,6 +748,7 @@ def get_update_contents_from_map_id(self, map_id, input_output_maps, job_info): return update_contents def get_panda_job_status(self, jobids): + jobids = list(jobids) self.logger.debug("get_panda_job_status, jobids[:10]: %s" % str(jobids[:10])) try: from pandaclient import Client @@ -951,7 +952,7 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m self.logger.debug("get_update_contents, update_contents[:3]: %s" % (str(update_contents[:3]))) return update_contents, update_contents_full, contents_ext_full - def get_contents_ext_detail(self, contents_ext_full, contents_ext_ids, job_info_items={}): + def get_contents_ext_detail(self, contents_ext_full, contents_ext_ids, job_info_maps={}): contents_ext_full_ids = set(contents_ext_full.keys()) new_ids = contents_ext_full_ids - contents_ext_ids to_update_ids = contents_ext_full_ids - new_ids @@ -967,20 +968,20 @@ def get_contents_ext_detail(self, contents_ext_full, contents_ext_ids, job_info_ 'coll_id': content['coll_id'], 'map_id': content['map_id'], 'status': content['status']} - for job_info_item in job_info_items: - new_content_ext[job_info_item] = getattr(job_info, job_info_item) + for job_info_item in job_info_maps: + new_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) new_contents_ext.append(new_content_ext) for to_update_id in to_update_ids: content = contents_ext_full[new_id]['content'] job_info = contents_ext_full[new_id]['job_info'] update_content_ext = {'content_id': content['content_id'], 'status': content['status']} - for job_info_item in job_info_items: - update_content_ext[job_info_item] = getattr(job_info, job_info_item) + for job_info_item in job_info_maps: + update_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) update_contents_ext.append(update_content_ext) return new_contents_ext, update_contents_ext - def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, job_info_items={}): + def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, job_info_maps={}): contents_ext_ids = [content['content_id'] for content in contents_ext] contents_ext_ids = set(contents_ext_ids) contents_ext_panda_ids = [content['PandaID'] for content in contents_ext] @@ -1038,13 +1039,13 @@ def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, j left_panda_ids = to_check_panda_ids - checked_panda_ids left_panda_ids = list(left_panda_ids) - new_contents_ext1, update_contents_ext1 = self.get_contents_ext_detail(contents_ext_full, contents_ext_ids, job_info_items) + new_contents_ext1, update_contents_ext1 = self.get_contents_ext_detail(contents_ext_full, contents_ext_ids, job_info_maps) new_contents_ext = new_contents_ext + new_contents_ext1 update_contents_ext = update_contents_ext + update_contents_ext1 return new_contents_ext, update_contents_ext, left_panda_ids - def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext=None, job_info_items={}, log_prefix=''): + def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext=None, job_info_maps={}, log_prefix=''): task_id = None try: from pandaclient import Client @@ -1062,7 +1063,7 @@ def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext= self.logger.debug(log_prefix + "poll_panda_task, task_info[0]: %s" % str(task_info[0])) if task_info[0] != 0: self.logger.warn(log_prefix + "poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) - return ProcessingStatus.Running, [], [] + return ProcessingStatus.Running, [], [], [], [] task_info = task_info[1] @@ -1086,7 +1087,7 @@ def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext= inputname_jobid_map) new_contents_ext, update_contents_ext, left_jobs = self.get_contents_ext(input_output_maps, contents_ext, - contents_ext_full, job_info_items) + contents_ext_full, job_info_maps) # if left_jobs: # processing_status = ProcessingStatus.Running @@ -1158,7 +1159,7 @@ def resume_processing(self, processing, log_prefix=''): def require_ext_contents(self): return True - def poll_processing_updates(self, processing, input_output_maps, contents_ext=None, job_info_items={}, log_prefix=''): + def poll_processing_updates(self, processing, input_output_maps, contents_ext=None, job_info_maps={}, log_prefix=''): """ *** Function called by Carrier agent. """ @@ -1172,7 +1173,7 @@ def poll_processing_updates(self, processing, input_output_maps, contents_ext=No ret_poll_panda_task = self.poll_panda_task(processing=processing, input_output_maps=input_output_maps, contents_ext=contents_ext, - job_info_items=job_info_items, + job_info_maps=job_info_maps, log_prefix=log_prefix) processing_status, update_contents, update_contents_full, new_contents_ext, update_contents_ext = ret_poll_panda_task From 362664b300d3290302c6620088f20c65d758c4e7 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 20 Dec 2022 14:12:27 +0100 Subject: [PATCH 31/91] update httpd timeout --- main/etc/idds/rest/httpd-idds-443-py36-cc7.conf.template | 5 +++-- main/etc/idds/rest/httpd-idds-443-py39-cc7.conf.template | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/main/etc/idds/rest/httpd-idds-443-py36-cc7.conf.template b/main/etc/idds/rest/httpd-idds-443-py36-cc7.conf.template index 25d5b687..b5472c24 100644 --- a/main/etc/idds/rest/httpd-idds-443-py36-cc7.conf.template +++ b/main/etc/idds/rest/httpd-idds-443-py36-cc7.conf.template @@ -6,8 +6,9 @@ # Authors: # - Wen Guan, , 2019 -# TimeOut 600 -# KeepAliveTimeout 600 +TimeOut 600 +KeepAliveTimeout 600 +SSLSessionCacheTimeout 600 # Built-in modules LoadModule ssl_module /usr/lib64/httpd/modules/mod_ssl.so diff --git a/main/etc/idds/rest/httpd-idds-443-py39-cc7.conf.template b/main/etc/idds/rest/httpd-idds-443-py39-cc7.conf.template index b983599e..cbdabfd6 100644 --- a/main/etc/idds/rest/httpd-idds-443-py39-cc7.conf.template +++ b/main/etc/idds/rest/httpd-idds-443-py39-cc7.conf.template @@ -8,6 +8,7 @@ TimeOut 600 KeepAliveTimeout 600 +SSLSessionCacheTimeout 600 # Built-in modules LoadModule ssl_module /usr/lib64/httpd/modules/mod_ssl.so From 34137786f0d8bb76d6f5bd4fd73d108d9f2141ac Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 20 Dec 2022 14:13:09 +0100 Subject: [PATCH 32/91] fix build error --- workflow/lib/idds/workflowv2/workflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/lib/idds/workflowv2/workflow.py b/workflow/lib/idds/workflowv2/workflow.py index e14500d5..57ab08db 100644 --- a/workflow/lib/idds/workflowv2/workflow.py +++ b/workflow/lib/idds/workflowv2/workflow.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 - 2021 +# - Wen Guan, , 2020 - 2022 import copy import datetime @@ -2051,7 +2051,7 @@ def metadata(self): def metadata(self, value): self.template.load_metadata() run_metadata = value - build_metadata = run_metadata['build'] + build_metadata = run_metadata['build'] if 'build' in run_metadata else {} self.set_build_metadata(build_metadata) self.parent_num_run = run_metadata['parent_num_run'] self._num_run = run_metadata['num_run'] From e18951bf80387f9a1fbc5b0156b10fef7698d358 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 20 Dec 2022 14:15:37 +0100 Subject: [PATCH 33/91] fix contents_ext --- main/etc/sql/oracle_update.sql | 128 +++++++++++++++---------------- main/lib/idds/core/catalog.py | 4 +- main/lib/idds/orm/base/models.py | 128 +++++++++++++++---------------- main/lib/idds/orm/contents.py | 32 +++++++- 4 files changed, 160 insertions(+), 132 deletions(-) diff --git a/main/etc/sql/oracle_update.sql b/main/etc/sql/oracle_update.sql index 686a6459..c46d1f70 100644 --- a/main/etc/sql/oracle_update.sql +++ b/main/etc/sql/oracle_update.sql @@ -139,72 +139,72 @@ CREATE TABLE CONTENTS_ext workload_id NUMBER(10), map_id NUMBER(12) DEFAULT 0, status NUMBER(2) constraint CONTENT_EXT_STATUS_NN NOT NULL, - PandaID NUMBER(14), - jobDefinitionID NUMBER(12), - schedulerID VARCHAR2(128), - pilotID VARCHAR2(200), - creationTime DATE, - modificationTime DATE, - startTime DATE, - endTime DATE, - prodSourceLabel VARCHAR2(20), - prodUserID VARCHAR2(250), - assignedPriority NUMBER(5), - currentPriority NUMBER(5), - attemptNr NUMBER(5), - maxAttempt NUMBER(5), - maxCpuCount NUMBER(5), - maxCpuUnit VARCHAR2(32), - maxDiskCount NUMBER(12), - maxDiskUnit VARCHAR2(10), - minRamCount NUMBER(12), - maxRamUnit VARCHAR2(10), - cpuConsumptionTime NUMBER(12), - cpuConsumptionUnit VARCHAR2(128), - jobStatus VARCHAR2(10), - jobName VARCHAR2(255), - transExitCode NUMBER(5), - pilotErrorCode NUMBER(5), - pilotErrorDiag VARCHAR2(500), - exeErrorCode NUMBER(5), - exeErrorDiag VARCHAR2(500), - supErrorCode NUMBER(5), - supErrorDiag VARCHAR2(250), - ddmErrorCode NUMBER(5), - ddmErrorDiag VARCHAR2(500), - brokerageErrorCode NUMBER(5), - brokerageErrorDiag VARCHAR2(250), - jobDispatcherErrorCode NUMBER(5), - jobDispatcherErrorDiag VARCHAR2(250), - taskBufferErrorCode NUMBER(5), - taskBufferErrorDiag VARCHAR2(300), - computingSite VARCHAR2(128), - computingElement VARCHAR2(128), + panda_id NUMBER(14), + job_definition_id NUMBER(12), + scheduler_id VARCHAR2(128), + pilot_id VARCHAR2(200), + creation_time DATE, + modification_time DATE, + start_time DATE, + end_time DATE, + prod_source_label VARCHAR2(20), + prod_user_id VARCHAR2(250), + assigned_priority NUMBER(5), + current_priority NUMBER(5), + attempt_nr NUMBER(5), + max_attempt NUMBER(5), + max_cpu_count NUMBER(5), + max_cpu_unit VARCHAR2(32), + max_disk_count NUMBER(12), + max_disk_unit VARCHAR2(10), + min_ram_count NUMBER(12), + min_ram_unit VARCHAR2(10), + cpu_consumption_time NUMBER(12), + cpu_consumption_unit VARCHAR2(128), + job_status VARCHAR2(10), + job_name VARCHAR2(255), + trans_exit_code NUMBER(5), + pilot_error_code NUMBER(5), + pilot_error_diag VARCHAR2(500), + exe_error_code NUMBER(5), + exe_error_diag VARCHAR2(500), + sup_error_code NUMBER(5), + sup_error_diag VARCHAR2(250), + ddm_error_code NUMBER(5), + ddm_error_diag VARCHAR2(500), + brokerage_error_code NUMBER(5), + brokerage_error_diag VARCHAR2(250), + job_dispatcher_error_code NUMBER(5), + job_dispatcher_error_diag VARCHAR2(250), + task_buffer_error_code NUMBER(5), + task_buffer_error_diag VARCHAR2(300), + computing_site VARCHAR2(128), + computing_element VARCHAR2(128), grid VARCHAR2(50), cloud VARCHAR2(50), - cpuConversion float(20), - taskID NUMBER(12), + cpu_conversion float(20), + task_id NUMBER(12), vo VARCHAR2(16), - pilotTiming VARCHAR2(100), - workingGroup VARCHAR2(20), - processingType VARCHAR2(64), - prodUserName VARCHAR2(60), - coreCount NUMBER(5), - nInputFiles NUMBER(10), - reqID NUMBER(12), - jediTaskID NUMBER(12), - actualCoreCount NUMBER(5), - maxRSS NUMBER(12), - maxVMEM NUMBER(12), - maxSWAP NUMBER(12), - maxPSS NUMBER(12), - avgRSS NUMBER(12), - avgVMEM NUMBER(12), - avgSWAP NUMBER(12), - avgPSS NUMBER(12), - maxWalltime NUMBER(12), - diskIO NUMBER(12), - failedAttempt NUMBER(5), + pilot_timing VARCHAR2(100), + working_group VARCHAR2(20), + processing_type VARCHAR2(64), + prod_user_name VARCHAR2(60), + core_count NUMBER(5), + n_input_files NUMBER(10), + req_id NUMBER(12), + jedi_task_id NUMBER(12), + actual_core_count NUMBER(5), + max_rss NUMBER(12), + max_vmem NUMBER(12), + max_swap NUMBER(12), + max_pss NUMBER(12), + avg_rss NUMBER(12), + avg_vmem NUMBER(12), + avg_swap NUMBER(12), + avg_pss NUMBER(12), + max_walltime NUMBER(12), + disk_io NUMBER(12), + failed_attempt NUMBER(5), hs06 NUMBER(12), hs06sec NUMBER(12), memory_leak VARCHAR2(10), @@ -218,4 +218,4 @@ PARTITION BY RANGE(TRANSFORM_ID) INTERVAL ( 100000 ) ( PARTITION initial_part VALUES LESS THAN (1) ); -CREATE INDEX CONTENTS_EXT_RTF_IDX ON CONTENTS_ext (request_id, transform_id, workload_id, coll_id, content_id, PandaID, status) LOCAL; +CREATE INDEX CONTENTS_EXT_RTF_IDX ON CONTENTS_ext (request_id, transform_id, workload_id, coll_id, content_id, panda_id, status) LOCAL; diff --git a/main/lib/idds/core/catalog.py b/main/lib/idds/core/catalog.py index c7421fc9..b708597c 100644 --- a/main/lib/idds/core/catalog.py +++ b/main/lib/idds/core/catalog.py @@ -611,8 +611,8 @@ def get_output_contents_by_request_id_status(request_id, name, content_status, l return contents -def get_contents_ext_items(): - return orm_contents.get_contents_ext_items() +def get_contents_ext_maps(): + return orm_contents.get_contents_ext_maps() @transactional_session diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index 77492596..c2ed3594 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -574,72 +574,72 @@ class Content_ext(BASE, ModelBase): workload_id = Column(Integer()) map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0) status = Column(EnumWithValue(ContentStatus)) - PandaID = Column(BigInteger()) - jobDefinitionID = Column(BigInteger()) - schedulerID = Column(String(128)) - pilotID = Column(String(200)) - creationTime = Column(DateTime) - modificationTime = Column(DateTime) - startTime = Column(DateTime) - endTime = Column(DateTime) - prodSourceLabel = Column(String(20)) - prodUserID = Column(String(250)) - assignedPriority = Column(Integer()) - currentPriority = Column(Integer()) - attemptNr = Column(Integer()) - maxAttempt = Column(Integer()) - maxCpuCount = Column(Integer()) - maxCpuUnit = Column(String(32)) - maxDiskCount = Column(Integer()) - maxDiskUnit = Column(String(10)) - minRamCount = Column(Integer()) - maxRamUnit = Column(String(10)) - cpuConsumptionTime = Column(Integer()) - cpuConsumptionUnit = Column(String(128)) - jobStatus = Column(String(10)) - jobName = Column(String(255)) - transExitCode = Column(Integer()) - pilotErrorCode = Column(Integer()) - pilotErrorDiag = Column(String(500)) - exeErrorCode = Column(Integer()) - exeErrorDiag = Column(String(500)) - supErrorCode = Column(Integer()) - supErrorDiag = Column(String(250)) - ddmErrorCode = Column(Integer()) - ddmErrorDiag = Column(String(500)) - brokerageErrorCode = Column(Integer()) - brokerageErrorDiag = Column(String(250)) - jobDispatcherErrorCode = Column(Integer()) - jobDispatcherErrorDiag = Column(String(250)) - taskBufferErrorCode = Column(Integer()) - taskBufferErrorDiag = Column(String(300)) - computingSite = Column(String(128)) - computingElement = Column(String(128)) + panda_id = Column(BigInteger()) + job_definition_id = Column(BigInteger()) + scheduler_id = Column(String(128)) + pilot_id = Column(String(200)) + creation_time = Column(DateTime) + modification_time = Column(DateTime) + start_time = Column(DateTime) + end_time = Column(DateTime) + prod_source_label = Column(String(20)) + prod_user_id = Column(String(250)) + assigned_priority = Column(Integer()) + current_priority = Column(Integer()) + attempt_nr = Column(Integer()) + max_attempt = Column(Integer()) + max_cpu_count = Column(Integer()) + max_cpu_unit = Column(String(32)) + max_disk_count = Column(Integer()) + max_disk_unit = Column(String(10)) + min_ram_count = Column(Integer()) + min_ram_unit = Column(String(10)) + cpu_consumption_time = Column(Integer()) + cpu_consumption_unit = Column(String(128)) + job_status = Column(String(10)) + job_name = Column(String(255)) + trans_exit_code = Column(Integer()) + pilot_error_code = Column(Integer()) + pilot_error_diag = Column(String(500)) + exe_error_code = Column(Integer()) + exe_error_diag = Column(String(500)) + sup_error_code = Column(Integer()) + sup_error_diag = Column(String(250)) + ddm_error_code = Column(Integer()) + ddm_error_diag = Column(String(500)) + brokerage_error_code = Column(Integer()) + brokerage_error_diag = Column(String(250)) + job_dispatcher_error_code = Column(Integer()) + job_dispatcher_error_diag = Column(String(250)) + task_buffer_error_code = Column(Integer()) + task_buffer_error_diag = Column(String(300)) + computing_site = Column(String(128)) + computing_element = Column(String(128)) grid = Column(String(50)) cloud = Column(String(50)) - cpuConversion = Column(Float()) - taskID = Column(BigInteger()) + cpu_conversion = Column(Float()) + task_id = Column(BigInteger()) vo = Column(String(16)) - pilotTiming = Column(String(100)) - workingGroup = Column(String(20)) - processingType = Column(String(64)) - prodUserName = Column(String(60)) - coreCount = Column(Integer()) - nInputFiles = Column(Integer()) - reqID = Column(BigInteger()) - jediTaskID = Column(BigInteger()) - actualCoreCount = Column(Integer()) - maxRSS = Column(Integer()) - maxVMEM = Column(Integer()) - maxSWAP = Column(Integer()) - maxPSS = Column(Integer()) - avgRSS = Column(Integer()) - avgVMEM = Column(Integer()) - avgSWAP = Column(Integer()) - avgPSS = Column(Integer()) - maxWalltime = Column(Integer()) - diskIO = Column(Integer()) - failedAttempt = Column(Integer()) + pilot_timing = Column(String(100)) + working_group = Column(String(20)) + processing_type = Column(String(64)) + prod_user_name = Column(String(60)) + core_count = Column(Integer()) + n_input_files = Column(Integer()) + req_id = Column(BigInteger()) + jedi_task_id = Column(BigInteger()) + actual_core_count = Column(Integer()) + max_rss = Column(Integer()) + max_vmem = Column(Integer()) + max_swap = Column(Integer()) + max_pss = Column(Integer()) + avg_rss = Column(Integer()) + avg_vmem = Column(Integer()) + avg_swap = Column(Integer()) + avg_pss = Column(Integer()) + max_walltime = Column(Integer()) + disk_io = Column(Integer()) + failed_attempt = Column(Integer()) hs06 = Column(Integer()) hs06sec = Column(Integer()) memory_leak = Column(String(10)) @@ -647,7 +647,7 @@ class Content_ext(BASE, ModelBase): job_label = Column(String(20)) _table_args = (PrimaryKeyConstraint('content_id', name='CONTENTS_EXT_PK'), - Index('CONTENTS_EXT_RTF_IDX', 'request_id', 'transform_id', 'workload_id', 'coll_id', 'content_id', 'PandaID', 'status')) + Index('CONTENTS_EXT_RTF_IDX', 'request_id', 'transform_id', 'workload_id', 'coll_id', 'content_id', 'panda_id', 'status')) class Health(BASE, ModelBase): diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index 8dcb8ec5..d84cfc9b 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -592,7 +592,7 @@ def delete_content(content_id=None, session=None): def get_contents_ext_items(): - default_params = {'PandaID': None, 'jobDefinitionID': None, 'schedulerID': None, + default_params = {'pandaID': None, 'jobDefinitionID': None, 'schedulerID': None, 'pilotID': None, 'creationTime': None, 'modificationTime': None, 'startTime': None, 'endTime': None, 'prodSourceLabel': None, 'prodUserID': None, 'assignedPriority': None, 'currentPriority': None, @@ -619,6 +619,34 @@ def get_contents_ext_items(): return default_params +def get_contents_ext_maps(): + default_params = {'panda_id': 'PandaID', 'job_definition_id': 'jobDefinitionID', 'scheduler_id': 'schedulerID', + 'pilot_id': 'pilotID', 'creation_time': 'creationTime', 'modification_time': 'modificationTime', + 'start_time': 'startTime', 'end_time': 'endTime', 'prod_source_label': 'prodSourceLabel', + 'prod_user_id': 'prodUserID', 'assigned_priority': 'assignedPriority', 'current_priority': 'currentPriority', + 'attempt_nr': 'attemptNr', 'max_attempt': 'maxAttempt', 'max_cpu_count': 'maxCpuCount', + 'max_cpu_unit': 'maxCpuUnit', 'max_disk_count': 'maxDiskCount', 'max_disk_unit': 'maxDiskUnit', + 'min_ram_count': 'minRamCount', 'min_ram_unit': 'minRamUnit', 'cpu_consumption_time': 'cpuConsumptionTime', + 'cpu_consumption_unit': 'cpuConsumptionUnit', 'job_status': 'jobStatus', 'job_name': 'jobName', + 'trans_exit_code': 'transExitCode', 'pilot_error_code': 'pilotErrorCode', 'pilot_error_diag': 'pilotErrorDiag', + 'exe_error_code': 'exeErrorCode', 'exe_error_diag': 'exeErrorDiag', 'sup_error_code': 'supErrorCode', + 'sup_error_diag': 'supErrorDiag', 'ddm_error_code': 'ddmErrorCode', 'ddm_error_diag': 'ddmErrorDiag', + 'brokerage_error_cdode': 'brokerageErrorCode', 'brokerage_error_diag': 'brokerageErrorDiag', + 'job_dispatcher_error_code': 'jobDispatcherErrorCode', 'job_dispatcher_error_diag': 'jobDispatcherErrorDiag', + 'task_buffer_error_code': 'taskBufferErrorCode', 'task_buffer_error_diag': 'taskBufferErrorDiag', + 'computing_site': 'computingSite', 'computing_element': 'computingElement', + 'grid': 'grid', 'cloud': 'cloud', 'cpu_conversion': 'cpuConversion', 'task_id': 'taskID', + 'vo': 'VO', 'pilot_timing': 'pilotTiming', 'working_group': 'workingGroup', + 'processing_type': 'processingType', 'prod_user_name': 'prodUserName', 'core_count': 'coreCount', + 'n_input_files': 'nInputFiles', 'req_id': 'reqID', 'jedi_task_id': 'jediTaskID', + 'actual_core_count': 'actualCoreCount', 'max_rss': 'maxRSS', 'max_vmem': 'maxVMEM', + 'max_swap': 'maxSWAP', 'max_pss': 'maxPSS', 'avg_rss': 'avgRSS', 'avg_vmem': 'avgVMEM', + 'avg_swap': 'avgSWAP', 'avg_pss': 'avgPSS', 'max_walltime': 'maxWalltime', 'disk_io': 'diskIO', + 'failed_attempt': 'failedAttempt', 'hs06': 'hs06', 'hs06sec': 'hs06sec', + 'memory_leak': 'memory_leak', 'memory_leak_x2': 'memory_leak_x2', 'job_label': 'job_label'} + return default_params + + @transactional_session def add_contents_ext(contents, bulk_size=10000, session=None): """ @@ -745,7 +773,7 @@ def get_contents_ext_ids(request_id=None, transform_id=None, workload_id=None, c models.Content_ext.workload_id, models.Content_ext.coll_id, models.Content_ext.content_id, - models.Content_ext.PandaID, + models.Content_ext.panda_id, models.Content_ext.status) query = query.with_hint(models.Content_ext, "INDEX(CONTENTS_EXT CONTENTS_EXT_RTF_IDX)") if request_id: From e00b29de50b11cebc160e38dfc47cf61ef4232e5 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 20 Dec 2022 14:16:24 +0100 Subject: [PATCH 34/91] fill input dependency id in adding conents --- main/lib/idds/core/processings.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/main/lib/idds/core/processings.py b/main/lib/idds/core/processings.py index d6d52a08..f0df8967 100644 --- a/main/lib/idds/core/processings.py +++ b/main/lib/idds/core/processings.py @@ -284,9 +284,26 @@ def update_processing_with_collection_contents(updated_processing, new_processin session=session) +def resolve_input_dependency_id(new_input_dependency_contents, session=None): + coll_ids = [] + for content in new_input_dependency_contents: + coll_ids.append(content['content_id']) + contents = orm_contents.get_contents(coll_id=coll_ids, session=session) + content_name_id_map = {} + for content in contents: + if content['coll_id'] not in content_name_id_map: + content_name_id_map[content['coll_id']] = {} + content_name_id_map[content['coll_id']][content['name']] = content['content_id'] + for content in new_input_dependency_contents: + content_dep_id = content_name_id_map[content['coll_id']][content['name']] + content['content_dep_id'] = content_dep_id + return new_input_dependency_contents + + @transactional_session def update_processing_contents(update_processing, update_contents, update_messages=None, new_contents=None, update_dep_contents=None, update_collections=None, messages=None, + new_input_dependency_contents=None, message_bulk_size=2000, session=None): """ Update processing with contents. @@ -300,6 +317,9 @@ def update_processing_contents(update_processing, update_contents, update_messag orm_contents.update_contents(update_contents, session=session) if new_contents: orm_contents.add_contents(new_contents, session=session) + if new_input_dependency_contents: + new_input_dependency_contents = resolve_input_dependency_id(new_input_dependency_contents, session=session) + orm_contents.add_contents(new_input_dependency_contents, session=session) if update_dep_contents: request_id, update_dep_contents_status_name, update_dep_contents_status = update_dep_contents for status_name in update_dep_contents_status_name: From 951c47aabf2b6b5f9418a1670b15bf5b45095394 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 20 Dec 2022 14:20:08 +0100 Subject: [PATCH 35/91] fix carrier to handle input_dependency --- main/lib/idds/agents/carrier/poller.py | 6 +- main/lib/idds/agents/carrier/submitter.py | 10 ++-- main/lib/idds/agents/carrier/utils.py | 68 ++++++++++++++--------- 3 files changed, 51 insertions(+), 33 deletions(-) diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py index 5f6b90dc..071d9c4c 100644 --- a/main/lib/idds/agents/carrier/poller.py +++ b/main/lib/idds/agents/carrier/poller.py @@ -196,7 +196,8 @@ def update_processing(self, processing, processing_model): update_dep_contents=processing.get('update_dep_contents', None), messages=processing.get('messages', None), update_messages=processing.get('update_messages', None), - new_contents=processing.get('new_contents', None)) + new_contents=processing.get('new_contents', None), + new_input_dependency_contents=processing.get('new_input_dependency_contents', None)) except exceptions.DatabaseException as ex: if 'ORA-00060' in str(ex): self.logger.warn(log_prefix + "(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") @@ -244,7 +245,7 @@ def handle_update_processing(self, processing): logger=self.logger, log_prefix=log_prefix) - process_status, new_contents, ret_msgs, update_contents, parameters, new_contents_ext, update_contents_ext = ret_handle_update_processing + process_status, new_contents, new_input_dependency_contents, ret_msgs, update_contents, parameters, new_contents_ext, update_contents_ext = ret_handle_update_processing proc = processing['processing_metadata']['processing'] work = proc.work @@ -288,6 +289,7 @@ def handle_update_processing(self, processing): ret = {'update_processing': update_processing, 'update_contents': update_contents, 'new_contents': new_contents, + 'new_input_dependency_contents': new_input_dependency_contents, 'messages': ret_msgs, 'new_contents_ext': new_contents_ext, 'update_contents_ext': update_contents_ext, diff --git a/main/lib/idds/agents/carrier/submitter.py b/main/lib/idds/agents/carrier/submitter.py index 15d9a317..3d79561c 100644 --- a/main/lib/idds/agents/carrier/submitter.py +++ b/main/lib/idds/agents/carrier/submitter.py @@ -76,10 +76,11 @@ def handle_new_processing(self, processing): # transform_id = processing['transform_id'] # transform = core_transforms.get_transform(transform_id=transform_id) # work = transform['transform_metadata']['work'] - status, processing, update_colls, new_contents, msgs, errors = handle_new_processing(processing, - self.agent_attributes, - logger=self.logger, - log_prefix=log_prefix) + ret_new_processing = handle_new_processing(processing, + self.agent_attributes, + logger=self.logger, + log_prefix=log_prefix) + status, processing, update_colls, new_contents, new_input_dependency_contents, msgs, errors = ret_new_processing if not status: raise exceptions.ProcessSubmitFailed(str(errors)) @@ -105,6 +106,7 @@ def handle_new_processing(self, processing): 'update_collections': update_colls, 'update_contents': [], 'new_contents': new_contents, + 'new_input_dependency_contents': new_input_dependency_contents, 'messages': msgs, } except Exception as ex: diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index 84bd6b95..d5911132 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -190,20 +190,30 @@ def get_ext_contents(transform_id, work): return contents_ids -def resolve_input_dependency_id(request_id, new_input_dep_coll_ids, new_input_dependency_contents): - contents = core_catalog.get_contents_by_coll_id_status(coll_id=new_input_dep_coll_ids) - content_name_id_map = {} - for content in contents: - if content['coll_id'] not in content_name_id_map: - content_name_id_map[content['coll_id']] = {} - content_name_id_map[content['coll_id']][content['name']] = content['content_id'] - for content in new_input_dependency_contents: - content_dep_id = content_name_id_map[content['coll_id']][content['name']] - content['content_dep_id'] = content_dep_id +def resolve_input_dependency_id(request_id, new_input_dep_coll_ids, new_input_dependency_contents, logger=None, log_prefix=''): + logger = get_logger(logger) + + logger.info(log_prefix + "resolve_input_dependency_id: new_input_dep_coll_ids: %s" % (str(new_input_dep_coll_ids))) + logger.info(log_prefix + "resolve_input_dependency_id: len(new_input_dependency_contents): %s" % len(new_input_dependency_contents)) + + if new_input_dep_coll_ids: + contents = core_catalog.get_contents_by_coll_id_status(coll_id=new_input_dep_coll_ids) + content_name_id_map = {} + for content in contents: + if content['coll_id'] not in content_name_id_map: + content_name_id_map[content['coll_id']] = {} + content_name_id_map[content['coll_id']][content['name']] = content['content_id'] + for content in new_input_dependency_contents: + if content['coll_id'] in content_name_id_map and content['name'] in content_name_id_map[content['coll_id']]: + content_dep_id = content_name_id_map[content['coll_id']][content['name']] + content['content_dep_id'] = content_dep_id return new_input_dependency_contents -def get_new_contents(request_id, transform_id, workload_id, new_input_output_maps): +def get_new_contents(request_id, transform_id, workload_id, new_input_output_maps, logger=None, log_prefix=''): + logger = get_logger(logger) + + logger.debug(log_prefix + "get_new_contents") new_input_contents, new_output_contents, new_log_contents = [], [], [] new_input_dependency_contents = [] new_input_dep_coll_ids = [] @@ -228,7 +238,7 @@ def get_new_contents(request_id, transform_id, workload_id, new_input_output_map content = get_new_content(request_id, transform_id, workload_id, map_id, log_content, content_relation_type=ContentRelationType.Log) new_log_contents.append(content) - new_input_dependency_contents = resolve_input_dependency_id(request_id, new_input_dep_coll_ids, new_input_dependency_contents) + # new_input_dependency_contents = resolve_input_dependency_id(request_id, new_input_dep_coll_ids, new_input_dependency_contents, logger=logger, log_prefix=log_prefix) return new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents @@ -442,10 +452,11 @@ def handle_new_processing(processing, agent_attributes, logger=None, log_prefix= if not status: logger.error(log_prefix + "Failed to submit processing (status: %s, workload_id: %s, errors: %s)" % (status, workload_id, errors)) - return False, processing, [], [], [], errors + return False, processing, [], [], [], [], errors ret_msgs = [] new_contents = [] + new_input_dependency_contents = [] update_collections = [] if proc.workload_id: processing['workload_id'] = proc.workload_id @@ -462,9 +473,10 @@ def handle_new_processing(processing, agent_attributes, logger=None, log_prefix= request_id = processing['request_id'] transform_id = processing['transform_id'] workload_id = processing['workload_id'] - ret_new_contents = get_new_contents(request_id, transform_id, workload_id, new_input_output_maps) + ret_new_contents = get_new_contents(request_id, transform_id, workload_id, new_input_output_maps, logger=logger, log_prefix=log_prefix) new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents = ret_new_contents - new_contents = new_input_contents + new_output_contents + new_log_contents + new_input_dependency_contents + # new_contents = new_input_contents + new_output_contents + new_log_contents + new_input_dependency_contents + new_contents = new_input_contents + new_output_contents + new_log_contents # not generate new messages # if new_input_contents: @@ -473,7 +485,7 @@ def handle_new_processing(processing, agent_attributes, logger=None, log_prefix= # if new_output_contents: # msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='file', files=new_input_contents, relation_type='output') # ret_msgs = ret_msgs + msgs - return True, processing, update_collections, new_contents, ret_msgs, errors + return True, processing, update_collections, new_contents, new_input_dependency_contents, ret_msgs, errors def get_updated_contents_by_request(request_id, transform_id, workload_id, work, terminated=False, input_output_maps=None, @@ -836,9 +848,9 @@ def handle_update_processing(processing, agent_attributes, logger=None, log_pref if work.require_ext_contents(): contents_ext = get_ext_contents(transform_id, work) - job_info_items = core_catalog.get_contents_ext_items() + job_info_maps = core_catalog.get_contents_ext_maps() ret_poll_processing = work.poll_processing_updates(processing, input_output_maps, contents_ext=contents_ext, - job_info_items=job_info_items, log_prefix=log_prefix) + job_info_maps=job_info_maps, log_prefix=log_prefix) process_status, content_updates, new_input_output_maps1, updated_contents_full, parameters, new_contents_ext, update_contents_ext = ret_poll_processing else: ret_poll_processing = work.poll_processing_updates(processing, input_output_maps, log_prefix=log_prefix) @@ -853,7 +865,8 @@ def handle_update_processing(processing, agent_attributes, logger=None, log_pref ret_new_contents = get_new_contents(request_id, transform_id, workload_id, new_input_output_maps) new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents = ret_new_contents - new_contents = new_input_contents + new_output_contents + new_log_contents + new_input_dependency_contents + # new_contents = new_input_contents + new_output_contents + new_log_contents + new_input_dependency_contents + new_contents = new_input_contents + new_output_contents + new_log_contents content_updates_missing, updated_contents_full_missing = poll_missing_outputs(input_output_maps) @@ -872,7 +885,7 @@ def handle_update_processing(processing, agent_attributes, logger=None, log_pref files=updated_contents_full, relation_type='output') ret_msgs = ret_msgs + msgs - return process_status, new_contents, ret_msgs, content_updates + content_updates_missing, parameters, new_contents_ext, update_contents_ext + return process_status, new_contents, new_input_dependency_contents, ret_msgs, content_updates + content_updates_missing, parameters, new_contents_ext, update_contents_ext def handle_trigger_processing(processing, agent_attributes, logger=None, log_prefix=''): @@ -934,7 +947,7 @@ def handle_trigger_processing(processing, agent_attributes, logger=None, log_pre ret_msgs = ret_msgs + msgs # not flusht updated_contents here - # content_updates = content_updates + updated_contents + content_updates = content_updates + updated_contents # updated_contents_full_output_input_deps = updated_contents_full_output + updated_contents_full_input_deps # if updated_contents_full_output or updated_contents_full_input_deps or updated_contents_full_input: @@ -954,12 +967,13 @@ def handle_trigger_processing(processing, agent_attributes, logger=None, log_pre for trigger_tf_id in updated_input_contents: logger.debug(log_prefix + "trigger_release_inputs: updated_input_contents[%s][:3] %s" % (trigger_tf_id, updated_input_contents[trigger_tf_id][:3])) - trigger_req_id = updated_input_contents[trigger_tf_id][0]['request_id'] - trigger_workload_id = updated_input_contents[trigger_tf_id][0]['workload_id'] + if updated_input_contents[trigger_tf_id]: + trigger_req_id = updated_input_contents[trigger_tf_id][0]['request_id'] + trigger_workload_id = updated_input_contents[trigger_tf_id][0]['workload_id'] - msgs = generate_messages(trigger_req_id, trigger_tf_id, trigger_workload_id, work, msg_type='file', - files=updated_input_contents[trigger_tf_id], relation_type='input') - ret_msgs = ret_msgs + msgs + msgs = generate_messages(trigger_req_id, trigger_tf_id, trigger_workload_id, work, msg_type='file', + files=updated_input_contents[trigger_tf_id], relation_type='input') + ret_msgs = ret_msgs + msgs return processing['substatus'], content_updates, ret_msgs, {}, update_contents_status_name, update_contents_status @@ -1251,7 +1265,7 @@ def sync_collection_status(request_id, transform_id, workload_id, work, input_ou 'missing_ext_files': coll.missing_ext_files} if terminate: if work.require_ext_contents(): - if coll.processed_files == coll.ext_processed_files and coll.failed_files == coll.failed_ext_files: + if coll.processed_files == coll.processed_ext_files and coll.failed_files == coll.failed_ext_files: all_ext_updated = True if (force_close_collection or (close_collection and all_updates_flushed and all_ext_updated) or coll.status == CollectionStatus.Closed): # noqa W503 From c3ed406247072ff2687dbe0136412c85bc140c9f Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 20 Dec 2022 14:35:38 +0100 Subject: [PATCH 36/91] fix processing terminating status --- main/lib/idds/agents/carrier/finisher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main/lib/idds/agents/carrier/finisher.py b/main/lib/idds/agents/carrier/finisher.py index 6d3bfbd3..d9f89676 100644 --- a/main/lib/idds/agents/carrier/finisher.py +++ b/main/lib/idds/agents/carrier/finisher.py @@ -107,6 +107,9 @@ def handle_terminated_processing(self, processing, log_prefix=""): try: processing, update_collections, messages = sync_processing(processing, self.agent_attributes, terminate=True, logger=self.logger, log_prefix=log_prefix) + if processing['status'] == ProcessingStatus.Terminating and is_process_terminated(processing['substatus']): + processing['status'] = processing['substatus'] + update_processing = {'processing_id': processing['processing_id'], 'parameters': {'status': processing['status'], 'locking': ProcessingLocking.Idle}} @@ -142,9 +145,6 @@ def process_terminated_processing(self, event): ret = self.handle_terminated_processing(pr, log_prefix=log_pre) self.logger.info(log_pre + "process_terminated_processing result: %s" % str(ret)) - if pr['status'] == ProcessingStatus.Terminating and is_process_terminated(pr['substatus']): - pr['status'] = pr['substatus'] - self.update_processing(ret, pr) self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id']) From f0d3c80df68b78609700d16ca63fd4af33a92388 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 27 Dec 2022 14:37:10 +0100 Subject: [PATCH 37/91] fix coll_id --- main/lib/idds/core/processings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/lib/idds/core/processings.py b/main/lib/idds/core/processings.py index f0df8967..7c8a8cf9 100644 --- a/main/lib/idds/core/processings.py +++ b/main/lib/idds/core/processings.py @@ -287,7 +287,7 @@ def update_processing_with_collection_contents(updated_processing, new_processin def resolve_input_dependency_id(new_input_dependency_contents, session=None): coll_ids = [] for content in new_input_dependency_contents: - coll_ids.append(content['content_id']) + coll_ids.append(content['coll_id']) contents = orm_contents.get_contents(coll_id=coll_ids, session=session) content_name_id_map = {} for content in contents: From 5d21acc6f21c0f5104c5880c1aa97bf084b0c9c4 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 27 Dec 2022 16:52:47 +0100 Subject: [PATCH 38/91] use separate log file for messaging listener --- common/lib/idds/common/utils.py | 22 +++++++++++++++++++ .../idds/agents/common/plugins/messaging.py | 5 +++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/common/lib/idds/common/utils.py b/common/lib/idds/common/utils.py index 6cbded24..c21e05a9 100644 --- a/common/lib/idds/common/utils.py +++ b/common/lib/idds/common/utils.py @@ -62,6 +62,28 @@ def setup_logging(name, stream=None, loglevel=None): format='%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s') +def get_logger((name, filename=None, loglevel=None): + """ + Setup logging + """ + if loglevel is None: + if config_has_section('common') and config_has_option('common', 'loglevel'): + loglevel = getattr(logging, config_get('common', 'loglevel').upper()) + else: + loglevel = logging.INFO + + if filename is None: + filename = name + ".log" + formatter = '%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s' + + handler = longging.FileHandler(filename) + handler.setFormatter(formatter) + logger = logging.getLogger(name) + logger.setLevel(level) + logger.addHandler(handler) + return logger + + def get_rest_url_prefix(): if config_has_section('rest') and config_has_option('rest', 'url_prefix'): url_prefix = config_get('rest', 'url_prefix') diff --git a/main/lib/idds/agents/common/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py index c442e808..d8c3ce89 100644 --- a/main/lib/idds/agents/common/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -19,7 +19,7 @@ import stomp from idds.common.plugin.plugin_base import PluginBase -from idds.common.utils import setup_logging +from idds.common.utils import setup_logging, get_logger setup_logging(__name__) @@ -37,7 +37,8 @@ def __init__(self, broker, output_queue): self.name = "MessagingListener" self.__broker = broker self.__output_queue = output_queue - self.logger = logging.getLogger(self.__class__.__name__) + # self.logger = logging.getLogger(self.__class__.__name__) + self.logger = get_logger(self.__class__.__name__, filename="MessageLister.log") def on_error(self, frame): ''' From 7e2a2d04d1ef6457136e6ececac4322709a03db9 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 27 Dec 2022 17:48:00 +0100 Subject: [PATCH 39/91] add get logger to support different file name --- common/lib/idds/common/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/lib/idds/common/utils.py b/common/lib/idds/common/utils.py index c21e05a9..bbe0f122 100644 --- a/common/lib/idds/common/utils.py +++ b/common/lib/idds/common/utils.py @@ -62,7 +62,7 @@ def setup_logging(name, stream=None, loglevel=None): format='%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s') -def get_logger((name, filename=None, loglevel=None): +def get_logger(name, filename=None, loglevel=None): """ Setup logging """ @@ -76,10 +76,10 @@ def get_logger((name, filename=None, loglevel=None): filename = name + ".log" formatter = '%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s' - handler = longging.FileHandler(filename) + handler = logging.FileHandler(filename) handler.setFormatter(formatter) logger = logging.getLogger(name) - logger.setLevel(level) + logger.setLevel(loglevel) logger.addHandler(handler) return logger From 778095587d5a98e51883c70828018a4973d87183 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 27 Dec 2022 17:48:36 +0100 Subject: [PATCH 40/91] fix client/rest/core functions to get request id from name --- client/lib/idds/client/requestclient.py | 15 ++++++++++++ main/lib/idds/core/requests.py | 15 ++++++++++++ main/lib/idds/orm/requests.py | 26 ++++++++++++++++++++ main/lib/idds/rest/v1/requests.py | 32 ++++++++++++++++++++++++- 4 files changed, 87 insertions(+), 1 deletion(-) diff --git a/client/lib/idds/client/requestclient.py b/client/lib/idds/client/requestclient.py index 33f55045..318acb8c 100644 --- a/client/lib/idds/client/requestclient.py +++ b/client/lib/idds/client/requestclient.py @@ -123,6 +123,21 @@ def get_requests(self, request_id=None, workload_id=None, with_detail=False, wit return requests + def get_request_id_by_name(self, name): + """ + Get request id by name. + + :param name: the request name. + + :returns {name:id} dict. + """ + path = self.REQUEST_BASEURL + path += "/name" + + url = self.build_url(self.host, path=os.path.join(path, name)) + r = self.get_request_response(url, type='GET', data=None) + return r + def abort_request(self, request_id, workload_id=None): """ Abort Request. diff --git a/main/lib/idds/core/requests.py b/main/lib/idds/core/requests.py index 40ebfe46..8bd872a9 100644 --- a/main/lib/idds/core/requests.py +++ b/main/lib/idds/core/requests.py @@ -142,6 +142,21 @@ def get_request_ids_by_workload_id(workload_id, session=None): return orm_requests.get_request_ids_by_workload_id(workload_id, session=session) +@read_session +def get_request_ids_by_name(name, session=None): + """ + Get request ids or raise a NoObject exception. + + :param name: name of the request. + :param session: The database session in use. + + :raises NoObject: If no request is founded. + + :returns: Request {name:id} dict. + """ + return orm_requests.get_request_ids_by_name(name, session=session) + + @transactional_session def get_request_by_id_status(request_id, status=None, locking=False, session=None): req = orm_requests.get_request_by_id_status(request_id=request_id, status=status, locking=locking, session=session) diff --git a/main/lib/idds/orm/requests.py b/main/lib/idds/orm/requests.py index 1d2fee6d..c167a688 100644 --- a/main/lib/idds/orm/requests.py +++ b/main/lib/idds/orm/requests.py @@ -172,6 +172,32 @@ def get_request_ids_by_workload_id(workload_id, session=None): raise exceptions.NoObject('request with workload_id:%s cannot be found: %s' % (workload_id, error)) +@read_session +def get_request_ids_by_name(name, session=None): + """ + Get request ids or raise a NoObject exception. + + :param name: name of the request. + :param session: The database session in use. + + :raises NoObject: If no request is founded. + + :returns: Request {name:id} dict. + """ + try: + query = session.query(models.Request.request_id, models.Request.name)\ + .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)")\ + .filter(models.Request.name.like(name.replace('*', '%'))) + tmp = query.all() + ret_ids = {} + if tmp: + for req in tmp: + ret_ids[req[1]] = req[0] + return ret_ids + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('request with name:%s cannot be found: %s' % (name, error)) + + @read_session def get_request_ids(request_id=None, workload_id=None, session=None): """ diff --git a/main/lib/idds/rest/v1/requests.py b/main/lib/idds/rest/v1/requests.py index 73add121..6790887e 100644 --- a/main/lib/idds/rest/v1/requests.py +++ b/main/lib/idds/rest/v1/requests.py @@ -22,7 +22,8 @@ CommandType) from idds.common.utils import json_loads from idds.core.requests import (add_request, get_requests, - get_request, update_request) + get_request, update_request, + get_request_ids_by_name) from idds.core.messages import add_message from idds.core.commands import add_command from idds.rest.v1.controller import IDDSController @@ -217,6 +218,32 @@ def post_test(self): pprint.pprint(self.get_request().url_rule) +class RequestName(IDDSController): + """ Get id from name. """ + + def get(self, name): + """ Get id from name. + HTTP Success: + 200 OK + HTTP Error: + 404 Not Found + 500 InternalError + :returns: {name:id} dict. + """ + try: + rets = get_request_ids_by_name(name) + except exceptions.NoObject as error: + return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) + except exceptions.IDDSException as error: + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=error.__class__.__name__, exc_msg=error) + except Exception as error: + print(error) + print(format_exc()) + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) + + return self.generate_http_response(HTTP_STATUS_CODE.OK, data=rets) + + class RequestBuild(IDDSController): """ Create, Update, get and delete Request. """ @@ -403,6 +430,9 @@ def get_blueprint(): bp.add_url_rule('/request/////', view_func=request_view, methods=['get', ]) bp.add_url_rule('/request//////', view_func=request_view, methods=['get', ]) + request_name2id = RequestName.as_view('request_name') + bp.add_url_rule('/request/name/', view_func=request_name2id, methods=['get', ]) + request_build = RequestBuild.as_view('request_build') bp.add_url_rule('/request/build/', view_func=request_build, methods=['post', ]) From 95f738b17a3b9c8d89ce492c48bbe54a7a46292e Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 29 Dec 2022 22:01:38 +0100 Subject: [PATCH 41/91] add contents_update table and trigger on deletion --- main/lib/idds/orm/base/models.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index c2ed3594..d39bb54a 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -564,6 +564,13 @@ class Content(BASE, ModelBase): Index('CONTENTS_REQ_TF_COLL_IDX', 'request_id', 'transform_id', 'coll_id', 'status')) +class Content_update(BASE, ModelBase): + """Represents a content update""" + __tablename__ = 'contents_update' + content_id = Column(BigInteger().with_variant(Integer, "sqlite"), primary_key=True) + substatus = Column(EnumWithValue(ContentStatus)) + + class Content_ext(BASE, ModelBase): """Represents a content extension""" __tablename__ = 'contents_ext' @@ -731,7 +738,7 @@ def register_models(engine): """ # models = (Request, Workprogress, Transform, Workprogress2transform, Processing, Collection, Content, Health, Message) - models = (Request, Transform, Processing, Collection, Content, Content_ext, Health, Message, Command) + models = (Request, Transform, Processing, Collection, Content, Content_update, Content_ext, Health, Message, Command) for model in models: # if not engine.has_table(model.__tablename__, model.metadata.schema): @@ -744,7 +751,18 @@ def unregister_models(engine): """ # models = (Request, Workprogress, Transform, Workprogress2transform, Processing, Collection, Content, Health, Message) - models = (Request, Transform, Processing, Collection, Content, Content_ext, Health, Message, Command) + models = (Request, Transform, Processing, Collection, Content, Content_update, Content_ext, Health, Message, Command) for model in models: model.metadata.drop_all(engine) # pylint: disable=maybe-no-member + + +@event.listens_for(Content_update.__table__, "after_create") +def _update_content_dep_status(target, connection, **kw): + DDL(""" + CREATE TRIGGER update_content_dep_status BEFORE DELETE ON contents_update + for each row + BEGIN + UPDATE contents set substatus = :old.substatus where contents.content_dep_id = :old.content_id; + END; + """) From 86637fadb5ce0cb382df0adee007bee0e8728701 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 29 Dec 2022 22:02:26 +0100 Subject: [PATCH 42/91] add functions on contents_update table --- main/lib/idds/agents/carrier/poller.py | 1 + main/lib/idds/core/processings.py | 6 +++- main/lib/idds/orm/contents.py | 43 ++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py index 071d9c4c..9307b585 100644 --- a/main/lib/idds/agents/carrier/poller.py +++ b/main/lib/idds/agents/carrier/poller.py @@ -197,6 +197,7 @@ def update_processing(self, processing, processing_model): messages=processing.get('messages', None), update_messages=processing.get('update_messages', None), new_contents=processing.get('new_contents', None), + new_update_contents=processing.get('new_update_contents', None), new_input_dependency_contents=processing.get('new_input_dependency_contents', None)) except exceptions.DatabaseException as ex: if 'ORA-00060' in str(ex): diff --git a/main/lib/idds/core/processings.py b/main/lib/idds/core/processings.py index 7c8a8cf9..e406861e 100644 --- a/main/lib/idds/core/processings.py +++ b/main/lib/idds/core/processings.py @@ -303,7 +303,7 @@ def resolve_input_dependency_id(new_input_dependency_contents, session=None): @transactional_session def update_processing_contents(update_processing, update_contents, update_messages=None, new_contents=None, update_dep_contents=None, update_collections=None, messages=None, - new_input_dependency_contents=None, + new_update_contents=None, new_input_dependency_contents=None, message_bulk_size=2000, session=None): """ Update processing with contents. @@ -315,6 +315,10 @@ def update_processing_contents(update_processing, update_contents, update_messag orm_collections.update_collections(update_collections, session=session) if update_contents: orm_contents.update_contents(update_contents, session=session) + if new_update_contents: + # first add and then delete, to trigger the trigger 'update_content_dep_status'. + orm_contents.add_contents_update(new_update_contents, session=session) + orm_contents.delete_contents_update(session=session) if new_contents: orm_contents.add_contents(new_contents, session=session) if new_input_dependency_contents: diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index d84cfc9b..0c4d6dfa 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -647,6 +647,49 @@ def get_contents_ext_maps(): return default_params +@transactional_session +def add_contents_update(contents, bulk_size=10000, session=None): + """ + Add contents update. + + :param contents: dict of contents. + :param session: session. + + :raises DuplicatedObject: If a collection with the same name exists. + :raises DatabaseException: If there is a database error. + + :returns: content ids. + """ + sub_params = [contents[i:i + bulk_size] for i in range(0, len(contents), bulk_size)] + + try: + for sub_param in sub_params: + session.bulk_insert_mappings(models.Content_update, sub_param) + content_ids = [None for _ in range(len(contents))] + return content_ids + except IntegrityError as error: + raise exceptions.DuplicatedObject('Duplicated objects: %s' % (error)) + except DatabaseError as error: + raise exceptions.DatabaseException(error) + + +@transactional_session +def delete_contents_update(session=None): + """ + delete a content. + + :param content_id: The id of the content. + :param session: The database session in use. + + :raises NoObject: If no content is founded. + :raises DatabaseException: If there is a database error. + """ + try: + session.query(models.Content_update).delete() + except Exception as error: + raise exceptions.NoObject('Content_update deletion error: %s' % (error)) + + @transactional_session def add_contents_ext(contents, bulk_size=10000, session=None): """ From 10a2c4bb298e298be8fc8ea6e341295b32315777 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 29 Dec 2022 22:03:04 +0100 Subject: [PATCH 43/91] optimize carrier.trigger to use orm trigger --- main/lib/idds/agents/carrier/trigger.py | 3 +- main/lib/idds/agents/carrier/utils.py | 98 ++++++++++++------------- 2 files changed, 49 insertions(+), 52 deletions(-) diff --git a/main/lib/idds/agents/carrier/trigger.py b/main/lib/idds/agents/carrier/trigger.py index 4adfdbf5..d6f217ca 100644 --- a/main/lib/idds/agents/carrier/trigger.py +++ b/main/lib/idds/agents/carrier/trigger.py @@ -77,7 +77,7 @@ def handle_trigger_processing(self, processing): self.agent_attributes, logger=self.logger, log_prefix=log_prefix) - process_status, update_contents, ret_msgs, parameters, update_dep_contents_status_name, update_dep_contents_status = ret_trigger_processing + process_status, update_contents, ret_msgs, parameters, update_dep_contents_status_name, update_dep_contents_status, new_update_contents = ret_trigger_processing new_process_status = process_status if is_process_terminated(process_status): @@ -96,6 +96,7 @@ def handle_trigger_processing(self, processing): ret = {'update_processing': update_processing, 'update_contents': update_contents, 'messages': ret_msgs, + 'new_update_contents': new_update_contents, 'update_dep_contents': (processing['request_id'], update_dep_contents_status_name, update_dep_contents_status), 'processing_status': new_process_status} except exceptions.ProcessFormatNotSupported as ex: diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index d5911132..a3f68c06 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -517,6 +517,7 @@ def get_updated_contents_by_request(request_id, transform_id, workload_id, work, def get_updated_contents_by_input_output_maps(input_output_maps=None, logger=None, log_prefix=''): updated_contents, updated_contents_full_input, updated_contents_full_output = [], [], [] updated_contents_full_input_deps = [] + new_update_contents = [] status_to_check = [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed, ContentStatus.Missing, ContentStatus.Failed, ContentStatus.Lost, @@ -533,12 +534,18 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, logger=Non u_content = {'content_id': content['content_id'], 'status': content['substatus']} updated_contents.append(u_content) + u_content_substatus = {'content_id': content['content_id'], + 'substatus': content['substatus']} + new_update_contents.append(u_content_substatus) updated_contents_full_input.append(content) for content in outputs: if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: u_content = {'content_id': content['content_id'], 'status': content['substatus']} updated_contents.append(u_content) + u_content_substatus = {'content_id': content['content_id'], + 'substatus': content['substatus']} + new_update_contents.append(u_content_substatus) updated_contents_full_output.append(content) for content in inputs_dependency: if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: @@ -546,7 +553,44 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, logger=Non 'status': content['substatus']} updated_contents.append(u_content) updated_contents_full_input_deps.append(content) - return updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps + + input_content_update_status = None + if is_all_contents_available(inputs_dependency): + input_content_update_status = ContentStatus.Available + elif is_all_contents_terminated(inputs_dependency): + input_content_update_status = ContentStatus.Missing + if input_content_update_status: + for content in inputs: + u_content = {'content_id': content['content_id'], + 'status': input_content_update_status, + 'substatus': input_content_update_status} + updated_contents.append(u_content) + content['status'] = input_content_update_status + content['substatus'] = input_content_update_status + updated_contents_full_input.append(content) + u_content_substatus = {'content_id': content['content_id'], + 'substatus': content['substatus']} + new_update_contents.append(u_content_substatus) + + output_content_update_status = None + if is_all_contents_available(inputs): + # wait for the job to finish + pass + elif is_all_contents_terminated_but_not_available(inputs): + output_content_update_status = ContentStatus.Missing + if output_content_update_status: + for content in outputs: + u_content = {'content_id': content['content_id'], + 'status': output_content_update_status, + 'substatus': output_content_update_status} + updated_contents.append(u_content) + content['status'] = output_content_update_status + content['substatus'] = output_content_update_status + updated_contents_full_output.append(content) + u_content_substatus = {'content_id': content['content_id'], + 'substatus': content['substatus']} + new_update_contents.append(u_content_substatus) + return updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps, new_update_contents def get_transform_dependency_map(transform_id, logger=None, log_prefix=''): @@ -908,31 +952,9 @@ def handle_trigger_processing(processing, agent_attributes, logger=None, log_pre input_output_maps = get_input_output_maps(transform_id, work) logger.debug(log_prefix + "input_output_maps.keys[:2]: %s" % str(list(input_output_maps.keys())[:2])) - """ - content_updates_trigger_no_deps, updated_input_contents_no_deps = [], [] - content_updates_trigger_no_deps, updated_input_contents_no_deps = trigger_release_inputs_no_deps(request_id, transform_id, workload_id, work, - input_output_maps, logger, log_prefix) - logger.debug(log_prefix + "trigger_release_inputs_no_deps: content_updates_trigger_no_deps[:3] %s" % (content_updates_trigger_no_deps[:3])) - # logger.debug(log_prefix + "trigger_release_inputs_no_deps: updated_input_contents_no_deps[:3] %s" % (updated_input_contents_no_deps[:3])) - - content_updates = content_updates + content_updates_trigger_no_deps - if updated_input_contents_no_deps: - for trigger_tf_id in updated_input_contents_no_deps: - logger.debug(log_prefix + "trigger_release_inputs_no_deps: updated_input_contents_no_deps[%s][:3] %s" % (trigger_tf_id, - updated_input_contents_no_deps[trigger_tf_id][:3])) - trigger_req_id = updated_input_contents_no_deps[trigger_tf_id][0]['request_id'] - trigger_workload_id = updated_input_contents_no_deps[trigger_tf_id][0]['workload_id'] - - msgs = generate_messages(trigger_req_id, trigger_tf_id, trigger_workload_id, work, msg_type='file', - files=updated_input_contents_no_deps[trigger_tf_id], relation_type='input') - ret_msgs = ret_msgs + msgs - - is_terminated = is_process_terminated(processing['substatus']) - """ - updated_contents_ret = get_updated_contents_by_input_output_maps(input_output_maps=input_output_maps, logger=logger, log_prefix=log_prefix) - updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps = updated_contents_ret + updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps, new_update_contents = updated_contents_ret logger.debug(log_prefix + "handle_trigger_processing: updated_contents[:3] %s" % (updated_contents[:3])) if updated_contents_full_input: @@ -946,35 +968,9 @@ def handle_trigger_processing(processing, agent_attributes, logger=None, log_pre files=updated_contents_full_output, relation_type='output') ret_msgs = ret_msgs + msgs - # not flusht updated_contents here content_updates = content_updates + updated_contents - # updated_contents_full_output_input_deps = updated_contents_full_output + updated_contents_full_input_deps - # if updated_contents_full_output or updated_contents_full_input_deps or updated_contents_full_input: - if True: - ret_trigger_release_inputs = trigger_release_inputs(request_id, transform_id, workload_id, work, - updated_contents_full_output, - updated_contents_full_input, - updated_contents_full_input_deps, - input_output_maps, - logger, log_prefix) - content_updates_trigger, updated_input_contents, update_contents_status_name, update_contents_status = ret_trigger_release_inputs - logger.debug(log_prefix + "trigger_release_inputs: content_updates_trigger[:3] %s" % (content_updates_trigger[:3])) - # logger.debug(log_prefix + "trigger_release_inputs: updated_input_contents[:3] %s" % (updated_input_contents[:3])) - - content_updates = content_updates + content_updates_trigger - if updated_input_contents: - for trigger_tf_id in updated_input_contents: - logger.debug(log_prefix + "trigger_release_inputs: updated_input_contents[%s][:3] %s" % (trigger_tf_id, - updated_input_contents[trigger_tf_id][:3])) - if updated_input_contents[trigger_tf_id]: - trigger_req_id = updated_input_contents[trigger_tf_id][0]['request_id'] - trigger_workload_id = updated_input_contents[trigger_tf_id][0]['workload_id'] - - msgs = generate_messages(trigger_req_id, trigger_tf_id, trigger_workload_id, work, msg_type='file', - files=updated_input_contents[trigger_tf_id], relation_type='input') - ret_msgs = ret_msgs + msgs - return processing['substatus'], content_updates, ret_msgs, {}, update_contents_status_name, update_contents_status + return processing['substatus'], content_updates, ret_msgs, {}, {}, {}, new_update_contents def get_content_status_from_panda_msg_status(status): From 1c53606c1193533371a332af866f01b13e97890f Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Fri, 30 Dec 2022 11:32:40 +0100 Subject: [PATCH 44/91] trigger contents dep --- main/lib/idds/agents/carrier/trigger.py | 8 ++++++- main/lib/idds/agents/carrier/utils.py | 7 ++++-- main/lib/idds/core/catalog.py | 29 +++++++++++++++++++++++++ main/lib/idds/core/processings.py | 2 +- main/lib/idds/orm/contents.py | 1 - 5 files changed, 42 insertions(+), 5 deletions(-) diff --git a/main/lib/idds/agents/carrier/trigger.py b/main/lib/idds/agents/carrier/trigger.py index d6f217ca..bf0d116c 100644 --- a/main/lib/idds/agents/carrier/trigger.py +++ b/main/lib/idds/agents/carrier/trigger.py @@ -70,11 +70,12 @@ def get_trigger_processings(self): self.logger.error(traceback.format_exc()) return [] - def handle_trigger_processing(self, processing): + def handle_trigger_processing(self, processing, trigger_new_updates=False): try: log_prefix = self.get_log_prefix(processing) ret_trigger_processing = handle_trigger_processing(processing, self.agent_attributes, + trigger_new_updates=trigger_new_updates, logger=self.logger, log_prefix=log_prefix) process_status, update_contents, ret_msgs, parameters, update_dep_contents_status_name, update_dep_contents_status, new_update_contents = ret_trigger_processing @@ -164,6 +165,11 @@ def process_trigger_processing(self, event): self.update_processing(ret, pr) + new_update_contents = ret.get('new_update_contents', None) + if new_update_contents: + ret = self.handle_trigger_processing(pr, trigger_new_updates=True) + self.update_processing(ret, pr) + if (('processing_status' in ret and ret['processing_status'] == ProcessingStatus.Terminating) or (event._content and 'Terminated' in event._content and event._content['Terminated'])): # noqa W503 self.logger.info(log_pre + "TerminatedProcessingEvent(processing_id: %s)" % pr['processing_id']) diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index a3f68c06..7380b7c0 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -932,7 +932,7 @@ def handle_update_processing(processing, agent_attributes, logger=None, log_pref return process_status, new_contents, new_input_dependency_contents, ret_msgs, content_updates + content_updates_missing, parameters, new_contents_ext, update_contents_ext -def handle_trigger_processing(processing, agent_attributes, logger=None, log_prefix=''): +def handle_trigger_processing(processing, agent_attributes, trigger_new_updates=False, logger=None, log_prefix=''): logger = get_logger(logger) ret_msgs = [] @@ -947,8 +947,11 @@ def handle_trigger_processing(processing, agent_attributes, logger=None, log_pre work.set_agent_attributes(agent_attributes, processing) if not work.use_dependency_to_release_jobs(): - return processing['substatus'], [], [], {} + return processing['substatus'], [], [], {}, {}, {}, [] else: + if trigger_new_updates: + # delete information in the contents_update table, to invoke the trigger. + core_catalog.delete_contents_update() input_output_maps = get_input_output_maps(transform_id, work) logger.debug(log_prefix + "input_output_maps.keys[:2]: %s" % str(list(input_output_maps.keys())[:2])) diff --git a/main/lib/idds/core/catalog.py b/main/lib/idds/core/catalog.py index b708597c..e25e53bf 100644 --- a/main/lib/idds/core/catalog.py +++ b/main/lib/idds/core/catalog.py @@ -611,6 +611,35 @@ def get_output_contents_by_request_id_status(request_id, name, content_status, l return contents +@transactional_session +def add_contents_update(contents, bulk_size=10000, session=None): + """ + Add contents update. + + :param contents: dict of contents. + :param session: session. + + :raises DuplicatedObject: If a collection with the same name exists. + :raises DatabaseException: If there is a database error. + + :returns: content ids. + """ + return orm_contents.add_contents_update(contents, bulk_size=bulk_size, session=session) + + +@transactional_session +def delete_contents_update(session=None): + """ + delete a content. + + :param session: The database session in use. + + :raises NoObject: If no content is founded. + :raises DatabaseException: If there is a database error. + """ + return orm_contents.delete_contents_update(session=session) + + def get_contents_ext_maps(): return orm_contents.get_contents_ext_maps() diff --git a/main/lib/idds/core/processings.py b/main/lib/idds/core/processings.py index e406861e..3e6e3677 100644 --- a/main/lib/idds/core/processings.py +++ b/main/lib/idds/core/processings.py @@ -318,7 +318,7 @@ def update_processing_contents(update_processing, update_contents, update_messag if new_update_contents: # first add and then delete, to trigger the trigger 'update_content_dep_status'. orm_contents.add_contents_update(new_update_contents, session=session) - orm_contents.delete_contents_update(session=session) + # orm_contents.delete_contents_update(session=session) if new_contents: orm_contents.add_contents(new_contents, session=session) if new_input_dependency_contents: diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index 0c4d6dfa..dc7c95b2 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -678,7 +678,6 @@ def delete_contents_update(session=None): """ delete a content. - :param content_id: The id of the content. :param session: The database session in use. :raises NoObject: If no content is founded. From 280bef51402890846cb500501bccdd30ed300aa5 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Fri, 30 Dec 2022 13:44:59 +0100 Subject: [PATCH 45/91] fix get logger --- common/lib/idds/common/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common/lib/idds/common/utils.py b/common/lib/idds/common/utils.py index bbe0f122..9c84d5f2 100644 --- a/common/lib/idds/common/utils.py +++ b/common/lib/idds/common/utils.py @@ -74,6 +74,13 @@ def get_logger(name, filename=None, loglevel=None): if filename is None: filename = name + ".log" + if not filename.startswith("/"): + if config_has_section('common') and config_has_option('common', 'logdir'): + logdir = config_get('common', 'logdir') + if not logdir: + logdir = '/var/log/idds' + filename = os.path.join(logdir, filename) + formatter = '%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s' handler = logging.FileHandler(filename) From 26b78492b61b9822bde9cc6b9aa7578914d276ea Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Fri, 30 Dec 2022 13:46:35 +0100 Subject: [PATCH 46/91] fix get logger --- common/lib/idds/common/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/common/lib/idds/common/utils.py b/common/lib/idds/common/utils.py index 9c84d5f2..697b9e6b 100644 --- a/common/lib/idds/common/utils.py +++ b/common/lib/idds/common/utils.py @@ -75,6 +75,7 @@ def get_logger(name, filename=None, loglevel=None): if filename is None: filename = name + ".log" if not filename.startswith("/"): + logdir = None if config_has_section('common') and config_has_option('common', 'logdir'): logdir = config_get('common', 'logdir') if not logdir: From 2860809492d0934cd562e8c2cebe76dce6c16015 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 2 Jan 2023 14:15:22 +0100 Subject: [PATCH 47/91] fix logging --- common/lib/idds/common/utils.py | 3 +- main/lib/idds/agents/carrier/receiver.py | 5 +- .../idds/agents/common/plugins/messaging.py | 5 +- main/lib/idds/agents/conductor/conductor.py | 3 +- main/lib/idds/tests/panda_test.py | 7 +- main/lib/idds/tests/test_logger.py | 83 +++++++++++++++++++ 6 files changed, 98 insertions(+), 8 deletions(-) create mode 100644 main/lib/idds/tests/test_logger.py diff --git a/common/lib/idds/common/utils.py b/common/lib/idds/common/utils.py index 697b9e6b..cf8fa0e4 100644 --- a/common/lib/idds/common/utils.py +++ b/common/lib/idds/common/utils.py @@ -82,13 +82,14 @@ def get_logger(name, filename=None, loglevel=None): logdir = '/var/log/idds' filename = os.path.join(logdir, filename) - formatter = '%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s' + formatter = logging.Formatter('%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s') handler = logging.FileHandler(filename) handler.setFormatter(formatter) logger = logging.getLogger(name) logger.setLevel(loglevel) logger.addHandler(handler) + logger.propagate = False return logger diff --git a/main/lib/idds/agents/carrier/receiver.py b/main/lib/idds/agents/carrier/receiver.py index 5e95ad83..5bde4afa 100644 --- a/main/lib/idds/agents/carrier/receiver.py +++ b/main/lib/idds/agents/carrier/receiver.py @@ -19,7 +19,7 @@ from idds.common.constants import Sections from idds.common.exceptions import AgentPluginError, IDDSException -from idds.common.utils import setup_logging +from idds.common.utils import setup_logging, get_logger from idds.common.utils import json_dumps from idds.core import messages as core_messages, catalog as core_catalog from idds.agents.common.baseagent import BaseAgent @@ -43,6 +43,7 @@ def __init__(self, num_threads=1, bulk_message_delay=5, bulk_message_size=2000, self.bulk_message_delay = int(bulk_message_delay) self.bulk_message_size = int(bulk_message_size) self.message_queue = Queue() + self.logger_receiver = get_logger(self.__class__.__name__) def __del__(self): self.stop_receiver() @@ -68,7 +69,7 @@ def get_output_messages(self): while not self.message_queue.empty(): msg = self.message_queue.get(False) if msg: - # self.logger.debug("Received message: %s" % str(msg)) + self.logger_receiver.debug("Received message: %s" % str(msg)) msgs.append(msg) except Exception as error: self.logger.error("Failed to get output messages: %s, %s" % (error, traceback.format_exc())) diff --git a/main/lib/idds/agents/common/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py index d8c3ce89..ce9471f7 100644 --- a/main/lib/idds/agents/common/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -38,7 +38,7 @@ def __init__(self, broker, output_queue): self.__broker = broker self.__output_queue = output_queue # self.logger = logging.getLogger(self.__class__.__name__) - self.logger = get_logger(self.__class__.__name__, filename="MessageLister.log") + self.logger = get_logger(self.__class__.__name__) def on_error(self, frame): ''' @@ -71,6 +71,9 @@ def __init__(self, name="MessagingSender", **kwargs): self.conns = [] + def setup_logger(self): + self.logger = get_logger(self.__class__.__name__) + def stop(self): self.graceful_stop.set() diff --git a/main/lib/idds/agents/conductor/conductor.py b/main/lib/idds/agents/conductor/conductor.py index 94fe967d..ab876dcf 100644 --- a/main/lib/idds/agents/conductor/conductor.py +++ b/main/lib/idds/agents/conductor/conductor.py @@ -19,7 +19,7 @@ from idds.common.constants import (Sections, MessageStatus, MessageDestination) from idds.common.exceptions import AgentPluginError, IDDSException -from idds.common.utils import setup_logging +from idds.common.utils import setup_logging, get_logger from idds.core import messages as core_messages from idds.agents.common.baseagent import BaseAgent @@ -55,6 +55,7 @@ def __init__(self, num_threads=1, retrieve_bulk_size=1000, threshold_to_release_ if replay_times is None: replay_times = 3 self.replay_times = int(replay_times) + # self.logger = get_logger(self.__class__.__name__) def __del__(self): self.stop_notifier() diff --git a/main/lib/idds/tests/panda_test.py b/main/lib/idds/tests/panda_test.py index 5fd5a04d..0bf0db2b 100644 --- a/main/lib/idds/tests/panda_test.py +++ b/main/lib/idds/tests/panda_test.py @@ -60,7 +60,7 @@ ret_jobs = ret_jobs + ret[1] print(len(ret_jobs)) -sys.exit(0) +# sys.exit(0) """ jediTaskID = 998 @@ -92,10 +92,11 @@ # task_ids = [2549, 2560] # task_ids = [i for i in range(3692, 3723)] # task_ids = [3834, 3835, 3836] -task_ids = [] +# task_ids = [i for i in range(141294, 142200)] + [i for i in range(141003, 141077)] + [i for i in range(141145, 141255)] +task_ids = [140954, 140955, 142228] for task_id in task_ids: print("Killing %s" % task_id) - # Client.killTask(task_id) + Client.killTask(task_id) """ jobids = [] diff --git a/main/lib/idds/tests/test_logger.py b/main/lib/idds/tests/test_logger.py new file mode 100644 index 00000000..2110925f --- /dev/null +++ b/main/lib/idds/tests/test_logger.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 + + +import logging +import os +import sys + +from idds.common.config import (config_has_section, config_has_option, + config_get) + + +def setup_logging(name, stream=None, loglevel=None): + """ + Setup logging + """ + if loglevel is None: + if config_has_section('common') and config_has_option('common', 'loglevel'): + loglevel = getattr(logging, config_get('common', 'loglevel').upper()) + else: + loglevel = logging.INFO + + if stream is None: + if config_has_section('common') and config_has_option('common', 'logdir'): + logging.basicConfig(filename=os.path.join(config_get('common', 'logdir'), name), + level=loglevel, + format='%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s') + else: + logging.basicConfig(stream=sys.stdout, level=loglevel, + format='%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s') + else: + logging.basicConfig(stream=stream, level=loglevel, + format='%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s') + + +def get_logger(name, filename=None, loglevel=None): + """ + Setup logging + """ + if loglevel is None: + if config_has_section('common') and config_has_option('common', 'loglevel'): + loglevel = getattr(logging, config_get('common', 'loglevel').upper()) + else: + loglevel = logging.INFO + + if filename is None: + filename = name + ".log" + if not filename.startswith("/"): + logdir = None + if config_has_section('common') and config_has_option('common', 'logdir'): + logdir = config_get('common', 'logdir') + if not logdir: + logdir = '/var/log/idds' + filename = os.path.join(logdir, filename) + + formatter = logging.Formatter('%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s') + + handler = logging.FileHandler(filename) + handler.setFormatter(formatter) + logger = logging.getLogger(name) + logger.setLevel(loglevel) + logger.addHandler(handler) + logger.propagate = False + return logger + + +def test_get_logger(): + logger = get_logger('test', filename='/tmp/wguan/test.log') + logger.info("test") + logger.debug("test1") + print(logger.handlers) + + +if __name__ == '__main__': + setup_logging('test1') + test_get_logger() From eb208b0566ece79ac28ad8ca135ecce1eb54c167 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 2 Jan 2023 16:49:22 +0100 Subject: [PATCH 48/91] add event counter --- main/lib/idds/agents/common/eventbus/event.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main/lib/idds/agents/common/eventbus/event.py b/main/lib/idds/agents/common/eventbus/event.py index 88e36ceb..76dc33b9 100644 --- a/main/lib/idds/agents/common/eventbus/event.py +++ b/main/lib/idds/agents/common/eventbus/event.py @@ -56,12 +56,14 @@ def __init__(self, publisher_id, event_type=EventType.Event, content=None): self._publisher_id = publisher_id self._event_type = event_type self._timestamp = time.time() + self._counter = 1 self._content = content def to_json(self): ret = {'id': self._id, 'publisher_id': self._publisher_id, 'event_type': (self._event_type.name, self._event_type.value), 'timestamp': self._timestamp, + 'counter': self._counter, 'content': self._content} return ret From b9917c418f00280e6ef0f807adc5963f46597818 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 2 Jan 2023 17:04:26 +0100 Subject: [PATCH 49/91] add event counter --- main/lib/idds/agents/common/eventbus/event.py | 80 +++++++++---------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/main/lib/idds/agents/common/eventbus/event.py b/main/lib/idds/agents/common/eventbus/event.py index 76dc33b9..f133ff0e 100644 --- a/main/lib/idds/agents/common/eventbus/event.py +++ b/main/lib/idds/agents/common/eventbus/event.py @@ -51,12 +51,12 @@ class EventType(Enum): class Event(object): - def __init__(self, publisher_id, event_type=EventType.Event, content=None): + def __init__(self, publisher_id, event_type=EventType.Event, content=None, counter=1): self._id = str(uuid.uuid4()) self._publisher_id = publisher_id self._event_type = event_type self._timestamp = time.time() - self._counter = 1 + self._counter = counter self._content = content def to_json(self): @@ -72,8 +72,8 @@ def __str__(self): class StateClaimEvent(Event): - def __init__(self, publisher_id, event_bus_state, content=None): - super(StateClaimEvent, self).__init__(publisher_id, event_type=EventType.StateClaim, content=content) + def __init__(self, publisher_id, event_bus_state, content=None, counter=1): + super(StateClaimEvent, self).__init__(publisher_id, event_type=EventType.StateClaim, content=content, counter=counter) self._event_bus_state = event_bus_state def to_json(self): @@ -83,8 +83,8 @@ def to_json(self): class DemandEvent(Event): - def __init__(self, publisher_id, demand_type, content=None): - super(DemandEvent, self).__init__(publisher_id, event_type=EventType.Demand, content=content) + def __init__(self, publisher_id, demand_type, content=None, counter=1): + super(DemandEvent, self).__init__(publisher_id, event_type=EventType.Demand, content=content, counter=counter) self._demand_type = demand_type def to_json(self): @@ -94,8 +94,8 @@ def to_json(self): class NewRequestEvent(Event): - def __init__(self, publisher_id, request_id, content=None): - super(NewRequestEvent, self).__init__(publisher_id, event_type=EventType.NewRequest, content=content) + def __init__(self, publisher_id, request_id, content=None, counter=1): + super(NewRequestEvent, self).__init__(publisher_id, event_type=EventType.NewRequest, content=content, counter=counter) self._request_id = request_id def to_json(self): @@ -105,8 +105,8 @@ def to_json(self): class UpdateRequestEvent(Event): - def __init__(self, publisher_id, request_id, content=None): - super(UpdateRequestEvent, self).__init__(publisher_id, event_type=EventType.UpdateRequest, content=content) + def __init__(self, publisher_id, request_id, content=None, counter=1): + super(UpdateRequestEvent, self).__init__(publisher_id, event_type=EventType.UpdateRequest, content=content, counter=counter) self._request_id = request_id def to_json(self): @@ -116,8 +116,8 @@ def to_json(self): class AbortRequestEvent(Event): - def __init__(self, publisher_id, request_id, content=None): - super(AbortRequestEvent, self).__init__(publisher_id, event_type=EventType.AbortRequest, content=content) + def __init__(self, publisher_id, request_id, content=None, counter=1): + super(AbortRequestEvent, self).__init__(publisher_id, event_type=EventType.AbortRequest, content=content, counter=counter) self._request_id = request_id def to_json(self): @@ -127,8 +127,8 @@ def to_json(self): class ResumeRequestEvent(Event): - def __init__(self, publisher_id, request_id, content=None): - super(ResumeRequestEvent, self).__init__(publisher_id, event_type=EventType.ResumeRequest, content=content) + def __init__(self, publisher_id, request_id, content=None, counter=1): + super(ResumeRequestEvent, self).__init__(publisher_id, event_type=EventType.ResumeRequest, content=content, counter=counter) self._request_id = request_id def to_json(self): @@ -138,8 +138,8 @@ def to_json(self): class ExpireRequestEvent(Event): - def __init__(self, publisher_id, request_id, content=None): - super(ExpireRequestEvent, self).__init__(publisher_id, event_type=EventType.ExpireRequest, content=content) + def __init__(self, publisher_id, request_id, content=None, counter=1): + super(ExpireRequestEvent, self).__init__(publisher_id, event_type=EventType.ExpireRequest, content=content, counter=counter) self._request_id = request_id def to_json(self): @@ -149,8 +149,8 @@ def to_json(self): class UpdateCommandEvent(Event): - def __init__(self, publisher_id, command_id, content=None): - super(UpdateCommandEvent, self).__init__(publisher_id, event_type=EventType.UpdateCommand, content=content) + def __init__(self, publisher_id, command_id, content=None, counter=1): + super(UpdateCommandEvent, self).__init__(publisher_id, event_type=EventType.UpdateCommand, content=content, counter=counter) self._command_id = command_id def to_json(self): @@ -160,8 +160,8 @@ def to_json(self): class NewTransformEvent(Event): - def __init__(self, publisher_id, transform_id, content=None): - super(NewTransformEvent, self).__init__(publisher_id, event_type=EventType.NewTransform, content=content) + def __init__(self, publisher_id, transform_id, content=None, counter=1): + super(NewTransformEvent, self).__init__(publisher_id, event_type=EventType.NewTransform, content=content, counter=counter) self._transform_id = transform_id def to_json(self): @@ -171,8 +171,8 @@ def to_json(self): class UpdateTransformEvent(Event): - def __init__(self, publisher_id, transform_id, content=None): - super(UpdateTransformEvent, self).__init__(publisher_id, event_type=EventType.UpdateTransform, content=content) + def __init__(self, publisher_id, transform_id, content=None, counter=1): + super(UpdateTransformEvent, self).__init__(publisher_id, event_type=EventType.UpdateTransform, content=content, counter=counter) self._transform_id = transform_id def to_json(self): @@ -182,8 +182,8 @@ def to_json(self): class AbortTransformEvent(Event): - def __init__(self, publisher_id, transform_id, content=None): - super(AbortTransformEvent, self).__init__(publisher_id, event_type=EventType.AbortTransform, content=content) + def __init__(self, publisher_id, transform_id, content=None, counter=1): + super(AbortTransformEvent, self).__init__(publisher_id, event_type=EventType.AbortTransform, content=content, counter=counter) self._transform_id = transform_id def to_json(self): @@ -193,8 +193,8 @@ def to_json(self): class ResumeTransformEvent(Event): - def __init__(self, publisher_id, transform_id, content=None): - super(ResumeTransformEvent, self).__init__(publisher_id, event_type=EventType.ResumeTransform, content=content) + def __init__(self, publisher_id, transform_id, content=None, counter=1): + super(ResumeTransformEvent, self).__init__(publisher_id, event_type=EventType.ResumeTransform, content=content, counter=counter) self._transform_id = transform_id def to_json(self): @@ -204,8 +204,8 @@ def to_json(self): class NewProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None): - super(NewProcessingEvent, self).__init__(publisher_id, event_type=EventType.NewProcessing, content=content) + def __init__(self, publisher_id, processing_id, content=None, counter=1): + super(NewProcessingEvent, self).__init__(publisher_id, event_type=EventType.NewProcessing, content=content, counter=counter) self._processing_id = processing_id def to_json(self): @@ -215,8 +215,8 @@ def to_json(self): class UpdateProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None): - super(UpdateProcessingEvent, self).__init__(publisher_id, event_type=EventType.UpdateProcessing, content=content) + def __init__(self, publisher_id, processing_id, content=None, counter=1): + super(UpdateProcessingEvent, self).__init__(publisher_id, event_type=EventType.UpdateProcessing, content=content, counter=counter) self._processing_id = processing_id def to_json(self): @@ -226,8 +226,8 @@ def to_json(self): class AbortProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None): - super(AbortProcessingEvent, self).__init__(publisher_id, event_type=EventType.AbortProcessing, content=content) + def __init__(self, publisher_id, processing_id, content=None, counter=1): + super(AbortProcessingEvent, self).__init__(publisher_id, event_type=EventType.AbortProcessing, content=content, counter=counter) self._processing_id = processing_id def to_json(self): @@ -237,8 +237,8 @@ def to_json(self): class ResumeProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None): - super(ResumeProcessingEvent, self).__init__(publisher_id, event_type=EventType.ResumeProcessing, content=content) + def __init__(self, publisher_id, processing_id, content=None, counter=1): + super(ResumeProcessingEvent, self).__init__(publisher_id, event_type=EventType.ResumeProcessing, content=content, counter=counter) self._processing_id = processing_id def to_json(self): @@ -248,8 +248,8 @@ def to_json(self): class SyncProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None): - super(SyncProcessingEvent, self).__init__(publisher_id, event_type=EventType.SyncProcessing, content=content) + def __init__(self, publisher_id, processing_id, content=None, counter=1): + super(SyncProcessingEvent, self).__init__(publisher_id, event_type=EventType.SyncProcessing, content=content, counter=counter) self._processing_id = processing_id def to_json(self): @@ -259,8 +259,8 @@ def to_json(self): class TerminatedProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None): - super(TerminatedProcessingEvent, self).__init__(publisher_id, event_type=EventType.TerminatedProcessing, content=content) + def __init__(self, publisher_id, processing_id, content=None, counter=1): + super(TerminatedProcessingEvent, self).__init__(publisher_id, event_type=EventType.TerminatedProcessing, content=content, counter=counter) self._processing_id = processing_id def to_json(self): @@ -270,8 +270,8 @@ def to_json(self): class TriggerProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None): - super(TriggerProcessingEvent, self).__init__(publisher_id, event_type=EventType.TriggerProcessing, content=content) + def __init__(self, publisher_id, processing_id, content=None, counter=1): + super(TriggerProcessingEvent, self).__init__(publisher_id, event_type=EventType.TriggerProcessing, content=content, counter=counter) self._processing_id = processing_id def to_json(self): From da877ed43f1af0baee78c2ce5d95f1dae6c37f0c Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 2 Jan 2023 17:28:19 +0100 Subject: [PATCH 50/91] fix to avoid the same inputs are triggered again and again --- main/lib/idds/agents/carrier/utils.py | 48 +++++++++++++++------------ 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index 7380b7c0..7c858b6d 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -561,16 +561,17 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, logger=Non input_content_update_status = ContentStatus.Missing if input_content_update_status: for content in inputs: - u_content = {'content_id': content['content_id'], - 'status': input_content_update_status, - 'substatus': input_content_update_status} - updated_contents.append(u_content) - content['status'] = input_content_update_status - content['substatus'] = input_content_update_status - updated_contents_full_input.append(content) - u_content_substatus = {'content_id': content['content_id'], - 'substatus': content['substatus']} - new_update_contents.append(u_content_substatus) + if content['substatus'] != input_content_update_status: + u_content = {'content_id': content['content_id'], + 'status': input_content_update_status, + 'substatus': input_content_update_status} + updated_contents.append(u_content) + content['status'] = input_content_update_status + content['substatus'] = input_content_update_status + updated_contents_full_input.append(content) + u_content_substatus = {'content_id': content['content_id'], + 'substatus': content['substatus']} + new_update_contents.append(u_content_substatus) output_content_update_status = None if is_all_contents_available(inputs): @@ -580,16 +581,17 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, logger=Non output_content_update_status = ContentStatus.Missing if output_content_update_status: for content in outputs: - u_content = {'content_id': content['content_id'], - 'status': output_content_update_status, - 'substatus': output_content_update_status} - updated_contents.append(u_content) - content['status'] = output_content_update_status - content['substatus'] = output_content_update_status - updated_contents_full_output.append(content) - u_content_substatus = {'content_id': content['content_id'], - 'substatus': content['substatus']} - new_update_contents.append(u_content_substatus) + if content['substatus'] != output_content_update_status: + u_content = {'content_id': content['content_id'], + 'status': output_content_update_status, + 'substatus': output_content_update_status} + updated_contents.append(u_content) + content['status'] = output_content_update_status + content['substatus'] = output_content_update_status + updated_contents_full_output.append(content) + u_content_substatus = {'content_id': content['content_id'], + 'substatus': content['substatus']} + new_update_contents.append(u_content_substatus) return updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps, new_update_contents @@ -1211,7 +1213,7 @@ def sync_collection_status(request_id, transform_id, workload_id, work, input_ou if content['status'] in [ContentStatus.Available, ContentStatus.Mapped, ContentStatus.Available.value, ContentStatus.Mapped.value, ContentStatus.FakeAvailable, ContentStatus.FakeAvailable.value]: - coll_status[content['coll_id']]['processed_extfiles'] += 1 + coll_status[content['coll_id']]['processed_ext_files'] += 1 elif content['status'] in [ContentStatus.Failed, ContentStatus.FinalFailed]: coll_status[content['coll_id']]['failed_ext_files'] += 1 elif content['status'] in [ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: @@ -1344,6 +1346,10 @@ def sync_processing(processing, agent_attributes, terminate=False, logger=None, msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='content_ext', files=contents_ext, relation_type='output', input_output_maps=input_output_maps) messages += msgs + + if processing['status'] == ProcessingStatus.Terminating and is_process_terminated(processing['substatus']): + processing['status'] = processing['substatus'] + return processing, update_collections, messages From 96f59f31055e6560e4aa1b1612588e6eec60b837 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 2 Jan 2023 17:31:38 +0100 Subject: [PATCH 51/91] fix core_processing to add ext contents --- main/lib/idds/agents/conductor/conductor.py | 3 ++- main/lib/idds/core/processings.py | 5 +++++ main/lib/idds/orm/contents.py | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/main/lib/idds/agents/conductor/conductor.py b/main/lib/idds/agents/conductor/conductor.py index ab876dcf..41bae669 100644 --- a/main/lib/idds/agents/conductor/conductor.py +++ b/main/lib/idds/agents/conductor/conductor.py @@ -19,7 +19,8 @@ from idds.common.constants import (Sections, MessageStatus, MessageDestination) from idds.common.exceptions import AgentPluginError, IDDSException -from idds.common.utils import setup_logging, get_logger +# from idds.common.utils import setup_logging, get_logger +from idds.common.utils import setup_logging from idds.core import messages as core_messages from idds.agents.common.baseagent import BaseAgent diff --git a/main/lib/idds/core/processings.py b/main/lib/idds/core/processings.py index 3e6e3677..e39cbc8c 100644 --- a/main/lib/idds/core/processings.py +++ b/main/lib/idds/core/processings.py @@ -304,6 +304,7 @@ def resolve_input_dependency_id(new_input_dependency_contents, session=None): def update_processing_contents(update_processing, update_contents, update_messages=None, new_contents=None, update_dep_contents=None, update_collections=None, messages=None, new_update_contents=None, new_input_dependency_contents=None, + new_contents_ext=None, update_contents_ext=None, message_bulk_size=2000, session=None): """ Update processing with contents. @@ -321,6 +322,10 @@ def update_processing_contents(update_processing, update_contents, update_messag # orm_contents.delete_contents_update(session=session) if new_contents: orm_contents.add_contents(new_contents, session=session) + if new_contents_ext: + orm_contents.add_contents_ext(new_contents_ext, session=session) + if update_contents_ext: + orm_contents.update_contents_ext(update_contents_ext, session=session) if new_input_dependency_contents: new_input_dependency_contents = resolve_input_dependency_id(new_input_dependency_contents, session=session) orm_contents.add_contents(new_input_dependency_contents, session=session) diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index dc7c95b2..9a5a20ad 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -834,7 +834,8 @@ def get_contents_ext_ids(request_id=None, transform_id=None, workload_id=None, c rets = [] if tmp: for t in tmp: - rets.append(t.to_dict()) + t2 = dict(zip(t.keys(), t)) + rets.append(t2) return rets except sqlalchemy.orm.exc.NoResultFound as error: raise exceptions.NoObject('No record can be found with (transform_id=%s): %s' % From d1af02375958b98b94fca09a526dcd8aac3f416a Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 2 Jan 2023 17:32:30 +0100 Subject: [PATCH 52/91] record counter in carrier --- main/lib/idds/agents/carrier/finisher.py | 41 ++++++++++++------------ main/lib/idds/agents/carrier/poller.py | 14 ++++++-- main/lib/idds/agents/carrier/trigger.py | 7 ++-- 3 files changed, 37 insertions(+), 25 deletions(-) diff --git a/main/lib/idds/agents/carrier/finisher.py b/main/lib/idds/agents/carrier/finisher.py index d9f89676..813bd525 100644 --- a/main/lib/idds/agents/carrier/finisher.py +++ b/main/lib/idds/agents/carrier/finisher.py @@ -18,7 +18,7 @@ from .utils import (handle_abort_processing, handle_resume_processing, - is_process_terminated, + # is_process_terminated, sync_processing) from .poller import Poller @@ -107,9 +107,6 @@ def handle_terminated_processing(self, processing, log_prefix=""): try: processing, update_collections, messages = sync_processing(processing, self.agent_attributes, terminate=True, logger=self.logger, log_prefix=log_prefix) - if processing['status'] == ProcessingStatus.Terminating and is_process_terminated(processing['substatus']): - processing['status'] = processing['substatus'] - update_processing = {'processing_id': processing['processing_id'], 'parameters': {'status': processing['status'], 'locking': ProcessingLocking.Idle}} @@ -135,26 +132,30 @@ def process_terminated_processing(self, event): self.number_workers += 1 try: if event: - pr = self.get_processing(processing_id=event._processing_id, locking=True) - if not pr: - self.logger.error("Cannot find processing for event: %s" % str(event)) + if event._counter > 3: + self.logger.warn("Event counter is bigger than 3, skip event: %s" % str(event)) else: - log_pre = self.get_log_prefix(pr) + original_event = event + pr = self.get_processing(processing_id=event._processing_id, locking=True) + if not pr: + self.logger.error("Cannot find processing for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(pr) - self.logger.info(log_pre + "process_terminated_processing") - ret = self.handle_terminated_processing(pr, log_prefix=log_pre) - self.logger.info(log_pre + "process_terminated_processing result: %s" % str(ret)) + self.logger.info(log_pre + "process_terminated_processing") + ret = self.handle_terminated_processing(pr, log_prefix=log_pre) + self.logger.info(log_pre + "process_terminated_processing result: %s" % str(ret)) - self.update_processing(ret, pr) - self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) - event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id']) - self.event_bus.send(event) - - if pr['status'] not in [ProcessingStatus.Finished, ProcessingStatus.Failed, ProcessingStatus.SubFinished]: - # some files are missing, poll it. - self.logger.info(log_pre + "UpdateProcessingEvent(processing_id: %s)" % pr['processing_id']) - event = UpdateProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + self.update_processing(ret, pr) + self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) + event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id']) self.event_bus.send(event) + + if pr['status'] not in [ProcessingStatus.Finished, ProcessingStatus.Failed, ProcessingStatus.SubFinished]: + # some files are missing, poll it. + self.logger.info(log_pre + "UpdateProcessingEvent(processing_id: %s)" % pr['processing_id']) + event = UpdateProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], counter=original_event._counter + 1) + self.event_bus.send(event) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py index 9307b585..ac244ec4 100644 --- a/main/lib/idds/agents/carrier/poller.py +++ b/main/lib/idds/agents/carrier/poller.py @@ -198,6 +198,8 @@ def update_processing(self, processing, processing_model): update_messages=processing.get('update_messages', None), new_contents=processing.get('new_contents', None), new_update_contents=processing.get('new_update_contents', None), + new_contents_ext=processing.get('new_contents_ext', None), + update_contents_ext=processing.get('update_contents_ext', None), new_input_dependency_contents=processing.get('new_input_dependency_contents', None)) except exceptions.DatabaseException as ex: if 'ORA-00060' in str(ex): @@ -348,6 +350,7 @@ def process_update_processing(self, event): self.number_workers += 1 try: if event: + original_event = event self.logger.info("process_update_processing, event: %s" % str(event)) pr = self.get_processing(processing_id=event._processing_id, status=None, locking=True) @@ -368,18 +371,23 @@ def process_update_processing(self, event): event_content['has_updates'] = True if is_process_terminated(pr['substatus']): event_content['Terminated'] = True - event = TriggerProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], content=event_content) + event = TriggerProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], content=event_content, + counter=original_event._counter) self.event_bus.send(event) elif 'processing_status' in ret and ret['processing_status'] == ProcessingStatus.Terminating: self.logger.info(log_pre + "TerminatedProcessingEvent(processing_id: %s)" % pr['processing_id']) - event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], + counter=original_event._counter) self.event_bus.send(event) else: if (('update_contents' in ret and ret['update_contents']) or ('new_contents' in ret and ret['new_contents']) # noqa W503 + or ('new_contents_ext' in ret and ret['new_contents_ext']) # noqa W503 + or ('update_contents_ext' in ret and ret['update_contents_ext']) # noqa W503 or ('messages' in ret and ret['messages'])): # noqa E129 self.logger.info(log_pre + "SyncProcessingEvent(processing_id: %s)" % pr['processing_id']) - event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], + counter=original_event._counter) self.event_bus.send(event) except Exception as ex: self.logger.error(ex) diff --git a/main/lib/idds/agents/carrier/trigger.py b/main/lib/idds/agents/carrier/trigger.py index bf0d116c..68426855 100644 --- a/main/lib/idds/agents/carrier/trigger.py +++ b/main/lib/idds/agents/carrier/trigger.py @@ -152,6 +152,7 @@ def process_trigger_processing(self, event): self.number_workers += 1 try: if event: + original_event = event # pr_status = [ProcessingStatus.New] self.logger.info("process_trigger_processing, event: %s" % str(event)) pr = self.get_processing(processing_id=event._processing_id, status=None, locking=True) @@ -173,7 +174,8 @@ def process_trigger_processing(self, event): if (('processing_status' in ret and ret['processing_status'] == ProcessingStatus.Terminating) or (event._content and 'Terminated' in event._content and event._content['Terminated'])): # noqa W503 self.logger.info(log_pre + "TerminatedProcessingEvent(processing_id: %s)" % pr['processing_id']) - event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], content=event._content) + event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], content=event._content, + counter=original_event._counter) self.event_bus.send(event) else: if ((event._content and 'has_updates' in event._content and event._content['has_updates']) @@ -181,7 +183,8 @@ def process_trigger_processing(self, event): or ('new_contents' in ret and ret['new_contents']) # noqa W503 or ('messages' in ret and ret['messages'])): # noqa E129 self.logger.info(log_pre + "SyncProcessingEvent(processing_id: %s)" % pr['processing_id']) - event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], + counter=original_event._counter) self.event_bus.send(event) except Exception as ex: self.logger.error(ex) From 8efb18763edaa4087c8e0910fb1623418a4783ea Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 3 Jan 2023 14:35:06 +0100 Subject: [PATCH 53/91] optimize logger in messaging --- common/lib/idds/common/plugin/plugin_base.py | 24 +++++++++--- common/lib/idds/common/plugin/plugin_utils.py | 9 +++-- main/lib/idds/agents/carrier/receiver.py | 6 +-- main/lib/idds/agents/common/baseagent.py | 8 ++-- .../idds/agents/common/plugins/messaging.py | 37 +++++++++++++------ main/lib/idds/agents/conductor/conductor.py | 6 +-- 6 files changed, 58 insertions(+), 32 deletions(-) diff --git a/common/lib/idds/common/plugin/plugin_base.py b/common/lib/idds/common/plugin/plugin_base.py index b71619c2..85006782 100644 --- a/common/lib/idds/common/plugin/plugin_base.py +++ b/common/lib/idds/common/plugin/plugin_base.py @@ -19,12 +19,12 @@ class PluginBase(object): - def __init__(self, **kwargs): + def __init__(self, logger=None, **kwargs): for key in kwargs: setattr(self, key, kwargs[key]) - self.logger = None - self.setup_logger() + self.logger = logger + self.setup_logger(self.logger) self.plugins = self.load_plugins(kwargs) def get_class_name(self): @@ -39,9 +39,20 @@ def setup_logger(self, logger=None): else: self.logger = logging.getLogger(self.get_class_name()) + @property + def logger(self): + return self._logger + + @logger.setter + def logger(self, logger): + self._logger = logger + def set_logger(self, logger): self.logger = logger + def get_logger(self): + return self.logger + def __call__(self, **kwargs): return exceptions.NotImplementedException(self.get_class_name()) @@ -61,7 +72,7 @@ def load_plugin_attributes(self, name, plugin, kwargs): attrs[attr_name] = value return attrs - def load_plugin(self, name, plugin, kwargs): + def load_plugin(self, name, plugin, logger, kwargs): """ Load plugin """ @@ -71,10 +82,11 @@ def load_plugin(self, name, plugin, kwargs): plugin_class = plugin[k + 1:] module = __import__(plugin_modules, fromlist=[None]) cls = getattr(module, plugin_class) + attrs['logger'] = logger impl = cls(**attrs) return impl - def load_plugins(self, kwargs): + def load_plugins(self, kwargs, logger=None): if not kwargs: return {} @@ -83,5 +95,5 @@ def load_plugins(self, kwargs): if key.startswith('plugin.'): if key.count('.') == 1: plugin_name = key.replace('plugin.', '').strip() - plugins[plugin_name] = self.load_plugin(plugin_name, value, kwargs) + plugins[plugin_name] = self.load_plugin(plugin_name, value, logger=logger, kwargs=kwargs) return plugins diff --git a/common/lib/idds/common/plugin/plugin_utils.py b/common/lib/idds/common/plugin/plugin_utils.py index 5889ea88..0b77da3b 100644 --- a/common/lib/idds/common/plugin/plugin_utils.py +++ b/common/lib/idds/common/plugin/plugin_utils.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2022 """ @@ -57,7 +57,7 @@ def load_plugin_attributes(config_section, name, plugin): return attrs -def load_plugin(config_section, name, plugin): +def load_plugin(config_section, name, plugin, logger=None): """ Load plugin attributes """ @@ -67,11 +67,12 @@ def load_plugin(config_section, name, plugin): plugin_class = plugin[k + 1:] module = __import__(plugin_modules, fromlist=[None]) cls = getattr(module, plugin_class) + attrs['logger'] = logger impl = cls(**attrs) return impl -def load_plugins(config_section): +def load_plugins(config_section, logger=None): """ Load plugins """ @@ -82,5 +83,5 @@ def load_plugins(config_section): if option.startswith('plugin.'): if option.count('.') == 1: plugin_name = option.replace('plugin.', '').strip() - plugins[plugin_name] = load_plugin(config_section, plugin_name, value) + plugins[plugin_name] = load_plugin(config_section, plugin_name, value, logger=logger) return plugins diff --git a/main/lib/idds/agents/carrier/receiver.py b/main/lib/idds/agents/carrier/receiver.py index 5bde4afa..39d400e8 100644 --- a/main/lib/idds/agents/carrier/receiver.py +++ b/main/lib/idds/agents/carrier/receiver.py @@ -43,7 +43,7 @@ def __init__(self, num_threads=1, bulk_message_delay=5, bulk_message_size=2000, self.bulk_message_delay = int(bulk_message_delay) self.bulk_message_size = int(bulk_message_size) self.message_queue = Queue() - self.logger_receiver = get_logger(self.__class__.__name__) + self.logger = get_logger(self.__class__.__name__) def __del__(self): self.stop_receiver() @@ -55,7 +55,7 @@ def start_receiver(self): self.logger.info("Starting receiver: %s" % self.receiver) self.receiver.set_output_queue(self.message_queue) - self.set_logger(self.logger) + self.setup_logger(self.logger) self.receiver.start() def stop_receiver(self): @@ -69,7 +69,7 @@ def get_output_messages(self): while not self.message_queue.empty(): msg = self.message_queue.get(False) if msg: - self.logger_receiver.debug("Received message: %s" % str(msg)) + self.logger.debug("Received message: %s" % str(msg)) msgs.append(msg) except Exception as error: self.logger.error("Failed to get output messages: %s, %s" % (error, traceback.format_exc())) diff --git a/main/lib/idds/agents/common/baseagent.py b/main/lib/idds/agents/common/baseagent.py index 5a5d859c..14e1f65f 100644 --- a/main/lib/idds/agents/common/baseagent.py +++ b/main/lib/idds/agents/common/baseagent.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2022 import os import socket @@ -34,11 +34,11 @@ class BaseAgent(TimerScheduler, PluginBase): The base IDDS agent class """ - def __init__(self, num_threads=1, name=None, **kwargs): + def __init__(self, num_threads=1, name=None, logger=None, **kwargs): super(BaseAgent, self).__init__(num_threads, name=name) self.name = self.__class__.__name__ self.id = str(uuid.uuid4())[:8] - self.logger = None + self.logger = logger self.setup_logger(self.logger) self.config_section = Sections.Common @@ -91,7 +91,7 @@ def load_plugin_sequence(self): self.plugin_sequence = load_plugin_sequence(self.config_section) def load_plugins(self): - self.plugins = load_plugins(self.config_section) + self.plugins = load_plugins(self.config_section, logger=self.logger) """ for plugin_name in self.plugin_sequence: if plugin_name not in self.plugins: diff --git a/main/lib/idds/agents/common/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py index ce9471f7..a39a1b4c 100644 --- a/main/lib/idds/agents/common/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -30,7 +30,7 @@ class MessagingListener(stomp.ConnectionListener): ''' Messaging Listener ''' - def __init__(self, broker, output_queue): + def __init__(self, broker, output_queue, logger=None): ''' __init__ ''' @@ -38,7 +38,10 @@ def __init__(self, broker, output_queue): self.__broker = broker self.__output_queue = output_queue # self.logger = logging.getLogger(self.__class__.__name__) - self.logger = get_logger(self.__class__.__name__) + if logger: + self.logger = logger + else: + self.logger = get_logger(self.__class__.__name__) def on_error(self, frame): ''' @@ -53,11 +56,12 @@ def on_message(self, frame): class MessagingSender(PluginBase, threading.Thread): - def __init__(self, name="MessagingSender", **kwargs): + def __init__(self, name="MessagingSender", logger=None, **kwargs): threading.Thread.__init__(self, name=name) - super(MessagingSender, self).__init__(name=name, **kwargs) + super(MessagingSender, self).__init__(name=name, logger=logger, **kwargs) - self.setup_logger() + if logger: + self.logger = logger self.graceful_stop = threading.Event() self.request_queue = None self.output_queue = None @@ -71,8 +75,17 @@ def __init__(self, name="MessagingSender", **kwargs): self.conns = [] - def setup_logger(self): - self.logger = get_logger(self.__class__.__name__) + def setup_logger(self, logger): + if logger: + self.logger = logger + else: + self.logger = get_logger(self.__class__.__name__) + + def set_logger(self, logger): + self.logger = logger + + def get_logger(self): + return self.logger def stop(self): self.graceful_stop.set() @@ -200,14 +213,14 @@ def __call__(self): class MessagingReceiver(MessagingSender): - def __init__(self, name="MessagingReceiver", **kwargs): - super(MessagingReceiver, self).__init__(name=name, **kwargs) + def __init__(self, name="MessagingReceiver", logger=None, **kwargs): + super(MessagingReceiver, self).__init__(name=name, logger=logger, **kwargs) self.listener = None self.receiver_conns = [] def get_listener(self, broker): if self.listener is None: - self.listener = MessagingListener(broker, self.output_queue) + self.listener = MessagingListener(broker, self.output_queue, logger=self.logger) return self.listener def subscribe(self): @@ -264,8 +277,8 @@ def __call__(self): class MessagingMessager(MessagingReceiver): - def __init__(self, name="MessagingMessager", **kwargs): - super(MessagingMessager, self).__init__(name=name, **kwargs) + def __init__(self, name="MessagingMessager", logger=None, **kwargs): + super(MessagingMessager, self).__init__(name=name, logger=logger, **kwargs) def execute_send_subscribe(self): try: diff --git a/main/lib/idds/agents/conductor/conductor.py b/main/lib/idds/agents/conductor/conductor.py index 41bae669..152f0a68 100644 --- a/main/lib/idds/agents/conductor/conductor.py +++ b/main/lib/idds/agents/conductor/conductor.py @@ -19,8 +19,7 @@ from idds.common.constants import (Sections, MessageStatus, MessageDestination) from idds.common.exceptions import AgentPluginError, IDDSException -# from idds.common.utils import setup_logging, get_logger -from idds.common.utils import setup_logging +from idds.common.utils import setup_logging, get_logger from idds.core import messages as core_messages from idds.agents.common.baseagent import BaseAgent @@ -56,7 +55,7 @@ def __init__(self, num_threads=1, retrieve_bulk_size=1000, threshold_to_release_ if replay_times is None: replay_times = 3 self.replay_times = int(replay_times) - # self.logger = get_logger(self.__class__.__name__) + self.logger = get_logger(self.__class__.__name__) def __del__(self): self.stop_notifier() @@ -104,6 +103,7 @@ def start_notifier(self): self.logger.info("Starting notifier: %s" % self.notifier) self.notifier.set_request_queue(self.message_queue) self.notifier.set_response_queue(self.output_message_queue) + self.notifier.set_logger(self.logger) self.notifier.start() def stop_notifier(self): From 98ac85494670905f4d1c607126c4176988c6eb3a Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 4 Jan 2023 13:48:37 +0100 Subject: [PATCH 54/91] fix to send messages to different destinations --- main/lib/idds/agents/common/plugins/messaging.py | 10 +++++----- main/lib/idds/agents/conductor/conductor.py | 7 +++++-- main/lib/idds/orm/messages.py | 8 +++++++- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/main/lib/idds/agents/common/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py index a39a1b4c..b72f80c5 100644 --- a/main/lib/idds/agents/common/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -152,7 +152,7 @@ def get_connection(self, destination): if not conn.is_connected(): # conn.start() conn.connect(self.username, self.password, wait=True) - return conn, queue_dest + return conn, queue_dest, destination except Exception as error: self.logger.error("Failed to connect to message broker(will re-resolve brokers): %s" % str(error)) @@ -166,16 +166,16 @@ def get_connection(self, destination): queue_dest = self.channels[destination]['destination'] if not conn.is_connected(): conn.connect(self.username, self.password, wait=True) - return conn, queue_dest + return conn, queue_dest, destination except Exception as error: self.logger.error("Failed to connect to message broker(will re-resolve brokers): %s" % str(error)) def send_message(self, msg): destination = msg['destination'] if 'destination' in msg else 'default' - conn, queue_dest = self.get_connection(destination) + conn, queue_dest, destination = self.get_connection(destination) - self.logger.info("Sending message to message broker: %s" % msg['msg_id']) - self.logger.debug("Sending message to message broker: %s" % json.dumps(msg['msg_content'])) + self.logger.info("Sending message to message broker(%s): %s" % (destination, msg['msg_id'])) + self.logger.debug("Sending message to message broker(%s): %s" % (destination, json.dumps(msg['msg_content']))) conn.send(body=json.dumps(msg['msg_content']), destination=queue_dest, id='atlas-idds-messaging', diff --git a/main/lib/idds/agents/conductor/conductor.py b/main/lib/idds/agents/conductor/conductor.py index 152f0a68..7ed50a16 100644 --- a/main/lib/idds/agents/conductor/conductor.py +++ b/main/lib/idds/agents/conductor/conductor.py @@ -64,9 +64,10 @@ def get_messages(self): """ Get messages """ + destination = [MessageDestination.Outside, MessageDestination.ContentExt] messages = core_messages.retrieve_messages(status=MessageStatus.New, bulk_size=self.retrieve_bulk_size, - destination=MessageDestination.Outside) + destination=destination) # self.logger.debug("Main thread get %s new messages" % len(messages)) if messages: @@ -79,7 +80,7 @@ def get_messages(self): messages_d = core_messages.retrieve_messages(status=MessageStatus.Delivered, retries=retry, delay=delay, bulk_size=self.retrieve_bulk_size, - destination=MessageDestination.Outside) + destination=destination) if messages_d: self.logger.info("Main thread get %s retries messages" % len(messages_d)) retry_messages += messages_d @@ -142,6 +143,8 @@ def run(self): num_contents = 0 messages = self.get_messages() for message in messages: + message['destination'] = message['destination'].name + num_contents += message['num_contents'] self.message_queue.put(message) while not self.message_queue.empty(): diff --git a/main/lib/idds/orm/messages.py b/main/lib/idds/orm/messages.py index e316f3f3..50f090d9 100644 --- a/main/lib/idds/orm/messages.py +++ b/main/lib/idds/orm/messages.py @@ -132,6 +132,12 @@ def retrieve_messages(bulk_size=1000, msg_type=None, status=None, source=None, """ messages = [] try: + if destination is not None: + if not isinstance(destination, (list, tuple)): + destination = [destination] + if len(destination) == 1: + destination = [destination[0], destination[0]] + query = session.query(models.Message) if request_id is not None: query = query.with_hint(models.Message, "INDEX(MESSAGES MESSAGES_TYPE_ST_IDX)", 'oracle') @@ -149,7 +155,7 @@ def retrieve_messages(bulk_size=1000, msg_type=None, status=None, source=None, if source is not None: query = query.filter_by(source=source) if destination is not None: - query = query.filter_by(destination=destination) + query = query.filter(models.Message.destination.in_(destination)) if request_id is not None: query = query.filter_by(request_id=request_id) if workload_id is not None: From e3ceb369354dfd5aeec5f69103e4e1e9247bd90e Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 4 Jan 2023 13:49:30 +0100 Subject: [PATCH 55/91] fix to fill content ext --- doma/lib/idds/doma/workflowv2/domapandawork.py | 12 +++++++++++- main/lib/idds/agents/carrier/utils.py | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index ea4b4228..5b8ef9f7 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -950,6 +950,7 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m self.logger.debug("get_update_contents, num_updated_contents: %s, num_unupdated_contents: %s" % (num_updated_contents, num_unupdated_contents)) self.logger.debug("get_update_contents, update_contents[:3]: %s" % (str(update_contents[:3]))) + self.logger.debug("get_update_contents, contents_ext_full[:3]: %s" % (str({k: contents_ext_full[k] for k in list(contents_ext_full.keys())[:3]}))) return update_contents, update_contents_full, contents_ext_full def get_contents_ext_detail(self, contents_ext_full, contents_ext_ids, job_info_maps={}): @@ -970,6 +971,8 @@ def get_contents_ext_detail(self, contents_ext_full, contents_ext_ids, job_info_ 'status': content['status']} for job_info_item in job_info_maps: new_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if new_content_ext[job_info_item] == 'NULL': + new_content_ext[job_info_item] = None new_contents_ext.append(new_content_ext) for to_update_id in to_update_ids: @@ -978,13 +981,17 @@ def get_contents_ext_detail(self, contents_ext_full, contents_ext_ids, job_info_ update_content_ext = {'content_id': content['content_id'], 'status': content['status']} for job_info_item in job_info_maps: update_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if update_content_ext[job_info_item] == 'NULL': + update_content_ext[job_info_item] = None update_contents_ext.append(update_content_ext) return new_contents_ext, update_contents_ext def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, job_info_maps={}): + self.logger.debug("get_contents_ext, len(contents_ext): %s" % (str(len(contents_ext)))) + self.logger.debug("get_contents_ext, contents_ext[:3]: %s" % (str(contents_ext[:3]))) contents_ext_ids = [content['content_id'] for content in contents_ext] contents_ext_ids = set(contents_ext_ids) - contents_ext_panda_ids = [content['PandaID'] for content in contents_ext] + contents_ext_panda_ids = [content['panda_id'] for content in contents_ext] contents_ext_panda_ids = set(contents_ext_panda_ids) new_contents_ext, update_contents_ext = [], [] @@ -1043,6 +1050,9 @@ def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, j new_contents_ext = new_contents_ext + new_contents_ext1 update_contents_ext = update_contents_ext + update_contents_ext1 + self.logger.debug("get_contents_ext, new_contents_ext[:1]: %s" % (str(new_contents_ext[:1]))) + self.logger.debug("get_contents_ext, update_contents_ext[:1]: %s" % (str(update_contents_ext[:1]))) + self.logger.debug("get_contents_ext, left_panda_ids[:3]: %s" % (str(left_panda_ids[:3]))) return new_contents_ext, update_contents_ext, left_panda_ids def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext=None, job_info_maps={}, log_prefix=''): diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index 7c858b6d..179bbd45 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -1265,7 +1265,7 @@ def sync_collection_status(request_id, transform_id, workload_id, work, input_ou 'failed_ext_files': coll.failed_ext_files, 'missing_ext_files': coll.missing_ext_files} if terminate: - if work.require_ext_contents(): + if coll in output_collections and work.require_ext_contents(): if coll.processed_files == coll.processed_ext_files and coll.failed_files == coll.failed_ext_files: all_ext_updated = True if (force_close_collection or (close_collection and all_updates_flushed and all_ext_updated) From 6e664c5f6b409d6058e2df131ad13f4f0614a882 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 4 Jan 2023 15:18:39 +0100 Subject: [PATCH 56/91] fix rest status code --- main/lib/idds/rest/v1/controller.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main/lib/idds/rest/v1/controller.py b/main/lib/idds/rest/v1/controller.py index c1401080..1255f772 100644 --- a/main/lib/idds/rest/v1/controller.py +++ b/main/lib/idds/rest/v1/controller.py @@ -56,11 +56,13 @@ def generate_message(self, exc_cls=None, exc_msg=None): def generate_http_response(self, status_code, data=None, exc_cls=None, exc_msg=None): enable_json_outputs = self.get_request().args.get('json_outputs', None) - if enable_json_outputs and enable_json_outputs.upper == 'TRUE': + if enable_json_outputs and enable_json_outputs.upper() == 'TRUE': error = None if exc_cls: error = {'ExceptionClass': exc_cls, 'ExceptionMessage': self.generate_message(exc_cls, exc_msg)} + if status_code == HTTP_STATUS_CODE.OK: + status_code = 0 response = {'ret_code': status_code, 'data': data, 'error': error} From a927f2ac706b08376940301a21e9925b733b4dce Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 4 Jan 2023 18:00:36 +0100 Subject: [PATCH 57/91] fix content relation type in get_contents --- main/lib/idds/core/catalog.py | 14 ++- main/lib/idds/orm/contents.py | 156 ++-------------------------------- 2 files changed, 17 insertions(+), 153 deletions(-) diff --git a/main/lib/idds/core/catalog.py b/main/lib/idds/core/catalog.py index e25e53bf..06bbd8ae 100644 --- a/main/lib/idds/core/catalog.py +++ b/main/lib/idds/core/catalog.py @@ -16,7 +16,7 @@ from idds.common import exceptions from idds.common.constants import (CollectionType, CollectionStatus, CollectionLocking, - CollectionRelationType, ContentStatus) + CollectionRelationType, ContentStatus, ContentRelationType) from idds.orm.base.session import read_session, transactional_session from idds.orm import (transforms as orm_transforms, collections as orm_collections, @@ -314,7 +314,17 @@ def get_contents(coll_scope=None, coll_name=None, request_id=None, workload_id=N coll_ids = [coll['coll_id'] for coll in collections] if coll_ids: - rets = orm_contents.get_contents(coll_id=coll_ids, status=status, to_json=to_json, session=session) + if relation_type is None: + content_relation_type = None + else: + if relation_type == CollectionRelationType.Output: + content_relation_type = ContentRelationType.Output + elif relation_type == CollectionRelationType.Input: + content_relation_type = ContentRelationType.Input + elif relation_type == CollectionRelationType.Log: + content_relation_type = ContentRelationType.Log + rets = orm_contents.get_contents(coll_id=coll_ids, status=status, to_json=to_json, + relation_type=content_relation_type, session=session) else: rets = [] return rets diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index 9a5a20ad..5dbbef1a 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -274,7 +274,7 @@ def get_match_contents(coll_id, scope, name, content_type=None, min_id=None, max @read_session -def get_contents(scope=None, name=None, coll_id=None, status=None, to_json=False, session=None): +def get_contents(scope=None, name=None, coll_id=None, status=None, relation_type=None, to_json=False, session=None): """ Get content or raise a NoObject exception. @@ -313,155 +313,9 @@ def get_contents(scope=None, name=None, coll_id=None, status=None, to_json=False query = query.filter(models.Content.name.like(name.replace('*', '%'))) if status is not None: query = query.filter(models.Content.status.in_(status)) + if relation_type: + query = query.filter(models.Content.content_relation_type == relation_type) - tmp = query.all() - rets = [] - if tmp: - for t in tmp: - if to_json: - rets.append(t.to_dict_json()) - else: - rets.append(t.to_dict()) - return rets - except sqlalchemy.orm.exc.NoResultFound as error: - raise exceptions.NoObject('No record can be found with (scope=%s, name=%s, coll_id=%s): %s' % - (scope, name, coll_id, error)) - except Exception as error: - raise error - - -@read_session -def get_contents_by_request_transform(request_id=None, transform_id=None, workload_id=None, status=None, status_updated=False, session=None): - """ - Get content or raise a NoObject exception. - - :param request_id: request id. - :param transform_id: transform id. - :param workload_id: workload id. - - :param session: The database session in use. - - :raises NoObject: If no content is founded. - - :returns: list of contents. - """ - - try: - if status is not None: - if not isinstance(status, (tuple, list)): - status = [status] - - query = session.query(models.Content) - query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') - if request_id: - query = query.filter(models.Content.request_id == request_id) - if transform_id: - query = query.filter(models.Content.transform_id == transform_id) - if workload_id: - query = query.filter(models.Content.workload_id == workload_id) - if status is not None: - query = query.filter(models.Content.substatus.in_(status)) - if status_updated: - query = query.filter(models.Content.status != models.Content.substatus) - query = query.order_by(asc(models.Content.request_id), asc(models.Content.transform_id), asc(models.Content.map_id)) - - tmp = query.all() - rets = [] - if tmp: - for t in tmp: - rets.append(t.to_dict()) - return rets - except sqlalchemy.orm.exc.NoResultFound as error: - raise exceptions.NoObject('No record can be found with (transform_id=%s): %s' % - (transform_id, error)) - except Exception as error: - raise error - - -@read_session -def get_contents_by_content_ids(content_ids, request_id=None, bulk_size=1000, session=None): - """ - Get content or raise a NoObject exception. - - :param request_id: request id. - :param content_ids: list of content id. - :param workload_id: workload id. - - :param session: The database session in use. - - :raises NoObject: If no content is founded. - - :returns: list of contents. - """ - try: - if content_ids: - if not isinstance(content_ids, (list, tuple)): - content_ids = [content_ids] - - chunks = [content_ids[i:i + bulk_size] for i in range(0, len(content_ids), bulk_size)] - ret = [] - for chunk in chunks: - ret_chunk = get_contents_by_content_ids_real(chunk, request_id=request_id) - ret = ret + ret_chunk - return ret - except Exception as error: - raise error - - -@read_session -def get_contents_by_content_ids_real(content_ids, request_id=None, session=None): - """ - Get content or raise a NoObject exception. - - :param request_id: request id. - :param content_ids: list of content id. - :param workload_id: workload id. - - :param session: The database session in use. - - :raises NoObject: If no content is founded. - - :returns: list of contents. - """ - try: - query = session.query(models.Content) - query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') - if request_id: - query = query.filter(models.Content.request_id == request_id) - query = query.filter(models.Content.content_id.in_(content_ids)) - ret = query.all() - rets = [t.to_dict() for t in ret] - return rets - except Exception as error: - raise error - - -@read_session -def get_input_contents(request_id, coll_id, name=None, to_json=False, session=None): - """ - Get content or raise a NoObject exception. - - :param request_id: request id. - :param coll_id: collection id. - :param name: content name. - :param to_json: return json format. - - :param session: The database session in use. - - :raises NoObject: If no content is founded. - - :returns: list of contents. - """ - - try: - query = session.query(models.Content) - query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') - query = query.filter(models.Content.request_id == request_id) - query = query.filter(models.Content.coll_id == coll_id) - - if name: - query = query.filter(models.Content.name == name) - # query = query.filter(models.Content.content_relation_type == ContentRelationType.Input) query = query.order_by(asc(models.Content.map_id)) tmp = query.all() @@ -474,8 +328,8 @@ def get_input_contents(request_id, coll_id, name=None, to_json=False, session=No rets.append(t.to_dict()) return rets except sqlalchemy.orm.exc.NoResultFound as error: - raise exceptions.NoObject('No record can be found with (transform_id=%s, coll_id=%s, name=%s): %s' % - (request_id, coll_id, name, error)) + raise exceptions.NoObject('No record can be found with (coll_id=%s, name=%s): %s' % + (coll_id, name, error)) except Exception as error: raise error From bdf31a8ccd17f33b195d704c6c03fd94ecaa8fdf Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 4 Jan 2023 18:01:45 +0100 Subject: [PATCH 58/91] group contents_ext --- client/lib/idds/client/catalogclient.py | 5 +-- client/lib/idds/client/clientmanager.py | 35 ++++++++++++++++++ client/lib/idds/client/requestclient.py | 2 +- main/lib/idds/rest/v1/catalog.py | 47 ++++++++++++++++++------- main/lib/idds/tests/test_client.py | 27 +++++++++++++- 5 files changed, 100 insertions(+), 16 deletions(-) diff --git a/client/lib/idds/client/catalogclient.py b/client/lib/idds/client/catalogclient.py index 305725a8..32741935 100644 --- a/client/lib/idds/client/catalogclient.py +++ b/client/lib/idds/client/catalogclient.py @@ -173,7 +173,7 @@ def register_contents(self, coll_scope, coll_name, request_id, workload_id, cont r = self.get_request_response(url, type='POST', data=contents) return r - def get_contents_output_ext(self, request_id=None, workload_id=None, transform_id=None): + def get_contents_output_ext(self, request_id=None, workload_id=None, transform_id=None, group_by_jedi_task_id=False): """ Get output extension contents from the Head service. @@ -191,7 +191,8 @@ def get_contents_output_ext(self, request_id=None, workload_id=None, transform_i if transform_id is None: transform_id = 'null' - url = self.build_url(self.host, path=os.path.join(path, str(request_id), str(workload_id), str(transform_id))) + url = self.build_url(self.host, path=os.path.join(path, str(request_id), str(workload_id), + str(transform_id), str(group_by_jedi_task_id))) contents = self.get_request_response(url, type='GET') return contents diff --git a/client/lib/idds/client/clientmanager.py b/client/lib/idds/client/clientmanager.py index c8957052..cfc86437 100644 --- a/client/lib/idds/client/clientmanager.py +++ b/client/lib/idds/client/clientmanager.py @@ -95,6 +95,11 @@ def setup_client(self, auth_setup=False): if self.enable_json_outputs: self.client.enable_json_outputs() + def setup_json_outputs(self): + self.enable_json_outputs = True + if self.client: + self.client.enable_json_outputs() + def get_local_config_root(self): local_cfg_root = get_local_config_root(self.local_config_root) return local_cfg_root @@ -552,6 +557,20 @@ def get_requests(self, request_id=None, workload_id=None, with_detail=False, wit reqs = self.client.get_requests(request_id=request_id, workload_id=workload_id, with_detail=with_detail, with_metadata=with_metadata) return reqs + @exception_handler + def get_request_id_by_name(self, name): + """ + Get request id by name. + + :param name: the request name. + + :returns {name:id} dict. + """ + self.setup_client() + + ret = self.client.get_request_id_by_name(name=name) + return ret + @exception_handler def get_status(self, request_id=None, workload_id=None, with_detail=False, with_metadata=False): """ @@ -669,3 +688,19 @@ def get_messages(self, request_id=None, workload_id=None): msgs = self.client.get_messages(request_id=request_id, workload_id=workload_id) logging.info("Retrieved %s messages for request_id: %s, workload_id: %s" % (len(msgs), request_id, workload_id)) return (0, msgs) + + @exception_handler + def get_contents_output_ext(self, request_id=None, workload_id=None, transform_id=None, group_by_jedi_task_id=False): + """ + Get output extension contents from the Head service. + + :param request_id: the request id. + :param workload_id: the workload id. + :param transform_id: the transform id. + + :raise exceptions if it's not got successfully. + """ + self.setup_client() + + return self.client.get_contents_output_ext(workload_id=workload_id, request_id=request_id, transform_id=transform_id, + group_by_jedi_task_id=group_by_jedi_task_id) diff --git a/client/lib/idds/client/requestclient.py b/client/lib/idds/client/requestclient.py index 318acb8c..99cef10e 100644 --- a/client/lib/idds/client/requestclient.py +++ b/client/lib/idds/client/requestclient.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2022 """ diff --git a/main/lib/idds/rest/v1/catalog.py b/main/lib/idds/rest/v1/catalog.py index 3474a21a..26a006bd 100644 --- a/main/lib/idds/rest/v1/catalog.py +++ b/main/lib/idds/rest/v1/catalog.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019-2020 +# - Wen Guan, , 2019 - 2022 import traceback @@ -116,7 +116,7 @@ def get(self, coll_scope, coll_name, request_id, workload_id, relation_type, sta class ContentsOutputExt(IDDSController): """ Catalog """ - def get(self, request_id, workload_id, transform_id): + def get(self, request_id, workload_id, transform_id, group_by_jedi_task_id=False): """ Get contents by request_id, workload_id and transform_id. HTTP Success: 200 OK @@ -139,17 +139,40 @@ def get(self, request_id, workload_id, transform_id): transform_id = None else: transform_id = int(transform_id) + if group_by_jedi_task_id: + if type(group_by_jedi_task_id) in [bool]: + pass + else: + if type(group_by_jedi_task_id) in [str] and group_by_jedi_task_id.lower() in ['true']: + group_by_jedi_task_id = True + else: + group_by_jedi_task_id = False + else: + group_by_jedi_task_id = False - if transform_id is None: - self.generate_http_response(HTTP_STATUS_CODE.BadRequest, - exc_cls=exceptions.BadRequest.__name__, - exc_msg="Transform_id must not be None") - - contents = get_contents(request_id=request_id, workload_id=workload_id, transform_id=transform_id, - relation_type=CollectionRelationType.Output) - contents_ext = get_contents_ext(request_id=request_id, workload_id=workload_id, transform_id=transform_id) + if request_id is None: + return self.generate_http_response(HTTP_STATUS_CODE.BadRequest, + exc_cls=exceptions.BadRequest.__name__, + exc_msg="request_id must not be None") - rets = combine_contents_ext(contents, contents_ext, with_status_name=True) + else: + contents = get_contents(request_id=request_id, workload_id=workload_id, transform_id=transform_id, + relation_type=CollectionRelationType.Output) + contents_ext = get_contents_ext(request_id=request_id, workload_id=workload_id, transform_id=transform_id) + + ret_contents = combine_contents_ext(contents, contents_ext, with_status_name=True) + rets = {} + for content in ret_contents: + if group_by_jedi_task_id: + jedi_task_id = content.get('jedi_task_id', 'None') + if jedi_task_id not in rets: + rets[jedi_task_id] = [] + rets[jedi_task_id].append(content) + else: + transform_id = content.get('transform_id') + if transform_id not in rets: + rets[transform_id] = [] + rets[transform_id].append(content) except exceptions.NoObject as error: return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) except exceptions.IDDSException as error: @@ -181,7 +204,7 @@ def get_blueprint(): view_func=contents_view, methods=['get', ]) # get contents contents_ext_view = ContentsOutputExt.as_view('contents_output_ext') - bp.add_url_rule('/catalog/contents_output_ext///', + bp.add_url_rule('/catalog/contents_output_ext////', view_func=contents_ext_view, methods=['get', ]) return bp diff --git a/main/lib/idds/tests/test_client.py b/main/lib/idds/tests/test_client.py index 4021b755..1a47a545 100644 --- a/main/lib/idds/tests/test_client.py +++ b/main/lib/idds/tests/test_client.py @@ -38,11 +38,36 @@ def test(): cm1 = ClientManager(host=atlas_host) cm1 = ClientManager(host=doma_host) cm1 = ClientManager(host=dev_host) - request_id = 389 + request_id = 414 ret = cm1.get_requests(request_id, with_detail=True) print(json_dumps(ret, sort_keys=True, indent=4)) + cm1.setup_json_outputs() + ret = cm1.get_requests(request_id, with_detail=True) + print(json_dumps(ret, sort_keys=True, indent=4)) + + ret = cm1.get_request_id_by_name(name='test_workflow.idds.1672836584.9900262.test') + print(json_dumps(ret, sort_keys=True, indent=4)) + + ret = cm1.get_request_id_by_name(name='test_workflow.idds.1672836584.9900262.test1') + print(json_dumps(ret, sort_keys=True, indent=4)) + + ret = cm1.get_request_id_by_name(name='test_workflow.idds*') + print(json_dumps(ret, sort_keys=True, indent=4)) + + ret = cm1.get_contents_output_ext(request_id=request_id) + print(json_dumps(ret, sort_keys=True, indent=4)) + + ret = cm1.get_contents_output_ext(request_id=request_id, group_by_jedi_task_id=True) + print(json_dumps(ret, sort_keys=True, indent=4)) + + ret = cm1.get_contents_output_ext(request_id=None) + print(json_dumps(ret, sort_keys=True, indent=4)) + + ret = cm1.get_contents_output_ext(request_id=99999) + print(json_dumps(ret, sort_keys=True, indent=4)) + if __name__ == '__main__': test() From a09b0e1059e1d8f5f61c4ad35ebadb37407344dd Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 9 Jan 2023 20:05:13 +0100 Subject: [PATCH 59/91] add get contents by request func --- main/lib/idds/orm/contents.py | 53 ++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index 5dbbef1a..4c32fe85 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -274,7 +274,8 @@ def get_match_contents(coll_id, scope, name, content_type=None, min_id=None, max @read_session -def get_contents(scope=None, name=None, coll_id=None, status=None, relation_type=None, to_json=False, session=None): +def get_contents(scope=None, name=None, transform_id=None, coll_id=None, status=None, + relation_type=None, to_json=False, session=None): """ Get content or raise a NoObject exception. @@ -305,6 +306,8 @@ def get_contents(scope=None, name=None, coll_id=None, status=None, relation_type query = session.query(models.Content) query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_ID_NAME_IDX)", 'oracle') + if transform_id: + query = query.filter(models.Content.transform_id == transform_id) if coll_id: query = query.filter(models.Content.coll_id.in_(coll_id)) if scope: @@ -334,6 +337,54 @@ def get_contents(scope=None, name=None, coll_id=None, status=None, relation_type raise error +@read_session +def get_contents_by_request_transform(request_id=None, transform_id=None, workload_id=None, status=None, status_updated=False, session=None): + """ + Get content or raise a NoObject exception. + + :param request_id: request id. + :param transform_id: transform id. + :param workload_id: workload id. + + :param session: The database session in use. + + :raises NoObject: If no content is founded. + + :returns: list of contents. + """ + + try: + if status is not None: + if not isinstance(status, (tuple, list)): + status = [status] + + query = session.query(models.Content) + query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') + if request_id: + query = query.filter(models.Content.request_id == request_id) + if transform_id: + query = query.filter(models.Content.transform_id == transform_id) + if workload_id: + query = query.filter(models.Content.workload_id == workload_id) + if status is not None: + query = query.filter(models.Content.substatus.in_(status)) + if status_updated: + query = query.filter(models.Content.status != models.Content.substatus) + query = query.order_by(asc(models.Content.request_id), asc(models.Content.transform_id), asc(models.Content.map_id)) + + tmp = query.all() + rets = [] + if tmp: + for t in tmp: + rets.append(t.to_dict()) + return rets + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('No record can be found with (transform_id=%s): %s' % + (transform_id, error)) + except Exception as error: + raise error + + @read_session def get_content_status_statistics(coll_id=None, session=None): """ From 7f7ada487106e669b5c281097a0cd8e480abcd51 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 9 Jan 2023 20:06:11 +0100 Subject: [PATCH 60/91] improve abort processing --- main/lib/idds/agents/carrier/finisher.py | 3 ++- main/lib/idds/agents/carrier/utils.py | 27 ++++++++++++------------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/main/lib/idds/agents/carrier/finisher.py b/main/lib/idds/agents/carrier/finisher.py index 813bd525..036f7197 100644 --- a/main/lib/idds/agents/carrier/finisher.py +++ b/main/lib/idds/agents/carrier/finisher.py @@ -166,7 +166,7 @@ def handle_abort_processing(self, processing, log_prefix=""): process abort processing """ try: - processing, update_collections, update_contents = handle_abort_processing(processing, self.agent_attributes, logger=self.logger, log_prefix=log_prefix) + processing, update_collections, update_contents, messages = handle_abort_processing(processing, self.agent_attributes, logger=self.logger, log_prefix=log_prefix) update_processing = {'processing_id': processing['processing_id'], 'parameters': {'status': processing['status'], @@ -174,6 +174,7 @@ def handle_abort_processing(self, processing, log_prefix=""): ret = {'update_processing': update_processing, 'update_collections': update_collections, 'update_contents': update_contents, + 'messages': messages } return ret except Exception as ex: diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index 179bbd45..8f5baacd 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -1356,9 +1356,9 @@ def sync_processing(processing, agent_attributes, terminate=False, logger=None, def handle_abort_processing(processing, agent_attributes, logger=None, log_prefix=''): logger = get_logger(logger) - request_id = processing['request_id'] - transform_id = processing['transform_id'] - workload_id = processing['workload_id'] + # request_id = processing['request_id'] + # transform_id = processing['transform_id'] + # workload_id = processing['workload_id'] proc = processing['processing_metadata']['processing'] work = proc.work @@ -1366,22 +1366,23 @@ def handle_abort_processing(processing, agent_attributes, logger=None, log_prefi work.abort_processing(processing, log_prefix=log_prefix) - input_collections = work.get_input_collections() - output_collections = work.get_output_collections() - log_collections = work.get_log_collections() + # input_collections = work.get_input_collections() + # output_collections = work.get_output_collections() + # log_collections = work.get_log_collections() # input_output_maps = get_input_output_maps(transform_id, work) - update_collections, all_updates_flushed = sync_collection_status(request_id, transform_id, workload_id, work, - input_output_maps=None, close_collection=True, - force_close_collection=True) + # update_collections, all_updates_flushed = sync_collection_status(request_id, transform_id, workload_id, work, + # input_output_maps=None, close_collection=True, + # force_close_collection=True) - for coll in input_collections + output_collections + log_collections: - coll.status = CollectionStatus.Closed - coll.substatus = CollectionStatus.Closed + # for coll in input_collections + output_collections + log_collections: + # coll.status = CollectionStatus.Closed + # coll.substatus = CollectionStatus.Closed + processing, update_collections, messages = sync_processing(processing, agent_attributes, terminate=True, logger=logger, log_prefix=log_prefix) update_contents = [] # processing['status'] = ProcessingStatus.Cancelled - return processing, update_collections, update_contents + return processing, update_collections, update_contents, messages def reactive_contents(request_id, transform_id, workload_id, work, input_output_maps): From de47b220e2783a96d3e6cbfe3efe5d38f0f5b80b Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 9 Jan 2023 20:07:32 +0100 Subject: [PATCH 61/91] fix cases that no task id --- .../lib/idds/doma/workflowv2/domapandawork.py | 68 +++++++++++++++---- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index 5b8ef9f7..1031ecfc 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -43,7 +43,7 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, output_collections=None, log_collections=None, logger=None, dependency_map=None, task_name="", task_queue=None, queue=None, processing_type=None, - prodSourceLabel='test', task_type='test', + prodSourceLabel='test', task_type='lsst', maxwalltime=90000, maxattempt=5, core_count=1, encode_command_line=False, num_retries=5, @@ -78,6 +78,8 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.panda_verify_host = None self.dependency_map = dependency_map + if self.dependency_map is None: + self.dependency_map = {} self.dependency_map_deleted = [] # self.logger.setLevel(logging.DEBUG) @@ -322,9 +324,10 @@ def get_unmapped_jobs(self, mapped_input_output_maps={}): return unmapped_jobs def has_dependency(self): - for job in self.dependency_map: - if "dependencies" in job and job["dependencies"]: - return True + if self.dependency_map: + for job in self.dependency_map: + if "dependencies" in job and job["dependencies"]: + return True return False def get_parent_work_names(self): @@ -388,12 +391,18 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): 'logs': [], 'inputs': [input_content], 'outputs': [output_content]} + uni_input_name = {} for input_d in inputs_dependency: task_name = input_d['task'] input_name = input_d['inputname'] - input_d_coll = task_name_to_coll_map[task_name]['outputs'][0] - input_d_content = self.map_file_to_content(input_d_coll['coll_id'], input_d_coll['scope'], input_name) - new_input_output_maps[next_key]['inputs_dependency'].append(input_d_content) + task_name_input_name = task_name + input_name + if task_name_input_name not in uni_input_name: + uni_input_name[task_name_input_name] = None + input_d_coll = task_name_to_coll_map[task_name]['outputs'][0] + input_d_content = self.map_file_to_content(input_d_coll['coll_id'], input_d_coll['scope'], input_name) + new_input_output_maps[next_key]['inputs_dependency'].append(input_d_content) + else: + self.logger.debug("get_new_input_output_maps, duplicated input dependency for job %s: %s" % (job['name'], str(job["dependencies"]))) # all inputs are parsed. move it to dependency_map_deleted # self.dependency_map_deleted.append(job) @@ -438,6 +447,8 @@ def create_processing(self, input_output_maps=[]): self.task_name = self.task_name + "_" + str(self.get_request_id()) + "_" + str(self.get_work_id()) in_files = [] + if self.dependency_map is None: + self.dependency_map = {} for job in self.dependency_map: in_files.append(job['name']) @@ -449,15 +460,29 @@ def create_processing(self, input_output_maps=[]): task_param_map['site'] = self.queue task_param_map['workingGroup'] = self.working_group task_param_map['nFilesPerJob'] = 1 - task_param_map['nFiles'] = len(in_files) - task_param_map['noInput'] = True - task_param_map['pfnList'] = in_files + if in_files: + task_param_map['inputPreStaging'] = True + task_param_map['nFiles'] = len(in_files) + task_param_map['noInput'] = True + task_param_map['pfnList'] = in_files + else: + # task_param_map['inputPreStaging'] = True + in_files = ['pseudo_file'] + task_param_map['nFiles'] = len(in_files) + task_param_map['noInput'] = True + task_param_map['pfnList'] = in_files + task_param_map['taskName'] = self.task_name task_param_map['userName'] = self.username if self.username else 'iDDS' task_param_map['taskPriority'] = self.task_priority task_param_map['architecture'] = '' task_param_map['transUses'] = '' task_param_map['transHome'] = None + + executable = self.executable + if self.task_type == 'lsst_build': + executable = str(self.get_request_id()) + " " + str(self.signature) + " " + self.executable + if self.encode_command_line: # task_param_map['transPath'] = 'https://atlpan.web.cern.ch/atlpan/bash-c-enc' task_param_map['transPath'] = 'https://storage.googleapis.com/drp-us-central1-containers/bash-c-enc' @@ -478,7 +503,7 @@ def create_processing(self, input_output_maps=[]): # task_param_map['ramUnit'] = 'MB' task_param_map['ramUnit'] = 'MBPerCoreFixed' - task_param_map['inputPreStaging'] = True + # task_param_map['inputPreStaging'] = True task_param_map['prestagingRuleID'] = 123 task_param_map['nChunksToWait'] = 1 task_param_map['maxCpuCount'] = self.core_count @@ -488,7 +513,7 @@ def create_processing(self, input_output_maps=[]): task_param_map['log'] = self.task_log task_param_map['jobParameters'] = [ {'type': 'constant', - 'value': self.executable, # noqa: E501 + 'value': executable, # noqa: E501 }, ] @@ -1103,7 +1128,8 @@ def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext= return processing_status, updated_contents, update_contents_full, new_contents_ext, update_contents_ext else: - return ProcessingStatus.Running, [], [], [], [] + self.logger.error("poll_panda_task, task_id (%s) cannot be found" % task_id) + return ProcessingStatus.Failed, [], [], [], [] except Exception as ex: msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) self.logger.error(log_prefix + msg) @@ -1161,7 +1187,21 @@ def reactivate_processing(self, processing, log_prefix=''): self.logger.error(log_prefix + msg) def abort_processing(self, processing, log_prefix=''): - self.kill_processing_force(processing, log_prefix=log_prefix) + try: + has_task = False + if processing: + proc = processing['processing_metadata']['processing'] + task_id = proc.workload_id + if task_id: + has_task = True + self.kill_processing_force(processing, log_prefix=log_prefix) + + if not has_task: + self.status = WorkStatus.Failed + except Exception as ex: + msg = "Failed to abort the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + msg) def resume_processing(self, processing, log_prefix=''): self.reactivate_processing(processing, log_prefix=log_prefix) From 5d9618f8b002d2e96b820d69d1579a6ff102de4e Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 9 Jan 2023 20:08:32 +0100 Subject: [PATCH 62/91] add build rest and client --- client/lib/idds/client/clientmanager.py | 33 +++++++++++-------------- client/lib/idds/client/requestclient.py | 8 +++--- main/lib/idds/rest/v1/requests.py | 8 +++--- 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/client/lib/idds/client/clientmanager.py b/client/lib/idds/client/clientmanager.py index cfc86437..c5dc8e4f 100644 --- a/client/lib/idds/client/clientmanager.py +++ b/client/lib/idds/client/clientmanager.py @@ -469,24 +469,6 @@ def submit_build(self, workflow, username=None, userdn=None, use_dataset_name=Tr request_id = self.client.add_request(**props) return request_id - @exception_handler - def update_build(self, request_id, signature, workflow): - """ - Submit the workflow as a request to iDDS server. - - :param workflow: The workflow to be submitted. - """ - self.setup_client() - - parameters = { - 'request_id': request_id, - 'signature': signature, - 'workflow': workflow - } - - ret = self.client.update_build_request(request_id=request_id, parameters=parameters) - return ret - @exception_handler def abort(self, request_id=None, workload_id=None): """ @@ -704,3 +686,18 @@ def get_contents_output_ext(self, request_id=None, workload_id=None, transform_i return self.client.get_contents_output_ext(workload_id=workload_id, request_id=request_id, transform_id=transform_id, group_by_jedi_task_id=group_by_jedi_task_id) + + @exception_handler + def update_build_request(self, request_id, signature, workflow): + """ + Update Build Request to the Head service. + + :param request_id: the request. + :param signature: the signature of the request. + :param workflow: the workflow of the request. + + :raise exceptions if it's not updated successfully. + """ + self.setup_client() + + return self.client.update_build_request(request_id=request_id, signature=signature, workflow=workflow) diff --git a/client/lib/idds/client/requestclient.py b/client/lib/idds/client/requestclient.py index 99cef10e..8a51185c 100644 --- a/client/lib/idds/client/requestclient.py +++ b/client/lib/idds/client/requestclient.py @@ -80,12 +80,13 @@ def update_request(self, request_id, parameters): r = self.get_request_response(url, type='PUT', data=data) return r - def update_build_request(self, request_id, parameters): + def update_build_request(self, request_id, signature, workflow): """ Update Build Request to the Head service. :param request_id: the request. - :param kwargs: other attributes of the request. + :param signature: the signature of the request. + :param workflow: the workflow of the request. :raise exceptions if it's not updated successfully. """ @@ -93,7 +94,8 @@ def update_build_request(self, request_id, parameters): path += "/build" url = self.build_url(self.host, path=os.path.join(path, str(request_id))) - data = parameters + data = {'signature': signature, + 'workflow': workflow} r = self.get_request_response(url, type='POST', data=data) return r diff --git a/main/lib/idds/rest/v1/requests.py b/main/lib/idds/rest/v1/requests.py index 6790887e..3de96fbd 100644 --- a/main/lib/idds/rest/v1/requests.py +++ b/main/lib/idds/rest/v1/requests.py @@ -247,7 +247,7 @@ def get(self, name): class RequestBuild(IDDSController): """ Create, Update, get and delete Request. """ - def post(self): + def post(self, request_id): """ update build request result. HTTP Success: 200 OK @@ -257,10 +257,10 @@ def post(self): """ try: parameters = self.get_request().data and json_loads(self.get_request().data) - if 'request_id' not in parameters or 'signature' not in parameters or 'workflow' not in parameters: - raise exceptions.IDDSException("request_id/signature/workflow are required") + if 'signature' not in parameters or 'workflow' not in parameters: + raise exceptions.IDDSException("signature and workflow are required") - request_id = parameters['request_id'] + # request_id = parameters['request_id'] signature = parameters['signature'] workflow = parameters['workflow'] From 4b1ff512eaec5d9583e1b1465cd0062450446217 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 9 Jan 2023 20:09:28 +0100 Subject: [PATCH 63/91] update clerk to handle build request --- main/lib/idds/agents/clerk/clerk.py | 144 +++++++++++++++++++++++++--- 1 file changed, 131 insertions(+), 13 deletions(-) diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index d1163d53..83303ab8 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -168,7 +168,8 @@ def get_running_requests(self): RequestStatus.ToSuspend, RequestStatus.Suspending, RequestStatus.ToExpire, RequestStatus.Expiring, RequestStatus.ToFinish, RequestStatus.ToForceFinish, - RequestStatus.ToResume, RequestStatus.Resuming] + RequestStatus.ToResume, RequestStatus.Resuming, + RequestStatus.Building] reqs = core_requests.get_requests_by_status_type(status=req_status, time_period=None, locking=True, bulk_size=self.retrieve_bulk_size, not_lock=True, update_poll=True, only_return_id=True) @@ -283,8 +284,11 @@ def get_work_tag_attribute(self, work_tag, attribute): work_tag_attribute_value = int(getattr(self, work_tag_attribute)) return work_tag_attribute_value - def generate_transform(self, req, work): - wf = req['request_metadata']['workflow'] + def generate_transform(self, req, work, build=False): + if build: + wf = req['request_metadata']['build_workflow'] + else: + wf = req['request_metadata']['workflow'] work.set_request_id(req['request_id']) work.username = req['username'] @@ -417,18 +421,22 @@ def handle_build_request(self, req): self.logger.info(log_pre + "handle build request") workflow = req['request_metadata']['build_workflow'] - build_work = workflow.get_build_work() - build_work.add_proxy(workflow.get_proxy()) - transform = self.generate_transform(req, build_work) + works = workflow.get_new_works() + transforms = [] + for work in works: + new_work = work + new_work.add_proxy(workflow.get_proxy()) + transform = self.generate_transform(req, new_work, build=True) + transforms.append(transform) self.logger.debug(log_pre + "Processing request(%s): new build transforms: %s" % (req['request_id'], - str(transform))) + str(transforms))) ret_req = {'request_id': req['request_id'], 'parameters': {'status': RequestStatus.Building, 'locking': RequestLocking.Idle, # 'processing_metadata': processing_metadata, 'request_metadata': req['request_metadata']}, - 'new_transforms': [transform]} + 'new_transforms': transforms} ret_req['parameters'] = self.load_poll_period(req, ret_req['parameters']) self.logger.info(log_pre + "Handle build request result: %s" % str(ret_req)) except Exception as ex: @@ -556,7 +564,10 @@ def handle_update_request_real(self, req, event): """ log_pre = self.get_log_prefix(req) self.logger.info(log_pre + " handle_update_request: request_id: %s" % req['request_id']) - wf = req['request_metadata']['workflow'] + if 'workflow' in req['request_metadata']: + wf = req['request_metadata']['workflow'] + else: + wf = req['request_metadata']['build_workflow'] to_abort = False to_abort_transform_id = None @@ -636,6 +647,92 @@ def handle_update_request_real(self, req, event): self.logger.info(log_pre + "Handle update request result: %s" % str(ret)) return ret + def handle_update_build_request_real(self, req, event): + """ + process build request + """ + log_pre = self.get_log_prefix(req) + self.logger.info(log_pre + " handle_update_build_request: request_id: %s" % req['request_id']) + wf = req['request_metadata']['build_workflow'] + + to_abort = False + to_abort_transform_id = None + if (event and event._content and 'cmd_type' in event._content and event._content['cmd_type'] + and event._content['cmd_type'] in [CommandType.AbortRequest, CommandType.ExpireRequest]): # noqa W503 + to_abort = True + self.logger.info(log_pre + "to_abort: %s" % to_abort) + if (event and event._content and 'cmd_content' in event._content and event._content['cmd_content'] + and 'transform_id' in event._content['cmd_content']): # noqa W503 + to_abort_transform_id = event._content['cmd_content']['transform_id'] + self.logger.info(log_pre + "to_abort_transform_id: %s" % to_abort_transform_id) + + if to_abort and not to_abort_transform_id: + wf.to_cancel = True + + # current works + works = wf.get_all_works() + # print(works) + for work in works: + # print(work.get_work_id()) + tf = core_transforms.get_transform(transform_id=work.get_work_id()) + if tf: + transform_work = tf['transform_metadata']['work'] + # work_status = WorkStatus(tf['status'].value) + # work.set_status(work_status) + work.sync_work_data(status=tf['status'], substatus=tf['substatus'], work=transform_work, workload_id=tf['workload_id']) + self.logger.info(log_pre + "transform status: %s, work status: %s" % (tf['status'], work.status)) + wf.refresh_works() + + new_transforms = [] + if req['status'] in [RequestStatus.Building] and not wf.to_cancel: + # new works + works = wf.get_new_works() + for work in works: + # new_work = work.copy() + new_work = work + new_work.add_proxy(wf.get_proxy()) + new_transform = self.generate_transform(req, new_work, build=True) + new_transforms.append(new_transform) + self.logger.debug(log_pre + " Processing build request(%s): new transforms: %s" % (req['request_id'], str(new_transforms))) + + req_status = RequestStatus.Building + if wf.is_terminated(): + if wf.is_finished(synchronize=False): + req_status = RequestStatus.Failed + else: + if to_abort and not to_abort_transform_id: + req_status = RequestStatus.Cancelled + elif wf.is_expired(synchronize=False): + req_status = RequestStatus.Expired + elif wf.is_subfinished(synchronize=False): + req_status = RequestStatus.SubFinished + elif wf.is_failed(synchronize=False): + req_status = RequestStatus.Failed + else: + req_status = RequestStatus.Failed + # req_msg = wf.get_terminated_msg() + else: + if wf.is_to_expire(req['expired_at'], self.pending_time, request_id=req['request_id']): + wf.expired = True + event_content = {'request_id': req['request_id'], + 'cmd_type': CommandType.ExpireRequest, + 'cmd_content': {}} + self.logger.debug(log_pre + "ExpireRequestEvent(request_id: %s)" % req['request_id']) + event = ExpireRequestEvent(publisher_id=self.id, request_id=req['request_id'], content=event_content) + self.event_bus.send(event) + + parameters = {'status': req_status, + 'locking': RequestLocking.Idle, + 'request_metadata': req['request_metadata'] + } + parameters = self.load_poll_period(req, parameters) + + ret = {'request_id': req['request_id'], + 'parameters': parameters, + 'new_transforms': new_transforms} # 'update_transforms': update_transforms} + self.logger.info(log_pre + "Handle update request result: %s" % str(ret)) + return ret + def handle_update_request(self, req, event): """ process running request @@ -643,7 +740,10 @@ def handle_update_request(self, req, event): try: # if self.release_helper: # self.release_inputs(req['request_id']) - ret_req = self.handle_update_request_real(req, event) + if req['status'] in [RequestStatus.Building]: + ret_req = self.handle_update_build_request_real(req, event=event) + else: + ret_req = self.handle_update_request_real(req, event) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) @@ -678,7 +778,8 @@ def process_update_request(self, event): RequestStatus.ToSuspend, RequestStatus.Suspending, RequestStatus.ToExpire, RequestStatus.Expiring, RequestStatus.ToFinish, RequestStatus.ToForceFinish, - RequestStatus.ToResume, RequestStatus.Resuming] + RequestStatus.ToResume, RequestStatus.Resuming, + RequestStatus.Building] req = self.get_request(request_id=event._request_id, status=req_status, locking=True) if not req: @@ -722,7 +823,13 @@ def handle_abort_request(self, req, event): if to_abort and to_abort_transform_id: req_status = req['status'] else: - wf = req['request_metadata']['workflow'] + if req['status'] in [RequestStatus.Building]: + wf = req['request_metadata']['build_workflow'] + else: + if 'workflow' in req['request_metadata']: + wf = req['request_metadata']['workflow'] + else: + wf = req['request_metadata']['build_workflow'] wf.to_cancel = True req_status = RequestStatus.Cancelling @@ -784,7 +891,13 @@ def process_abort_request(self, event): if event and event._content and event._content['cmd_content'] and 'transform_id' in event._content['cmd_content']: to_abort_transform_id = event._content['cmd_content']['transform_id'] - wf = req['request_metadata']['workflow'] + if req['status'] in [RequestStatus.Building]: + wf = req['request_metadata']['build_workflow'] + else: + if 'workflow' in req['request_metadata']: + wf = req['request_metadata']['workflow'] + else: + wf = req['request_metadata']['build_workflow'] works = wf.get_all_works() if works: for work in works: @@ -802,6 +915,11 @@ def process_abort_request(self, event): self.event_bus.send(event) self.handle_command(event, cmd_status=CommandStatus.Processed, errors=None) + except AssertionError as ex: + self.logger.error("process_abort_request, Failed to process event: %s" % str(event)) + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.handle_command(event, cmd_status=CommandStatus.Processed, errors=str(ex)) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) From 707ec07213d2a558ef164c5499d3c64fdee42e72 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 9 Jan 2023 20:09:55 +0100 Subject: [PATCH 64/91] add sign func --- workflow/lib/idds/workflowv2/work.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/lib/idds/workflowv2/work.py b/workflow/lib/idds/workflowv2/work.py index 2f8437b6..3c3d758e 100644 --- a/workflow/lib/idds/workflowv2/work.py +++ b/workflow/lib/idds/workflowv2/work.py @@ -1513,6 +1513,7 @@ def initialize_work(self): # self.arguments = re.sub(key, str(self.parameters.get_param_value(key)), self.arguments) # self.arguments = self.arguments.format(**self.parameters) pass + self.sign() if not self.is_initialized(): self.set_initialized() From c5f30740d0af58c1c6c2f05c924da09822936e07 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 9 Jan 2023 20:11:28 +0100 Subject: [PATCH 65/91] add rubin tools --- main/tools/rubin/bash-c | 7 +++++++ main/tools/rubin/bash-c-enc | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 main/tools/rubin/bash-c create mode 100644 main/tools/rubin/bash-c-enc diff --git a/main/tools/rubin/bash-c b/main/tools/rubin/bash-c new file mode 100644 index 00000000..3b3e966c --- /dev/null +++ b/main/tools/rubin/bash-c @@ -0,0 +1,7 @@ +#!/bin/bash +IFS=';' read -ra ADDR <<< "$@" + +for i in "${ADDR[@]}"; do + echo $i + eval $i +done diff --git a/main/tools/rubin/bash-c-enc b/main/tools/rubin/bash-c-enc new file mode 100644 index 00000000..5d7f7119 --- /dev/null +++ b/main/tools/rubin/bash-c-enc @@ -0,0 +1,18 @@ +#!/bin/env python + +import sys +import subprocess +import base64 +import datetime + +exec_str = base64.b64decode(sys.argv[1]) + +print("INFO : start {}".format(datetime.datetime.utcnow())) +print("INFO : exec string: {}".format(exec_str)) + +p = subprocess.Popen(exec_str.decode(), stdout=sys.stdout, stderr=sys.stderr, + shell=True, universal_newlines=True) +retcode = p.wait() +print("INFO : end {} with retcode={}".format(datetime.datetime.utcnow(), retcode)) +exit(retcode) + From 212aad8cab2066a76d0aa4dd24af62460079fa7e Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 12 Jan 2023 20:44:13 +0100 Subject: [PATCH 66/91] add support to get token from env, fix token validation --- client/lib/idds/client/base.py | 30 ++++++++----- client/lib/idds/client/clientmanager.py | 57 ++++++++++++++++--------- 2 files changed, 57 insertions(+), 30 deletions(-) diff --git a/client/lib/idds/client/base.py b/client/lib/idds/client/base.py index 168a0eb9..5c6df629 100644 --- a/client/lib/idds/client/base.py +++ b/client/lib/idds/client/base.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2023 """ @@ -51,6 +51,7 @@ def __init__(self, host=None, auth=None, timeout=None, client_proxy=None): self.retries = 2 self.auth_type = None + self.oidc_token_file = None self.oidc_token = None self.vo = None self.auth_setup = False @@ -59,6 +60,8 @@ def __init__(self, host=None, auth=None, timeout=None, client_proxy=None): self.auth_type = self.auth['auth_type'] if 'client_proxy' in self.auth: self.client_proxy = self.auth['client_proxy'] + if 'oidc_token_file' in self.auth: + self.oidc_token_file = self.auth['oidc_token_file'] if 'oidc_token' in self.auth: self.oidc_token = self.auth['oidc_token'] if 'vo' in self.auth: @@ -120,13 +123,26 @@ def check_auth(self): raise exceptions.RestException("Cannot find a valid x509 proxy.") elif self.auth_type in ['oidc']: if not self.auth_setup: - if not self.oidc_token or not os.path.exists(self.oidc_token): + if not self.oidc_token and (not self.oidc_token_file or not os.path.exists(self.oidc_token_file)): raise exceptions.RestException("Cannot find oidc token.") if not self.vo: raise exceptions.RestException("vo is not defined for oidc authentication.") else: logging.error("auth_type %s is not supported." % str(self.auth_type)) + def get_oidc_token(self): + if self.oidc_token: + return self.oidc_token + else: + oidc_utils = OIDCAuthenticationUtils() + status, token = oidc_utils.load_token(self.oidc_token_file) + if not status: + raise exceptions.IDDSException("Token %s cannot be loaded: %s" % (self.oidc_token_file, str(token))) + is_expired, errors = oidc_utils.is_token_expired(token) + if is_expired: + raise exceptions.IDDSException("Token is already expired: %s" % errors) + return token['id_token'] + def build_url(self, url, path=None, params=None, doseq=False): """ Build url path. @@ -208,14 +224,8 @@ def get_request_response(self, url, type='GET', data=None, headers=None, auth_se else: return else: - oidc_utils = OIDCAuthenticationUtils() - status, token = oidc_utils.load_token(self.oidc_token) - if not status: - raise exceptions.IDDSException("Token %s cannot be loaded: %s" % (self.oidc_token, str(token))) - is_expired, errors = oidc_utils.is_token_expired(token) - if is_expired: - raise exceptions.IDDSException("Token is already expired: %s" % errors) - headers['X-IDDS-Auth-Token'] = token['id_token'] + id_token = self.get_oidc_token() + headers['X-IDDS-Auth-Token'] = id_token if type == 'GET': result = self.session.get(url, timeout=self.timeout, headers=headers, verify=False) diff --git a/client/lib/idds/client/clientmanager.py b/client/lib/idds/client/clientmanager.py index c5dc8e4f..8e61d97e 100644 --- a/client/lib/idds/client/clientmanager.py +++ b/client/lib/idds/client/clientmanager.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 - 2022 +# - Wen Guan, , 2020 - 2023 """ @@ -64,6 +64,7 @@ def __init__(self, host=None, timeout=600, setup_client=False): self.config = None self.auth_type = None self.x509_proxy = None + self.oidc_token_file = None self.oidc_token = None self.vo = None @@ -87,6 +88,7 @@ def setup_client(self, auth_setup=False): self.client = Client(host=self.host, auth={'auth_type': self.auth_type, 'client_proxy': self.x509_proxy, + 'oidc_token_file': self.oidc_token_file, 'oidc_token': self.oidc_token, 'vo': self.vo, 'auth_setup': auth_setup}, @@ -113,6 +115,7 @@ def get_config_value(self, configuration, section, name, current, default): 'local_config_root': 'IDDS_LOCAL_CONFIG_ROOT', 'config': 'IDDS_CONFIG', 'auth_type': 'IDDS_AUTH_TYPE', + 'oidc_token_file': 'IDDS_OIDC_TOKEN_FILE', 'oidc_token': 'IDDS_OIDC_TOKEN', 'vo': 'IDDS_VO', 'auth_no_verify': 'IDDS_AUTH_NO_VERIFY', @@ -139,6 +142,7 @@ def get_section(self, name): 'auth_type': 'common', 'host': 'rest', 'x509_proxy': 'x509_proxy', + 'oidc_token_file': 'oidc', 'oidc_token': 'oidc', 'vo': 'oidc'} if name in name_sections: @@ -178,9 +182,13 @@ def get_local_configuration(self): self.x509_proxy = proxy if self.get_local_config_root(): + self.oidc_token_file = self.get_config_value(config, None, 'oidc_token_file', current=self.oidc_token_file, + default=os.path.join(self.get_local_config_root(), '.token')) self.oidc_token = self.get_config_value(config, None, 'oidc_token', current=self.oidc_token, - default=os.path.join(self.get_local_config_root(), '.token')) + default=None) else: + self.oidc_token_file = self.get_config_value(config, None, 'oidc_token_file', current=self.oidc_token_file, + default=None) self.oidc_token = self.get_config_value(config, None, 'oidc_token', current=self.oidc_token, default=None) @@ -211,6 +219,7 @@ def save_local_configuration(self): self.set_local_configuration(name='auth_type', value=self.auth_type) self.set_local_configuration(name='host', value=self.host) self.set_local_configuration(name='x509_proxy', value=self.x509_proxy) + self.set_local_configuration(name='oidc_token_file', value=self.oidc_token_file) self.set_local_configuration(name='oidc_token', value=self.oidc_token) self.set_local_configuration(name='vo', value=self.vo) self.set_local_configuration(name='enable_json_outputs', value=self.enable_json_outputs) @@ -219,8 +228,8 @@ def save_local_configuration(self): self.configuration.write(configfile) def setup_local_configuration(self, local_config_root=None, config=None, host=None, - auth_type=None, x509_proxy=None, - oidc_token=None, vo=None): + auth_type=None, x509_proxy=None, oidc_token=None, + oidc_token_file=None, vo=None): if 'IDDS_CONFIG' in os.environ and os.environ['IDDS_CONFIG']: if config is None: @@ -234,6 +243,7 @@ def setup_local_configuration(self, local_config_root=None, config=None, host=No self.host = host self.auth_type = auth_type self.x509_proxy = x509_proxy + self.oidc_token_file = oidc_token_file self.oidc_token = oidc_token self.vo = vo @@ -293,11 +303,11 @@ def setup_oidc_token(self): logging.error("Failed to get token.") else: oidc_util = OIDCAuthenticationUtils() - status, output = oidc_util.save_token(self.oidc_token, token) + status, output = oidc_util.save_token(self.oidc_token_file, token) if status: - logging.info("Token is saved to %s" % (self.oidc_token)) + logging.info("Token is saved to %s" % (self.oidc_token_file)) else: - logging.info("Failed to save token to %s: (status: %s, output: %s)" % (self.oidc_token, status, output)) + logging.info("Failed to save token to %s: (status: %s, output: %s)" % (self.oidc_token_file, status, output)) def refresh_oidc_token(self): """" @@ -306,21 +316,21 @@ def refresh_oidc_token(self): self.setup_client(auth_setup=True) oidc_util = OIDCAuthenticationUtils() - status, token = oidc_util.load_token(self.oidc_token) + status, token = oidc_util.load_token(self.oidc_token_file) if not status: logging.error("Token %s cannot be loaded: %s" % (status, token)) return is_expired, output = oidc_util.is_token_expired(token) if is_expired: - logging.error("Token %s is already expired(%s). Cannot refresh." % self.oidc_token, output) + logging.error("Token %s is already expired(%s). Cannot refresh." % self.oidc_token_file, output) else: new_token = self.client.refresh_id_token(self.vo, token['refresh_token']) - status, data = oidc_util.save_token(self.oidc_token, new_token) + status, data = oidc_util.save_token(self.oidc_token_file, new_token) if status: - logging.info("New token saved to %s" % self.oidc_token) + logging.info("New token saved to %s" % self.oidc_token_file) else: - logging.info("Failed to save token to %s: %s" % (self.oidc_token, data)) + logging.info("Failed to save token to %s: %s" % (self.oidc_token_file, data)) @exception_handler def clean_oidc_token(self): @@ -330,11 +340,11 @@ def clean_oidc_token(self): self.setup_client(auth_setup=True) oidc_util = OIDCAuthenticationUtils() - status, output = oidc_util.clean_token(self.oidc_token) + status, output = oidc_util.clean_token(self.oidc_token_file) if status: - logging.info("Token %s is cleaned" % self.oidc_token) + logging.info("Token %s is cleaned" % self.oidc_token_file) else: - logging.error("Failed to clean token %s: status: %s, output: %s" % (self.oidc_token, status, output)) + logging.error("Failed to clean token %s: status: %s, output: %s" % (self.oidc_token_file, status, output)) @exception_handler def check_oidc_token_status(self): @@ -344,14 +354,21 @@ def check_oidc_token_status(self): self.setup_client(auth_setup=True) oidc_util = OIDCAuthenticationUtils() - status, token = oidc_util.load_token(self.oidc_token) - if not status: - logging.error("Token %s cannot be loaded: status: %s, error: %s" % (self.oidc_token, status, token)) - return + if self.oidc_token: + token = self.oidc_token + else: + status, token = oidc_util.load_token(self.oidc_token_file) + if not status: + logging.error("Token %s cannot be loaded: status: %s, error: %s" % (self.oidc_token_file, status, token)) + return + token = token['id_token'] status, token_info = oidc_util.get_token_info(token) if status: - logging.info("Token path: %s" % self.oidc_token) + if self.oidc_token: + logging.info("ID token: %s" % self.oidc_token) + else: + logging.info("Token path: %s" % self.oidc_token_file) for k in token_info: logging.info("Token %s: %s" % (k, token_info[k])) else: From 967089dd90c478d3514135a7c6df97f6be35211b Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 12 Jan 2023 20:45:31 +0100 Subject: [PATCH 67/91] fix token validation --- common/lib/idds/common/authentication.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/common/lib/idds/common/authentication.py b/common/lib/idds/common/authentication.py index 86671862..f10ad72e 100644 --- a/common/lib/idds/common/authentication.py +++ b/common/lib/idds/common/authentication.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2021 - 2022 +# - Wen Guan, , 2021 - 2023 import datetime import base64 @@ -258,14 +258,20 @@ def verify_id_token(self, vo, token): # check audience decoded_token = jwt.decode(token, verify=False, options={"verify_signature": False}) audience = decoded_token['aud'] - if auth_config['client_id'] != audience: + if audience not in [auth_config['audience'], auth_config['client_id']]: # discovery_endpoint = auth_config['oidc_config_url'] - return False, "The audience of the token doesn't match vo configuration.", None + return False, "The audience %s of the token doesn't match vo configuration(client_id: %s)." % (audience, auth_config['client_id']), None public_key = self.get_public_key(token, endpoint_config['jwks_uri']) # decode token only with RS256 + if 'iss' in decoded_token and decoded_token['iss'] and decoded_token['iss'] != endpoint_config['issuer'] and endpoint_config['issuer'].startswith(decoded_token['iss']): + # iss is missing the last '/' in access tokens + issuer = decoded_token['iss'] + else: + issuer = endpoint_config['issuer'] + decoded = jwt.decode(token, public_key, verify=True, algorithms='RS256', - audience=audience, issuer=endpoint_config['issuer']) + audience=audience, issuer=issuer) decoded['vo'] = vo if 'name' in decoded: username = decoded['name'] @@ -319,7 +325,8 @@ def clean_token(self, path): def get_token_info(self, token): try: - enc = token['id_token'].split('.')[1] + # enc = token['id_token'].split('.')[1] + enc = token.split('.')[1] enc += '=' * (-len(enc) % 4) dec = json.loads(base64.urlsafe_b64decode(enc.encode())) exp_time = datetime.datetime.utcfromtimestamp(dec['exp']) From bc5a7fea091a08d13960f2d24bf05391739fc3fd Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 12 Jan 2023 20:47:04 +0100 Subject: [PATCH 68/91] support loglevel from env --- common/lib/idds/common/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/common/lib/idds/common/utils.py b/common/lib/idds/common/utils.py index cf8fa0e4..6b7599ca 100644 --- a/common/lib/idds/common/utils.py +++ b/common/lib/idds/common/utils.py @@ -49,6 +49,12 @@ def setup_logging(name, stream=None, loglevel=None): else: loglevel = logging.INFO + if os.environ.get('IDDS_LOG_LEVEL', None): + idds_log_level = os.environ.get('IDDS_LOG_LEVEL', None) + idds_log_level = idds_log_level.upper() + if idds_log_level in ["DEBUG", "CRITICAL", "ERROR", "WARNING", "INFO"]: + loglevel = getattr(logging, idds_log_level) + if stream is None: if config_has_section('common') and config_has_option('common', 'logdir'): logging.basicConfig(filename=os.path.join(config_get('common', 'logdir'), name), From 7c4587630c20275f4636525ca813d500c78a124e Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 12 Jan 2023 20:53:03 +0100 Subject: [PATCH 69/91] fix ext_contents updates --- .../lib/idds/doma/workflowv2/domapandawork.py | 165 +++++++++--------- 1 file changed, 80 insertions(+), 85 deletions(-) diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index 1031ecfc..f2eaf666 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -480,8 +480,8 @@ def create_processing(self, input_output_maps=[]): task_param_map['transHome'] = None executable = self.executable - if self.task_type == 'lsst_build': - executable = str(self.get_request_id()) + " " + str(self.signature) + " " + self.executable + executable = "export IDDS_BUILD_REQUEST_ID=" + str(self.get_request_id()) + ";" + executable += "export IDDS_BUIL_SIGNATURE=" + str(self.signature) + "; " + self.executable if self.encode_command_line: # task_param_map['transPath'] = 'https://atlpan.web.cern.ch/atlpan/bash-c-enc' @@ -978,50 +978,49 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m self.logger.debug("get_update_contents, contents_ext_full[:3]: %s" % (str({k: contents_ext_full[k] for k in list(contents_ext_full.keys())[:3]}))) return update_contents, update_contents_full, contents_ext_full - def get_contents_ext_detail(self, contents_ext_full, contents_ext_ids, job_info_maps={}): - contents_ext_full_ids = set(contents_ext_full.keys()) - new_ids = contents_ext_full_ids - contents_ext_ids - to_update_ids = contents_ext_full_ids - new_ids - new_contents_ext, update_contents_ext = [], [] - - for new_id in new_ids: - content = contents_ext_full[new_id]['content'] - job_info = contents_ext_full[new_id]['job_info'] - new_content_ext = {'content_id': content['content_id'], - 'request_id': content['request_id'], - 'transform_id': content['transform_id'], - 'workload_id': content['workload_id'], - 'coll_id': content['coll_id'], - 'map_id': content['map_id'], - 'status': content['status']} + def get_contents_ext_detail(self, new_contents_ext, update_contents_ext, job_info_maps={}): + new_contents_ext_d, update_contents_ext_d = [], [] + + for content_id in new_contents_ext: + content = new_contents_ext[content_id]['content'] + job_info = new_contents_ext[content_id]['job_info'] + new_content_ext_d = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id'], + 'map_id': content['map_id'], + 'status': content['status']} for job_info_item in job_info_maps: - new_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) - if new_content_ext[job_info_item] == 'NULL': - new_content_ext[job_info_item] = None - - new_contents_ext.append(new_content_ext) - for to_update_id in to_update_ids: - content = contents_ext_full[new_id]['content'] - job_info = contents_ext_full[new_id]['job_info'] - update_content_ext = {'content_id': content['content_id'], 'status': content['status']} + new_content_ext_d[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if new_content_ext_d[job_info_item] == 'NULL': + new_content_ext_d[job_info_item] = None + new_contents_ext_d.append(new_content_ext_d) + + for content_id in update_contents_ext: + content = update_contents_ext[content_id]['content'] + job_info = update_contents_ext[content_id]['job_info'] + update_content_ext_d = {'content_id': content['content_id'], + 'status': content['status']} for job_info_item in job_info_maps: - update_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) - if update_content_ext[job_info_item] == 'NULL': - update_content_ext[job_info_item] = None - update_contents_ext.append(update_content_ext) - return new_contents_ext, update_contents_ext + update_content_ext_d[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if update_content_ext_d[job_info_item] == 'NULL': + update_content_ext_d[job_info_item] = None + + update_contents_ext_d.append(update_content_ext_d) + + return new_contents_ext_d, update_contents_ext_d def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, job_info_maps={}): self.logger.debug("get_contents_ext, len(contents_ext): %s" % (str(len(contents_ext)))) self.logger.debug("get_contents_ext, contents_ext[:3]: %s" % (str(contents_ext[:3]))) - contents_ext_ids = [content['content_id'] for content in contents_ext] - contents_ext_ids = set(contents_ext_ids) - contents_ext_panda_ids = [content['panda_id'] for content in contents_ext] - contents_ext_panda_ids = set(contents_ext_panda_ids) - new_contents_ext, update_contents_ext = [], [] - terminated_contents, terminated_contents_full = [], {} - terminated_contents_full_no_panda, terminated_contents_full_no_panda_full = [], {} + contents_ext_dict = {content['content_id']: content for content in contents_ext} + + left_contents = [] + to_check_panda_ids = {} + new_contents_ext, update_contents_ext = {}, {} + new_need_poll_contents_ext, update_need_poll_contents_ext = {}, {} for map_id in input_output_maps: # inputs = input_output_maps[map_id]['inputs'] @@ -1030,55 +1029,51 @@ def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, j for content in outputs: if content['substatus'] in [ContentStatus.Available, ContentStatus.Failed, ContentStatus.FinalFailed, ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: - # terminated_contents.append(content['content_id']) - # terminated_contents_full[content['content_id']] = content - if content['content_metadata'] and 'panda_id' in content['content_metadata']: - terminated_contents.append(content['content_metadata']['panda_id']) - terminated_contents_full[content['content_metadata']['panda_id']] = content + if content['content_id'] not in contents_ext_dict: + if content['content_id'] in contents_ext_full: + new_contents_ext[content['content_id']] = contents_ext_full[content['content_id']] + else: + new_need_poll_contents_ext[content['content_id']] = content + if content['content_metadata'] and 'panda_id' in content['content_metadata']: + to_check_panda_ids[content['content_metadata']['panda_id']] = content['content_id'] else: - terminated_contents_full_no_panda.append(content['content_id']) - terminated_contents_full_no_panda_full[content['content_id']] = content - - to_check_panda_ids = [] - terminated_contents = set(terminated_contents) - contents_ext_full_panda_ids = [contents_ext_full[content_id]['job_info'].PandaID for content_id in contents_ext_full] - contents_ext_full_panda_ids = set(contents_ext_full_panda_ids) - to_check_panda_ids = terminated_contents - contents_ext_panda_ids - contents_ext_full_panda_ids - - terminated_contents_full_no_panda = set(terminated_contents_full_no_panda) - final_term_contents = terminated_contents_full_no_panda - contents_ext_ids - for content_id in final_term_contents: - new_content_ext = {'content_id': content['content_id'], - 'request_id': content['request_id'], - 'transform_id': content['transform_id'], - 'workload_id': content['workload_id'], - 'coll_id': content['coll_id'], - 'map_id': content['map_id'], - 'status': content['status']} - new_contents_ext.append(new_content_ext) - - left_panda_ids = [] + content_ext = contents_ext_dict[content['content_id']] + panda_id = None + if content['content_metadata'] and 'panda_id' in content['content_metadata']: + panda_id = content['content_metadata']['panda_id'] + if content['substatus'] != content_ext['status'] or panda_id != content_ext['panda_id']: + if content['content_id'] in contents_ext_full: + update_contents_ext[content['content_id']] = contents_ext_full[content['content_id']] + else: + update_need_poll_contents_ext[content['content_id']] = content + if panda_id: + to_check_panda_ids[panda_id] = content['content_id'] + else: + left_contents.append(content) + if to_check_panda_ids: - checked_panda_ids = [] - ret_job_infos = self.get_panda_job_status(to_check_panda_ids) + to_check_panda_ids_list = list(to_check_panda_ids.keys()) + ret_job_infos = self.get_panda_job_status(to_check_panda_ids_list) for job_info in ret_job_infos: - checked_panda_ids.append(job_info.PandaID) - content = terminated_contents_full[job_info.PandaID] - contents_ext_full[content['content_id']] = {'content': content, 'job_info': job_info} - - to_check_panda_ids = set(to_check_panda_ids) - checked_panda_ids = set(checked_panda_ids) - left_panda_ids = to_check_panda_ids - checked_panda_ids - left_panda_ids = list(left_panda_ids) + content_id = to_check_panda_ids[job_info.PandaID] + del to_check_panda_ids[job_info.PandaID] + if content_id in new_need_poll_contents_ext: + new_contents_ext[content_id] = {'content': new_need_poll_contents_ext[content_id], 'job_info': job_info} + del new_need_poll_contents_ext[content_id] + else: + update_contents_ext[content_id] = {'content': update_need_poll_contents_ext[content_id], 'job_info': job_info} + del update_need_poll_contents_ext[content_id] + for content_id in new_need_poll_contents_ext: + left_contents.append(new_need_poll_contents_ext[content_id]) + for content_id in update_need_poll_contents_ext: + left_contents.append(update_need_poll_contents_ext[content_id]) - new_contents_ext1, update_contents_ext1 = self.get_contents_ext_detail(contents_ext_full, contents_ext_ids, job_info_maps) - new_contents_ext = new_contents_ext + new_contents_ext1 - update_contents_ext = update_contents_ext + update_contents_ext1 + new_contents_ext_d, update_contents_ext_d = self.get_contents_ext_detail(new_contents_ext, update_contents_ext, job_info_maps) - self.logger.debug("get_contents_ext, new_contents_ext[:1]: %s" % (str(new_contents_ext[:1]))) - self.logger.debug("get_contents_ext, update_contents_ext[:1]: %s" % (str(update_contents_ext[:1]))) - self.logger.debug("get_contents_ext, left_panda_ids[:3]: %s" % (str(left_panda_ids[:3]))) - return new_contents_ext, update_contents_ext, left_panda_ids + self.logger.debug("get_contents_ext, new_contents_ext_d[:1]: %s" % (str(new_contents_ext_d[:1]))) + self.logger.debug("get_contents_ext, update_contents_ext_d[:1]: %s" % (str(update_contents_ext_d[:1]))) + self.logger.debug("get_contents_ext, left_contents[:3]: %s" % (str(left_contents[:3]))) + return new_contents_ext_d, update_contents_ext_d, left_contents def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext=None, job_info_maps={}, log_prefix=''): task_id = None @@ -1121,8 +1116,8 @@ def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext= inputname_mapid_map, inputname_jobid_map) - new_contents_ext, update_contents_ext, left_jobs = self.get_contents_ext(input_output_maps, contents_ext, - contents_ext_full, job_info_maps) + new_contents_ext, update_contents_ext, left_contents = self.get_contents_ext(input_output_maps, contents_ext, + contents_ext_full, job_info_maps) # if left_jobs: # processing_status = ProcessingStatus.Running From 7558b437ade6d677f65147750ccefc846a49a177 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 12 Jan 2023 20:54:03 +0100 Subject: [PATCH 70/91] fix update request with pipeline workflow --- main/lib/idds/orm/requests.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/main/lib/idds/orm/requests.py b/main/lib/idds/orm/requests.py index c167a688..06411a13 100644 --- a/main/lib/idds/orm/requests.py +++ b/main/lib/idds/orm/requests.py @@ -897,8 +897,12 @@ def update_request(request_id, parameters, update_request_metadata=False, sessio parameters['processing_metadata'] = {} parameters['processing_metadata']['build_workflow_data'] = build_workflow.metadata - if 'request_metadata' in parameters and not update_request_metadata: - del parameters['request_metadata'] + if 'request_metadata' in parameters: + if not update_request_metadata: + del parameters['request_metadata'] + else: + parameters['_request_metadata'] = parameters['request_metadata'] + del parameters['request_metadata'] if 'processing_metadata' in parameters: parameters['_processing_metadata'] = parameters['processing_metadata'] del parameters['processing_metadata'] From c8d54bac11806406a92ee4a574592525d1f8c1ee Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 12 Jan 2023 20:57:57 +0100 Subject: [PATCH 71/91] fix rest service to verify the build task during update pipeline workflow --- main/lib/idds/rest/v1/controller.py | 2 +- main/lib/idds/rest/v1/requests.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/main/lib/idds/rest/v1/controller.py b/main/lib/idds/rest/v1/controller.py index 1255f772..dc9f6a6d 100644 --- a/main/lib/idds/rest/v1/controller.py +++ b/main/lib/idds/rest/v1/controller.py @@ -63,7 +63,7 @@ def generate_http_response(self, status_code, data=None, exc_cls=None, exc_msg=N 'ExceptionMessage': self.generate_message(exc_cls, exc_msg)} if status_code == HTTP_STATUS_CODE.OK: status_code = 0 - response = {'ret_code': status_code, + response = {'status': status_code, 'data': data, 'error': error} resp = Response(response=json_dumps(response, sort_keys=True, indent=4), status=HTTP_STATUS_CODE.OK, content_type='application/json') diff --git a/main/lib/idds/rest/v1/requests.py b/main/lib/idds/rest/v1/requests.py index 3de96fbd..fdf2651e 100644 --- a/main/lib/idds/rest/v1/requests.py +++ b/main/lib/idds/rest/v1/requests.py @@ -271,7 +271,8 @@ def post(self, request_id): raise exceptions.IDDSException("Request (request_id: %s, status: %s) is not in Building status" % (request_id, req['status'])) build_workflow = req['request_metadata']['build_workflow'] - build_work = build_workflow.get_build_work() + works = build_workflow.get_all_works() + build_work = works[0] if build_work.get_signature() != signature: raise exceptions.IDDSException("Request (request_id: %s) has a different signature(%s != %s)" % (request_id, signature, From a5c3660128762d81fe7d6d1c0795dbe44e8d6dc8 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 12 Jan 2023 20:59:23 +0100 Subject: [PATCH 72/91] fix workflow to store the signature in the metadata --- workflow/lib/idds/workflowv2/workflow.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflow/lib/idds/workflowv2/workflow.py b/workflow/lib/idds/workflowv2/workflow.py index 57ab08db..94956008 100644 --- a/workflow/lib/idds/workflowv2/workflow.py +++ b/workflow/lib/idds/workflowv2/workflow.py @@ -776,6 +776,7 @@ def works(self, value): 'status': work.status.value if work.status else work.status, 'substatus': work.substatus.value if work.substatus else work.substatus, 'next_works': work.next_works, + 'signature': work.signature, 'transforming': work.transforming} self.add_metadata_item('works', work_metadata) @@ -796,6 +797,7 @@ def refresh_works(self): 'status': work.status.value if work.status else work.status, 'substatus': work.substatus.value if work.substatus else work.substatus, 'next_works': work.next_works, + 'signature': work.signature, 'transforming': work.transforming} if work.last_updated_at and (not self.last_updated_at or work.last_updated_at > self.last_updated_at): self.last_updated_at = work.last_updated_at @@ -810,6 +812,8 @@ def load_works(self): self._works[k].workload_id = work_metadata[k]['workload_id'] if 'workload_id' in work_metadata[k] else None self._works[k].external_id = work_metadata[k]['external_id'] if 'external_id' in work_metadata[k] else None self._works[k].transforming = work_metadata[k]['transforming'] + if 'signature' in work_metadata[k]: + self._works[k].signature = work_metadata[k]['signature'] self._works[k].status = WorkStatus(work_metadata[k]['status']) if work_metadata[k]['status'] else work_metadata[k]['status'] self._works[k].substatus = WorkStatus(work_metadata[k]['substatus']) if work_metadata[k]['substatus'] else work_metadata[k]['substatus'] self._works[k].next_works = work_metadata[k]['next_works'] if 'next_works' in work_metadata[k] else [] From fc2b2a8baf744fcd5a00752187581df001cd9b3a Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 12 Jan 2023 21:08:59 +0100 Subject: [PATCH 73/91] build task test use case --- main/lib/idds/tests/doma_build_test.py | 164 +++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 main/lib/idds/tests/doma_build_test.py diff --git a/main/lib/idds/tests/doma_build_test.py b/main/lib/idds/tests/doma_build_test.py new file mode 100644 index 00000000..badbb46e --- /dev/null +++ b/main/lib/idds/tests/doma_build_test.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 - 2023 + + +""" +Test client. +""" + +import os +import tarfile +import time +import uuid + +# from pandaclient import Client + +# import traceback + +# from rucio.client.client import Client as Rucio_Client +# from rucio.common.exception import CannotAuthenticate + +# from idds.client.client import Client +from idds.client.clientmanager import ClientManager +# from idds.common.constants import RequestType, RequestStatus +from idds.common.utils import get_rest_host +# from idds.tests.common import get_example_real_tape_stagein_request +# from idds.tests.common import get_example_prodsys2_tape_stagein_request + +# from idds.workflowv2.work import Work, Parameter, WorkStatus +# from idds.workflowv2.workflow import Condition, Workflow +from idds.workflowv2.workflow import Workflow +# from idds.atlas.workflowv2.atlasstageinwork import ATLASStageinWork +from idds.doma.workflowv2.domapandawork import DomaPanDAWork + +# task_cloud = 'LSST' +task_cloud = 'US' + +task_queue = 'DOMA_LSST_GOOGLE_TEST' +# task_queue = 'DOMA_LSST_GOOGLE_MERGE' +# task_queue = 'SLAC_TEST' +# task_queue = 'DOMA_LSST_SLAC_TEST' +task_queue = 'SLAC_Rubin' +# task_queue = 'CC-IN2P3_TEST' + + +os.environ['PANDA_URL'] = 'http://pandaserver-doma.cern.ch:25080/server/panda' +os.environ['PANDA_URL_SSL'] = 'https://pandaserver-doma.cern.ch:25443/server/panda' +os.environ["PANDACACHE_URL"] = os.environ["PANDA_URL_SSL"] + + +def get_local_pfns(): + working_dir = os.path.dirname(os.path.realpath(__file__)) + local_pfns = [] + files = ['test_domapanda.py', + 'test_domapanda_build.py'] + for f in files: + full_filename = os.path.join(working_dir, f) + if os.path.exists(full_filename): + print("Adding: %s" % full_filename) + local_pfns.append(full_filename) + else: + print("Not exist: %s" % full_filename) + # add iDDS client code, to be compiled on worker node. + base_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(working_dir)))) + for idds_dir in ['client', 'common', 'workflow', 'doma']: + idds_full_dir = os.path.join(base_dir, idds_dir) + local_pfns.append(idds_full_dir) + # add main test script + cmd_build = os.path.join(base_dir, 'main/tools/rubin/cmdline_builder.py') + local_pfns.append(cmd_build) + main_script = os.path.join(base_dir, 'main/tools/rubin/test_build.sh') + local_pfns.append(main_script) + return local_pfns + + +def create_archive_file(output_filename, local_pfns): + if not output_filename.startswith("/"): + output_filename = os.path.join('/tmp/wguan', output_filename) + + with tarfile.open(output_filename, "w:gz", dereference=True) as tar: + for local_pfn in local_pfns: + base_name = os.path.basename(local_pfn) + tar.add(local_pfn, arcname=os.path.basename(base_name)) + return output_filename + + +def copy_files_to_pandacache(filename): + from pandaclient import Client + status, out = Client.putFile(filename, True) + print("copy_files_to_pandacache: status: %s, out: %s" % (status, out)) + if out.startswith("NewFileName:"): + # found the same input sandbox to reuse + filename = out.split(":")[-1] + elif out != "True": + print(out) + return None + + filename = os.path.basename(filename) + cache_path = os.path.join(os.environ["PANDACACHE_URL"], 'cache') + filename = os.path.join(cache_path, filename) + return filename + + +def setup_workflow(): + local_pfns = get_local_pfns() + output_filename = "jobO.%s.tar.gz" % str(uuid.uuid4()) + output_filename = create_archive_file(output_filename, local_pfns) + print("archive file: %s" % output_filename) + output_filename = copy_files_to_pandacache(output_filename) + print("pandacache file: %s" % output_filename) + + # executable = 'unset PYTHONPATH; source /cvmfs/sw.lsst.eu/linux-x86_64/lsst_distrib/w_2022_53/loadLSST.bash; pwd; ls -al; setup lsst_distrib;\n python3 ${CTRL_BPS_PANDA_DIR}/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py' + + executable = 'unset PYTHONPATH; source /cvmfs/sw.lsst.eu/linux-x86_64/lsst_distrib/w_2022_53/loadLSST.bash; pwd; ls -al; setup lsst_distrib;' + + executable += 'if [[ ! -z "${PANDA_AUTH_DIR}" ]] && [[ ! -z "${PANDA_AUTH_ORIGIN}" ]]; then export PANDA_AUTH_ID_TOKEN=$(cat $PANDA_AUTH_DIR); export PANDA_AUTH_VO=$PANDA_AUTH_ORIGIN; export IDDS_OIDC_TOKEN=$(cat $PANDA_AUTH_DIR); export IDDS_VO=$PANDA_AUTH_ORIGIN; export PANDA_AUTH=oidc; else unset PANDA_AUTH; export IDDS_AUTH_TYPE=x509_proxy; fi;' # noqa E501 + + # executable = 'python3 ${CTRL_BPS_PANDA_DIR}/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder_build.py' + # executable += 'wget https://storage.googleapis.com/drp-us-central1-containers/cmdline_builder.py;' + executable += 'wget https://wguan-wisc.web.cern.ch/wguan-wisc/cmdline_builder.py;' + executable += 'export IDDS_HOST=https://aipanda160.cern.ch:443/idds;' + executable += 'python3 cmdline_builder.py ' + + executable += output_filename + ' ' + './test_build.sh' + + work1 = DomaPanDAWork(executable=executable, + task_type='lsst_build', + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#1'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#1'}], + log_collections=[], dependency_map=None, + task_name="build_task", task_queue=task_queue, + encode_command_line=True, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + + pending_time = 12 + # pending_time = None + workflow = Workflow(pending_time=pending_time) + + workflow.add_work(work1) + workflow.name = 'test_workflow.idds.%s.test' % time.time() + return workflow + + +if __name__ == '__main__': + host = get_rest_host() + workflow = setup_workflow() + + wm = ClientManager(host=host) + # wm.set_original_user(user_name="wguandev") + request_id = wm.submit_build(workflow, use_dataset_name=False) + print(request_id) From ac45877b4a3cd95523afa0256138b803f84bfb7a Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 12 Jan 2023 21:14:46 +0100 Subject: [PATCH 74/91] add build test task --- main/tools/rubin/cmdline_builder.py | 67 +++++++++++++++++++++++++++++ main/tools/rubin/test_build.sh | 63 +++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 main/tools/rubin/cmdline_builder.py create mode 100755 main/tools/rubin/test_build.sh diff --git a/main/tools/rubin/cmdline_builder.py b/main/tools/rubin/cmdline_builder.py new file mode 100644 index 00000000..c9a34621 --- /dev/null +++ b/main/tools/rubin/cmdline_builder.py @@ -0,0 +1,67 @@ +#!/bin/env python + +import os +import sys +import subprocess +# import base64 +import datetime +import tarfile + + +def download_extract_archive(filename): + archive_basename = os.path.basename(filename) + target_dir = os.getcwd() + full_output_filename = os.path.join(target_dir, archive_basename) + + if filename.startswith("https:"): + panda_cache_url = os.path.dirname(os.path.dirname(filename)) + os.environ["PANDACACHE_URL"] = panda_cache_url + elif "PANDACACHE_URL" not in os.environ and "PANDA_URL_SSL" in os.environ: + os.environ["PANDACACHE_URL"] = os.environ["PANDA_URL_SSL"] + print("PANDACACHE_URL: %s" % os.environ.get("PANDACACHE_URL", None)) + + from pandaclient import Client + # status, output = Client.getFile(archive_basename, output_path=full_output_filename, verbose=False) + status, output = Client.getFile(archive_basename, output_path=full_output_filename) + print("Download archive file from pandacache status: %s, output: %s" % (status, output)) + if status != 0: + raise RuntimeError("Failed to download archive file from pandacache") + with tarfile.open(full_output_filename, 'r:gz') as f: + f.extractall(target_dir) + print("Extract %s to %s" % (full_output_filename, target_dir)) + os.remove(full_output_filename) + print("Remove %s" % full_output_filename) + + +# request_id and signature are added by iDDS for build task +request_id = os.environ.get("IDDS_BUILD_REQUEST_ID", None) +signature = os.environ.get("IDDS_BUIL_SIGNATURE", None) +job_archive = sys.argv[1] +exec_str = sys.argv[2:] +exec_str = " ".join(exec_str) + +if request_id is None: + print("IDDS_BUILD_REQUEST_ID is not defined.") + sys.exit(-1) +if signature is None: + print("IDDS_BUIL_SIGNATURE is not defined") + sys.exit(-1) + +print("INFO: start {}".format(datetime.datetime.utcnow())) +print("INFO: job archive: {}".format(job_archive)) +print("INFO: exec string: {}".format(exec_str)) + +current_dir = os.getcwd() + +download_extract_archive(job_archive) + +print("INFO: current dir: %s" % current_dir) + +# add current dir to PATH +os.environ['PATH'] = current_dir + ":" + os.environ['PATH'] + +p = subprocess.Popen(exec_str, stdout=sys.stdout, stderr=sys.stderr, + shell=True, universal_newlines=True) +retcode = p.wait() +print("INFO : end {} with retcode={}".format(datetime.datetime.utcnow(), retcode)) +exit(retcode) diff --git a/main/tools/rubin/test_build.sh b/main/tools/rubin/test_build.sh new file mode 100755 index 00000000..3304d776 --- /dev/null +++ b/main/tools/rubin/test_build.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +CurrentDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +echo "Current dir: `pwd`" +echo "Curent dir contents:" +ls + +echo "PATH: $PATH" + +# install iDDS client +cd client +# python setup.py install --force --prefix $CurrentDir +python setup.py install --old-and-unmanageable --force --prefix $CurrentDir +cd .. + +cd common +python setup.py install --old-and-unmanageable --force --prefix $CurrentDir +cd .. + +cd workflow +python setup.py install --old-and-unmanageable --force --prefix $CurrentDir +cd .. + +cd doma +python setup.py install --old-and-unmanageable --force --prefix $CurrentDir +cd .. + +echo "Current dir: `pwd`" +echo "Curent dir contents:" +ls + +python_install_path=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib(prefix='$CurrentDir'))") + +export PYTHONPATH=${python_install_path}:$PYTHONPATH + +echo IDDS_BUILD_REQUEST_ID=$IDDS_BUILD_REQUEST_ID +echo IDDS_BUIL_SIGNATURE=$IDDS_BUIL_SIGNATURE + +if [[ ! -z "${PANDA_AUTH_DIR}" ]] && [[ ! -z "${PANDA_AUTH_ORIGIN}" ]]; then + export PANDA_AUTH_ID_TOKEN=$(cat $PANDA_AUTH_DIR); + export PANDA_AUTH_VO=$PANDA_AUTH_ORIGIN; + export IDDS_OIDC_TOKEN=$(cat $PANDA_AUTH_DIR) + export IDDS_VO=$PANDA_AUTH_ORIGIN + export PANDA_AUTH=oidc + export IDDS_AUTH_TYPE=oidc +else + unset PANDA_AUTH; + export IDDS_AUTH_TYPE=x509_proxy +fi; + +# echo PYTHONPATH=$PYTHONPATH +# which python +# which python3 + +echo "envs: " +env + +export IDDS_LOG_LEVEL=DEBUG + + +echo "exec command: python test_domapanda_build.py $@" +python test_domapanda_build.py $@ From 3294ca5d784de27cf126031973297ba5dc752e8f Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Fri, 13 Jan 2023 14:46:05 +0100 Subject: [PATCH 75/91] script to check total files --- main/lib/idds/tests/core_tests_stat.py | 27 ++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 main/lib/idds/tests/core_tests_stat.py diff --git a/main/lib/idds/tests/core_tests_stat.py b/main/lib/idds/tests/core_tests_stat.py new file mode 100644 index 00000000..46bda88c --- /dev/null +++ b/main/lib/idds/tests/core_tests_stat.py @@ -0,0 +1,27 @@ + +from idds.common.utils import json_dumps, setup_logging # noqa F401 +from idds.common.constants import ContentStatus, ContentType, ContentRelationType, ContentLocking # noqa F401 +from idds.core.requests import get_requests # noqa F401 +from idds.core.messages import retrieve_messages # noqa F401 +from idds.core.transforms import get_transforms, get_transform # noqa F401 +from idds.core.workprogress import get_workprogresses # noqa F401 +from idds.core.processings import get_processings # noqa F401 +from idds.core import transforms as core_transforms # noqa F401 +from idds.core.transforms import release_inputs_by_collection, release_inputs_by_collection_old # noqa F401 +from idds.workflowv2.workflow import Workflow # noqa F401 +from idds.workflowv2.work import Work # noqa F401 + + +setup_logging(__name__) + +output_total = 0 +output_processed = 0 +reqs = get_requests(with_transform=True) +for req in reqs: + if "HSC" in req['name'] or "hsc" in req['name']: + print("name: %s, output_total: %s, output_processed: %s" % (req['name'], req['output_total_files'], req['output_processed_files'])) + if req['output_total_files'] and req['output_processed_files']: + output_total += req['output_total_files'] + output_processed += req['output_processed_files'] + +print("Total: %s, processed: %s" % (output_total, output_processed)) From 300369e5538a599c550bec5c60829573b8e9e5c6 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 16 Jan 2023 13:04:23 +0100 Subject: [PATCH 76/91] support OIDC_AUTH based token environment --- client/lib/idds/client/clientmanager.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/client/lib/idds/client/clientmanager.py b/client/lib/idds/client/clientmanager.py index 8e61d97e..79740b2f 100644 --- a/client/lib/idds/client/clientmanager.py +++ b/client/lib/idds/client/clientmanager.py @@ -121,6 +121,9 @@ def get_config_value(self, configuration, section, name, current, default): 'auth_no_verify': 'IDDS_AUTH_NO_VERIFY', 'enable_json_outputs': 'IDDS_ENABLE_JSON_OUTPUTS'} + additional_name_envs = {'oidc_token': 'OIDC_AUTH_ID_TOKEN', + 'oidc_token_file': 'OIDC_AUTH_TOKEN_FILE', + 'vo': 'OIDC_AUTH_VO'} if not section: section = self.get_section(name) @@ -128,6 +131,10 @@ def get_config_value(self, configuration, section, name, current, default): env_value = os.environ.get(name_envs[name], None) if env_value and len(env_value.strip()) > 0: return env_value + if name in additional_name_envs: + env_value = os.environ.get(additional_name_envs[name], None) + if env_value and len(env_value.strip()) > 0: + return env_value if configuration and type(configuration) in [str]: config = ConfigParser.ConfigParser() From d9afb9447798029a61d63603d4e942a19e2f74a4 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 16 Jan 2023 14:06:59 +0100 Subject: [PATCH 77/91] add syslog-ng --- Dockerfile | 6 ++ main/tools/syslog-ng/commands | 3 + main/tools/syslog-ng/config_syslog_ng.py | 77 ++++++++++++++++++++++++ main/tools/syslog-ng/http.conf | 29 +++++++++ main/tools/syslog-ng/idds.conf | 40 ++++++++++++ main/tools/syslog-ng/syslog-ng.conf | 70 +++++++++++++++++++++ start-daemon.sh | 11 ++++ 7 files changed, 236 insertions(+) create mode 100644 main/tools/syslog-ng/commands create mode 100644 main/tools/syslog-ng/config_syslog_ng.py create mode 100644 main/tools/syslog-ng/http.conf create mode 100644 main/tools/syslog-ng/idds.conf create mode 100644 main/tools/syslog-ng/syslog-ng.conf diff --git a/Dockerfile b/Dockerfile index 5bdf58ce..597f2b53 100644 --- a/Dockerfile +++ b/Dockerfile @@ -136,6 +136,12 @@ RUN sed -i "s/WSGISocketPrefix\ \/var\/log\/idds\/wsgisocks\/wsgi/WSGISocketPref # for idds daemons RUN ln -fs /opt/idds/config/idds/supervisord_idds.ini /etc/supervisord.d/idds.ini +# for syslog-ng +RUN mv /etc/syslog-ng/syslog-ng.conf /etc/syslog-ng/syslog-ng.conf.back +ADD main/tools/syslog-ng/syslog-ng.conf /etc/syslog-ng/ +ADD main/tools/syslog-ng/idds.conf /etc/syslog-ng/conf.d/ +ADD main/tools/syslog-ng/http.conf /etc/syslog-ng/conf.d/ + RUN chmod -R 777 /opt/idds/config RUN chmod -R 777 /var/log/idds RUN chmod -R 777 /var/idds diff --git a/main/tools/syslog-ng/commands b/main/tools/syslog-ng/commands new file mode 100644 index 00000000..dfd1cade --- /dev/null +++ b/main/tools/syslog-ng/commands @@ -0,0 +1,3 @@ +python main/tools/syslog-ng/config_syslog_ng.py -s '/var/log/idds/*.log' -d /dev/stdout -f IDDS -p -c /etc/syslog-ng/conf.d/idds.conf +python main/tools/syslog-ng/config_syslog_ng.py -s '/var/log/idds/http*log' -d /dev/stdout -f IDDS -p -c /etc/syslog-ng/conf.d/http.conf + diff --git a/main/tools/syslog-ng/config_syslog_ng.py b/main/tools/syslog-ng/config_syslog_ng.py new file mode 100644 index 00000000..63fcce6b --- /dev/null +++ b/main/tools/syslog-ng/config_syslog_ng.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python + +import argparse +import logging + +import os +import glob +import string + + +def get_files(source): + sources = [] + for name in glob.glob(source): + sources.append(name) + return sources + + +def get_file_template(): + template = """source s_${filename} { + file("$source"); +}; +destination d_${filename} { + file( + "${destination}" + template("$${ISODATE} ${flag} ${filename} $${HOST} $${MESSAGE}\\n")); +}; +log { source(s_${filename}); destination(d_${filename}); }; + +""" + return string.Template(template) + + +def get_pipe_template(): + template = """source s_${filename} { + file("$source"); +}; +destination d_${filename} { + pipe( + "${destination}" + template("$${ISODATE} ${flag} ${filename} $${HOST} $${MESSAGE}\\n")); +}; +log { source(s_${filename}); destination(d_${filename}); }; + +""" + return string.Template(template) + + +def generate_source_dest_pair(source, destination, flag, pipe=False): + filename = os.path.basename(source).replace(".log", "").replace("_log", "") + if pipe: + template = get_pipe_template() + else: + template = get_file_template() + ret = template.substitute(filename=filename, source=source, destination=destination, flag=flag) + return ret + + +def generate_config(config_file, source, destination, flag, pipe=False): + with open(config_file, 'w') as fd: + sources = get_files(source) + for src in sources: + src_dest = generate_source_dest_pair(src, destination, flag, pipe) + fd.write(src_dest) + + +logging.getLogger().setLevel(logging.INFO) +parser = argparse.ArgumentParser(description="Configure syslog-ng") +parser.add_argument('-s', '--source', default=None, help='Source files') +parser.add_argument('-d', '--destination', default=None, help='Destination file name') +parser.add_argument('-f', '--flag', default=None, help='Flag name') +parser.add_argument('-c', '--config', default=None, help='Configuration file to be generated') +parser.add_argument('-p', "--pipe", action="store_true", default=False, help='Use pipe') +args = parser.parse_args() + + +if __name__ == '__main__': + generate_config(args.config, args.source, args.destination, args.flag, args.pipe) diff --git a/main/tools/syslog-ng/http.conf b/main/tools/syslog-ng/http.conf new file mode 100644 index 00000000..975d7f81 --- /dev/null +++ b/main/tools/syslog-ng/http.conf @@ -0,0 +1,29 @@ +source s_httpd_error { + file("/var/log/idds/httpd_error_log"); +}; +destination d_httpd_error { + pipe( + "/dev/stdout" + template("${ISODATE} IDDS httpd_error ${HOST} ${MESSAGE}\n")); +}; +log { source(s_httpd_error); destination(d_httpd_error); }; + +source s_httpd_access { + file("/var/log/idds/httpd_access_log"); +}; +destination d_httpd_access { + pipe( + "/dev/stdout" + template("${ISODATE} IDDS httpd_access ${HOST} ${MESSAGE}\n")); +}; +log { source(s_httpd_access); destination(d_httpd_access); }; + +source s_httpd_ssl { + file("/var/log/idds/httpd_ssl_log"); +}; +destination d_httpd_ssl { + pipe( + "/dev/stdout" + template("${ISODATE} IDDS httpd_ssl ${HOST} ${MESSAGE}\n")); +}; +log { source(s_httpd_ssl); destination(d_httpd_ssl); }; diff --git a/main/tools/syslog-ng/idds.conf b/main/tools/syslog-ng/idds.conf new file mode 100644 index 00000000..51b7cde9 --- /dev/null +++ b/main/tools/syslog-ng/idds.conf @@ -0,0 +1,40 @@ +source s_Receiver { + file("/var/log/idds/Receiver.log"); +}; +destination d_Receiver { + pipe( + "/dev/stdout" + template("${ISODATE} IDDS Receiver ${HOST} ${MESSAGE}\n")); +}; +log { source(s_Receiver); destination(d_Receiver); }; + +source s_idds-server-stdout { + file("/var/log/idds/idds-server-stdout.log"); +}; +destination d_idds-server-stdout { + pipe( + "/dev/stdout" + template("${ISODATE} IDDS stdout ${HOST} ${MESSAGE}\n")); +}; +log { source(s_idds-server-stdout); destination(d_idds-server-stdout); }; + +source s_Conductor { + file("/var/log/idds/Conductor.log"); +}; +destination d_Conductor { + pipe( + "/dev/stdout" + template("${ISODATE} IDDS Conductor ${HOST} ${MESSAGE}\n")); +}; +log { source(s_Conductor); destination(d_Conductor); }; + +source s_idds-server-stderr { + file("/var/log/idds/idds-server-stderr.log"); +}; +destination d_idds-server-stderr { + pipe( + "/dev/stdout" + template("${ISODATE} IDDS stderr ${HOST} ${MESSAGE}\n")); +}; +log { source(s_idds-server-stderr); destination(d_idds-server-stderr); }; + diff --git a/main/tools/syslog-ng/syslog-ng.conf b/main/tools/syslog-ng/syslog-ng.conf new file mode 100644 index 00000000..6a4726e6 --- /dev/null +++ b/main/tools/syslog-ng/syslog-ng.conf @@ -0,0 +1,70 @@ +@version:3.5 +@include "scl.conf" + +# syslog-ng configuration file. +# +# This should behave pretty much like the original syslog on RedHat. But +# it could be configured a lot smarter. +# +# See syslog-ng(8) and syslog-ng.conf(5) for more information. +# +# Note: it also sources additional configuration files (*.conf) +# located in /etc/syslog-ng/conf.d/ + +options { + flush_lines (0); + time_reopen (10); + log_fifo_size (1000); + chain_hostnames (off); + use_dns (no); + use_fqdn (no); + create_dirs (no); + keep_hostname (yes); +}; + +source s_sys { + system(); + internal(); + # udp(ip(0.0.0.0) port(514)); +}; + +destination d_cons { file("/dev/console"); }; +destination d_mesg { file("/var/log/messages"); }; +destination d_auth { file("/var/log/secure"); }; +destination d_mail { file("/var/log/maillog" flush_lines(10)); }; +destination d_spol { file("/var/log/spooler"); }; +destination d_boot { file("/var/log/boot.log"); }; +destination d_cron { file("/var/log/cron"); }; +destination d_kern { file("/var/log/kern"); }; +destination d_mlal { usertty("*"); }; + +filter f_kernel { facility(kern); }; +filter f_default { level(info..emerg) and + not (facility(mail) + or facility(authpriv) + or facility(cron)); }; +filter f_auth { facility(authpriv); }; +filter f_mail { facility(mail); }; +filter f_emergency { level(emerg); }; +filter f_news { facility(uucp) or + (facility(news) + and level(crit..emerg)); }; +filter f_boot { facility(local7); }; +filter f_cron { facility(cron); }; + +#log { source(s_sys); filter(f_kernel); destination(d_cons); }; +# log { source(s_sys); filter(f_kernel); destination(d_kern); }; +# log { source(s_sys); filter(f_default); destination(d_mesg); }; +# log { source(s_sys); filter(f_auth); destination(d_auth); }; +# log { source(s_sys); filter(f_mail); destination(d_mail); }; +# log { source(s_sys); filter(f_emergency); destination(d_mlal); }; +# log { source(s_sys); filter(f_news); destination(d_spol); }; +# log { source(s_sys); filter(f_boot); destination(d_boot); }; +# log { source(s_sys); filter(f_cron); destination(d_cron); }; + + +# Source additional configuration files (.conf extension only) +@include "/etc/syslog-ng/conf.d/*.conf" + + +# vim:ft=syslog-ng:ai:si:ts=4:sw=4:et: diff --git a/start-daemon.sh b/start-daemon.sh index b118ef88..fd66a72e 100755 --- a/start-daemon.sh +++ b/start-daemon.sh @@ -1,4 +1,12 @@ #!/bin/sh +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 - 2023 IDDS_SERVICE=$1 @@ -187,4 +195,7 @@ else exec "$@" fi +echo "start syslog-ng" +/usr/sbin/syslog-ng --no-caps + trap : TERM INT; sleep infinity & wait From a183d772f0321fdf588dab86af5cec15ef2b8745 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 16 Jan 2023 16:35:12 +0100 Subject: [PATCH 78/91] rotate logs and start logs to start output --- Dockerfile | 2 ++ main/config_default/supervisord_httpd.ini | 25 +++++++++++++++++++ main/config_default/supervisord_idds.ini | 6 ++--- main/config_default/supervisord_iddsfake.ini | 24 ++++++++++++++++++ main/config_default/supervisord_logrotate.ini | 24 ++++++++++++++++++ start-daemon.sh | 4 ++- 6 files changed, 81 insertions(+), 4 deletions(-) create mode 100644 main/config_default/supervisord_httpd.ini create mode 100644 main/config_default/supervisord_iddsfake.ini create mode 100644 main/config_default/supervisord_logrotate.ini diff --git a/Dockerfile b/Dockerfile index 597f2b53..dfc93fe5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -135,6 +135,8 @@ RUN sed -i "s/WSGISocketPrefix\ \/var\/log\/idds\/wsgisocks\/wsgi/WSGISocketPref # for idds daemons RUN ln -fs /opt/idds/config/idds/supervisord_idds.ini /etc/supervisord.d/idds.ini +RUN ln -fs /opt/idds/config/idds/supervisord_iddsfake.ini /etc/supervisord.d/iddsfake.ini +RUN ln -fs /opt/idds/config/idds/supervisord_httpd.ini /etc/supervisord.d/httpd.ini # for syslog-ng RUN mv /etc/syslog-ng/syslog-ng.conf /etc/syslog-ng/syslog-ng.conf.back diff --git a/main/config_default/supervisord_httpd.ini b/main/config_default/supervisord_httpd.ini new file mode 100644 index 00000000..06d7794e --- /dev/null +++ b/main/config_default/supervisord_httpd.ini @@ -0,0 +1,25 @@ +[program:httpd] +environment = + RUCIO_HOME=/opt/idds/, + RUCIO_ACCOUNT=pilot, + RUCIO_AUTH_TYPE=x509_proxy, + X509_USER_PROXY=/opt/idds/config/x509up +;command=/opt/idds/bin/run-idds +;command=bash -c "source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds" +# command=bash -c "trap 'kill -TERM $PID; wait $PID' TERM && source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds && PID=$! && wait $PID" +#command=apachectl -D "FOREGROUND" -k start +command=/usr/sbin/httpd -DFOREGROUND +# process_name=%(process_num)02d +# user=atlpan +childlogdir=/var/log/idds +stdout_logfile=/var/log/idds/httpd_access_log +stderr_logfile=/var/log/idds/httpd_error_log +stdout_logfile_maxbytes=2GB +stderr_logfile_maxbytes=2GB +stdout_logfile_backups=3 +stderr_logfile_backups=3 +redirect_stderr=false +autorestart=true +stopsignal=TERM +stopasgroup=true +exitcodes=1 diff --git a/main/config_default/supervisord_idds.ini b/main/config_default/supervisord_idds.ini index 66a3c50b..b3b47b71 100644 --- a/main/config_default/supervisord_idds.ini +++ b/main/config_default/supervisord_idds.ini @@ -7,15 +7,15 @@ environment = ;command=/opt/idds/bin/run-idds ;command=bash -c "source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds" command=bash -c "trap 'kill -TERM $PID; wait $PID' TERM && source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds && PID=$! && wait $PID" -process_name=%(process_num)02d +# process_name=%(process_num)02d # user=atlpan childlogdir=/var/log/idds stdout_logfile=/var/log/idds/%(program_name)s-stdout.log stderr_logfile=/var/log/idds/%(program_name)s-stderr.log stdout_logfile_maxbytes=2GB stderr_logfile_maxbytes=2GB -stdout_logfile_backups=10 -stderr_logfile_backups=10 +stdout_logfile_backups=5 +stderr_logfile_backups=5 redirect_stderr=false autorestart=true stopsignal=TERM diff --git a/main/config_default/supervisord_iddsfake.ini b/main/config_default/supervisord_iddsfake.ini new file mode 100644 index 00000000..a9471fcc --- /dev/null +++ b/main/config_default/supervisord_iddsfake.ini @@ -0,0 +1,24 @@ +[program:idds-server] +environment = + RUCIO_HOME=/opt/idds/, + RUCIO_ACCOUNT=pilot, + RUCIO_AUTH_TYPE=x509_proxy, + X509_USER_PROXY=/opt/idds/config/x509up +;command=/opt/idds/bin/run-idds +;command=bash -c "source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds" +;command=bash -c "trap 'kill -TERM $PID; wait $PID' TERM && source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds && PID=$! && wait $PID" +command=bash -c "trap 'kill -TERM $PID; wait $PID' TERM && /opt/idds/bin/run-idds-fake && PID=$! && wait $PID" +# process_name=%(process_num)02d +# user=atlpan +childlogdir=/var/log/idds +stdout_logfile=/var/log/idds/Conductor.log +stderr_logfile=/var/log/idds/Receiver.log +stdout_logfile_maxbytes=2GB +stderr_logfile_maxbytes=2GB +stdout_logfile_backups=5 +stderr_logfile_backups=5 +redirect_stderr=false +autorestart=true +stopsignal=TERM +stopasgroup=true +exitcodes=1 diff --git a/main/config_default/supervisord_logrotate.ini b/main/config_default/supervisord_logrotate.ini new file mode 100644 index 00000000..7f2af811 --- /dev/null +++ b/main/config_default/supervisord_logrotate.ini @@ -0,0 +1,24 @@ +[program:idds-server] +environment = + RUCIO_HOME=/opt/idds/, + RUCIO_ACCOUNT=pilot, + RUCIO_AUTH_TYPE=x509_proxy, + X509_USER_PROXY=/opt/idds/config/x509up +;command=/opt/idds/bin/run-idds +;command=bash -c "source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds" +;command=bash -c "trap 'kill -TERM $PID; wait $PID' TERM && source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds && PID=$! && wait $PID" +command=/usr/sbin/logrotate -d /etc/logrotate.d/idds +# process_name=%(process_num)02d +# user=atlpan +childlogdir=/var/log/idds +stdout_logfile=/var/log/idds/%(program_name)s-stdout.log +stderr_logfile=/var/log/idds/%(program_name)s-stderr.log +stdout_logfile_maxbytes=2GB +stderr_logfile_maxbytes=2GB +stdout_logfile_backups=5 +stderr_logfile_backups=5 +redirect_stderr=false +autorestart=true +stopsignal=TERM +stopasgroup=true +exitcodes=1 diff --git a/start-daemon.sh b/start-daemon.sh index fd66a72e..99480f7b 100755 --- a/start-daemon.sh +++ b/start-daemon.sh @@ -111,6 +111,8 @@ if [ -f /opt/idds/config/idds/supervisord_idds.ini ]; then else echo "supervisord conf not found. will use the default one." cp /opt/idds/config_default/supervisord_idds.ini /opt/idds/config/idds/supervisord_idds.ini + cp /opt/idds/config_default/supervisord_iddsfake.ini /opt/idds/config/idds/supervisord_iddsfake.ini + cp /opt/idds/config_default/supervisord_httpd.ini /opt/idds/config/idds/supervisord_httpd.ini fi if [ -f /etc/grid-security/hostkey.pem ]; then @@ -187,7 +189,7 @@ elif [ "${IDDS_SERVICE}" == "daemon" ]; then /usr/bin/supervisord -c /etc/supervisord.conf elif [ "${IDDS_SERVICE}" == "all" ]; then echo "starting iDDS rest service" - /usr/sbin/httpd + # /usr/sbin/httpd echo "starting iDDS daemon service" /usr/bin/supervisord -c /etc/supervisord.conf From 00c005f3fe5ba1e95d86ebf1bb3707b519a192ac Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 16 Jan 2023 16:35:58 +0100 Subject: [PATCH 79/91] log rotate --- main/bin/run-idds-fake | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 main/bin/run-idds-fake diff --git a/main/bin/run-idds-fake b/main/bin/run-idds-fake new file mode 100644 index 00000000..bf142a18 --- /dev/null +++ b/main/bin/run-idds-fake @@ -0,0 +1,5 @@ +#!/bin/bash + +while true; do + sleep 10000000 +done From aee2b14045c93e0d3d1cd91a697bc5f2e37311b7 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 16 Jan 2023 17:01:36 +0100 Subject: [PATCH 80/91] fix idds-fake daemon --- main/config_default/supervisord_iddsfake.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/config_default/supervisord_iddsfake.ini b/main/config_default/supervisord_iddsfake.ini index a9471fcc..b5cdbcee 100644 --- a/main/config_default/supervisord_iddsfake.ini +++ b/main/config_default/supervisord_iddsfake.ini @@ -1,4 +1,4 @@ -[program:idds-server] +[program:idds-fake] environment = RUCIO_HOME=/opt/idds/, RUCIO_ACCOUNT=pilot, From a7d240e53aec7092f33c4a066a27d84fc984a0b5 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 16 Jan 2023 17:27:39 +0100 Subject: [PATCH 81/91] add syslog-ng to supervisord --- Dockerfile | 1 + main/config_default/supervisord_syslog-ng.ini | 19 +++++++++ main/tools/syslog-ng/syslog-ng.conf | 40 ------------------- start-daemon.sh | 1 + 4 files changed, 21 insertions(+), 40 deletions(-) create mode 100644 main/config_default/supervisord_syslog-ng.ini diff --git a/Dockerfile b/Dockerfile index dfc93fe5..203c53d7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -137,6 +137,7 @@ RUN sed -i "s/WSGISocketPrefix\ \/var\/log\/idds\/wsgisocks\/wsgi/WSGISocketPref RUN ln -fs /opt/idds/config/idds/supervisord_idds.ini /etc/supervisord.d/idds.ini RUN ln -fs /opt/idds/config/idds/supervisord_iddsfake.ini /etc/supervisord.d/iddsfake.ini RUN ln -fs /opt/idds/config/idds/supervisord_httpd.ini /etc/supervisord.d/httpd.ini +RUN ln -fs /opt/idds/config/idds/supervisord_syslog-ng.ini /etc/supervisord.d/syslog-ng.ini # for syslog-ng RUN mv /etc/syslog-ng/syslog-ng.conf /etc/syslog-ng/syslog-ng.conf.back diff --git a/main/config_default/supervisord_syslog-ng.ini b/main/config_default/supervisord_syslog-ng.ini new file mode 100644 index 00000000..aae885c8 --- /dev/null +++ b/main/config_default/supervisord_syslog-ng.ini @@ -0,0 +1,19 @@ +[program:syslog-ng] +;command=/opt/idds/bin/run-idds +;command=bash -c "source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds" +;command=bash -c "trap 'kill -TERM $PID; wait $PID' TERM && source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds && PID=$! && wait $PID" +command=/usr/sbin/syslog-ng -F --no-caps --persist-file=/var/log/idds/syslog-ng.persist +# process_name=%(process_num)02d +# user=atlpan +childlogdir=/var/log/idds +stdout_logfile=/var/log/idds/%(program_name)s-stdout.log +stderr_logfile=/var/log/idds/%(program_name)s-stderr.log +stdout_logfile_maxbytes=2GB +stderr_logfile_maxbytes=2GB +stdout_logfile_backups=5 +stderr_logfile_backups=5 +redirect_stderr=false +autorestart=true +stopsignal=TERM +stopasgroup=true +exitcodes=1 diff --git a/main/tools/syslog-ng/syslog-ng.conf b/main/tools/syslog-ng/syslog-ng.conf index 6a4726e6..608a9c2c 100644 --- a/main/tools/syslog-ng/syslog-ng.conf +++ b/main/tools/syslog-ng/syslog-ng.conf @@ -22,46 +22,6 @@ options { keep_hostname (yes); }; -source s_sys { - system(); - internal(); - # udp(ip(0.0.0.0) port(514)); -}; - -destination d_cons { file("/dev/console"); }; -destination d_mesg { file("/var/log/messages"); }; -destination d_auth { file("/var/log/secure"); }; -destination d_mail { file("/var/log/maillog" flush_lines(10)); }; -destination d_spol { file("/var/log/spooler"); }; -destination d_boot { file("/var/log/boot.log"); }; -destination d_cron { file("/var/log/cron"); }; -destination d_kern { file("/var/log/kern"); }; -destination d_mlal { usertty("*"); }; - -filter f_kernel { facility(kern); }; -filter f_default { level(info..emerg) and - not (facility(mail) - or facility(authpriv) - or facility(cron)); }; -filter f_auth { facility(authpriv); }; -filter f_mail { facility(mail); }; -filter f_emergency { level(emerg); }; -filter f_news { facility(uucp) or - (facility(news) - and level(crit..emerg)); }; -filter f_boot { facility(local7); }; -filter f_cron { facility(cron); }; - -#log { source(s_sys); filter(f_kernel); destination(d_cons); }; -# log { source(s_sys); filter(f_kernel); destination(d_kern); }; -# log { source(s_sys); filter(f_default); destination(d_mesg); }; -# log { source(s_sys); filter(f_auth); destination(d_auth); }; -# log { source(s_sys); filter(f_mail); destination(d_mail); }; -# log { source(s_sys); filter(f_emergency); destination(d_mlal); }; -# log { source(s_sys); filter(f_news); destination(d_spol); }; -# log { source(s_sys); filter(f_boot); destination(d_boot); }; -# log { source(s_sys); filter(f_cron); destination(d_cron); }; - # Source additional configuration files (.conf extension only) @include "/etc/syslog-ng/conf.d/*.conf" diff --git a/start-daemon.sh b/start-daemon.sh index 99480f7b..7deb03a7 100755 --- a/start-daemon.sh +++ b/start-daemon.sh @@ -113,6 +113,7 @@ else cp /opt/idds/config_default/supervisord_idds.ini /opt/idds/config/idds/supervisord_idds.ini cp /opt/idds/config_default/supervisord_iddsfake.ini /opt/idds/config/idds/supervisord_iddsfake.ini cp /opt/idds/config_default/supervisord_httpd.ini /opt/idds/config/idds/supervisord_httpd.ini + cp /opt/idds/config_default/supervisord_syslog-ng.ini /opt/idds/config/idds/supervisord_syslog-ng.ini fi if [ -f /etc/grid-security/hostkey.pem ]; then From fe7dd25650a1f920588c4a41edca832f814c8110 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 16 Jan 2023 18:38:44 +0100 Subject: [PATCH 82/91] fix messaging plugin to support list of brokers --- main/lib/idds/agents/common/plugins/messaging.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/main/lib/idds/agents/common/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py index b72f80c5..d1ffa7c4 100644 --- a/main/lib/idds/agents/common/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -104,6 +104,10 @@ def connect_to_messaging_brokers(self, sender=True): for name in self.channels: channel = self.channels[name] brokers = channel['brokers'] + if type(brokers) in [list, tuple]: + pass + else: + brokers = brokers.split(",") # destination = channel['destination'] # username = channel['username'] # password = channel['password'] From 8358bc7aed8fafdc133a238c13aa79013f16add4 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 16 Jan 2023 18:39:13 +0100 Subject: [PATCH 83/91] fix idds default config to support multiple channels --- main/config_default/idds.cfg | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/main/config_default/idds.cfg b/main/config_default/idds.cfg index 576d6045..0595aada 100755 --- a/main/config_default/idds.cfg +++ b/main/config_default/idds.cfg @@ -76,6 +76,12 @@ plugin.receiver.username = user plugin.receiver.password = password plugin.receiver.broker_timeout = 10 +plugin.receiver.channels = {"default": {"brokers": ["atlas-mb.cern.ch:61013"], + "destination": "/topic/doma.panda_idds", + "username": "user", + "password": "password", + "broker_timeout": 360}} + # domapandawork.life_time = 86400 domapandawork.num_retries = 0 domapandawork.poll_panda_jobs_chunk_size = 10000 @@ -94,3 +100,15 @@ plugin.notifier.destination = /queue/atlas.idds plugin.notifier.username = user plugin.notifier.password = password plugin.notifier.broker_timeout = 10 + +plugin.notifier.channels = {"default": {"brokers": ["atlas-mb.cern.ch:61013"], + "destination": "/topic/doma.idds", + "username": "user", + "password": "password", + "broker_timeout": 360}, + "ContentExt": {"brokers": ["atlas-test-mb.cern.ch:61013"], + "destination": "/queue/atlas.idds", + "username": "user", + "password": "password", + "broker_timeout": 360} + } From 381141c10d4cc066e2dc19cae72a0fa34665b2f6 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 16 Jan 2023 21:09:04 +0100 Subject: [PATCH 84/91] fix merge configmap --- main/config_default/idds.cfg | 8 ++++---- main/tools/env/merge_configmap.py | 11 ++++++++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/main/config_default/idds.cfg b/main/config_default/idds.cfg index 0595aada..b1763397 100755 --- a/main/config_default/idds.cfg +++ b/main/config_default/idds.cfg @@ -107,8 +107,8 @@ plugin.notifier.channels = {"default": {"brokers": ["atlas-mb.cern.ch:61013"], "password": "password", "broker_timeout": 360}, "ContentExt": {"brokers": ["atlas-test-mb.cern.ch:61013"], - "destination": "/queue/atlas.idds", - "username": "user", - "password": "password", - "broker_timeout": 360} + "destination": "/queue/atlas.idds", + "username": "user", + "password": "password", + "broker_timeout": 360} } diff --git a/main/tools/env/merge_configmap.py b/main/tools/env/merge_configmap.py index 4f8764e2..04e15723 100644 --- a/main/tools/env/merge_configmap.py +++ b/main/tools/env/merge_configmap.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2022 +# - Wen Guan, , 2022 - 2023 import argparse @@ -49,6 +49,14 @@ def as_parse_env(dct): return dct +def convert_section_json(data_conf): + for section in data_conf: + for item in data_conf[section]: + if type(data_conf[section][item]) in [list, tuple, dict]: + data_conf[section][item] = json.dumps(data_conf[section][item]) + return data_conf + + def merge_configs(source_file_path, dest_file_path): """ Merge configuration file. @@ -60,6 +68,7 @@ def merge_configs(source_file_path, dest_file_path): data = json.load(f, object_hook=as_parse_env) if dest_file_path in data: data_conf = data[dest_file_path] + data_conf = convert_section_json(data_conf) parser = configparser.ConfigParser() parser.read(dest_file_path) parser.read_dict(data_conf) From d7d84ce0ba899458050d9f666820a35e8422dbe1 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 17 Jan 2023 09:28:45 +0100 Subject: [PATCH 85/91] redirect syslog-ng outputs --- main/config_default/supervisord_syslog-ng.ini | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/main/config_default/supervisord_syslog-ng.ini b/main/config_default/supervisord_syslog-ng.ini index aae885c8..1e2dfe42 100644 --- a/main/config_default/supervisord_syslog-ng.ini +++ b/main/config_default/supervisord_syslog-ng.ini @@ -2,7 +2,7 @@ ;command=/opt/idds/bin/run-idds ;command=bash -c "source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds" ;command=bash -c "trap 'kill -TERM $PID; wait $PID' TERM && source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds && PID=$! && wait $PID" -command=/usr/sbin/syslog-ng -F --no-caps --persist-file=/var/log/idds/syslog-ng.persist +command=/usr/sbin/syslog-ng -F --no-caps --persist-file=/var/log/idds/syslog-ng.persist -p /var/log/idds/syslog-ng.pid # process_name=%(process_num)02d # user=atlpan childlogdir=/var/log/idds @@ -12,7 +12,8 @@ stdout_logfile_maxbytes=2GB stderr_logfile_maxbytes=2GB stdout_logfile_backups=5 stderr_logfile_backups=5 -redirect_stderr=false +redirect_stderr=True +redirect_stdout=True autorestart=true stopsignal=TERM stopasgroup=true From 16ac54073b6b940e9b6d840cb7666fc634217c88 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 17 Jan 2023 09:29:14 +0100 Subject: [PATCH 86/91] redirect syslog-ng outputs --- main/config_default/supervisord_syslog-ng.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/config_default/supervisord_syslog-ng.ini b/main/config_default/supervisord_syslog-ng.ini index 1e2dfe42..a2db8571 100644 --- a/main/config_default/supervisord_syslog-ng.ini +++ b/main/config_default/supervisord_syslog-ng.ini @@ -10,8 +10,8 @@ stdout_logfile=/var/log/idds/%(program_name)s-stdout.log stderr_logfile=/var/log/idds/%(program_name)s-stderr.log stdout_logfile_maxbytes=2GB stderr_logfile_maxbytes=2GB -stdout_logfile_backups=5 -stderr_logfile_backups=5 +stdout_logfile_backups=1 +stderr_logfile_backups=1 redirect_stderr=True redirect_stdout=True autorestart=true From 0028ad42d7e7a38ac46f0a909fe74caea49301a3 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 17 Jan 2023 10:33:21 +0100 Subject: [PATCH 87/91] put syslog-ng logs to /dev/stdout --- main/config_default/supervisord_syslog-ng.ini | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/main/config_default/supervisord_syslog-ng.ini b/main/config_default/supervisord_syslog-ng.ini index a2db8571..97a6b765 100644 --- a/main/config_default/supervisord_syslog-ng.ini +++ b/main/config_default/supervisord_syslog-ng.ini @@ -6,14 +6,16 @@ command=/usr/sbin/syslog-ng -F --no-caps --persist-file=/var/log/idds/syslog-ng. # process_name=%(process_num)02d # user=atlpan childlogdir=/var/log/idds -stdout_logfile=/var/log/idds/%(program_name)s-stdout.log -stderr_logfile=/var/log/idds/%(program_name)s-stderr.log +# stdout_logfile=/var/log/idds/%(program_name)s-stdout.log +# stderr_logfile=/var/log/idds/%(program_name)s-stderr.log +stdout_logfile=/dev/stdout +stderr_logfile=/dev/stderr stdout_logfile_maxbytes=2GB stderr_logfile_maxbytes=2GB stdout_logfile_backups=1 stderr_logfile_backups=1 -redirect_stderr=True -redirect_stdout=True +redirect_stderr=true +redirect_stdout=true autorestart=true stopsignal=TERM stopasgroup=true From 27d805832955ff0c6914ff6f0b46bba8e19059ff Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 17 Jan 2023 11:01:59 +0100 Subject: [PATCH 88/91] update docker syslog-ng --- Dockerfile | 2 +- start-daemon.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 203c53d7..cf26aa52 100644 --- a/Dockerfile +++ b/Dockerfile @@ -137,7 +137,7 @@ RUN sed -i "s/WSGISocketPrefix\ \/var\/log\/idds\/wsgisocks\/wsgi/WSGISocketPref RUN ln -fs /opt/idds/config/idds/supervisord_idds.ini /etc/supervisord.d/idds.ini RUN ln -fs /opt/idds/config/idds/supervisord_iddsfake.ini /etc/supervisord.d/iddsfake.ini RUN ln -fs /opt/idds/config/idds/supervisord_httpd.ini /etc/supervisord.d/httpd.ini -RUN ln -fs /opt/idds/config/idds/supervisord_syslog-ng.ini /etc/supervisord.d/syslog-ng.ini +# RUN ln -fs /opt/idds/config/idds/supervisord_syslog-ng.ini /etc/supervisord.d/syslog-ng.ini # for syslog-ng RUN mv /etc/syslog-ng/syslog-ng.conf /etc/syslog-ng/syslog-ng.conf.back diff --git a/start-daemon.sh b/start-daemon.sh index 7deb03a7..dbe6e68e 100755 --- a/start-daemon.sh +++ b/start-daemon.sh @@ -199,6 +199,7 @@ else fi echo "start syslog-ng" -/usr/sbin/syslog-ng --no-caps +# /usr/sbin/syslog-ng -F --no-caps --persist-file=/var/log/idds/syslog-ng.persist -p /var/log/idds/syslog-ng.pid +/usr/sbin/syslog-ng --no-caps --persist-file=/var/log/idds/syslog-ng.persist -p /var/log/idds/syslog-ng.pid trap : TERM INT; sleep infinity & wait From c7ea6957379a429810aa1ef961dc59dbce40b219 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 17 Jan 2023 11:19:43 +0100 Subject: [PATCH 89/91] update docker syslog-ng --- start-daemon.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/start-daemon.sh b/start-daemon.sh index dbe6e68e..54459c41 100755 --- a/start-daemon.sh +++ b/start-daemon.sh @@ -199,7 +199,6 @@ else fi echo "start syslog-ng" -# /usr/sbin/syslog-ng -F --no-caps --persist-file=/var/log/idds/syslog-ng.persist -p /var/log/idds/syslog-ng.pid -/usr/sbin/syslog-ng --no-caps --persist-file=/var/log/idds/syslog-ng.persist -p /var/log/idds/syslog-ng.pid +/usr/sbin/syslog-ng -F --no-caps --persist-file=/var/log/idds/syslog-ng.persist -p /var/log/idds/syslog-ng.pid trap : TERM INT; sleep infinity & wait From 1fa44d0fc5b273871e3ce1cf58e62d7058cc2808 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 17 Jan 2023 13:09:58 +0100 Subject: [PATCH 90/91] update docker for syslog-ng --- Dockerfile | 2 +- main/config_default/supervisord_syslog-ng.ini | 8 ++++---- start-daemon.sh | 6 ++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index cf26aa52..203c53d7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -137,7 +137,7 @@ RUN sed -i "s/WSGISocketPrefix\ \/var\/log\/idds\/wsgisocks\/wsgi/WSGISocketPref RUN ln -fs /opt/idds/config/idds/supervisord_idds.ini /etc/supervisord.d/idds.ini RUN ln -fs /opt/idds/config/idds/supervisord_iddsfake.ini /etc/supervisord.d/iddsfake.ini RUN ln -fs /opt/idds/config/idds/supervisord_httpd.ini /etc/supervisord.d/httpd.ini -# RUN ln -fs /opt/idds/config/idds/supervisord_syslog-ng.ini /etc/supervisord.d/syslog-ng.ini +RUN ln -fs /opt/idds/config/idds/supervisord_syslog-ng.ini /etc/supervisord.d/syslog-ng.ini # for syslog-ng RUN mv /etc/syslog-ng/syslog-ng.conf /etc/syslog-ng/syslog-ng.conf.back diff --git a/main/config_default/supervisord_syslog-ng.ini b/main/config_default/supervisord_syslog-ng.ini index 97a6b765..d93ce873 100644 --- a/main/config_default/supervisord_syslog-ng.ini +++ b/main/config_default/supervisord_syslog-ng.ini @@ -6,10 +6,10 @@ command=/usr/sbin/syslog-ng -F --no-caps --persist-file=/var/log/idds/syslog-ng. # process_name=%(process_num)02d # user=atlpan childlogdir=/var/log/idds -# stdout_logfile=/var/log/idds/%(program_name)s-stdout.log -# stderr_logfile=/var/log/idds/%(program_name)s-stderr.log -stdout_logfile=/dev/stdout -stderr_logfile=/dev/stderr +stdout_logfile=/var/log/idds/%(program_name)s-stdout.log +stderr_logfile=/var/log/idds/%(program_name)s-stderr.log +# stdout_logfile=/dev/stdout +# stderr_logfile=/dev/stderr stdout_logfile_maxbytes=2GB stderr_logfile_maxbytes=2GB stdout_logfile_backups=1 diff --git a/start-daemon.sh b/start-daemon.sh index 54459c41..809573a0 100755 --- a/start-daemon.sh +++ b/start-daemon.sh @@ -198,7 +198,9 @@ else exec "$@" fi -echo "start syslog-ng" -/usr/sbin/syslog-ng -F --no-caps --persist-file=/var/log/idds/syslog-ng.persist -p /var/log/idds/syslog-ng.pid +# echo "start syslog-ng" +# /usr/sbin/syslog-ng -F --no-caps --persist-file=/var/log/idds/syslog-ng.persist -p /var/log/idds/syslog-ng.pid +tail -f -F /var/log/idds/syslog-ng-stdout.log & +tail -f -F /var/log/idds/syslog-ng-stderr.log & trap : TERM INT; sleep infinity & wait From b6b2c848f5cc5b5d52514620109eb19749d5be37 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 18 Jan 2023 13:25:51 +0100 Subject: [PATCH 91/91] fix with_hint --- main/lib/idds/orm/contents.py | 6 +++--- main/lib/idds/orm/requests.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index 4c32fe85..80f6eb8d 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -471,7 +471,7 @@ def update_dep_contents(request_id, content_dep_ids, status, bulk_size=1000, ses params = {'substatus': status} chunks = [content_dep_ids[i:i + bulk_size] for i in range(0, len(content_dep_ids), bulk_size)] for chunk in chunks: - session.query(models.Content).with_hint(models.Content, "INDEX(CONTENTS CONTENTS_DEP_IDX)")\ + session.query(models.Content).with_hint(models.Content, "INDEX(CONTENTS CONTENTS_DEP_IDX)", "oracle")\ .filter(models.Content.request_id == request_id)\ .filter(models.Content.content_id.in_(chunk))\ .update(params, synchronize_session=False) @@ -668,7 +668,7 @@ def get_contents_ext(request_id=None, transform_id=None, workload_id=None, coll_ status = [status] query = session.query(models.Content_ext) - query = query.with_hint(models.Content_ext, "INDEX(CONTENTS_EXT CONTENTS_EXT_RTF_IDX)") + query = query.with_hint(models.Content_ext, "INDEX(CONTENTS_EXT CONTENTS_EXT_RTF_IDX)", "oracle") if request_id: query = query.filter(models.Content_ext.request_id == request_id) if transform_id: @@ -722,7 +722,7 @@ def get_contents_ext_ids(request_id=None, transform_id=None, workload_id=None, c models.Content_ext.content_id, models.Content_ext.panda_id, models.Content_ext.status) - query = query.with_hint(models.Content_ext, "INDEX(CONTENTS_EXT CONTENTS_EXT_RTF_IDX)") + query = query.with_hint(models.Content_ext, "INDEX(CONTENTS_EXT CONTENTS_EXT_RTF_IDX)", "oracle") if request_id: query = query.filter(models.Content_ext.request_id == request_id) if transform_id: diff --git a/main/lib/idds/orm/requests.py b/main/lib/idds/orm/requests.py index 06411a13..ae0d66f7 100644 --- a/main/lib/idds/orm/requests.py +++ b/main/lib/idds/orm/requests.py @@ -186,7 +186,7 @@ def get_request_ids_by_name(name, session=None): """ try: query = session.query(models.Request.request_id, models.Request.name)\ - .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)")\ + .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", "oracle")\ .filter(models.Request.name.like(name.replace('*', '%'))) tmp = query.all() ret_ids = {}