diff --git a/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py b/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py index bf0dd7b0..d4ec0c52 100644 --- a/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py +++ b/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 - 2022 +# - Wen Guan, , 2020 - 2023 try: @@ -47,6 +47,7 @@ def __init__(self, task_parameters=None, # maxwalltime=90000, maxattempt=5, core_count=1, # encode_command_line=False, num_retries=5, + use_rucio=False, # task_log=None, # task_cloud=None, # task_rss=0 @@ -84,6 +85,7 @@ def __init__(self, task_parameters=None, self.retry_number = 0 self.num_retries = num_retries + self.use_rucio = use_rucio self.load_panda_urls() def my_condition(self): @@ -155,6 +157,8 @@ def set_agent_attributes(self, attrs, req_attributes=None): super(ATLASPandaWork, self).set_agent_attributes(attrs) if self.agent_attributes and 'num_retries' in self.agent_attributes and self.agent_attributes['num_retries']: self.num_retries = int(self.agent_attributes['num_retries']) + if self.class_name in attrs and 'use_rucio' in attrs[self.class_name]: + self.use_rucio = attrs[self.class_name]['use_rucio'] def parse_task_parameters(self, task_parameters): if self.task_parameters: @@ -307,17 +311,18 @@ def poll_external_collection(self, coll): else: try: if not coll.coll_type == CollectionType.PseudoDataset: - client = self.get_rucio_client() - did_meta = client.get_metadata(scope=coll.scope, name=coll.name) - - coll.coll_metadata['bytes'] = did_meta['bytes'] - coll.coll_metadata['total_files'] = did_meta['length'] - coll.coll_metadata['availability'] = did_meta['availability'] - coll.coll_metadata['events'] = did_meta['events'] - coll.coll_metadata['is_open'] = did_meta['is_open'] - coll.coll_metadata['run_number'] = did_meta['run_number'] - coll.coll_metadata['did_type'] = did_meta['did_type'] - coll.coll_metadata['list_all_files'] = False + if self.use_rucio: + client = self.get_rucio_client() + did_meta = client.get_metadata(scope=coll.scope, name=coll.name) + + coll.coll_metadata['bytes'] = did_meta['bytes'] + coll.coll_metadata['total_files'] = did_meta['length'] + coll.coll_metadata['availability'] = did_meta['availability'] + coll.coll_metadata['events'] = did_meta['events'] + coll.coll_metadata['is_open'] = did_meta['is_open'] + coll.coll_metadata['run_number'] = did_meta['run_number'] + coll.coll_metadata['did_type'] = did_meta['did_type'] + coll.coll_metadata['list_all_files'] = False if 'is_open' in coll.coll_metadata and not coll.coll_metadata['is_open']: coll_status = CollectionStatus.Closed diff --git a/common/lib/idds/common/authentication.py b/common/lib/idds/common/authentication.py index c44541b9..d457b5e2 100644 --- a/common/lib/idds/common/authentication.py +++ b/common/lib/idds/common/authentication.py @@ -384,6 +384,8 @@ def get_user_name_from_dn1(dn): username = up.sub('', dn) up2 = re.compile('/CN=[0-9]+') username = up2.sub('', username) + up2 = re.compile('/CN=[0-9]+') + username = up2.sub('', username) up3 = re.compile(' [0-9]+') username = up3.sub('', username) up4 = re.compile('_[0-9]+') @@ -421,6 +423,8 @@ def get_user_name_from_dn2(dn): username = up.sub('', dn) up2 = re.compile(',CN=[0-9]+') username = up2.sub('', username) + up2 = re.compile('CN=[0-9]+,') + username = up2.sub(',', username) up3 = re.compile(' [0-9]+') username = up3.sub('', username) up4 = re.compile('_[0-9]+') @@ -428,8 +432,11 @@ def get_user_name_from_dn2(dn): username = username.replace(',CN=proxy', '') username = username.replace(',CN=limited proxy', '') username = username.replace('limited proxy', '') + username = re.sub(',CN=Robot:[^/]+,', ',', username) username = re.sub(',CN=Robot:[^/]+', '', username) + username = re.sub(',CN=Robot[^/]+,', ',', username) username = re.sub(',CN=Robot[^/]+', '', username) + username = re.sub(',CN=nickname:[^/]+,', ',', username) username = re.sub(',CN=nickname:[^/]+', '', username) pat = re.compile('.*,CN=([^\,]+),CN=([^\,]+)') # noqa W605 mat = pat.match(username) diff --git a/common/lib/idds/common/constants.py b/common/lib/idds/common/constants.py index d827dddd..1793cd02 100644 --- a/common/lib/idds/common/constants.py +++ b/common/lib/idds/common/constants.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2022 +# - Wen Guan, , 2019 - 2023 """ Constants. @@ -32,6 +32,8 @@ class Sections: Consumer = 'consumer' EventBus = 'eventbus' Cache = 'cache' + Archiver = 'archiver' + Coordinator = 'coordinator' class HTTP_STATUS_CODE: @@ -288,6 +290,7 @@ class ContentStatus(IDDSEnum): Mapped = 7 FakeAvailable = 8 Missing = 9 + Cancelled = 10 class ContentLocking(IDDSEnum): @@ -295,6 +298,12 @@ class ContentLocking(IDDSEnum): Locking = 1 +class ContentFetchStatus(IDDSEnum): + New = 0 + Fetching = 1 + Fetched = 2 + + class GranularityType(IDDSEnum): File = 0 Event = 1 @@ -338,6 +347,12 @@ class ProcessingLocking(IDDSEnum): Locking = 1 +class HealthStatus(IDDSEnum): + Default = 0 + InActive = 1 + Active = 2 + + class MessageType(IDDSEnum): StageInFile = 0 StageInCollection = 1 @@ -469,6 +484,12 @@ class CommandLocation(IDDSEnum): Other = 6 +class ReturnCode(IDDSEnum): + Ok = 0 + Failed = 255 + Locked = 1 + + def get_work_status_from_transform_processing_status(status): if status in [ProcessingStatus.New, TransformStatus.New]: return WorkStatus.New diff --git a/common/lib/idds/common/event.py b/common/lib/idds/common/event.py new file mode 100644 index 00000000..58cf167c --- /dev/null +++ b/common/lib/idds/common/event.py @@ -0,0 +1,470 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 - 2023 + +import time +import uuid + +from deepdiff import DeepDiff + +from idds.common.constants import IDDSEnum +from idds.common.dict_class import DictClass +from idds.common.utils import json_dumps, merge_dict + + +class EventBusState(IDDSEnum): + New = 0 + Master = 1 + Slave = 2 + Unknown = 3 + + +class EventType(IDDSEnum): + Event = 0 + StateClaim = 1 + Demand = 2 + Message = 3 + + NewRequest = 10 + UpdateRequest = 11 + AbortRequest = 12 + ResumeRequest = 13 + ExpireRequest = 14 + + NewTransform = 20 + UpdateTransform = 21 + AbortTransform = 22 + ResumeTransform = 23 + + NewProcessing = 30 + UpdateProcessing = 31 + AbortProcessing = 32 + ResumeProcessing = 33 + SyncProcessing = 34 + TerminatedProcessing = 35 + TriggerProcessing = 36 + MsgTriggerProcessing = 37 + + UpdateCommand = 40 + + Test = 90 + + +class EventPriority(IDDSEnum): + Low = 0 + Medium = 10 + High = 50 + + +class EventStatus(IDDSEnum): + New = 0 + Processing = 1 + Processed = 2 + + +class Event(DictClass): + def __init__(self, publisher_id=None, event_type=EventType.Event, content=None, counter=1): + self._id = str(uuid.uuid4()) + self._publisher_id = publisher_id + self._event_type = event_type + self._timestamp = time.time() + self._counter = counter + self._content = content + self.has_changes = False + self._requeue_counter = 0 + + def get_event_id(self): + return uuid.UUID(self._id).int % 1000000 + + @property + def event_type(self): + return self._event_type.name + + def able_to_merge(self, event): + if self._event_type == event._event_type and self.get_event_id() == event.get_event_id(): + return True + if self._event_type == event._event_type and self.get_event_id() == event.get_event_id() and self._counter == event._counter: + return True + # if (self._content is None and event._content is None): + # return True + # elif (self._content is not None and event._content is not None): + # ddiff = DeepDiff(self._content, event._content, ignore_order=True) + # if not ddiff: + # return True + return False + + def changed(self): + return self.has_changes + + def merge(self, event): + self.has_changes = False + if self.able_to_merge(event): + if event._counter: + if self._counter is None: + self._counter = event._counter + self.has_changes = True + elif self._counter and event._counter and self._counter < event._counter: + self._counter = event._counter + self.has_changes = True + + if event._content: + if self._content is None: + self._content = event._content + self.has_changes = True + else: + ddiff = DeepDiff(self._content, event._content, ignore_order=True) + if ddiff: + self._content = merge_dict(self._content, event._content) + self.has_changes = True + return True, None + else: + return False, event + + def requeue(self): + self._requeue_counter += 1 + + def get_requeue_counter(self): + return self._requeue_counter + + def to_json(self, strip=False): + ret = {'id': self._id, 'publisher_id': self._publisher_id, + 'event_type': (self._event_type.name, self._event_type.value), + 'timestamp': self._timestamp, + 'counter': self._counter, + 'content': self._content} + return ret + + def __str__(self): + return json_dumps(self.to_json()) + + def clean(self): + pass + + def fail(self): + pass + + def set_terminating(self): + if self._content is None: + self._content = {} + self._content['is_terminating'] = True + + def is_terminating(self): + if self._content and ('is_terminating' in self._content and self._content['is_terminating']): + return True + + def set_has_updates(self): + if self._content is None: + self._content = {} + self._content['has_updates'] = True + + def has_updates(self): + if self._content and ('has_updates' in self._content and self._content['has_updates'] + or 'num_to_update_contents' in self._content and self._content['num_to_update_contents']): # noqa W503, E125, E128 + return True + + +class StateClaimEvent(Event): + def __init__(self, publisher_id=None, event_bus_state=None, content=None, counter=1): + super(StateClaimEvent, self).__init__(publisher_id, event_type=EventType.StateClaim, content=content, counter=counter) + self._event_bus_state = event_bus_state + + def to_json(self, strip=False): + ret = super(StateClaimEvent, self).to_json() + ret['event_bus_state'] = self._event_bus_state + return ret + + +class TestEvent(Event): + def __init__(self, publisher_id=None, content=None, counter=1): + super(TestEvent, self).__init__(publisher_id, event_type=EventType.Test, content=content, counter=counter) + + def to_json(self, strip=False): + ret = super(TestEvent, self).to_json() + return ret + + +class DemandEvent(Event): + def __init__(self, publisher_id=None, demand_type=None, content=None, counter=1): + super(DemandEvent, self).__init__(publisher_id, event_type=EventType.Demand, content=content, counter=counter) + self._demand_type = demand_type + + def to_json(self, strip=False): + ret = super(DemandEvent, self).to_json() + ret['demand_type'] = self._demand_type + return ret + + +class NewRequestEvent(Event): + def __init__(self, publisher_id=None, request_id=None, content=None, counter=1): + super(NewRequestEvent, self).__init__(publisher_id, event_type=EventType.NewRequest, content=content, counter=counter) + self._request_id = request_id + + def get_event_id(self): + return self._request_id + + def to_json(self, strip=False): + ret = super(NewRequestEvent, self).to_json() + ret['request_id'] = self._request_id + return ret + + +class UpdateRequestEvent(Event): + def __init__(self, publisher_id=None, request_id=None, content=None, counter=1): + super(UpdateRequestEvent, self).__init__(publisher_id, event_type=EventType.UpdateRequest, content=content, counter=counter) + self._request_id = request_id + + def get_event_id(self): + return self._request_id + + def to_json(self, strip=False): + ret = super(UpdateRequestEvent, self).to_json() + ret['request_id'] = self._request_id + return ret + + +class AbortRequestEvent(Event): + def __init__(self, publisher_id=None, request_id=None, content=None, counter=1): + super(AbortRequestEvent, self).__init__(publisher_id, event_type=EventType.AbortRequest, content=content, counter=counter) + self._request_id = request_id + + def get_event_id(self): + return self._request_id + + def to_json(self, strip=False): + ret = super(AbortRequestEvent, self).to_json() + ret['request_id'] = self._request_id + return ret + + +class ResumeRequestEvent(Event): + def __init__(self, publisher_id=None, request_id=None, content=None, counter=1): + super(ResumeRequestEvent, self).__init__(publisher_id, event_type=EventType.ResumeRequest, content=content, counter=counter) + self._request_id = request_id + + def get_event_id(self): + return self._request_id + + def to_json(self, strip=False): + ret = super(ResumeRequestEvent, self).to_json() + ret['request_id'] = self._request_id + return ret + + +class ExpireRequestEvent(Event): + def __init__(self, publisher_id=None, request_id=None, content=None, counter=1): + super(ExpireRequestEvent, self).__init__(publisher_id, event_type=EventType.ExpireRequest, content=content, counter=counter) + self._request_id = request_id + + def get_event_id(self): + return self._request_id + + def to_json(self, strip=False): + ret = super(ExpireRequestEvent, self).to_json() + ret['request_id'] = self._request_id + return ret + + +class UpdateCommandEvent(Event): + def __init__(self, publisher_id=None, command_id=None, content=None, counter=1): + super(UpdateCommandEvent, self).__init__(publisher_id, event_type=EventType.UpdateCommand, content=content, counter=counter) + self._command_id = command_id + + def get_event_id(self): + return self._command_id + + def to_json(self, strip=False): + ret = super(UpdateCommandEvent, self).to_json() + ret['command_id'] = self._command_id + return ret + + +class NewTransformEvent(Event): + def __init__(self, publisher_id=None, transform_id=None, content=None, counter=1): + super(NewTransformEvent, self).__init__(publisher_id, event_type=EventType.NewTransform, content=content, counter=counter) + self._transform_id = transform_id + + def get_event_id(self): + return self._transform_id + + def to_json(self, strip=False): + ret = super(NewTransformEvent, self).to_json() + ret['transform_id'] = self._transform_id + return ret + + +class UpdateTransformEvent(Event): + def __init__(self, publisher_id=None, transform_id=None, content=None, counter=1): + super(UpdateTransformEvent, self).__init__(publisher_id, event_type=EventType.UpdateTransform, content=content, counter=counter) + self._transform_id = transform_id + + def get_event_id(self): + return self._transform_id + + def to_json(self, strip=False): + ret = super(UpdateTransformEvent, self).to_json() + ret['transform_id'] = self._transform_id + return ret + + +class AbortTransformEvent(Event): + def __init__(self, publisher_id=None, transform_id=None, content=None, counter=1): + super(AbortTransformEvent, self).__init__(publisher_id, event_type=EventType.AbortTransform, content=content, counter=counter) + self._transform_id = transform_id + + def get_event_id(self): + return self._transform_id + + def to_json(self, strip=False): + ret = super(AbortTransformEvent, self).to_json() + ret['transform_id'] = self._transform_id + return ret + + +class ResumeTransformEvent(Event): + def __init__(self, publisher_id=None, transform_id=None, content=None, counter=1): + super(ResumeTransformEvent, self).__init__(publisher_id, event_type=EventType.ResumeTransform, content=content, counter=counter) + self._transform_id = transform_id + + def get_event_id(self): + return self._transform_id + + def to_json(self, strip=False): + ret = super(ResumeTransformEvent, self).to_json() + ret['transform_id'] = self._transform_id + return ret + + +class NewProcessingEvent(Event): + def __init__(self, publisher_id=None, processing_id=None, content=None, counter=1): + super(NewProcessingEvent, self).__init__(publisher_id, event_type=EventType.NewProcessing, content=content, counter=counter) + self._processing_id = processing_id + + def get_event_id(self): + return self._processing_id + + def to_json(self, strip=False): + ret = super(NewProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class UpdateProcessingEvent(Event): + def __init__(self, publisher_id=None, processing_id=None, content=None, counter=1): + super(UpdateProcessingEvent, self).__init__(publisher_id, event_type=EventType.UpdateProcessing, content=content, counter=counter) + self._processing_id = processing_id + + def get_event_id(self): + return self._processing_id + + def to_json(self, strip=False): + ret = super(UpdateProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class AbortProcessingEvent(Event): + def __init__(self, publisher_id=None, processing_id=None, content=None, counter=1): + super(AbortProcessingEvent, self).__init__(publisher_id, event_type=EventType.AbortProcessing, content=content, counter=counter) + self._processing_id = processing_id + + def get_event_id(self): + return self._processing_id + + def to_json(self, strip=False): + ret = super(AbortProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class ResumeProcessingEvent(Event): + def __init__(self, publisher_id=None, processing_id=None, content=None, counter=1): + super(ResumeProcessingEvent, self).__init__(publisher_id, event_type=EventType.ResumeProcessing, content=content, counter=counter) + self._processing_id = processing_id + + def get_event_id(self): + return self._processing_id + + def to_json(self, strip=False): + ret = super(ResumeProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class SyncProcessingEvent(Event): + def __init__(self, publisher_id=None, processing_id=None, content=None, counter=1): + super(SyncProcessingEvent, self).__init__(publisher_id, event_type=EventType.SyncProcessing, content=content, counter=counter) + self._processing_id = processing_id + + def get_event_id(self): + return self._processing_id + + def to_json(self, strip=False): + ret = super(SyncProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class TerminatedProcessingEvent(Event): + def __init__(self, publisher_id=None, processing_id=None, content=None, counter=1): + super(TerminatedProcessingEvent, self).__init__(publisher_id, event_type=EventType.TerminatedProcessing, content=content, counter=counter) + self._processing_id = processing_id + + def get_event_id(self): + return self._processing_id + + def to_json(self, strip=False): + ret = super(TerminatedProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class TriggerProcessingEvent(Event): + def __init__(self, publisher_id=None, processing_id=None, content=None, counter=1): + super(TriggerProcessingEvent, self).__init__(publisher_id, event_type=EventType.TriggerProcessing, content=content, counter=counter) + self._processing_id = processing_id + + def get_event_id(self): + return self._processing_id + + def to_json(self, strip=False): + ret = super(TriggerProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class MsgTriggerProcessingEvent(Event): + def __init__(self, publisher_id=None, processing_id=None, content=None, counter=1): + super(MsgTriggerProcessingEvent, self).__init__(publisher_id, event_type=EventType.MsgTriggerProcessing, content=content, counter=counter) + self._processing_id = processing_id + + def get_event_id(self): + return self._processing_id + + def to_json(self, strip=False): + ret = super(MsgTriggerProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class MessageEvent(Event): + def __init__(self, publisher_id=None, message=None, content=None, counter=1): + super(MessageEvent, self).__init__(publisher_id, event_type=EventType.Message, content=content, counter=counter) + self._msg = message + + def get_event_id(self): + return uuid.UUID(self._id).int % 1000000 + + def get_message(self): + return self._msg + + def to_json(self, strip=False): + ret = super(MessageEvent, self).to_json() + if not strip: + ret['message'] = self._msg + return ret diff --git a/common/lib/idds/common/utils.py b/common/lib/idds/common/utils.py index 6b7599ca..c9294623 100644 --- a/common/lib/idds/common/utils.py +++ b/common/lib/idds/common/utils.py @@ -6,9 +6,10 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2021 +# - Wen Guan, , 2019 - 2023 +import errno import datetime import logging import json @@ -523,3 +524,60 @@ def extract_scope_atlas(did, scopes): def truncate_string(string, length=800): string = (string[:length] + '...') if string and len(string) > length else string return string + + +def merge_dict(dict1, dict2): + keys = list(dict1.keys()) + for key in list(dict2.keys()): + if key not in keys: + keys.append(key) + for key in keys: + if key in dict2: + if key not in dict1 or dict1[key] is None: + dict1[key] = dict2[key] + else: + if dict2[key] is None: + continue + elif isinstance(dict1[key], type(dict2[key])): + raise Exception("type of %s is different from %s, cannot merge" % (type(dict1[key]), type(dict2[key]))) + elif dict1[key] == dict2[key]: + continue + elif type(dict1[key]) in (list, tuple, str): + dict1[key] = dict1[key] + dict2[key] + elif type(dict1[key]) in (int, float, complex): + dict1[key] = dict1[key] + dict2[key] + elif type(dict1[key]) in (bool, bool): + dict1[key] = True + elif type(dict1[key]) in (dict, dict): + dict1[key] = merge_dict(dict1[key], dict2[key]) + return dict1 + + +def pid_exists(pid): + """ + Check whether pid exists in the current process table. + UNIX only. + """ + if pid < 0: + return False + if pid == 0: + # According to "man 2 kill" PID 0 refers to every process + # in the process group of the calling process. + # On certain systems 0 is a valid PID but we have no way + # to know that in a portable fashion. + raise ValueError('invalid PID 0') + try: + os.kill(pid, 0) + except OSError as err: + if err.errno == errno.ESRCH: + # ESRCH == No such process + return False + elif err.errno == errno.EPERM: + # EPERM clearly means there's a process to deny access to + return True + else: + # According to "man 2 kill" possible error values are + # (EINVAL, EPERM, ESRCH) + raise + else: + return True diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index 10d11c3e..03a8dd41 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -477,10 +477,13 @@ def create_processing(self, input_output_maps=[]): self.task_name = self.task_name + "_" + str(self.get_request_id()) + "_" + str(self.get_work_id()) in_files = [] + has_dependencies = False if self.dependency_map is None: self.dependency_map = {} for job in self.dependency_map: in_files.append(job['name']) + if not has_dependencies and "dependencies" in job and job['dependencies']: + has_dependencies = True task_param_map = {} task_param_map['vo'] = self.vo @@ -491,7 +494,8 @@ def create_processing(self, input_output_maps=[]): task_param_map['workingGroup'] = self.working_group task_param_map['nFilesPerJob'] = 1 if in_files: - task_param_map['inputPreStaging'] = True + if has_dependencies: + task_param_map['inputPreStaging'] = True task_param_map['nFiles'] = len(in_files) task_param_map['noInput'] = True task_param_map['pfnList'] = in_files @@ -932,7 +936,7 @@ def get_job_maps(self, input_output_maps): if content['substatus'] in [ContentStatus.Available]: if 'panda_id' in content['content_metadata']: finished_jobs.append(content['content_metadata']['panda_id']) - elif content['substatus'] in [ContentStatus.Failed, ContentStatus.FinalFailed, + elif content['substatus'] in [ContentStatus.FinalFailed, ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: if 'panda_id' in content['content_metadata']: @@ -1028,6 +1032,9 @@ def get_contents_ext_detail(self, new_contents_ext, update_contents_ext, job_inf new_content_ext_d[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) if new_content_ext_d[job_info_item] == 'NULL': new_content_ext_d[job_info_item] = None + if new_content_ext_d[job_info_item] is None: + del new_content_ext_d[job_info_item] + new_contents_ext_d.append(new_content_ext_d) for content_id in update_contents_ext: @@ -1039,6 +1046,8 @@ def get_contents_ext_detail(self, new_contents_ext, update_contents_ext, job_inf update_content_ext_d[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) if update_content_ext_d[job_info_item] == 'NULL': update_content_ext_d[job_info_item] = None + if update_content_ext_d[job_info_item] is None: + del update_content_ext_d[job_info_item] update_contents_ext_d.append(update_content_ext_d) @@ -1105,7 +1114,7 @@ def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, j self.logger.debug("get_contents_ext, new_contents_ext_d[:1]: %s" % (str(new_contents_ext_d[:1]))) self.logger.debug("get_contents_ext, update_contents_ext_d[:1]: %s" % (str(update_contents_ext_d[:1]))) - self.logger.debug("get_contents_ext, left_contents[:3]: %s" % (str(left_contents[:3]))) + self.logger.debug("get_contents_ext, left_contents[:1]: %s" % (str(left_contents[:3]))) return new_contents_ext_d, update_contents_ext_d, left_contents def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext=None, job_info_maps={}, log_prefix=''): diff --git a/main/config_default/idds.cfg b/main/config_default/idds.cfg index b1763397..c2854741 100755 --- a/main/config_default/idds.cfg +++ b/main/config_default/idds.cfg @@ -3,7 +3,7 @@ # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2023 [common] #logdir = /var/log/idds @@ -23,7 +23,17 @@ cacher_dir = /var/log/idds [main] # agents = clerk, transformer, carrier, conductor -agents = clerk, transformer, submitter, poller, receiver, trigger, finisher, conductor +# agents = clerk, transformer, submitter, poller, receiver, trigger, finisher, conductor +agents = clerk, transformer, submitter, poller, receiver, trigger, finisher, conductor, archiver, coordinator + +[eventbus] +# backend = database +backend = message +# debug = True + +[coordinator] +coordination_interval_delay = 300 + [clerk] num_threads = 4 @@ -112,3 +122,9 @@ plugin.notifier.channels = {"default": {"brokers": ["atlas-mb.cern.ch:61013"], "password": "password", "broker_timeout": 360} } + +[archiver] +# days +older_than = 60 +poll_period = 1 + diff --git a/main/etc/idds/idds.cfg.template b/main/etc/idds/idds.cfg.template index d140b4ef..52028959 100755 --- a/main/etc/idds/idds.cfg.template +++ b/main/etc/idds/idds.cfg.template @@ -50,6 +50,13 @@ cacher_dir = /data/idds # agents = clerk, marshaller, transformer, carrier, conductor, consumer agents = clerk, transformer, carrier, conductor +[eventbus] +# backend = database +backend = message + +[coordinator] +coordination_interval_delay = 300 + [clerk] num_threads = 3 poll_time_period = 120 diff --git a/main/etc/idds/supervisord.d/idds.ini b/main/etc/idds/supervisord.d/idds.ini index 280a8d2b..7fcaf2a2 100644 --- a/main/etc/idds/supervisord.d/idds.ini +++ b/main/etc/idds/supervisord.d/idds.ini @@ -17,6 +17,12 @@ stderr_logfile_maxbytes=2GB stdout_logfile_backups=10 stderr_logfile_backups=10 redirect_stderr=false +conductor_logfile=/var/log/idds/Conductor.log +receiver_logfile=/var/log/idds/Receiver.log +conductor_logfile_maxbytes=2GB +receiver_logfile_maxbytes=2GB +conductor_logfile_backups=3 +receiver_logfile_backups=3 autorestart=true stopsignal=TERM stopasgroup=true diff --git a/main/etc/sql/oracle_update.sql b/main/etc/sql/oracle_update.sql index c5ec6963..542b6c9b 100644 --- a/main/etc/sql/oracle_update.sql +++ b/main/etc/sql/oracle_update.sql @@ -311,3 +311,91 @@ BEGIN (select content_id, substatus from contents where request_id = request_id_in and transform_id = transform_id_in and content_relation_type = 1) t on c.content_dep_id = t.content_id where c.substatus != t.substatus) set c_substatus = t_substatus; END; + + + +--- 2023.03.06 +drop index PROCESSINGS_STATUS_POLL_IDX; +drop index CONTENTS_REL_IDX; +drop index CONTENTS_TF_IDX; +drop index CONTENTS_EXT_RTW_IDX; +drop index CONTENTS_EXT_RTM_IDX; +drop index COMMANDS_STATUS_IDX; +drop index MESSAGES_ST_IDX; +drop index MESSAGES_TYPE_STU_IDX; +drop index REQUESTS_STATUS_POLL_IDX; +drop index TRANSFORMS_REQ_IDX; +drop index TRANSFORMS_STATUS_POLL_IDX; +drop index COLLECTIONS_REQ_IDX; + +CREATE INDEX PROCESSINGS_STATUS_POLL_IDX ON PROCESSINGS (status, processing_id, locking, updated_at, new_poll_period, update_poll_period, created_at) COMPRESS 3 LOCAL; + +CREATE INDEX CONTENTS_REL_IDX ON CONTENTS (request_id, content_relation_type, transform_id, substatus) COMPRESS 3 LOCAL; +CREATE INDEX CONTENTS_TF_IDX ON CONTENTS (transform_id, request_id, coll_id, content_relation_type, map_id) COMPRESS 4 LOCAL; + +CREATE INDEX CONTENTS_EXT_RTW_IDX ON contents_ext (request_id, transform_id, workload_id) COMPRESS 3 ; +CREATE INDEX CONTENTS_EXT_RTM_IDX ON contents_ext (request_id, transform_id, map_id) COMPRESS 2; + +CREATE INDEX COMMANDS_STATUS_IDX on commands (status, locking, updated_at) COMPRESS 2; + +CREATE INDEX MESSAGES_ST_IDX on messages (status, destination, created_at) COMPRESS 2; +CREATE INDEX MESSAGES_TYPE_STU_IDX on messages (msg_type, status, destination, retries, updated_at, created_at) COMPRESS 3; + +CREATE INDEX REQUESTS_STATUS_POLL_IDX on REQUESTS (status, request_id, locking, priority, updated_at, new_poll_period, update_poll_period, next_poll_at, created_at) COMPRESS 3 LOCAL; + +CREATE INDEX TRANSFORMS_REQ_IDX on transforms (request_id, transform_id) COMPRESS 2; +CREATE INDEX TRANSFORMS_STATUS_POLL_IDX on transforms (status, transform_id, locking, updated_at, new_poll_period, update_poll_period, created_at) COMPRESS 3 LOCAL; + +CREATE INDEX COLLECTIONS_REQ_IDX on collections (request_id, transform_id, updated_at) COMPRESS 2; + + +-- 2023.03.10 + +CREATE SEQUENCE EVENT_ID_SEQ MINVALUE 1 INCREMENT BY 1 START WITH 1 NOCACHE ORDER NOCYCLE GLOBAL; +CREATE TABLE EVENTS +( + event_id NUMBER(12) DEFAULT ON NULL EVENT_ID_SEQ.NEXTVAL constraint EVENT_ID_NN NOT NULL, + event_type NUMBER(12), + event_actual_id NUMBER(12), + priority NUMBER(12), + status NUMBER(2), + created_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + processing_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + processed_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + content CLOB, + CONSTRAINT EVENTS_PK PRIMARY KEY (event_id) -- USING INDEX LOCAL, +); + +CREATE TABLE EVENTS_ARCHIVE +( + event_id NUMBER(12), + event_type NUMBER(12), + event_actual_id NUMBER(12), + priority NUMBER(12), + status NUMBER(2), + created_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + processing_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + processed_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + content CLOB, + CONSTRAINT EVENTS_AR_PK PRIMARY KEY (event_id) -- USING INDEX LOCAL, +); + +CREATE TABLE EVENTS_PRIORITY +( + event_type NUMBER(12), + event_actual_id NUMBER(12), + priority NUMBER(12), + last_processed_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + updated_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + CONSTRAINT EVENTS_PR_PK PRIMARY KEY (event_type, event_actual_id) -- USING INDEX LOCAL, +); + + +--- 2023.03.16 +alter table HEALTH add (status NUMBER(2)); +alter table contents_update add content_metadata CLOB; +alter table health modify payload VARCHAR2(2048); + + +--- 2023.03.29 +alter table contents_update add fetch_status NUMBER(2) DEFAULT 0; diff --git a/main/etc/sql/postgresql.sql b/main/etc/sql/postgresql.sql index b6f5e9c5..6af99c7e 100644 --- a/main/etc/sql/postgresql.sql +++ b/main/etc/sql/postgresql.sql @@ -1,6 +1,423 @@ -CREATE USER doma_idds_r WITH PASSWORD 'Tiaroa4dr_idds'; -GRANT CONNECT ON DATABASE doma_idds TO doma_idds_r; -GRANT USAGE ON SCHEMA doma_idds TO doma_idds_r; -GRANT SELECT ON ALL TABLES IN SCHEMA doma_idds TO doma_idds_r; -ALTER DEFAULT PRIVILEGES IN SCHEMA doma_idds GRANT SELECT ON TABLES TO doma_idds_r; +--- with schema doma_idds +CREATE SEQUENCE doma_idds."REQUEST_ID_SEQ" START WITH 1 + +CREATE TABLE doma_idds.requests ( + request_id BIGINT NOT NULL, + scope VARCHAR(25), + name VARCHAR(255), + requester VARCHAR(20), + request_type INTEGER NOT NULL, + username VARCHAR(20), + userdn VARCHAR(200), + transform_tag VARCHAR(20), + workload_id INTEGER, + priority INTEGER, + status INTEGER NOT NULL, + substatus INTEGER, + oldstatus INTEGER, + locking INTEGER NOT NULL, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + next_poll_at TIMESTAMP WITHOUT TIME ZONE, + accessed_at TIMESTAMP WITHOUT TIME ZONE, + expired_at TIMESTAMP WITHOUT TIME ZONE, + new_retries INTEGER, + update_retries INTEGER, + max_new_retries INTEGER, + max_update_retries INTEGER, + new_poll_period INTERVAL, + update_poll_period INTERVAL, + errors VARCHAR(1024), + request_metadata JSONB, + processing_metadata JSONB, + PRIMARY KEY (request_id) +); + +CREATE SEQUENCE doma_idds."WORKPROGRESS_ID_SEQ" START WITH 1 + +CREATE TABLE doma_idds.workprogresses ( + workprogress_id BIGINT NOT NULL, + request_id BIGINT, + workload_id INTEGER, + scope VARCHAR(25), + name VARCHAR(255), + priority INTEGER, + status INTEGER, + substatus INTEGER, + locking INTEGER, + created_at TIMESTAMP WITHOUT TIME ZONE, + updated_at TIMESTAMP WITHOUT TIME ZONE, + next_poll_at TIMESTAMP WITHOUT TIME ZONE, + accessed_at TIMESTAMP WITHOUT TIME ZONE, + expired_at TIMESTAMP WITHOUT TIME ZONE, + errors VARCHAR(1024), + workprogress_metadata JSONB, + processing_metadata JSONB, + PRIMARY KEY (workprogress_id) +); + +CREATE SEQUENCE doma_idds."TRANSFORM_ID_SEQ" START WITH 1 + +CREATE TABLE doma_idds.transforms ( + transform_id BIGINT NOT NULL, + request_id BIGINT NOT NULL, + workload_id INTEGER, + transform_type INTEGER NOT NULL, + transform_tag VARCHAR(20), + priority INTEGER, + safe2get_output_from_input INTEGER, + status INTEGER NOT NULL, + substatus INTEGER, + oldstatus INTEGER, + locking INTEGER NOT NULL, + retries INTEGER, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + next_poll_at TIMESTAMP WITHOUT TIME ZONE, + started_at TIMESTAMP WITHOUT TIME ZONE, + finished_at TIMESTAMP WITHOUT TIME ZONE, + expired_at TIMESTAMP WITHOUT TIME ZONE, + new_retries INTEGER, + update_retries INTEGER, + max_new_retries INTEGER, + max_update_retries INTEGER, + new_poll_period INTERVAL, + update_poll_period INTERVAL, + name VARCHAR(255), + errors VARCHAR(1024), + transform_metadata JSONB, + running_metadata JSONB, + PRIMARY KEY (transform_id) +); + + +CREATE TABLE doma_idds.wp2transforms ( + workprogress_id BIGINT NOT NULL, + transform_id BIGINT NOT NULL, + PRIMARY KEY (workprogress_id, transform_id) +); + +CREATE SEQUENCE doma_idds."PROCESSING_ID_SEQ" START WITH 1 + +CREATE TABLE doma_idds.processings ( + processing_id BIGINT NOT NULL, + transform_id BIGINT NOT NULL, + request_id BIGINT NOT NULL, + workload_id INTEGER, + status INTEGER NOT NULL, + substatus INTEGER, + oldstatus INTEGER, + locking INTEGER NOT NULL, + submitter VARCHAR(20), + submitted_id INTEGER, + granularity INTEGER, + granularity_type INTEGER, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + next_poll_at TIMESTAMP WITHOUT TIME ZONE, + poller_updated_at TIMESTAMP WITHOUT TIME ZONE, + submitted_at TIMESTAMP WITHOUT TIME ZONE, + finished_at TIMESTAMP WITHOUT TIME ZONE, + expired_at TIMESTAMP WITHOUT TIME ZONE, + new_retries INTEGER, + update_retries INTEGER, + max_new_retries INTEGER, + max_update_retries INTEGER, + new_poll_period INTERVAL, + update_poll_period INTERVAL, + errors VARCHAR(1024), + processing_metadata JSONB, + running_metadata JSONB, + output_metadata JSONB, + PRIMARY KEY (processing_id) +); + +CREATE SEQUENCE doma_idds."COLLECTION_ID_SEQ" START WITH 1 + +CREATE TABLE doma_idds.collections ( + coll_id BIGINT NOT NULL, + request_id BIGINT NOT NULL, + workload_id INTEGER, + transform_id BIGINT NOT NULL, + coll_type INTEGER NOT NULL, + relation_type INTEGER NOT NULL, + scope VARCHAR(25), + name VARCHAR(255), + bytes INTEGER, + status INTEGER NOT NULL, + substatus INTEGER, + locking INTEGER NOT NULL, + total_files INTEGER, + storage_id INTEGER, + new_files INTEGER, + processed_files INTEGER, + processing_files INTEGER, + failed_files INTEGER, + missing_files INTEGER, + ext_files INTEGER, + processed_ext_files INTEGER, + failed_ext_files INTEGER, + missing_ext_files INTEGER, + processing_id INTEGER, + retries INTEGER, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + next_poll_at TIMESTAMP WITHOUT TIME ZONE, + accessed_at TIMESTAMP WITHOUT TIME ZONE, + expired_at TIMESTAMP WITHOUT TIME ZONE, + coll_metadata JSONB, + PRIMARY KEY (coll_id) +); + +CREATE SEQUENCE doma_idds."CONTENT_ID_SEQ" START WITH 1 + +CREATE TABLE doma_idds.contents ( + content_id BIGINT NOT NULL, + transform_id BIGINT NOT NULL, + coll_id BIGINT NOT NULL, + request_id BIGINT NOT NULL, + workload_id INTEGER, + map_id BIGINT NOT NULL, + content_dep_id BIGINT, + scope VARCHAR(25), + name VARCHAR(4000), + min_id INTEGER, + max_id INTEGER, + content_type INTEGER NOT NULL, + content_relation_type INTEGER NOT NULL, + status INTEGER NOT NULL, + substatus INTEGER, + locking INTEGER NOT NULL, + bytes INTEGER, + md5 VARCHAR(32), + adler32 VARCHAR(8), + processing_id INTEGER, + storage_id INTEGER, + retries INTEGER, + path VARCHAR(4000), + created_at TIMESTAMP WITHOUT TIME ZONE, + updated_at TIMESTAMP WITHOUT TIME ZONE, + accessed_at TIMESTAMP WITHOUT TIME ZONE, + expired_at TIMESTAMP WITHOUT TIME ZONE, + content_metadata VARCHAR(100), + PRIMARY KEY (content_id) +); + + + SET search_path TO doma_idds; + CREATE OR REPLACE PROCEDURE update_contents_to_others(request_id_in int, transform_id_in int) + AS $$ + BEGIN + UPDATE doma_idds.contents set substatus = d.substatus from + (select content_id, content_dep_id, substatus from doma_idds.contents where request_id = request_id_in and transform_id = transform_id_in and content_relation_type = 1 and status != 0) d + where doma_idds.contents.request_id = request_id_in and doma_idds.contents.content_relation_type = 3 and doma_idds.contents.substatus != d.substatus and d.content_id = doma_idds.contents.content_dep_id; + END; + $$ LANGUAGE PLPGSQL + + + SET search_path TO doma_idds; + CREATE OR REPLACE PROCEDURE update_contents_from_others(request_id_in int, transform_id_in int) + AS $$ + BEGIN + + UPDATE doma_idds.contents set substatus = d.substatus from + (select content_id, content_dep_id, substatus from doma_idds.contents where request_id = request_id_in and content_relation_type = 1 and status != 0) d + where doma_idds.contents.request_id = request_id_in and doma_idds.contents.transform_id = transform_id_in and doma_idds.contents.content_relation_type = 3 and doma_idds.contents.substatus != d.substatus and d.content_id = doma_idds.contents.content_dep_id; + END; + $$ LANGUAGE PLPGSQL + + +CREATE TABLE doma_idds.contents_update ( + content_id BIGSERIAL NOT NULL, + substatus INTEGER, + request_id BIGINT, + transform_id BIGINT, + workload_id INTEGER, + coll_id BIGINT, + content_metadata VARCHAR(100), + PRIMARY KEY (content_id) +); + + +CREATE TABLE doma_idds.contents_ext ( + content_id BIGSERIAL NOT NULL, + transform_id BIGINT NOT NULL, + coll_id BIGINT NOT NULL, + request_id BIGINT NOT NULL, + workload_id INTEGER, + map_id BIGINT NOT NULL, + status INTEGER NOT NULL, + panda_id BIGINT, + job_definition_id BIGINT, + scheduler_id VARCHAR(128), + pilot_id VARCHAR(200), + creation_time TIMESTAMP WITHOUT TIME ZONE, + modification_time TIMESTAMP WITHOUT TIME ZONE, + start_time TIMESTAMP WITHOUT TIME ZONE, + end_time TIMESTAMP WITHOUT TIME ZONE, + prod_source_label VARCHAR(20), + prod_user_id VARCHAR(250), + assigned_priority INTEGER, + current_priority INTEGER, + attempt_nr INTEGER, + max_attempt INTEGER, + max_cpu_count INTEGER, + max_cpu_unit VARCHAR(32), + max_disk_count INTEGER, + max_disk_unit VARCHAR(10), + min_ram_count INTEGER, + min_ram_unit VARCHAR(10), + cpu_consumption_time INTEGER, + cpu_consumption_unit VARCHAR(128), + job_status VARCHAR(10), + job_name VARCHAR(255), + trans_exit_code INTEGER, + pilot_error_code INTEGER, + pilot_error_diag VARCHAR(500), + exe_error_code INTEGER, + exe_error_diag VARCHAR(500), + sup_error_code INTEGER, + sup_error_diag VARCHAR(250), + ddm_error_code INTEGER, + ddm_error_diag VARCHAR(500), + brokerage_error_code INTEGER, + brokerage_error_diag VARCHAR(250), + job_dispatcher_error_code INTEGER, + job_dispatcher_error_diag VARCHAR(250), + task_buffer_error_code INTEGER, + task_buffer_error_diag VARCHAR(300), + computing_site VARCHAR(128), + computing_element VARCHAR(128), + grid VARCHAR(50), + cloud VARCHAR(50), + cpu_conversion FLOAT, + task_id BIGINT, + vo VARCHAR(16), + pilot_timing VARCHAR(100), + working_group VARCHAR(20), + processing_type VARCHAR(64), + prod_user_name VARCHAR(60), + core_count INTEGER, + n_input_files INTEGER, + req_id BIGINT, + jedi_task_id BIGINT, + actual_core_count INTEGER, + max_rss INTEGER, + max_vmem INTEGER, + max_swap INTEGER, + max_pss INTEGER, + avg_rss INTEGER, + avg_vmem INTEGER, + avg_swap INTEGER, + avg_pss INTEGER, + max_walltime INTEGER, + disk_io INTEGER, + failed_attempt INTEGER, + hs06 INTEGER, + hs06sec INTEGER, + memory_leak VARCHAR(10), + memory_leak_x2 VARCHAR(10), + job_label VARCHAR(20), + PRIMARY KEY (content_id) +); + +CREATE SEQUENCE doma_idds."HEALTH_ID_SEQ" START WITH 1 + +CREATE TABLE doma_idds.health ( + health_id BIGINT NOT NULL, + agent VARCHAR(30), + hostname VARCHAR(127), + pid INTEGER, + status INTEGER NOT NULL, + thread_id BIGINT, + thread_name VARCHAR(255), + created_at TIMESTAMP WITHOUT TIME ZONE, + updated_at TIMESTAMP WITHOUT TIME ZONE, + payload VARCHAR(2048), + PRIMARY KEY (health_id) +); + +CREATE SEQUENCE doma_idds."MESSAGE_ID_SEQ" START WITH 1 + +CREATE TABLE doma_idds.messages ( + msg_id BIGINT NOT NULL, + msg_type INTEGER NOT NULL, + status INTEGER NOT NULL, + substatus INTEGER, + locking INTEGER NOT NULL, + source INTEGER NOT NULL, + destination INTEGER NOT NULL, + request_id BIGINT NOT NULL, + workload_id INTEGER, + transform_id INTEGER NOT NULL, + processing_id INTEGER NOT NULL, + num_contents INTEGER, + retries INTEGER, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + msg_content JSONB, + PRIMARY KEY (msg_id) +); + +CREATE SEQUENCE doma_idds."COMMAND_ID_SEQ" START WITH 1 + +CREATE TABLE doma_idds.commands ( + cmd_id BIGINT NOT NULL, + request_id BIGINT NOT NULL, + workload_id INTEGER, + transform_id INTEGER, + processing_id INTEGER, + cmd_type INTEGER, + status INTEGER NOT NULL, + substatus INTEGER, + locking INTEGER NOT NULL, + username VARCHAR(50), + retries INTEGER, + source INTEGER, + destination INTEGER, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + cmd_content JSONB, + errors VARCHAR(1024), + PRIMARY KEY (cmd_id) +); + + +CREATE TABLE doma_idds.events_priority ( + event_type INTEGER NOT NULL, + event_actual_id INTEGER NOT NULL, + priority INTEGER NOT NULL, + last_processed_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + updated_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + PRIMARY KEY (event_type, event_actual_id) +); + +CREATE SEQUENCE doma_idds."EVENT_ID_SEQ" START WITH 1 + +CREATE TABLE doma_idds.events ( + event_id BIGINT NOT NULL, + event_type INTEGER NOT NULL, + event_actual_id INTEGER NOT NULL, + priority INTEGER, + status INTEGER NOT NULL, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + processing_at TIMESTAMP WITHOUT TIME ZONE, + processed_at TIMESTAMP WITHOUT TIME ZONE, + content JSONB, + PRIMARY KEY (event_id) +); + + +CREATE TABLE doma_idds.events_archive ( + event_id BIGSERIAL NOT NULL, + event_type INTEGER NOT NULL, + event_actual_id INTEGER NOT NULL, + priority INTEGER, + status INTEGER NOT NULL, + created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL, + processing_at TIMESTAMP WITHOUT TIME ZONE, + processed_at TIMESTAMP WITHOUT TIME ZONE, + content JSONB, + PRIMARY KEY (event_id) +); diff --git a/main/etc/sql/postgresql_init.sql b/main/etc/sql/postgresql_init.sql new file mode 100644 index 00000000..52312493 --- /dev/null +++ b/main/etc/sql/postgresql_init.sql @@ -0,0 +1,18 @@ +CREATE USER doma_idds_r WITH PASSWORD 'test_idds'; +GRANT CONNECT ON DATABASE doma_idds TO doma_idds_r; +GRANT USAGE ON SCHEMA doma_idds TO doma_idds_r; +GRANT SELECT ON ALL TABLES IN SCHEMA doma_idds TO doma_idds_r; +ALTER DEFAULT PRIVILEGES IN SCHEMA doma_idds GRANT SELECT ON TABLES TO doma_idds_r; + +CREATE USER doma_idds WITH PASSWORD 'test_idds_pass'; +create database doma_idds; +\connect doma_idds; +create schema if not exists doma_idds authorization doma_idds; +GRANT CONNECT ON DATABASE doma_idds TO doma_idds; +GRANT USAGE ON SCHEMA doma_idds TO doma_idds; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA doma_idds TO doma_idds; +GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA doma_idds TO doma_idds; +GRANT ALL PRIVILEGES ON DATABASE doma_idds TO doma_idds; +#ALTER DEFAULT PRIVILEGES IN SCHEMA doma_idds GRANT SELECT ON TABLES TO doma_idds; + +set search_path to doma_idds; diff --git a/main/lib/idds/agents/archive/__init__.py b/main/lib/idds/agents/archive/__init__.py new file mode 100644 index 00000000..865b774e --- /dev/null +++ b/main/lib/idds/agents/archive/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 diff --git a/main/lib/idds/agents/archive/archiver.py b/main/lib/idds/agents/archive/archiver.py new file mode 100644 index 00000000..a1e917aa --- /dev/null +++ b/main/lib/idds/agents/archive/archiver.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 - 2023 + +import traceback + +from idds.common.constants import Sections, RequestStatus +from idds.common.utils import setup_logging +from idds.core import (requests as core_requests, + messages as core_messages) +from idds.agents.common.baseagent import BaseAgent + + +setup_logging(__name__) + + +class Archiver(BaseAgent): + """ + Archiver works to archive data + """ + + def __init__(self, num_threads=1, poll_period=7, older_than=30, **kwargs): + self.set_max_workers() + num_threads = self.max_number_workers + super(Archiver, self).__init__(num_threads=num_threads, name='Archive', **kwargs) + if not poll_period: + poll_period = 7 # days + self.poll_period = int(poll_period) * 3600 * 24 + if not older_than: + older_than = 30 + self.older_than = int(older_than) # days + self.config_section = Sections.Archiver + + def clean_messages(self): + try: + status = [RequestStatus.Finished, RequestStatus.SubFinished, + RequestStatus.Failed, RequestStatus.Cancelled, + RequestStatus.Suspended, RequestStatus.Expired] + request_id = core_requests.get_last_request_id(older_than=self.older_than, status=status) + if request_id: + self.logger.info("cleaning old mesages older than request id %s" % request_id) + core_messages.clean_old_messages(request_id=request_id) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + + def run(self): + """ + Main run function. + """ + try: + self.logger.info("Starting main thread") + self.init_thread_info() + + self.add_default_tasks() + + self.logger.info("poll period: %s seconds" % self.poll_period) + self.logger.info("older_than: %s days" % self.older_than) + + task = self.create_task(task_func=self.clean_messages, task_output_queue=None, + task_args=tuple(), task_kwargs={}, delay_time=self.poll_period, priority=1) + self.add_task(task) + + self.execute() + except KeyboardInterrupt: + self.stop() + + +if __name__ == '__main__': + agent = Archiver() + agent() diff --git a/main/lib/idds/agents/carrier/finisher.py b/main/lib/idds/agents/carrier/finisher.py index c89a89eb..a6abffed 100644 --- a/main/lib/idds/agents/carrier/finisher.py +++ b/main/lib/idds/agents/carrier/finisher.py @@ -6,11 +6,11 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2022 +# - Wen Guan, , 2019 - 2023 import traceback -from idds.common.constants import (Sections, ProcessingStatus, ProcessingLocking) +from idds.common.constants import (Sections, ReturnCode, ProcessingStatus, ProcessingLocking) from idds.common.utils import setup_logging, truncate_string from idds.agents.common.eventbus.event import (EventType, UpdateProcessingEvent, @@ -81,18 +81,24 @@ def handle_sync_processing(self, processing, log_prefix=""): def process_sync_processing(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: self.logger.info("process_sync_processing: event: %s" % event) pr = self.get_processing(processing_id=event._processing_id, locking=True) if not pr: self.logger.error("Cannot find processing for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(pr) self.logger.info(log_pre + "process_sync_processing") ret = self.handle_sync_processing(pr, log_prefix=log_pre) - self.logger.info(log_pre + "process_sync_processing result: %s" % str(ret)) + ret_copy = {} + for ret_key in ret: + if ret_key != 'messages': + ret_copy[ret_key] = ret[ret_key] + self.logger.info(log_pre + "process_sync_processing result: %s" % str(ret_copy)) self.update_processing(ret, pr) @@ -103,7 +109,9 @@ def process_sync_processing(self, event): except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def handle_terminated_processing(self, processing, log_prefix=""): """ @@ -135,6 +143,7 @@ def handle_terminated_processing(self, processing, log_prefix=""): def process_terminated_processing(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: if event._counter > 3: @@ -144,12 +153,17 @@ def process_terminated_processing(self, event): pr = self.get_processing(processing_id=event._processing_id, locking=True) if not pr: self.logger.error("Cannot find processing for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(pr) self.logger.info(log_pre + "process_terminated_processing") ret = self.handle_terminated_processing(pr, log_prefix=log_pre) - self.logger.info(log_pre + "process_terminated_processing result: %s" % str(ret)) + ret_copy = {} + for ret_key in ret: + if ret_key != 'messages': + ret_copy[ret_key] = ret[ret_key] + self.logger.info(log_pre + "process_terminated_processing result: %s" % str(ret_copy)) self.update_processing(ret, pr) self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) @@ -160,11 +174,14 @@ def process_terminated_processing(self, event): # some files are missing, poll it. self.logger.info(log_pre + "UpdateProcessingEvent(processing_id: %s)" % pr['processing_id']) event = UpdateProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], counter=original_event._counter + 1) + event.set_terminating() self.event_bus.send(event) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def handle_abort_processing(self, processing, log_prefix=""): """ @@ -197,6 +214,7 @@ def handle_abort_processing(self, processing, log_prefix=""): def process_abort_processing(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: processing_status = [ProcessingStatus.Finished, ProcessingStatus.Failed, @@ -208,6 +226,7 @@ def process_abort_processing(self, event): if not pr: self.logger.error("Cannot find processing for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(pr) self.logger.info(log_pre + "process_abort_processing") @@ -221,7 +240,12 @@ def process_abort_processing(self, event): self.update_processing(ret, pr) elif pr: ret = self.handle_abort_processing(pr, log_prefix=log_pre) - self.logger.info(log_pre + "process_abort_processing result: %s" % str(ret)) + ret_copy = {} + for ret_key in ret: + if ret_key != 'messages': + ret_copy[ret_key] = ret[ret_key] + self.logger.info(log_pre + "process_abort_processing result: %s" % str(ret_copy)) + self.update_processing(ret, pr) self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id'], content=event._content) @@ -229,7 +253,9 @@ def process_abort_processing(self, event): except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def handle_resume_processing(self, processing, log_prefix=""): """ @@ -261,6 +287,7 @@ def handle_resume_processing(self, processing, log_prefix=""): def process_resume_processing(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: processing_status = [ProcessingStatus.Finished] @@ -269,6 +296,7 @@ def process_resume_processing(self, event): if not pr: self.logger.error("Cannot find processing for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(pr) self.logger.info(log_pre + "process_resume_processing") @@ -294,7 +322,9 @@ def process_resume_processing(self, event): except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def init_event_function_map(self): self.event_func_map = { @@ -322,6 +352,7 @@ def run(self): """ try: self.logger.info("Starting main thread") + self.init_thread_info() self.load_plugins() self.init() diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py index d3645a2c..de55efe0 100644 --- a/main/lib/idds/agents/carrier/poller.py +++ b/main/lib/idds/agents/carrier/poller.py @@ -14,8 +14,8 @@ import traceback from idds.common import exceptions -from idds.common.constants import Sections, ProcessingStatus, ProcessingLocking -from idds.common.utils import setup_logging, truncate_string +from idds.common.constants import Sections, ReturnCode, ProcessingStatus, ProcessingLocking +from idds.common.utils import setup_logging, truncate_string, json_dumps from idds.core import processings as core_processings from idds.agents.common.baseagent import BaseAgent from idds.agents.common.eventbus.event import (EventType, @@ -233,6 +233,7 @@ def update_processing(self, processing, processing_model): except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + self.logger.warn("Failed to update_processings: %s" % json_dumps(processing)) try: processing_id = processing['update_processing']['processing_id'] @@ -359,6 +360,7 @@ def handle_update_processing(self, processing): def process_update_processing(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: original_event = event @@ -367,6 +369,7 @@ def process_update_processing(self, event): pr = self.get_processing(processing_id=event._processing_id, status=None, locking=True) if not pr: self.logger.error("Cannot find processing for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(pr) @@ -382,6 +385,7 @@ def process_update_processing(self, event): event_content['has_updates'] = True if is_process_terminated(pr['substatus']): event_content['Terminated'] = True + event_content['is_terminating'] = True self.logger.info(log_pre + "TriggerProcessingEvent(processing_id: %s)" % pr['processing_id']) event = TriggerProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], content=event_content, counter=original_event._counter) @@ -390,6 +394,7 @@ def process_update_processing(self, event): self.logger.info(log_pre + "TerminatedProcessingEvent(processing_id: %s)" % pr['processing_id']) event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], counter=original_event._counter) + event.set_terminating() self.event_bus.send(event) else: if (('update_contents' in ret and ret['update_contents']) @@ -400,11 +405,14 @@ def process_update_processing(self, event): self.logger.info(log_pre + "SyncProcessingEvent(processing_id: %s)" % pr['processing_id']) event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], counter=original_event._counter) + event.set_has_updates() self.event_bus.send(event) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def clean_locks(self): self.logger.info("clean locking") @@ -424,6 +432,7 @@ def run(self): """ try: self.logger.info("Starting main thread") + self.init_thread_info() self.load_plugins() self.init() diff --git a/main/lib/idds/agents/carrier/receiver.py b/main/lib/idds/agents/carrier/receiver.py index a8f87102..abcd8478 100644 --- a/main/lib/idds/agents/carrier/receiver.py +++ b/main/lib/idds/agents/carrier/receiver.py @@ -9,6 +9,7 @@ # - Wen Guan, , 2019 - 2023 import time +import threading import traceback try: # python 3 @@ -17,14 +18,16 @@ # Python 2 from Queue import Queue -from idds.common.constants import Sections +from idds.common.constants import Sections, ReturnCode from idds.common.exceptions import AgentPluginError, IDDSException from idds.common.utils import setup_logging, get_logger from idds.common.utils import json_dumps from idds.core import messages as core_messages, catalog as core_catalog +from idds.core import health as core_health from idds.agents.common.baseagent import BaseAgent # from idds.agents.common.eventbus.event import TerminatedProcessingEvent -from idds.agents.common.eventbus.event import MsgTriggerProcessingEvent +from idds.agents.common.eventbus.event import (EventType, MessageEvent, + TriggerProcessingEvent) from .utils import handle_messages_processing @@ -36,9 +39,9 @@ class Receiver(BaseAgent): Receiver works to receive workload management messages to update task/job status. """ - def __init__(self, num_threads=1, bulk_message_delay=30, bulk_message_size=2000, - random_delay=None, update_processing_interval=300, **kwargs): - super(Receiver, self).__init__(num_threads=num_threads, name='Receiver', **kwargs) + def __init__(self, receiver_num_threads=8, num_threads=1, bulk_message_delay=30, bulk_message_size=2000, + random_delay=None, update_processing_interval=300, mode='single', **kwargs): + super(Receiver, self).__init__(num_threads=receiver_num_threads, name='Receiver', **kwargs) self.config_section = Sections.Carrier self.bulk_message_delay = int(bulk_message_delay) self.bulk_message_size = int(bulk_message_size) @@ -50,6 +53,13 @@ def __init__(self, num_threads=1, bulk_message_delay=30, bulk_message_size=2000, else: self.update_processing_interval = 300 + self.mode = mode + self.selected_receiver = None + + self.log_prefix = '' + + self._lock = threading.RLock() + def __del__(self): self.stop_receiver() @@ -67,18 +77,146 @@ def stop_receiver(self): if hasattr(self, 'receiver') and self.receiver: self.logger.info("Stopping receiver: %s" % self.receiver) self.receiver.stop() + self.receiver = None + + def is_receiver_started(self): + if hasattr(self, 'receiver') and self.receiver: + return True + return False + + def get_num_queued_messages(self): + return self.message_queue.qsize() def get_output_messages(self): - msgs = [] + with self._lock: + msgs = [] + try: + msg_size = 0 + while not self.message_queue.empty(): + msg = self.message_queue.get(False) + if msg: + if msg_size < 10: + self.logger.debug("Received message(only log first 10 messages): %s" % str(msg)) + msgs.append(msg) + msg_size += 1 + if msg_size >= self.bulk_message_size: + break + except Exception as error: + self.logger.error("Failed to get output messages: %s, %s" % (error, traceback.format_exc())) + if msgs: + total_msgs = self.get_num_queued_messages() + self.logger.info("process_messages: Get %s messages, left %s messages" % (len(msgs), total_msgs)) + return msgs + + def is_selected(self): + if not self.selected_receiver: + return True + return self.is_self(self.selected_receiver) + + def monitor_receiver(self): + if self.mode == "single": + self.logger.info("Receiver single mode") + self.selected_receiver = core_health.select_agent(name='Receiver', newer_than=self.heartbeat_delay * 2) + self.logger.debug("Selected receiver: %s" % self.selected_receiver) + + def add_receiver_monitor_task(self): + task = self.create_task(task_func=self.monitor_receiver, task_output_queue=None, + task_args=tuple(), task_kwargs={}, delay_time=self.heartbeat_delay, + priority=1) + self.add_task(task) + + def handle_messages(self, output_messages, log_prefix): + ret_msg_handle = handle_messages_processing(output_messages, + logger=self.logger, + log_prefix=log_prefix, + update_processing_interval=self.update_processing_interval) + + update_processings, update_processings_by_job, terminated_processings, update_contents, msgs = ret_msg_handle + if msgs: + # self.logger.debug(log_prefix + "adding messages[:3]: %s" % json_dumps(msgs[:3])) + core_messages.add_messages(msgs, bulk_size=self.bulk_message_size) + + num_to_update_contents = 0 + if update_contents: + self.logger.info(log_prefix + "update_contents[:3]: %s" % json_dumps(update_contents[:3])) + # instead of update contents directly, add contents to contents_update table. + # core_catalog.update_contents(update_contents) + core_catalog.add_contents_update(update_contents) + num_to_update_contents = len(update_contents) + + for pr_id in update_processings_by_job: + # self.logger.info(log_prefix + "TerminatedProcessingEvent(processing_id: %s)" % pr_id) + # event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr_id) + # self.logger.info(log_prefix + "MsgTriggerProcessingEvent(processing_id: %s)" % pr_id) + self.logger.info(log_prefix + "TriggerProcessingEvent(processing_id: %s)" % pr_id) + event = TriggerProcessingEvent(publisher_id=self.id, processing_id=pr_id) + self.event_bus.send(event) + + for pr_id in update_processings: + # self.logger.info(log_prefix + "TerminatedProcessingEvent(processing_id: %s)" % pr_id) + # event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr_id) + self.logger.info(log_prefix + "TriggerProcessingEvent(processing_id: %s)" % pr_id) + event = TriggerProcessingEvent(publisher_id=self.id, processing_id=pr_id, + content={'num_to_update_contents': num_to_update_contents}) + event.set_has_updates() + self.event_bus.send(event) + + for pr_id in terminated_processings: + self.logger.info(log_prefix + "TriggerProcessingEvent(processing_id: %s)" % pr_id) + event = TriggerProcessingEvent(publisher_id=self.id, + processing_id=pr_id, + content={'Terminated': True, 'source': 'Receiver'}) + event.set_terminating() + self.event_bus.send(event) + + def process_messages(self, log_prefix=None): + output_messages = self.get_output_messages() + has_messages = False + if output_messages: + self.logger.info("process_messages: Received %s messages" % (len(output_messages))) + self.handle_messages(output_messages, log_prefix=log_prefix) + self.logger.info("process_messages: Handled %s messages" % len(output_messages)) + has_messages = True + return has_messages + + def worker(self, log_prefix): + while not self.graceful_stop.is_set(): + try: + has_messages = self.process_messages(log_prefix) + if not has_messages: + time.sleep(1) + except IDDSException as error: + self.logger.error("Worker thread IDDSException: %s" % str(error)) + except Exception as error: + self.logger.critical("Worker thread exception: %s\n%s" % (str(error), traceback.format_exc())) + + def is_ok_to_run_more_workers(self): + if self.executors.has_free_workers(): + return True + return False + + def process_messages_event(self, event): try: - while not self.message_queue.empty(): - msg = self.message_queue.get(False) - if msg: - self.logger.debug("Received message: %s" % str(msg)) - msgs.append(msg) - except Exception as error: - self.logger.error("Failed to get output messages: %s, %s" % (error, traceback.format_exc())) - return msgs + pro_ret = ReturnCode.Ok.value + if event: + output_messages = event.get_message() + if output_messages: + self.logger.info("process_messages: Received %s messages" % (len(output_messages))) + self.handle_messages(output_messages, log_prefix=self.log_prefix) + self.logger.info("process_messages: Handled %s messages" % len(output_messages)) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value + return pro_ret + + def init_event_function_map(self): + self.event_func_map = { + EventType.Message: { + 'pre_check': self.is_ok_to_run_more_workers, + 'exec_func': self.process_messages_event + } + } def run(self): """ @@ -86,47 +224,48 @@ def run(self): """ try: self.logger.info("Starting main thread") - self.load_plugins() + self.init_thread_info() + + self.add_default_tasks() - self.start_receiver() + if self.mode == "single": + self.logger.debug("single mode") + self.add_receiver_monitor_task() + + self.load_plugins() self.add_health_message_task() log_prefix = "" + self.log_prefix = log_prefix + + # [self.executors.submit(self.worker, log_prefix) for i in range(self.executors.get_max_workers())] + self.init_event_function_map() while not self.graceful_stop.is_set(): try: + self.execute_schedules() + time_start = time.time() - output_messages = self.get_output_messages() - update_processings, terminated_processings, update_contents, msgs = handle_messages_processing(output_messages, - logger=self.logger, - log_prefix=log_prefix, - update_processing_interval=self.update_processing_interval) - - if msgs: - # self.logger.debug(log_prefix + "adding messages[:3]: %s" % json_dumps(msgs[:3])) - core_messages.add_messages(msgs, bulk_size=self.bulk_message_size) - - if update_contents: - self.logger.info(log_prefix + "update_contents[:3]: %s" % json_dumps(update_contents[:3])) - core_catalog.update_contents(update_contents) - - for pr_id in update_processings: - # self.logger.info(log_prefix + "TerminatedProcessingEvent(processing_id: %s)" % pr_id) - # event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr_id) - self.logger.info(log_prefix + "MsgTriggerProcessingEvent(processing_id: %s)" % pr_id) - event = MsgTriggerProcessingEvent(publisher_id=self.id, processing_id=pr_id) - self.event_bus.send(event) - for pr_id in terminated_processings: - self.logger.info(log_prefix + "MsgTriggerProcessingEvent(processing_id: %s)" % pr_id) - event = MsgTriggerProcessingEvent(publisher_id=self.id, processing_id=pr_id, content={'Terminated': True, 'source': 'Receiver'}) + if self.is_selected(): + if not self.is_receiver_started(): + self.start_receiver() + + if not self.is_selected(): + if self.is_receiver_started(): + self.stop_receiver() + + msg = self.get_output_messages() + if msg: + event = MessageEvent(message=msg) self.event_bus.send(event) - time_delay = self.bulk_message_delay - (time.time() - time_start) - time_delay = self.bulk_message_delay - if time_delay > 0: - time.sleep(time_delay) + if not msg: + time_delay = self.bulk_message_delay - (time.time() - time_start) + time_delay = self.bulk_message_delay + if time_delay > 0: + time.sleep(time_delay) except IDDSException as error: self.logger.error("Main thread IDDSException: %s" % str(error)) except Exception as error: diff --git a/main/lib/idds/agents/carrier/submitter.py b/main/lib/idds/agents/carrier/submitter.py index f6d170a6..3419830e 100644 --- a/main/lib/idds/agents/carrier/submitter.py +++ b/main/lib/idds/agents/carrier/submitter.py @@ -159,10 +159,12 @@ def process_new_processing(self, event): self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) submit_event_content = {'event': 'submitted'} event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id'], content=submit_event_content) + event.set_has_updates() self.event_bus.send(event) self.logger.info(log_pre + "SyncProcessingEvent(processing_id: %s)" % pr['processing_id']) event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + event.set_has_updates() self.event_bus.send(event) except Exception as ex: self.logger.error(ex) @@ -183,6 +185,7 @@ def run(self): """ try: self.logger.info("Starting main thread") + self.init_thread_info() self.load_plugins() self.init() diff --git a/main/lib/idds/agents/carrier/trigger.py b/main/lib/idds/agents/carrier/trigger.py index 564ba056..25e72c86 100644 --- a/main/lib/idds/agents/carrier/trigger.py +++ b/main/lib/idds/agents/carrier/trigger.py @@ -11,9 +11,8 @@ import traceback from idds.common import exceptions -from idds.common.constants import ProcessingStatus, ProcessingLocking +from idds.common.constants import ProcessingStatus, ProcessingLocking, ReturnCode from idds.common.utils import setup_logging, truncate_string -from idds.core import catalog as core_catalog from idds.core import processings as core_processings from idds.agents.common.eventbus.event import (EventType, UpdateTransformEvent, @@ -21,7 +20,8 @@ TerminatedProcessingEvent, SyncProcessingEvent) -from .utils import handle_trigger_processing, is_process_terminated +from .utils import (handle_trigger_processing, + is_process_terminated) from .poller import Poller setup_logging(__name__) @@ -166,6 +166,7 @@ def handle_trigger_processing(self, processing, trigger_new_updates=False): return ret def process_trigger_processing_real(self, event): + pro_ret = ReturnCode.Ok.value try: if event: original_event = event @@ -174,25 +175,27 @@ def process_trigger_processing_real(self, event): pr = self.get_processing(processing_id=event._processing_id, status=None, locking=True) if not pr: self.logger.error("Cannot find processing for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(pr) self.logger.info(log_pre + "process_trigger_processing") ret = self.handle_trigger_processing(pr) # self.logger.info(log_pre + "process_trigger_processing result: %s" % str(ret)) - new_update_contents = ret.get('new_update_contents', None) + # new_update_contents = ret.get('new_update_contents', None) ret['new_update_contents'] = None - ret_update_contents = ret.get('update_contents', None) + # ret_update_contents = ret.get('update_contents', None) self.update_processing(ret, pr) - if new_update_contents or ret_update_contents: + update_transforms = ret.get('update_transforms', None) + if update_transforms: # self.logger.info(log_pre + "update_contents_to_others_by_dep_id") # core_catalog.update_contents_to_others_by_dep_id(request_id=pr['request_id'], transform_id=pr['transform_id']) # self.logger.info(log_pre + "update_contents_to_others_by_dep_id done") # core_catalog.delete_contents_update(request_id=pr['request_id'], transform_id=pr['transform_id']) - update_transforms = core_catalog.get_updated_transforms_by_content_status(request_id=pr['request_id'], - transform_id=pr['transform_id']) + # update_transforms = get_updated_transforms_by_content_status(request_id=pr['request_id'], + # transform_id=pr['transform_id']) self.logger.info(log_pre + "update_transforms: %s" % str(update_transforms)) for update_transform in update_transforms: if 'transform_id' in update_transform: @@ -211,6 +214,7 @@ def process_trigger_processing_real(self, event): processing_id=pr['processing_id'], content=event._content, counter=original_event._counter) + event.set_terminating() self.event_bus.send(event) else: if ((event._content and 'has_updates' in event._content and event._content['has_updates']) @@ -219,21 +223,26 @@ def process_trigger_processing_real(self, event): or ('messages' in ret and ret['messages'])): # noqa E129 self.logger.info(log_pre + "SyncProcessingEvent(processing_id: %s)" % pr['processing_id']) event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], + content=event._content, counter=original_event._counter) self.event_bus.send(event) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value + return pro_ret def process_trigger_processing(self, event): self.number_workers += 1 - self.process_trigger_processing_real(event) + ret = self.process_trigger_processing_real(event) self.number_workers -= 1 + return ret def process_msg_trigger_processing(self, event): self.number_msg_workers += 1 - self.process_trigger_processing_real(event) + ret = self.process_trigger_processing_real(event) self.number_msg_workers -= 1 + return ret def init_event_function_map(self): self.event_func_map = { @@ -253,6 +262,7 @@ def run(self): """ try: self.logger.info("Starting main thread") + self.init_thread_info() self.load_plugins() self.init() diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index de3cb371..8f6e6836 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -970,6 +970,32 @@ def handle_update_processing(processing, agent_attributes, logger=None, log_pref return process_status, new_contents, new_input_dependency_contents, ret_msgs, content_updates + content_updates_missing, parameters, new_contents_ext, update_contents_ext +def get_transform_id_dependency_map(transform_id, logger=None, log_prefix=''): + cache = get_redis_cache() + transform_id_dependcy_map_key = "transform_id_dependcy_map_%s" % transform_id + transform_id_dependcy_map = cache.get(transform_id_dependcy_map_key, default=[]) + return transform_id_dependcy_map + + +def set_transform_id_dependency_map(transform_id, transform_id_dependcy_map, logger=None, log_prefix=''): + cache = get_redis_cache() + transform_id_dependcy_map_key = "transform_id_dependcy_map_%s" % transform_id + cache.set(transform_id_dependcy_map_key, transform_id_dependcy_map) + + +def get_updated_transforms_by_content_status(request_id=None, transform_id=None, logger=None, log_prefix=''): + logger = get_logger(logger) + logger.debug("get_updated_transforms_by_content_status starts") + + update_transforms = get_transform_id_dependency_map(transform_id=transform_id, logger=logger, log_prefix=log_prefix) + if not update_transforms: + update_transforms = core_catalog.get_updated_transforms_by_content_status(request_id=request_id, + transform_id=transform_id) + set_transform_id_dependency_map(transform_id, update_transforms, logger=logger, log_prefix=log_prefix) + logger.debug("get_updated_transforms_by_content_status ends") + return update_transforms + + def handle_trigger_processing(processing, agent_attributes, trigger_new_updates=False, logger=None, log_prefix=''): logger = get_logger(logger) @@ -994,8 +1020,27 @@ def handle_trigger_processing(processing, agent_attributes, trigger_new_updates= # logger.debug(log_prefix + "delete_contents_update: %s" % str(ret_update_transforms)) pass + logger.debug(log_prefix + "sync contents_update to contents") + core_catalog.set_fetching_contents_update(request_id=request_id, transform_id=transform_id, fetch=True) + contents_update_list = core_catalog.get_contents_update(request_id=request_id, transform_id=transform_id, fetch=True) + new_contents_update_list = [] + # contents_id_list = [] + for con in contents_update_list: + con_dict = {'content_id': con['content_id'], + 'substatus': con['substatus']} + if 'content_metadata' in con and con['content_metadata']: + con_dict['content_metadata'] = con['content_metadata'] + new_contents_update_list.append(con_dict) + # contents_id_list.append(con['content_id']) + core_catalog.update_contents(new_contents_update_list) + # core_catalog.delete_contents_update(contents=contents_id_list) + core_catalog.delete_contents_update(request_id=request_id, transform_id=transform_id, fetch=True) + logger.debug(log_prefix + "sync contents_update to contents done") + logger.debug(log_prefix + "update_contents_from_others_by_dep_id") - core_catalog.update_contents_from_others_by_dep_id(request_id=request_id, transform_id=transform_id) + # core_catalog.update_contents_from_others_by_dep_id(request_id=request_id, transform_id=transform_id) + to_triggered_contents = core_catalog.get_update_contents_from_others_by_dep_id(request_id=request_id, transform_id=transform_id) + core_catalog.update_contents(to_triggered_contents) logger.debug(log_prefix + "update_contents_from_others_by_dep_id done") input_output_maps = get_input_output_maps(transform_id, work) @@ -1019,6 +1064,11 @@ def handle_trigger_processing(processing, agent_attributes, trigger_new_updates= content_updates = content_updates + updated_contents + if content_updates or new_update_contents: + ret_update_transforms = get_updated_transforms_by_content_status(request_id=request_id, + transform_id=transform_id, + logger=logger, + log_prefix=log_prefix) # update_dep_contents_status_name = {} # update_dep_contents_status = {} # for content in new_update_contents: @@ -1149,12 +1199,17 @@ def get_workload_id_transform_id_map(workload_id, logger=None, log_prefix=''): return workload_id_transform_id_map[workload_id_str] +content_id_lock = threading.Lock() + + def get_input_name_content_id_map(request_id, workload_id, transform_id): cache = get_redis_cache() input_name_content_id_map_key = "transform_input_contentid_map_%s" % transform_id input_name_content_id_map = cache.get(input_name_content_id_map_key, default={}) if not input_name_content_id_map: + content_id_lock.acquire() + contents = core_catalog.get_contents_by_request_transform(request_id=request_id, transform_id=transform_id) input_name_content_id_map = {} for content in contents: @@ -1162,6 +1217,8 @@ def get_input_name_content_id_map(request_id, workload_id, transform_id): input_name_content_id_map[content['name']] = content['content_id'] cache.set(input_name_content_id_map_key, input_name_content_id_map) + + content_id_lock.release() return input_name_content_id_map @@ -1251,7 +1308,7 @@ def whether_to_update_processing(processing_id, interval=300): if update_processing_map[key] + 86400 < time.time(): del update_processing_map[key] - cache.set(update_processing_map_key, default=update_processing_map) + cache.set(update_processing_map_key, update_processing_map, expire_seconds=86400) update_processing_lock.release() return ret @@ -1262,6 +1319,7 @@ def handle_messages_processing(messages, logger=None, log_prefix='', update_proc log_prefix = "" update_processings = [] + update_processings_by_job = [] terminated_processings = [] update_contents = [] @@ -1270,12 +1328,12 @@ def handle_messages_processing(messages, logger=None, log_prefix='', update_proc if 'taskid' not in msg or not msg['taskid']: continue - logger.debug(log_prefix + "Received message: %s" % str(ori_msg)) - if msg['msg_type'] in ['task_status']: workload_id = msg['taskid'] status = msg['status'] - if status in ['pending']: # 'prepared' + if status in ['pending1']: # 'prepared' + logger.debug(log_prefix + "Received message: %s" % str(ori_msg)) + ret_req_tf_pr_id = get_workload_id_transform_id_map(workload_id, logger=logger, log_prefix=log_prefix) if not ret_req_tf_pr_id: # request is submitted by some other instances @@ -1292,6 +1350,8 @@ def handle_messages_processing(messages, logger=None, log_prefix='', update_proc else: logger.debug(log_prefix + "Processing %s is already processed, not add it to update processing" % (str(processing_id))) elif status in ['finished', 'done']: + logger.debug(log_prefix + "Received message: %s" % str(ori_msg)) + ret_req_tf_pr_id = get_workload_id_transform_id_map(workload_id, logger=logger, log_prefix=log_prefix) if not ret_req_tf_pr_id: # request is submitted by some other instances @@ -1311,6 +1371,8 @@ def handle_messages_processing(messages, logger=None, log_prefix='', update_proc status = msg['status'] inputs = msg['inputs'] if inputs and status in ['finished']: + logger.debug(log_prefix + "Received message: %s" % str(ori_msg)) + ret_req_tf_pr_id = get_workload_id_transform_id_map(workload_id, logger=logger, log_prefix=log_prefix) if not ret_req_tf_pr_id: # request is submitted by some other instances @@ -1324,25 +1386,32 @@ def handle_messages_processing(messages, logger=None, log_prefix='', update_proc if content_id: if to_update_jobid: u_content = {'content_id': content_id, + 'request_id': req_id, + 'transform_id': tf_id, + 'workload_id': workload_id, # 'status': get_content_status_from_panda_msg_status(status), 'substatus': get_content_status_from_panda_msg_status(status), 'content_metadata': {'panda_id': job_id}} else: u_content = {'content_id': content_id, + 'request_id': req_id, + 'transform_id': tf_id, + 'workload_id': workload_id, 'substatus': get_content_status_from_panda_msg_status(status)} # # 'status': get_content_status_from_panda_msg_status(status)} update_contents.append(u_content) # if processing_id not in update_processings: - if processing_id not in update_processings and whether_to_update_processing(processing_id, update_processing_interval): - update_processings.append(processing_id) - logger.debug(log_prefix + "Add to update processing: %s" % str(processing_id)) + # if processing_id not in update_processings and whether_to_update_processing(processing_id, update_processing_interval): + if processing_id not in update_processings_by_job: + update_processings_by_job.append(processing_id) + logger.debug(log_prefix + "Add to update processing by job: %s" % str(processing_id)) - return update_processings, terminated_processings, update_contents, [] + return update_processings, update_processings_by_job, terminated_processings, update_contents, [] def sync_collection_status(request_id, transform_id, workload_id, work, input_output_maps=None, - close_collection=False, force_close_collection=False, terminate=False): + close_collection=False, force_close_collection=False, abort=False, terminate=False): if input_output_maps is None: input_output_maps = get_input_output_maps(transform_id, work) @@ -1391,7 +1460,8 @@ def sync_collection_status(request_id, transform_id, workload_id, work, input_ou ContentStatus.Available.value, ContentStatus.Mapped.value, ContentStatus.FakeAvailable, ContentStatus.FakeAvailable.value]: coll_status[content['coll_id']]['processed_ext_files'] += 1 - elif content['status'] in [ContentStatus.Failed, ContentStatus.FinalFailed]: + # elif content['status'] in [ContentStatus.Failed, ContentStatus.FinalFailed]: + elif content['status'] in [ContentStatus.FinalFailed]: coll_status[content['coll_id']]['failed_ext_files'] += 1 elif content['status'] in [ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: coll_status[content['coll_id']]['missing_ext_files'] += 1 @@ -1456,16 +1526,26 @@ def sync_collection_status(request_id, transform_id, workload_id, work, input_ou messages += msgs if terminate: - if coll in output_collections and work.require_ext_contents(): - if coll.processed_files == coll.processed_ext_files and coll.failed_files == coll.failed_ext_files: + all_files_monitored = False + if coll.total_files == coll.processed_files + coll.failed_files + coll.missing_files: + all_files_monitored = True + + if abort: + u_coll['status'] = CollectionStatus.Closed + u_coll['substatus'] = CollectionStatus.Closed + coll.status = CollectionStatus.Closed + coll.substatus = CollectionStatus.Closed + elif coll in output_collections: + if (not work.require_ext_contents() or (work.require_ext_contents() + and coll.processed_files == coll.processed_ext_files and coll.failed_files == coll.failed_ext_files)): # noqa E129, W503 all_ext_updated = True - if (force_close_collection or (close_collection and all_updates_flushed and all_ext_updated) + if (force_close_collection or (close_collection and all_updates_flushed and all_ext_updated and all_files_monitored) or coll.status == CollectionStatus.Closed): # noqa W503 u_coll['status'] = CollectionStatus.Closed u_coll['substatus'] = CollectionStatus.Closed coll.status = CollectionStatus.Closed coll.substatus = CollectionStatus.Closed - elif force_close_collection or close_collection and all_updates_flushed or coll.status == CollectionStatus.Closed: + elif force_close_collection or (close_collection and all_updates_flushed and all_files_monitored) or coll.status == CollectionStatus.Closed: u_coll['status'] = CollectionStatus.Closed u_coll['substatus'] = CollectionStatus.Closed coll.status = CollectionStatus.Closed @@ -1501,7 +1581,7 @@ def sync_work_status(request_id, transform_id, workload_id, work): work.status = WorkStatus.SubFinished -def sync_processing(processing, agent_attributes, terminate=False, logger=None, log_prefix=""): +def sync_processing(processing, agent_attributes, terminate=False, abort=False, logger=None, log_prefix=""): logger = get_logger() request_id = processing['request_id'] @@ -1516,7 +1596,7 @@ def sync_processing(processing, agent_attributes, terminate=False, logger=None, input_output_maps = get_input_output_maps(transform_id, work) update_collections, all_updates_flushed, msgs = sync_collection_status(request_id, transform_id, workload_id, work, input_output_maps=input_output_maps, - close_collection=True, terminate=terminate) + close_collection=True, abort=abort, terminate=terminate) messages += msgs @@ -1526,8 +1606,8 @@ def sync_processing(processing, agent_attributes, terminate=False, logger=None, msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='work') messages += msgs if work.is_finished(): - # processing['status'] = ProcessingStatus.Finished - processing['status'] = processing['substatus'] + processing['status'] = ProcessingStatus.Finished + # processing['status'] = processing['substatus'] elif work.is_subfinished(): processing['status'] = ProcessingStatus.SubFinished elif work.is_failed(): @@ -1572,7 +1652,7 @@ def handle_abort_processing(processing, agent_attributes, logger=None, log_prefi # for coll in input_collections + output_collections + log_collections: # coll.status = CollectionStatus.Closed # coll.substatus = CollectionStatus.Closed - processing, update_collections, messages = sync_processing(processing, agent_attributes, terminate=True, logger=logger, log_prefix=log_prefix) + processing, update_collections, messages = sync_processing(processing, agent_attributes, terminate=True, abort=True, logger=logger, log_prefix=log_prefix) update_contents = [] # processing['status'] = ProcessingStatus.Cancelled diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index 9e085ba1..c6465e6a 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -14,7 +14,8 @@ import traceback from idds.common import exceptions -from idds.common.constants import (Sections, RequestStatus, RequestLocking, +from idds.common.constants import (Sections, ReturnCode, + RequestStatus, RequestLocking, TransformStatus, CommandType, CommandStatus, CommandLocking) from idds.common.utils import setup_logging, truncate_string @@ -775,18 +776,21 @@ def handle_update_request(self, req, event): def process_update_request(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: - req_status = [RequestStatus.Transforming, RequestStatus.ToCancel, RequestStatus.Cancelling, - RequestStatus.ToSuspend, RequestStatus.Suspending, - RequestStatus.ToExpire, RequestStatus.Expiring, - RequestStatus.ToFinish, RequestStatus.ToForceFinish, - RequestStatus.ToResume, RequestStatus.Resuming, - RequestStatus.Building] - - req = self.get_request(request_id=event._request_id, status=req_status, locking=True) + # req_status = [RequestStatus.Transforming, RequestStatus.ToCancel, RequestStatus.Cancelling, + # RequestStatus.ToSuspend, RequestStatus.Suspending, + # RequestStatus.ToExpire, RequestStatus.Expiring, + # RequestStatus.ToFinish, RequestStatus.ToForceFinish, + # RequestStatus.ToResume, RequestStatus.Resuming, + # RequestStatus.Building] + + # req = self.get_request(request_id=event._request_id, status=req_status, locking=True) + req = self.get_request(request_id=event._request_id, locking=True) if not req: self.logger.error("Cannot find request for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(req) ret = self.handle_update_request(req, event=event) @@ -802,7 +806,9 @@ def process_update_request(self, event): except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def handle_abort_request(self, req, event): """ @@ -866,11 +872,13 @@ def handle_command(self, event, cmd_status, errors=None): def process_abort_request(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: req = self.get_request(request_id=event._request_id, locking=True) if not req: self.logger.error("Cannot find request for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(req) self.logger.info(log_pre + "process_abort_request event: %s" % str(event)) @@ -923,10 +931,13 @@ def process_abort_request(self, event): self.logger.error(ex) self.logger.error(traceback.format_exc()) self.handle_command(event, cmd_status=CommandStatus.Processed, errors=str(ex)) + pro_ret = ReturnCode.Failed.value except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def handle_resume_request(self, req): """ @@ -960,11 +971,13 @@ def handle_resume_request(self, req): def process_resume_request(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: req = self.get_request(request_id=event._request_id, locking=True) if not req: self.logger.error("Cannot find request for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(req) self.logger.info(log_pre + "process_resume_request event: %s" % str(event)) @@ -1003,7 +1016,9 @@ def process_resume_request(self, event): except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def clean_locks(self): self.logger.info("clean locking") @@ -1039,6 +1054,7 @@ def run(self): """ try: self.logger.info("Starting main thread") + self.init_thread_info() self.load_plugins() diff --git a/main/lib/idds/agents/common/baseagent.py b/main/lib/idds/agents/common/baseagent.py index 9f2f99dc..e4bdcbd2 100644 --- a/main/lib/idds/agents/common/baseagent.py +++ b/main/lib/idds/agents/common/baseagent.py @@ -6,20 +6,22 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2022 +# - Wen Guan, , 2019 - 2023 import os import socket +import time import traceback import threading import uuid from idds.common.constants import Sections from idds.common.constants import (MessageType, MessageTypeStr, - MessageStatus, MessageSource) + MessageStatus, MessageSource, + ReturnCode) from idds.common.plugin.plugin_base import PluginBase from idds.common.plugin.plugin_utils import load_plugins, load_plugin_sequence -from idds.common.utils import setup_logging +from idds.common.utils import setup_logging, pid_exists, json_dumps from idds.core import health as core_health, messages as core_messages from idds.agents.common.timerscheduler import TimerScheduler from idds.agents.common.eventbus.eventbus import EventBus @@ -41,6 +43,9 @@ def __init__(self, num_threads=1, name=None, logger=None, **kwargs): self.logger = logger self.setup_logger(self.logger) + self.thread_id = None + self.thread_name = None + self.config_section = Sections.Common for key in kwargs: @@ -54,6 +59,11 @@ def __init__(self, num_threads=1, name=None, logger=None, **kwargs): else: self.poll_operation_time_period = int(self.poll_operation_time_period) + if not hasattr(self, 'event_interval_delay'): + self.event_interval_delay = 1 + else: + self.event_interval_delay = int(self.event_interval_delay) + self.plugins = {} self.plugin_sequence = [] @@ -63,6 +73,7 @@ def __init__(self, num_threads=1, name=None, logger=None, **kwargs): self.event_bus = EventBus() self.event_func_map = {} + self.event_futures = {} self.cache = get_redis_cache() @@ -79,6 +90,17 @@ def get_event_bus(self): def get_name(self): return self.name + def init_thread_info(self): + hb_thread = threading.current_thread() + self.thread_id = hb_thread.ident + self.thread_name = hb_thread.name + + def get_thread_id(self): + return self.thread_id + + def get_thread_name(self): + return self.thread_name + def load_agent_attributes(self, kwargs): rets = {} for key in kwargs: @@ -115,14 +137,45 @@ def get_event_function_map(self): return self.event_func_map def execute_event_schedule(self): + event_ids = list(self.event_futures.keys()) + for event_id in event_ids: + event, future, start_time = self.event_futures[event_id] + if future.done(): + ret = future.result() + status = "finished" + end_time = time.time() + if ret is None or ret == 0: + self.event_bus.clean_event(event) + elif ret == ReturnCode.Locked.value: + status = "locked" + self.event_bus.fail_event(event) + else: + status = "failed" + self.event_bus.fail_event(event) + del self.event_futures[event_id] + self.event_bus.send_report(event, status, start_time, end_time, self.get_hostname(), ret) + if status == 'locked': + self.logger.warning("Corresponding resource is locked, put the event back again: %s" % json_dumps(event)) + event.requeue() + self.event_bus.send(event) + event_funcs = self.get_event_function_map() for event_type in event_funcs: exec_func = event_funcs[event_type]['exec_func'] - pre_check = event_funcs[event_type]['pre_check'] - if pre_check(): - event = self.event_bus.get(event_type) - if event: - self.executors.submit(exec_func, event) + # pre_check = event_funcs[event_type]['pre_check'] + to_exec_at = event_funcs[event_type].get("to_exec_at", None) + if to_exec_at is None or to_exec_at < time.time(): + # if pre_check(): + if self.executors.has_free_workers(): + event = self.event_bus.get(event_type) + if event: + future = self.executors.submit(exec_func, event) + self.event_futures[event._id] = (event, future, time.time()) + event_funcs[event_type]["to_exec_at"] = time.time() + self.event_interval_delay + + def execute_schedules(self): + self.execute_timer_schedule() + self.execute_event_schedule() def execute(self): while not self.graceful_stop.is_set(): @@ -140,27 +193,69 @@ def run(self): try: self.logger.info("Starting main thread") + self.init_thread_info() self.load_plugins() self.execute() except KeyboardInterrupt: self.stop() - self.event_bus.stop() def __call__(self): self.run() + def stop(self): + super(BaseAgent, self).stop() + self.event_bus.stop() + def terminate(self): self.stop() - def health_heartbeat(self): + def get_hostname(self): + hostname = socket.getfqdn() + return hostname + + def is_self(self, health_item): hostname = socket.getfqdn() pid = os.getpid() - hb_thread = threading.current_thread() - thread_id = hb_thread.ident - thread_name = hb_thread.name + thread_id = self.get_thread_id() + ret = False + if ('hostname' in health_item and 'pid' in health_item and 'agent' in health_item + and 'thread_id' in health_item and health_item['hostname'] == hostname # noqa W503 + and health_item['pid'] == pid and health_item['agent'] == self.get_name() # noqa W503 + and health_item['thread_id'] == thread_id): # noqa W503 + ret = True + if not ret: + self.logger.debug("is_self: hostname %s, pid %s, thread_id %s, agent %s, health %s" % (hostname, pid, thread_id, self.get_name(), health_item)) + return ret + + def get_health_payload(self): + return None + + def health_heartbeat(self, heartbeat_delay=None): + if heartbeat_delay: + self.heartbeat_delay = heartbeat_delay + hostname = socket.getfqdn() + pid = os.getpid() + thread_id = self.get_thread_id() + thread_name = self.get_thread_name() + payload = self.get_health_payload() + self.logger.debug("health heartbeat: agent %s, pid %s, thread %s, delay %s" % (self.get_name(), pid, thread_name, self.heartbeat_delay)) core_health.add_health_item(agent=self.get_name(), hostname=hostname, pid=pid, - thread_id=thread_id, thread_name=thread_name, payload=None) + thread_id=thread_id, thread_name=thread_name, payload=payload) + core_health.clean_health(older_than=self.heartbeat_delay * 2) + + health_items = core_health.retrieve_health_items() + pids, pid_not_exists = [], [] + for health_item in health_items: + if health_item['hostname'] == hostname: + pid = health_item['pid'] + if pid not in pids: + pids.append(pid) + for pid in pids: + if not pid_exists(pid): + pid_not_exists.append(pid) + if pid_not_exists: + core_health.clean_health(hostname=hostname, pids=pid_not_exists, older_than=None) def add_default_tasks(self): task = self.create_task(task_func=self.health_heartbeat, task_output_queue=None, diff --git a/main/lib/idds/agents/common/eventbus/baseeventbusbackend.py b/main/lib/idds/agents/common/eventbus/baseeventbusbackend.py new file mode 100644 index 00000000..e92ad6cf --- /dev/null +++ b/main/lib/idds/agents/common/eventbus/baseeventbusbackend.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + +import logging +import time +import threading +import traceback +import uuid + +from .event import StateClaimEvent, EventBusState + + +class BaseEventBusBackend(threading.Thread): + """ + Base Event Bus Backend + """ + + def __init__(self, logger=None, **kwargs): + super(BaseEventBusBackend, self).__init__() + self._id = str(uuid.uuid4())[:8] + self._state_claim_wait = 60 + self._state_claim = StateClaimEvent(self._id, EventBusState.New, time.time()) + + self.graceful_stop = threading.Event() + + self._events = {} + self._events_index = {} + + self._lock = threading.RLock() + + self.setup_logger(logger) + + self.coordinator = None + + def setup_logger(self, logger=None): + """ + Setup logger + """ + if logger: + self.logger = logger + else: + self.logger = logging.getLogger(self.get_class_name()) + + def get_class_name(self): + return self.__class__.__name__ + + def stop(self, signum=None, frame=None): + self.graceful_stop.set() + + def send(self, event): + if self.get_coordinator(): + return self.get_coordinator().send(event) + else: + with self._lock: + if event._event_type not in self._events: + self._events[event._event_type] = {} + self._events_index[event._event_type] = [] + self._events[event._event_type][event._id] = event + self._events_index[event._event_type].append(event._id) + + def get(self, event_type, wait=0): + if self.get_coordinator(): + return self.get_coordinator().get(event_type, wait) + else: + with self._lock: + if event_type in self._events_index and self._events_index[event_type]: + event_id = self._events_index[event_type].pop(0) + event = self._events[event_type][event_id] + del self._events[event_type][event_id] + return event + return None + + def send_report(self, event, status, start_time, end_time, source, result): + if self.get_coordinator(): + return self.get_coordinator().send_report(event, status, start_time, end_time, source, result) + + def clean_event(self, event): + pass + + def fail_event(self, event): + pass + + def set_manager(self, manager): + pass + + def get_manager(self): + return None + + def set_coordinator(self, coordinator): + self.coordinator = coordinator + + def get_coordinator(self): + return self.coordinator + + def is_ok(self): + return True + + def execute(self): + while not self.graceful_stop.is_set(): + try: + self.graceful_stop.wait(0.1) + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + + def run(self): + self.execute() diff --git a/main/lib/idds/agents/common/eventbus/baseeventbusbackendopt.py b/main/lib/idds/agents/common/eventbus/baseeventbusbackendopt.py new file mode 100644 index 00000000..9630695f --- /dev/null +++ b/main/lib/idds/agents/common/eventbus/baseeventbusbackendopt.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + +import logging +import time +import threading +import uuid + +from .event import StateClaimEvent, EventBusState +from .baseeventbusbackend import BaseEventBusBackend + + +class BaseEventBusBackendOpt(BaseEventBusBackend): + """ + Base Event Bus Backend + """ + + def __init__(self, logger=None, **kwargs): + super(BaseEventBusBackendOpt, self).__init__() + self._id = str(uuid.uuid4())[:8] + self._state_claim_wait = 60 + self._state_claim = StateClaimEvent(self._id, EventBusState.New, time.time()) + + self.graceful_stop = threading.Event() + + self._events = {} + self._events_index = {} + self._events_act_id_index = {} + self._events_history = {} + self._events_history_clean_time = time.time() + self._events_insert_time = {} + self._lock = threading.RLock() + + self.max_delay = 180 + + self.setup_logger(logger) + + def setup_logger(self, logger=None): + """ + Setup logger + """ + if logger: + self.logger = logger + else: + self.logger = logging.getLogger(self.get_class_name()) + + def get_class_name(self): + return self.__class__.__name__ + + def send(self, event): + if self.get_coordinator(): + return self.get_coordinator().send(event) + else: + with self._lock: + if event._event_type not in self._events: + self._events[event._event_type] = {} + self._events_index[event._event_type] = [] + self._events_act_id_index[event._event_type] = {} + self._events_history[event._event_type] = {} + self._events_insert_time[event._event_type] = {} + + self.logger.debug("All events: %s" % self._events) + + merged = False + event_act_id = event.get_event_id() + if event_act_id not in self._events_act_id_index[event._event_type]: + self._events_act_id_index[event._event_type][event_act_id] = [event._id] + else: + old_event_ids = self._events_act_id_index[event._event_type][event_act_id].copy() + for old_event_id in old_event_ids: + if old_event_id not in self._events[event._event_type]: + self._events_act_id_index[event._event_type][event_act_id].remove(old_event_id) + else: + old_event = self._events[event._event_type][old_event_id] + if event.able_to_merge(old_event): + old_event.merge(event) + self._events[event._event_type][old_event_id] = old_event + self.logger.debug("New event %s is merged to old event %s" % (event, old_event)) + merged = True + if not merged: + self._events_act_id_index[event._event_type][event_act_id].append(event._id) + + if not merged: + if event_act_id not in self._events_history[event._event_type]: + self._events[event._event_type][event._id] = event + self._events_index[event._event_type].insert(0, event._id) + self._events_insert_time[event._event_type][event._id] = time.time() + self.logger.debug("Insert new event: %s" % event) + else: + hist_time = self._events_history[event._event_type][event_act_id] + insert_loc = len(self._events_index[event._event_type]) + q_event_ids = self._events_index[event._event_type].copy() + q_event_ids.reverse() + for q_event_id in q_event_ids: + q_event = self._events[event._event_type][q_event_id] + q_event_act_id = q_event.get_event_id() + if (q_event_act_id not in self._events_history[event._event_type] or self._events_insert_time[event._event_type][q_event_id] + self.max_delay < time.time()): + break + elif self._events_history[event._event_type][q_event_act_id] > hist_time: + insert_loc -= 1 + else: + break + self._events[event._event_type][event._id] = event + self._events_index[event._event_type].insert(insert_loc, event._id) + self._events_insert_time[event._event_type][event._id] = time.time() + self.logger.debug("Insert new event: %s" % event) + + if self._events_history_clean_time + 3600 * 4 < time.time(): + self._events_history_clean_time = time.time() + for event_type in self._events_index: + event_act_ids = [] + for event_id in self._events_index[event._event_type]: + event = self._events[event._event_type][event_id] + act_id = event.get_event_id() + event_act_ids.append(act_id) + + event_history_keys = list(self._events_history[event._event_type].keys()) + for key in event_history_keys: + if key not in event_act_ids: + del self._events_history[event._event_type][key] + + act_id_keys = list(self._events_act_id_index[event._event_type].keys()) + for act_id_key in act_id_keys: + act_id2ids = self._events_act_id_index[event._event_type][act_id_key].copy() + for q_id in act_id2ids: + if q_id not in self._events_index[event._event_type]: + self._events_act_id_index[event._event_type][act_id_key].remove(q_id) + if not self._events_act_id_index[event._event_type][act_id_key]: + del self._events_act_id_index[event._event_type][act_id_key] + + def get(self, event_type, wait=0): + if self.get_coordinator(): + return self.get_coordinator().get(event_type, wait) + else: + with self._lock: + if event_type in self._events_index and self._events_index[event_type]: + event_id = self._events_index[event_type].pop(0) + event = self._events[event_type][event_id] + event_act_id = event.get_event_id() + self._events_history[event_type][event_act_id] = time.time() + del self._events[event_type][event_id] + del self._events_insert_time[event._event_type][event._id] + return event + return None diff --git a/main/lib/idds/agents/common/eventbus/dbeventbusbackend.py b/main/lib/idds/agents/common/eventbus/dbeventbusbackend.py new file mode 100644 index 00000000..eab1dfa1 --- /dev/null +++ b/main/lib/idds/agents/common/eventbus/dbeventbusbackend.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + +import logging +import time +import threading +import uuid + +from idds.common.event import StateClaimEvent, EventBusState +from idds.core import events as core_events + +from .baseeventbusbackend import BaseEventBusBackend + + +class DBEventBusBackend(BaseEventBusBackend): + """ + Database Event Bus Backend + """ + + def __init__(self, logger=None, to_archive=True, **kwargs): + super(DBEventBusBackend, self).__init__() + self._id = str(uuid.uuid4())[:8] + self._state_claim_wait = 60 + self._state_claim = StateClaimEvent(self._id, EventBusState.New, time.time()) + + self.graceful_stop = threading.Event() + + self._events = {} + self._events_index = {} + self._events_act_id_index = {} + self._events_history = {} + self._events_history_clean_time = time.time() + self._events_insert_time = {} + self._lock = threading.RLock() + + self.max_delay = 180 + + self.to_archive = to_archive + + self.setup_logger(logger) + + def setup_logger(self, logger=None): + """ + Setup logger + """ + if logger: + self.logger = logger + else: + self.logger = logging.getLogger(self.get_class_name()) + + def get_class_name(self): + return self.__class__.__name__ + + def stop(self, signum=None, frame=None): + self.graceful_stop.set() + + def send(self, event): + ret = core_events.add_event(event) + self.logger.info("add event: %s, ret: %s" % (event, ret)) + + def get(self, event_type, wait=0): + event = core_events.get_event_for_processing(event_type=event_type) + return event + + def clean_event(self, event): + core_events.clean_event(event, to_archive=self.to_archive) + + def fail_event(self, event): + core_events.fail_event(event, to_archive=self.to_archive) diff --git a/main/lib/idds/agents/common/eventbus/event.py b/main/lib/idds/agents/common/eventbus/event.py index 5f2af27c..7c7d9cf9 100644 --- a/main/lib/idds/agents/common/eventbus/event.py +++ b/main/lib/idds/agents/common/eventbus/event.py @@ -6,287 +6,6 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2022 - 2023 +# - Wen Guan, , 2023 -import time -import uuid - -from enum import Enum - -from idds.common.utils import json_dumps - - -class EventBusState(Enum): - New = 0 - Master = 1 - Slave = 2 - Unknown = 3 - - -class EventType(Enum): - Event = 0 - StateClaim = 1 - Demand = 2 - - NewRequest = 10 - UpdateRequest = 11 - AbortRequest = 12 - ResumeRequest = 13 - ExpireRequest = 14 - - NewTransform = 20 - UpdateTransform = 21 - AbortTransform = 22 - ResumeTransform = 23 - - NewProcessing = 30 - UpdateProcessing = 31 - AbortProcessing = 32 - ResumeProcessing = 33 - SyncProcessing = 34 - TerminatedProcessing = 35 - TriggerProcessing = 36 - MsgTriggerProcessing = 37 - - UpdateCommand = 40 - - -class Event(object): - def __init__(self, publisher_id, event_type=EventType.Event, content=None, counter=1): - self._id = str(uuid.uuid4()) - self._publisher_id = publisher_id - self._event_type = event_type - self._timestamp = time.time() - self._counter = counter - self._content = content - - def to_json(self): - ret = {'id': self._id, 'publisher_id': self._publisher_id, - 'event_type': (self._event_type.name, self._event_type.value), - 'timestamp': self._timestamp, - 'counter': self._counter, - 'content': self._content} - return ret - - def __str__(self): - return json_dumps(self.to_json()) - - -class StateClaimEvent(Event): - def __init__(self, publisher_id, event_bus_state, content=None, counter=1): - super(StateClaimEvent, self).__init__(publisher_id, event_type=EventType.StateClaim, content=content, counter=counter) - self._event_bus_state = event_bus_state - - def to_json(self): - ret = super(StateClaimEvent, self).to_json() - ret['event_bus_state'] = self._event_bus_state - return ret - - -class DemandEvent(Event): - def __init__(self, publisher_id, demand_type, content=None, counter=1): - super(DemandEvent, self).__init__(publisher_id, event_type=EventType.Demand, content=content, counter=counter) - self._demand_type = demand_type - - def to_json(self): - ret = super(DemandEvent, self).to_json() - ret['demand_type'] = self._demand_type - return ret - - -class NewRequestEvent(Event): - def __init__(self, publisher_id, request_id, content=None, counter=1): - super(NewRequestEvent, self).__init__(publisher_id, event_type=EventType.NewRequest, content=content, counter=counter) - self._request_id = request_id - - def to_json(self): - ret = super(NewRequestEvent, self).to_json() - ret['request_id'] = self._request_id - return ret - - -class UpdateRequestEvent(Event): - def __init__(self, publisher_id, request_id, content=None, counter=1): - super(UpdateRequestEvent, self).__init__(publisher_id, event_type=EventType.UpdateRequest, content=content, counter=counter) - self._request_id = request_id - - def to_json(self): - ret = super(UpdateRequestEvent, self).to_json() - ret['request_id'] = self._request_id - return ret - - -class AbortRequestEvent(Event): - def __init__(self, publisher_id, request_id, content=None, counter=1): - super(AbortRequestEvent, self).__init__(publisher_id, event_type=EventType.AbortRequest, content=content, counter=counter) - self._request_id = request_id - - def to_json(self): - ret = super(AbortRequestEvent, self).to_json() - ret['request_id'] = self._request_id - return ret - - -class ResumeRequestEvent(Event): - def __init__(self, publisher_id, request_id, content=None, counter=1): - super(ResumeRequestEvent, self).__init__(publisher_id, event_type=EventType.ResumeRequest, content=content, counter=counter) - self._request_id = request_id - - def to_json(self): - ret = super(ResumeRequestEvent, self).to_json() - ret['request_id'] = self._request_id - return ret - - -class ExpireRequestEvent(Event): - def __init__(self, publisher_id, request_id, content=None, counter=1): - super(ExpireRequestEvent, self).__init__(publisher_id, event_type=EventType.ExpireRequest, content=content, counter=counter) - self._request_id = request_id - - def to_json(self): - ret = super(ExpireRequestEvent, self).to_json() - ret['request_id'] = self._request_id - return ret - - -class UpdateCommandEvent(Event): - def __init__(self, publisher_id, command_id, content=None, counter=1): - super(UpdateCommandEvent, self).__init__(publisher_id, event_type=EventType.UpdateCommand, content=content, counter=counter) - self._command_id = command_id - - def to_json(self): - ret = super(UpdateCommandEvent, self).to_json() - ret['command_id'] = self._command_id - return ret - - -class NewTransformEvent(Event): - def __init__(self, publisher_id, transform_id, content=None, counter=1): - super(NewTransformEvent, self).__init__(publisher_id, event_type=EventType.NewTransform, content=content, counter=counter) - self._transform_id = transform_id - - def to_json(self): - ret = super(NewTransformEvent, self).to_json() - ret['transform_id'] = self._transform_id - return ret - - -class UpdateTransformEvent(Event): - def __init__(self, publisher_id, transform_id, content=None, counter=1): - super(UpdateTransformEvent, self).__init__(publisher_id, event_type=EventType.UpdateTransform, content=content, counter=counter) - self._transform_id = transform_id - - def to_json(self): - ret = super(UpdateTransformEvent, self).to_json() - ret['transform_id'] = self._transform_id - return ret - - -class AbortTransformEvent(Event): - def __init__(self, publisher_id, transform_id, content=None, counter=1): - super(AbortTransformEvent, self).__init__(publisher_id, event_type=EventType.AbortTransform, content=content, counter=counter) - self._transform_id = transform_id - - def to_json(self): - ret = super(AbortTransformEvent, self).to_json() - ret['transform_id'] = self._transform_id - return ret - - -class ResumeTransformEvent(Event): - def __init__(self, publisher_id, transform_id, content=None, counter=1): - super(ResumeTransformEvent, self).__init__(publisher_id, event_type=EventType.ResumeTransform, content=content, counter=counter) - self._transform_id = transform_id - - def to_json(self): - ret = super(ResumeTransformEvent, self).to_json() - ret['transform_id'] = self._transform_id - return ret - - -class NewProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None, counter=1): - super(NewProcessingEvent, self).__init__(publisher_id, event_type=EventType.NewProcessing, content=content, counter=counter) - self._processing_id = processing_id - - def to_json(self): - ret = super(NewProcessingEvent, self).to_json() - ret['processing_id'] = self._processing_id - return ret - - -class UpdateProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None, counter=1): - super(UpdateProcessingEvent, self).__init__(publisher_id, event_type=EventType.UpdateProcessing, content=content, counter=counter) - self._processing_id = processing_id - - def to_json(self): - ret = super(UpdateProcessingEvent, self).to_json() - ret['processing_id'] = self._processing_id - return ret - - -class AbortProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None, counter=1): - super(AbortProcessingEvent, self).__init__(publisher_id, event_type=EventType.AbortProcessing, content=content, counter=counter) - self._processing_id = processing_id - - def to_json(self): - ret = super(AbortProcessingEvent, self).to_json() - ret['processing_id'] = self._processing_id - return ret - - -class ResumeProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None, counter=1): - super(ResumeProcessingEvent, self).__init__(publisher_id, event_type=EventType.ResumeProcessing, content=content, counter=counter) - self._processing_id = processing_id - - def to_json(self): - ret = super(ResumeProcessingEvent, self).to_json() - ret['processing_id'] = self._processing_id - return ret - - -class SyncProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None, counter=1): - super(SyncProcessingEvent, self).__init__(publisher_id, event_type=EventType.SyncProcessing, content=content, counter=counter) - self._processing_id = processing_id - - def to_json(self): - ret = super(SyncProcessingEvent, self).to_json() - ret['processing_id'] = self._processing_id - return ret - - -class TerminatedProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None, counter=1): - super(TerminatedProcessingEvent, self).__init__(publisher_id, event_type=EventType.TerminatedProcessing, content=content, counter=counter) - self._processing_id = processing_id - - def to_json(self): - ret = super(TerminatedProcessingEvent, self).to_json() - ret['processing_id'] = self._processing_id - return ret - - -class TriggerProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None, counter=1): - super(TriggerProcessingEvent, self).__init__(publisher_id, event_type=EventType.TriggerProcessing, content=content, counter=counter) - self._processing_id = processing_id - - def to_json(self): - ret = super(TriggerProcessingEvent, self).to_json() - ret['processing_id'] = self._processing_id - return ret - - -class MsgTriggerProcessingEvent(Event): - def __init__(self, publisher_id, processing_id, content=None, counter=1): - super(MsgTriggerProcessingEvent, self).__init__(publisher_id, event_type=EventType.MsgTriggerProcessing, content=content, counter=counter) - self._processing_id = processing_id - - def to_json(self): - ret = super(MsgTriggerProcessingEvent, self).to_json() - ret['processing_id'] = self._processing_id - return ret +from idds.common.event import * # noqa F401 F403 diff --git a/main/lib/idds/agents/common/eventbus/eventbus.py b/main/lib/idds/agents/common/eventbus/eventbus.py index 45764ef1..59055d15 100644 --- a/main/lib/idds/agents/common/eventbus/eventbus.py +++ b/main/lib/idds/agents/common/eventbus/eventbus.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2022 +# - Wen Guan, , 2022 - 2023 import logging import uuid @@ -14,7 +14,10 @@ from idds.common.constants import Sections from idds.common.config import config_has_section, config_list_options -from .localeventbusbackend import LocalEventBusBackend +# from .localeventbusbackend import LocalEventBusBackend +from .baseeventbusbackendopt import BaseEventBusBackendOpt +from .dbeventbusbackend import DBEventBusBackend +from .msgeventbusbackend import MsgEventBusBackend class Singleton(object): @@ -41,12 +44,36 @@ def __init__(self, logger=None): self.setup_logger(logger) self.config_section = Sections.EventBus attrs = self.load_attributes() - if 'backend' in attrs and attrs['backend'] == 'message': - # ToBeDone - # self.backend = MsgEventBusBackend(**attrs) - pass - else: - self.backend = LocalEventBusBackend(logger=self.logger, **attrs) + self.attrs = attrs + self._backend = None + self._orig_backend = None + if 'backend' in attrs: + if attrs['backend'] == 'message': + self.backend = MsgEventBusBackend(logger=self.logger, **attrs) + elif attrs['backend'] == "database": + if 'to_archive' not in attrs: + attrs['to_archive'] = True + self.backend = DBEventBusBackend(**attrs) + if self.backend is None: + self.backend = BaseEventBusBackendOpt(logger=self.logger, **attrs) + self.logger.info("EventBus backend : %s" % self.backend) + self.backend.start() + + @property + def backend(self): + if self._backend and isinstance(self._backend, MsgEventBusBackend) and not self._backend.is_ok(): + self._orig_backend = self._backend + self._backend = BaseEventBusBackendOpt(logger=self.logger, **self.attrs) + self.logger.critical("MsgEventBusBackend failed, switch to use BaseEventBusBackendOpt") + elif self._orig_backend and isinstance(self._orig_backend, MsgEventBusBackend) and self._orig_backend.is_ok(): + self.logger.critical("MsgEventBusBackend is ok, switch back to use it") + self._backend = self._orig_backend + self._orig_backend = None + return self._backend + + @backend.setter + def backend(self, value): + self._backend = value def setup_logger(self, logger=None): """ @@ -87,5 +114,26 @@ def get(self, event_type): def send(self, event): return self.publish_event(event) + def send_report(self, event, status, start_time, end_time, source, result): + return self.backend.send_report(event, status, start_time, end_time, source, result) + + def clean_event(self, event): + self.backend.clean_event(event) + + def fail_event(self, event): + self.backend.fail_event(event) + + def set_manager(self, manager): + self.backend.set_manager(manager) + + def get_manager(self): + return self.backend.get_manager() + + def get_coordinator(self): + return self.backend.get_coordinator() + + def set_coordinator(self, coordinator): + self.backend.set_coordinator(coordinator) + def stop(self): self.backend.stop() diff --git a/main/lib/idds/agents/common/eventbus/localeventbusbackend.py b/main/lib/idds/agents/common/eventbus/localeventbusbackend.py deleted file mode 100644 index aa1a5c42..00000000 --- a/main/lib/idds/agents/common/eventbus/localeventbusbackend.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0OA -# -# Authors: -# - Wen Guan, , 2022 - -import logging -import time -import threading -import traceback -import uuid - -from .event import StateClaimEvent, EventBusState - - -class LocalEventBusBackend(threading.Thread): - """ - Local Event Bus Backend - """ - - def __init__(self, logger=None, **kwargs): - super(LocalEventBusBackend, self).__init__() - self._id = str(uuid.uuid4())[:8] - self._state_claim_wait = 60 - self._state_claim = StateClaimEvent(self._id, EventBusState.New, time.time()) - - self.graceful_stop = threading.Event() - - self._events = {} - self._events_index = {} - - self._lock = threading.RLock() - - self.setup_logger(logger) - - def setup_logger(self, logger=None): - """ - Setup logger - """ - if logger: - self.logger = logger - else: - self.logger = logging.getLogger(self.get_class_name()) - - def get_class_name(self): - return self.__class__.__name__ - - def stop(self, signum=None, frame=None): - self.graceful_stop.set() - - def send(self, event): - with self._lock: - if event._event_type not in self._events: - self._events[event._event_type] = {} - self._events_index[event._event_type] = [] - self._events[event._event_type][event._id] = event - self._events_index[event._event_type].append(event._id) - - def get(self, event_type, wait=0): - with self._lock: - if event_type in self._events_index and self._events_index[event_type]: - event_id = self._events_index[event_type].pop(0) - event = self._events[event_type][event_id] - del self._events[event_type][event_id] - return event - return None - - def execute(self): - while not self.graceful_stop.is_set(): - try: - self.graceful_stop.wait(0.1) - except Exception as error: - self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) - - def run(self): - self.execute() diff --git a/main/lib/idds/agents/common/eventbus/msgeventbusbackend.py b/main/lib/idds/agents/common/eventbus/msgeventbusbackend.py new file mode 100644 index 00000000..c4a6d1a8 --- /dev/null +++ b/main/lib/idds/agents/common/eventbus/msgeventbusbackend.py @@ -0,0 +1,489 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + +import logging +import random +import socket +import string +import time +import threading +import traceback +import uuid + +import zmq +from zmq.auth.thread import ThreadAuthenticator + +from idds.common.utils import json_dumps, json_loads + +from .event import StateClaimEvent, EventBusState, TestEvent +from .baseeventbusbackend import BaseEventBusBackend + + +class MsgEventBusBackendReceiver(threading.Thread): + def __init__(self, name="MsgEventBusBackendReceiver", logger=None, debug=False, + graceful_stop=None, coordinator=None, coordinator_socket=None, **kwargs): + threading.Thread.__init__(self, name=name) + self.logger = logger + self.graceful_stop = graceful_stop + self.coordinator = coordinator + self.coordinator_socket = coordinator_socket + + self._events = {} + self._events_index = {} + self._events_act_id_index = {} + self._events_history = {} + self._events_history_clean_time = time.time() + self._events_insert_time = {} + + self.max_delay = 180 + + self._stop = threading.Event() + self._lock = threading.RLock() + + self.debug = debug + + def set_coordinator(self, coordinator): + self.coordinator = coordinator + + def stop(self): + self._stop.set() + + def run(self): + while not self.graceful_stop.is_set(): + try: + if self._stop.is_set(): + return + + try: + req = self.coordinator_socket.recv_string() + if self.debug: + self.logger.debug("MsgEventBusBackendReceiver received: %s" % req) + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.coordinator_socket.close() + + try: + req = json_loads(req) + reply = {'ret': None} + if self.coordinator: + if req['type'] == 'send_event': + event = req['event'] + ret = self.coordinator.send(event) + reply = {'type': 'send_event_ret', 'ret': ret} + elif req['type'] == 'get_event': + event_type = req['event_type'] + wait = req['wait'] + ret = self.coordinator.get(event_type, wait) + reply = {'type': 'get_event_ret', 'ret': ret} + else: + if req['type'] == 'send_event': + event = req['event'] + ret = self.send(event) + reply = {'type': 'send_event_ret', 'ret': ret} + elif req['type'] == 'get_event': + event_type = req['event_type'] + wait = req['wait'] + ret = self.get(event_type, wait) + reply = {'type': 'get_event_ret', 'ret': ret} + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + reply = {'type': 'error', 'ret': None} + + reply = json_dumps(reply) + try: + if self.debug: + self.logger.debug("MsgEventBusBackendReceiver reply: %s" % reply) + self.coordinator_socket.send_string(reply) + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.coordinator_socket.close() + + self.graceful_stop.wait(0.1) + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + + def send(self, event): + with self._lock: + if event._event_type not in self._events: + self._events[event._event_type] = {} + self._events_index[event._event_type] = [] + self._events_act_id_index[event._event_type] = {} + self._events_history[event._event_type] = {} + self._events_insert_time[event._event_type] = {} + + self.logger.debug("All events: %s" % self._events) + + merged = False + event_act_id = event.get_event_id() + if event_act_id not in self._events_act_id_index[event._event_type]: + self._events_act_id_index[event._event_type][event_act_id] = [event._id] + else: + old_event_ids = self._events_act_id_index[event._event_type][event_act_id].copy() + for old_event_id in old_event_ids: + if old_event_id not in self._events[event._event_type]: + self._events_act_id_index[event._event_type][event_act_id].remove(old_event_id) + else: + old_event = self._events[event._event_type][old_event_id] + if event.able_to_merge(old_event): + old_event.merge(event) + self._events[event._event_type][old_event_id] = old_event + self.logger.debug("New event %s is merged to old event %s" % (event, old_event)) + merged = True + if not merged: + self._events_act_id_index[event._event_type][event_act_id].append(event._id) + + if not merged: + if event_act_id not in self._events_history[event._event_type]: + self._events[event._event_type][event._id] = event + self._events_index[event._event_type].insert(0, event._id) + self._events_insert_time[event._event_type][event._id] = time.time() + self.logger.debug("Insert new event: %s" % event) + else: + hist_time = self._events_history[event._event_type][event_act_id] + insert_loc = len(self._events_index[event._event_type]) + q_event_ids = self._events_index[event._event_type].copy() + q_event_ids.reverse() + for q_event_id in q_event_ids: + q_event = self._events[event._event_type][q_event_id] + q_event_act_id = q_event.get_event_id() + if (q_event_act_id not in self._events_history[event._event_type] or self._events_insert_time[event._event_type][q_event_id] + self.max_delay < time.time()): + break + elif self._events_history[event._event_type][q_event_act_id] > hist_time: + insert_loc -= 1 + else: + break + self._events[event._event_type][event._id] = event + self._events_index[event._event_type].insert(insert_loc, event._id) + self._events_insert_time[event._event_type][event._id] = time.time() + self.logger.debug("Insert new event: %s" % event) + + if self._events_history_clean_time + 3600 * 4 < time.time(): + self._events_history_clean_time = time.time() + for event_type in self._events_index: + event_act_ids = [] + for event_id in self._events_index[event._event_type]: + event = self._events[event._event_type][event_id] + act_id = event.get_event_id() + event_act_ids.append(act_id) + + event_history_keys = list(self._events_history[event._event_type].keys()) + for key in event_history_keys: + if key not in event_act_ids: + del self._events_history[event._event_type][key] + + act_id_keys = list(self._events_act_id_index[event._event_type].keys()) + for act_id_key in act_id_keys: + act_id2ids = self._events_act_id_index[event._event_type][act_id_key].copy() + for q_id in act_id2ids: + if q_id not in self._events_index[event._event_type]: + self._events_act_id_index[event._event_type][act_id_key].remove(q_id) + if not self._events_act_id_index[event._event_type][act_id_key]: + del self._events_act_id_index[event._event_type][act_id_key] + + def get(self, event_type, wait=0): + with self._lock: + if event_type in self._events_index and self._events_index[event_type]: + event_id = self._events_index[event_type].pop(0) + event = self._events[event_type][event_id] + event_act_id = event.get_event_id() + self._events_history[event_type][event_act_id] = time.time() + del self._events[event_type][event_id] + del self._events_insert_time[event._event_type][event._id] + return event + return None + + +class MsgEventBusBackend(BaseEventBusBackend): + """ + Msg Event Bus Backend + """ + + def __init__(self, logger=None, coordinator_port=5556, socket_timeout=10, debug=False, + timeout_threshold=5, failure_threshold=5, **kwargs): + super(MsgEventBusBackend, self).__init__() + self._id = str(uuid.uuid4())[:8] + self._state_claim_wait = 60 + self._state_claim = StateClaimEvent(self._id, EventBusState.New, time.time()) + + self.graceful_stop = threading.Event() + + self._lock = threading.RLock() + + self.max_delay = 180 + + self._username = 'idds' + self._password = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(20)) + + self._is_ok = True + self.num_success = 0 + self.num_failures = 0 + self.num_timeout = 0 + self.cache_events = [] + + self.setup_logger(logger) + + self.socket_timeout = int(socket_timeout) + self.timeout_threshold = int(timeout_threshold) + self.failure_threshold = int(failure_threshold) + + self.coordinator_port = int(coordinator_port) + self.context = None + self.auth = None + self.coordinator_socket = None + self.coordinator_con_string = None + + self.processor = None + + self.manager = None + self.manager_socket = None + + self.debug = debug + + self.init_msg_channel() + + def setup_logger(self, logger=None): + """ + Setup logger + """ + if logger: + self.logger = logger + else: + self.logger = logging.getLogger(self.get_class_name()) + + def get_class_name(self): + return self.__class__.__name__ + + def stop(self, signum=None, frame=None): + self.logger.debug("graceful stop") + self.graceful_stop.set() + if self.auth: + self.logger.debug("auth stop") + self.auth.stop() + + def init_msg_channel(self): + with self._lock: + try: + if not self.context: + self.context = zmq.Context() + if self.auth: + self.auth.stop() + + self.auth = ThreadAuthenticator(self.context) + self.auth.start() + # self.auth.allow('127.0.0.1') + self.auth.allow() + # Instruct authenticator to handle PLAIN requests + self.auth.configure_plain(domain='*', passwords={self._username: self._password}) + + if not self.coordinator_socket or self.coordinator_socket.closed: + self.coordinator_socket = self.context.socket(zmq.REP) + self.coordinator_socket.plain_server = True + self.coordinator_socket.bind("tcp://*:%s" % self.coordinator_port) + + hostname = socket.getfqdn() + self.coordinator_con_string = "tcp://%s:%s" % (hostname, self.coordinator_port) + + if self.processor: + self.processor.stop() + + self.processor = MsgEventBusBackendReceiver(logger=self.logger, + graceful_stop=self.graceful_stop, + debug=self.debug, + coordinator_socket=self.coordinator_socket, + coordinator=self.coordinator) + self.processor.start() + except (zmq.error.ZMQError, zmq.Again) as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.num_failures += 1 + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.num_failures += 1 + + try: + if not self.manager_socket or self.manager_socket.closed: + manager = self.get_manager() + self.manager_socket = self.context.socket(zmq.REQ) + self.manager_socket.plain_username = manager['username'].encode('utf-8') + self.manager_socket.plain_password = manager['password'].encode('utf-8') + self.manager_socket.connect(manager['connect']) + except (zmq.error.ZMQError, zmq.Again) as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.num_failures += 1 + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.num_failures += 1 + + def set_manager(self, manager): + if not manager: + return None + + if (not self.manager or self.manager['connect'] != manager['connect'] + or self.manager['username'] != manager['username'] # noqa W503, E129 + or self.manager['password'] != manager['password']): # noqa W503, E129 + with self._lock: + try: + self.manager = manager + self.manager_socket = self.context.socket(zmq.REQ) + self.manager_socket.plain_username = manager['username'].encode('utf-8') + self.manager_socket.plain_password = manager['password'].encode('utf-8') + self.manager_socket.connect(manager['connect']) + except (zmq.error.ZMQError, zmq.Again) as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.num_failures += 1 + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.num_failures += 1 + + def get_manager(self): + if self.manager: + return self.manager + manager = {'connect': self.coordinator_con_string, + 'username': self._username, + 'password': self._password} + return manager + + def set_coordinator(self, coordinator): + self.coordinator = coordinator + if self.processor: + self.processor.set_coordinator(coordinator) + + def get_coordinator(self): + return self.coordinator + + def send(self, event): + with self._lock: + try: + req = {'type': 'send_event', 'event': event} + req = json_dumps(req) + # self.logger.debug("send:send %s" % req) + if self.debug: + self.logger.debug("MsgEventBusBackend send event: %s" % req) + + self.manager_socket.send_string(req) + if self.manager_socket.poll(self.socket_timeout * 1000): + reply = self.manager_socket.recv_string() + # self.logger.debug("send:recv %s" % reply) + if self.debug: + self.logger.debug("MsgEventBusBackend send event reply: %s" % reply) + reply = json_loads(reply) + ret = reply['ret'] + + # refresh failures when there are successful requests + self.num_failures = 0 + self.num_timeout = 0 + self.num_success += 1 + else: + ret = None + self.cache_events.append(event) + self.num_timeout += 1 + self.logger.critical("timeout to receive a message") + + return ret + except (zmq.error.ZMQError, zmq.Again) as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.manager_socket.close() + self.cache_events.append(event) + self.num_failures += 1 + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.manager_socket.close() + self.cache_events.append(event) + self.num_failures += 1 + + def get(self, event_type, wait=0): + with self._lock: + try: + req = {'type': 'get_event', 'event_type': event_type, 'wait': wait} + req = json_dumps(req) + # self.logger.debug("get:send %s" % req) + + if self.debug: + self.logger.debug("MsgEventBusBackend get event: %s" % req) + self.manager_socket.send_string(req) + + if self.manager_socket.poll(10 * 1000): + reply = self.manager_socket.recv_string() + # self.logger.debug("send:recv %s" % reply) + if self.debug: + self.logger.debug("MsgEventBusBackend get event reply: %s" % reply) + reply = json_loads(reply) + ret = reply['ret'] + + # refresh failures when there are successful requests + self.num_failures = 0 + self.num_success += 1 + self.num_timeout = 0 + else: + ret = None + self.num_timeout += 1 + self.logger.critical("timeout to receive a message") + + return ret + except (zmq.error.ZMQError, zmq.Again) as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.manager_socket.close() + self.num_failures += 1 + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.manager_socket.close() + self.num_failures += 1 + + def test(self): + if self.num_failures > 0 or self.num_timeout > 0: + event = TestEvent() + self.send(event) + self.get(event._event_type) + if self.num_timeout > 5: + if not self.manager_socket.closed: + self.logger.critical("The number of timeout reached a threshold, close connection.") + self.manager_socket.close() + + def send_report(self, event, status, start_time, end_time, source, result): + if self.get_coordinator(): + return self.get_coordinator().send_report(event, status, start_time, end_time, source, result) + + def clean_event(self, event): + pass + + def fail_event(self, event): + pass + + def is_ok(self): + if self.num_failures > self.failure_threshold or self.num_timeout > self.timeout_threshold: + self._is_ok = False + else: + self._is_ok = True + return self._is_ok + + def replay_cache_events(self): + cache_events = self.cache_events + self.cache_events = [] + for event in cache_events: + self.send(event) + + def execute(self): + while not self.graceful_stop.is_set(): + try: + self.init_msg_channel() + self.test() + if self.is_ok(): + self.replay_cache_events() + self.graceful_stop.wait(1) + else: + if self.num_failures > 20: + self.graceful_stop.wait(300) + else: + self.graceful_stop.wait(60) + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + self.stop() + + def run(self): + self.execute() diff --git a/main/lib/idds/agents/common/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py index cda86ccb..a398779a 100644 --- a/main/lib/idds/agents/common/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -10,7 +10,6 @@ import logging -import json import random import socket import threading @@ -19,7 +18,7 @@ import stomp from idds.common.plugin.plugin_base import PluginBase -from idds.common.utils import setup_logging, get_logger +from idds.common.utils import setup_logging, get_logger, json_dumps, json_loads setup_logging(__name__) @@ -69,7 +68,7 @@ def __init__(self, name="MessagingSender", logger=None, **kwargs): if not hasattr(self, 'channels'): raise Exception('"channels" is required but not defined.') - self.channels = json.loads(self.channels) + self.channels = json_loads(self.channels) self.broker_timeout = 3600 @@ -194,8 +193,8 @@ def send_message(self, msg): if conn: self.logger.info("Sending message to message broker(%s): %s" % (destination, msg['msg_id'])) - self.logger.debug("Sending message to message broker(%s): %s" % (destination, json.dumps(msg['msg_content']))) - conn.send(body=json.dumps(msg['msg_content']), + self.logger.debug("Sending message to message broker(%s): %s" % (destination, json_dumps(msg['msg_content']))) + conn.send(body=json_dumps(msg['msg_content']), destination=queue_dest, id='atlas-idds-messaging', ack='auto', diff --git a/main/lib/idds/agents/common/timerscheduler.py b/main/lib/idds/agents/common/timerscheduler.py index c99a4f5d..dfd5505d 100644 --- a/main/lib/idds/agents/common/timerscheduler.py +++ b/main/lib/idds/agents/common/timerscheduler.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 +# - Wen Guan, , 2020 - 2023 import heapq @@ -17,6 +17,36 @@ from .timertask import TimerTask +class IDDSThreadPoolExecutor(futures.ThreadPoolExecutor): + def __init__(self, max_workers=None, thread_name_prefix='', + initializer=None, initargs=()): + self.futures = [] + self._lock = threading.RLock() + super(IDDSThreadPoolExecutor, self).__init__(max_workers=max_workers, + thread_name_prefix=thread_name_prefix, + initializer=initializer, + initargs=initargs) + + def submit(self, fn, *args, **kwargs): + future = super(IDDSThreadPoolExecutor, self).submit(fn, *args, **kwargs) + with self._lock: + self.futures.append(future) + return future + + def get_max_workers(self): + return self._max_workers + + def get_num_workers(self): + with self._lock: + for future in self.futures.copy(): + if future.done(): + self.futures.remove(future) + return len(self.futures) + + def has_free_workers(self): + return self.get_num_workers() < self._max_workers + + class TimerScheduler(threading.Thread): """ The base class to schedule Task which will be executed after some time @@ -28,8 +58,8 @@ def __init__(self, num_threads, name=None, logger=None): if self.num_threads < 1: self.num_threads = 1 self.graceful_stop = threading.Event() - self.executors = futures.ThreadPoolExecutor(max_workers=self.num_threads, - thread_name_prefix=name) + self.executors = IDDSThreadPoolExecutor(max_workers=self.num_threads, + thread_name_prefix=name) self._task_queue = [] self._lock = threading.RLock() diff --git a/main/lib/idds/agents/conductor/conductor.py b/main/lib/idds/agents/conductor/conductor.py index c3167ebf..e51fa8ca 100644 --- a/main/lib/idds/agents/conductor/conductor.py +++ b/main/lib/idds/agents/conductor/conductor.py @@ -33,7 +33,7 @@ class Conductor(BaseAgent): """ def __init__(self, num_threads=1, retrieve_bulk_size=1000, threshold_to_release_messages=None, - random_delay=None, delay=60, replay_times=3, **kwargs): + random_delay=None, delay=60, interval_delay=10, replay_times=3, **kwargs): super(Conductor, self).__init__(num_threads=num_threads, name='Conductor', **kwargs) self.config_section = Sections.Conductor self.retrieve_bulk_size = int(retrieve_bulk_size) @@ -55,6 +55,9 @@ def __init__(self, num_threads=1, retrieve_bulk_size=1000, threshold_to_release_ if replay_times is None: replay_times = 3 self.replay_times = int(replay_times) + if not interval_delay: + interval_delay = 10 + self.interval_delay = int(interval_delay) self.logger = get_logger(self.__class__.__name__) def __del__(self): @@ -135,19 +138,24 @@ def run(self): """ try: self.logger.info("Starting main thread") + self.init_thread_info() self.load_plugins() + self.add_default_tasks() + self.start_notifier() # self.add_health_message_task() while not self.graceful_stop.is_set(): # execute timer task - self.execute_once() + self.execute_schedules() try: num_contents = 0 messages = self.get_messages() + if not messages: + time.sleep(self.interval_delay) for message in messages: message['destination'] = message['destination'].name diff --git a/main/lib/idds/agents/coordinator/__init__.py b/main/lib/idds/agents/coordinator/__init__.py new file mode 100644 index 00000000..865b774e --- /dev/null +++ b/main/lib/idds/agents/coordinator/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 diff --git a/main/lib/idds/agents/coordinator/coordinator.py b/main/lib/idds/agents/coordinator/coordinator.py new file mode 100644 index 00000000..8708d5b9 --- /dev/null +++ b/main/lib/idds/agents/coordinator/coordinator.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + +import random +import time +import threading +import traceback + +from idds.common.constants import (Sections) +from idds.common.exceptions import IDDSException +from idds.common.event import EventPriority +from idds.common.utils import setup_logging, get_logger, json_dumps, json_loads +from idds.core import health as core_health +from idds.agents.common.baseagent import BaseAgent + + +setup_logging(__name__) + + +class Coordinator(BaseAgent): + """ + Coordinator works to schedule agents to process different events + """ + + def __init__(self, num_threads=1, coordination_interval_delay=300, + interval_delay=30, max_queued_events=10, + max_total_files_for_small_task=1000, + interval_delay_for_big_task=300, + max_boost_interval_delay=3, + show_queued_events_time_interval=300, **kwargs): + super(Coordinator, self).__init__(num_threads=num_threads, name='Coordinator', **kwargs) + self.config_section = Sections.Coordinator + + self.coordination_interval_delay = coordination_interval_delay + if self.coordination_interval_delay: + self.coordination_interval_delay = int(self.coordination_interval_delay) + else: + self.coordination_interval_delay = 300 + + self._lock = threading.RLock() + + self.events = {} + self.events_index = {} + self.events_ids = {} + self.report = {} + self.accounts = {} + + self.interval_delay = interval_delay + if self.interval_delay: + self.interval_delay = int(self.interval_delay) + else: + self.interval_delay = 30 + self.max_boost_interval_delay = int(max_boost_interval_delay) + self.max_queued_events = max_queued_events + if self.max_queued_events: + self.max_queued_events = int(self.max_queued_events) + else: + self.max_queued_events = 10 + self.max_total_files_for_small_task = max_total_files_for_small_task + if not self.max_total_files_for_small_task: + self.max_total_files_for_small_task = 1000 + else: + self.max_total_files_for_small_task = int(self.max_total_files_for_small_task) + self.interval_delay_for_big_task = interval_delay_for_big_task + if not self.interval_delay_for_big_task: + self.interval_delay_for_big_task = 300 + else: + self.interval_delay_for_big_task = int(self.interval_delay_for_big_task) + self.logger = get_logger(self.__class__.__name__) + + self.show_queued_events_time = None + self.show_queued_events_time_interval = int(show_queued_events_time_interval) + + def __del__(self): + self.stop_coordinator() + + def get_health_payload(self): + manager = self.event_bus.get_manager() + payload = {'manager': manager} + payload = json_dumps(payload) + return payload + + def select_coordinator(self): + self.health_heartbeat(self.coordination_interval_delay) + self.selected_coordinator = core_health.select_agent(name='Coordinator', newer_than=self.coordination_interval_delay * 2) + self.logger.debug("Selected coordinator: %s" % self.selected_coordinator) + payload = json_loads(self.selected_coordinator['payload']) + self.event_bus.set_manager(payload['manager']) + self.event_bus.set_coordinator(self) + + def get_schedule_time(self, event, interval_delay): + last_start_time = self.report.get(event.get_event_id(), {}).get("event_types", {}).get(event._event_type.name, {}).get('start_time', None) + last_end_time = self.report.get(event.get_event_id(), {}).get("event_types", {}).get(event._event_type.name, {}).get('end_time', None) + time_delay = None + requeue_counter = event.get_requeue_counter() + if requeue_counter: + if requeue_counter <= 3: + time_delay = interval_delay * requeue_counter + else: + time_delay = interval_delay * random.randint(3, requeue_counter) + else: + if last_start_time and last_end_time: + time_delay = last_end_time - last_start_time + if not time_delay or time_delay < interval_delay: + time_delay = interval_delay + if last_end_time: + scheduled_time = last_end_time + time_delay + if scheduled_time < time.time(): + scheduled_time = time.time() + else: + scheduled_time = time.time() + return scheduled_time + + def get_scheduled_prio_time(self, event): + if event.is_terminating(): + priority = EventPriority.High + return priority, time.time() + elif event.has_updates(): + priority = EventPriority.Medium + else: + priority = EventPriority.Low + + total_queued_events = self.accounts.get(event._event_type, {}).get('total_queued_events', 0) + # total_processed_events = self.accounts.get(event._event_type, {}).get('total_processed_events', 0) + total_files = self.report.get(event.get_event_id(), {}).get("total_files", None) + processed_files = self.report.get(event.get_event_id(), {}).get("processed_files", None) + # lack_events = self.accounts.get(event._event_type, {}).get('lack_events', False) + + big_task = False + to_finish = False + if total_files and total_files > self.max_total_files_for_small_task: + big_task = True + if total_files and processed_files and total_files - processed_files <= self.max_total_files_for_small_task: + to_finish = True + + if big_task and not to_finish: + interval_delay = self.interval_delay_for_big_task + elif total_queued_events and total_queued_events > self.max_queued_events: + boost_times = 2 * min(int(total_queued_events * 1.0 / self.max_queued_events), self.max_boost_interval_delay) + interval_delay = self.interval_delay * boost_times + else: + interval_delay = self.interval_delay + scheduled_time = self.get_schedule_time(event, interval_delay) + return priority, scheduled_time + + def get_event_position(self, event_index, event): + if not event_index: + return 0 + for i, event_id in enumerate(event_index): + if self.events[event_id].scheduled_time > event.scheduled_time: + return i + return len(event_index) + + def send(self, event): + event.scheduled_priority, event.scheduled_time = self.get_scheduled_prio_time(event) + with self._lock: + merge = False + for old_event_id in self.events_ids.get(event.get_event_id(), []): + old_event = self.events[old_event_id] + if old_event.able_to_merge(event): + old_scheduled_priority = old_event.scheduled_priority + old_scheduled_time = old_event.scheduled_time + + old_event.merge(event) + + self.events[old_event_id] = old_event + # old_event.scheduled_priority, old_event.scheduled_time = self.get_scheduled_prio_time(old_event) + new_scheduled_priority, new_scheduled_time = self.get_scheduled_prio_time(old_event) + + if old_scheduled_priority != new_scheduled_priority or old_scheduled_time != new_scheduled_time: + self.events_index[old_event._event_type][old_scheduled_priority].remove(old_event._id) + old_event.scheduled_priority = new_scheduled_priority + old_event.scheduled_time = new_scheduled_time + if old_event.scheduled_priority not in self.events_index[old_event._event_type]: + self.events_index[old_event._event_type][old_event.scheduled_priority] = [] + insert_pos = self.get_event_position(self.events_index[old_event._event_type][old_event.scheduled_priority], old_event) + self.events_index[old_event._event_type][old_event.scheduled_priority].insert(insert_pos, old_event._id) + merge = True + self.logger.debug("New event %s is merged to old event %s" % (event.to_json(strip=True), old_event.to_json(strip=True))) + break + if not merge: + if event._event_type not in self.events_index: + self.events_index[event._event_type] = {} + if event.scheduled_priority not in self.events_index[event._event_type]: + self.events_index[event._event_type][event.scheduled_priority] = [] + if event.get_event_id() not in self.events_ids: + self.events_ids[event.get_event_id()] = [] + + self.events[event._id] = event + self.logger.debug("New event %s" % (event.to_json(strip=True))) + + insert_pos = self.get_event_position(self.events_index[event._event_type][event.scheduled_priority], event) + self.events_index[event._event_type][event.scheduled_priority].insert(insert_pos, event._id) + self.events_ids[event.get_event_id()].append(event._id) + + if event._event_type not in self.accounts: + self.accounts[event._event_type] = {'total_queued_events': 1, 'total_processed_events': 0, 'lack_events': False} + else: + self.accounts[event._event_type]['total_queued_events'] += 1 + + def get(self, event_type, wait=0): + with self._lock: + if event_type in self.events_index: + for scheduled_priority in [EventPriority.High, EventPriority.Medium, EventPriority.Low]: + if (scheduled_priority in self.events_index[event_type] and self.events_index[event_type][scheduled_priority]): + event_id = self.events_index[event_type][scheduled_priority][0] + event = self.events[event_id] + if event.scheduled_time <= time.time(): + event_id = self.events_index[event_type][scheduled_priority].pop(0) + event = self.events[event_id] + del self.events[event_id] + self.events_ids[event.get_event_id()].remove(event_id) + + if event._event_type in self.accounts: + self.accounts[event._event_type]['total_queued_events'] -= 1 + self.accounts[event._event_type]['total_processed_events'] += 1 + + self.logger.debug("Get event %s" % (event.to_json(strip=True))) + return event + + if event_type in self.accounts: + if self.accounts[event_type]['total_queued_events'] == 0: + self.accounts[event_type]['lack_events'] = True + return None + + def send_report(self, event, status, start_time, end_time, source, result): + event_id = event.get_event_id() + event_ret_status = status + event_name = event._event_type.name + if not event_ret_status and result: + event_ret_status = result.get("status", None) + if event_id not in self.report: + self.report[event_id] = {"status": event_ret_status, + "total_files": None, + "processed_files": None, + "event_types": {}} + self.report[event_id]['status'] = event_ret_status + self.report[event_id]['event_types'][event_name] = {'start_time': start_time, + 'end_time': end_time, + 'source': source, + 'status': event_ret_status, + 'result': result} + + def clean_cache_info(self): + with self._lock: + event_ids = list(self.events_ids.keys()) + for event_id in event_ids: + if not self.events_ids[event_id]: + del self.events_ids[event_id] + + event_ids = list(self.report.keys()) + for event_id in event_ids: + event_types = list(self.report[event_id]['event_types'].keys()) + for event_type in event_types: + end_time = self.report[event_id]['event_types'][event_type].get('end_time', None) + if not end_time or end_time < time.time() - 86400 * 10: + del self.report[event_id]['event_types'][event_type] + if not self.report[event_id]['event_types']: + del self.report[event_id] + + def show_queued_events(self): + if self.show_queued_events_time is None or self.show_queued_events_time + self.show_queued_events_time_interval < time.time(): + self.show_queued_events_time = time.time() + for event_type in self.events_index: + self.logger.info("Number of events has processed: %s: %s" % (event_type.name, self.accounts.get(event_type, {}).get('total_processed_events', None))) + for prio in self.events_index[event_type]: + self.logger.info("Number of queued events: %s %s: %s" % (event_type.name, prio.name, len(self.events_index[event_type][prio]))) + + def coordinate(self): + self.select_coordinator() + self.clean_cache_info() + self.show_queued_events() + + def run(self): + """ + Main run function. + """ + try: + self.logger.info("Starting main thread") + self.init_thread_info() + # self.load_plugins() + + # coordinator will do the heartbeat by itself. + # self.add_default_tasks() + + # self.add_health_message_task() + + while not self.graceful_stop.is_set(): + try: + self.execute_schedules() + self.coordinate() + time.sleep(self.coordination_interval_delay) + except IDDSException as error: + self.logger.error("Main thread IDDSException: %s" % str(error)) + except Exception as error: + self.logger.critical("Main thread exception: %s\n%s" % (str(error), traceback.format_exc())) + # time.sleep(random.randint(5, self.random_delay)) + except KeyboardInterrupt: + self.stop() + + def stop(self): + super(Coordinator, self).stop() + + +if __name__ == '__main__': + agent = Coordinator() + agent() diff --git a/main/lib/idds/agents/main.py b/main/lib/idds/agents/main.py index f7de3440..aff5ffb3 100755 --- a/main/lib/idds/agents/main.py +++ b/main/lib/idds/agents/main.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2023 """ Main start entry point for iDDS service @@ -38,7 +38,9 @@ 'trigger': ['idds.agents.carrier.trigger.Trigger', Sections.Carrier], 'finisher': ['idds.agents.carrier.finisher.Finisher', Sections.Carrier], 'conductor': ['idds.agents.conductor.conductor.Conductor', Sections.Conductor], - 'consumer': ['idds.agents.conductor.consumer.Consumer', Sections.Consumer] + 'consumer': ['idds.agents.conductor.consumer.Consumer', Sections.Consumer], + 'archiver': ['idds.agents.archive.archiver.Archiver', Sections.Archiver], + 'coordinator': ['idds.agents.coordinator.coordinator.Coordinator', Sections.Coordinator] } RUNNING_AGENTS = [] diff --git a/main/lib/idds/agents/transformer/transformer.py b/main/lib/idds/agents/transformer/transformer.py index a140c9d8..8d1a4aff 100644 --- a/main/lib/idds/agents/transformer/transformer.py +++ b/main/lib/idds/agents/transformer/transformer.py @@ -15,7 +15,8 @@ import traceback from idds.common import exceptions -from idds.common.constants import (Sections, TransformStatus, TransformLocking, +from idds.common.constants import (Sections, ReturnCode, + TransformStatus, TransformLocking, CommandType, ProcessingStatus) from idds.common.utils import setup_logging, truncate_string from idds.core import (transforms as core_transforms, @@ -525,17 +526,20 @@ def handle_update_transform(self, transform, event): def process_update_transform(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: - tf_status = [TransformStatus.Transforming, - TransformStatus.ToCancel, TransformStatus.Cancelling, - TransformStatus.ToSuspend, TransformStatus.Suspending, - TransformStatus.ToExpire, TransformStatus.Expiring, - TransformStatus.ToResume, TransformStatus.Resuming, - TransformStatus.ToFinish, TransformStatus.ToForceFinish] - tf = self.get_transform(transform_id=event._transform_id, status=tf_status, locking=True) + # tf_status = [TransformStatus.Transforming, + # TransformStatus.ToCancel, TransformStatus.Cancelling, + # TransformStatus.ToSuspend, TransformStatus.Suspending, + # TransformStatus.ToExpire, TransformStatus.Expiring, + # TransformStatus.ToResume, TransformStatus.Resuming, + # TransformStatus.ToFinish, TransformStatus.ToForceFinish] + # tf = self.get_transform(transform_id=event._transform_id, status=tf_status, locking=True) + tf = self.get_transform(transform_id=event._transform_id, locking=True) if not tf: self.logger.error("Cannot find transform for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(tf) @@ -561,7 +565,9 @@ def process_update_transform(self, event): except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def handle_abort_transform(self, transform): """ @@ -600,12 +606,14 @@ def handle_abort_transform(self, transform): def process_abort_transform(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: self.logger.info("process_abort_transform: event: %s" % event) tf = self.get_transform(transform_id=event._transform_id, locking=True) if not tf: self.logger.error("Cannot find transform for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(tf) self.logger.info(log_pre + "process_abort_transform") @@ -640,7 +648,9 @@ def process_abort_transform(self, event): except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def handle_resume_transform(self, transform): """ @@ -672,12 +682,14 @@ def handle_resume_transform(self, transform): def process_resume_transform(self, event): self.number_workers += 1 + pro_ret = ReturnCode.Ok.value try: if event: self.logger.info("process_resume_transform: event: %s" % event) tf = self.get_transform(transform_id=event._transform_id, locking=True) if not tf: self.logger.error("Cannot find transform for event: %s" % str(event)) + pro_ret = ReturnCode.Locked.value else: log_pre = self.get_log_prefix(tf) @@ -715,7 +727,9 @@ def process_resume_transform(self, event): except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + pro_ret = ReturnCode.Failed.value self.number_workers -= 1 + return pro_ret def clean_locks(self): self.logger.info("clean locking") @@ -747,6 +761,7 @@ def run(self): """ try: self.logger.info("Starting main thread") + self.init_thread_info() self.load_plugins() diff --git a/main/lib/idds/core/catalog.py b/main/lib/idds/core/catalog.py index b468432b..85a14f7f 100644 --- a/main/lib/idds/core/catalog.py +++ b/main/lib/idds/core/catalog.py @@ -656,6 +656,17 @@ def update_contents_from_others_by_dep_id(request_id=None, transform_id=None, se return orm_contents.update_contents_from_others_by_dep_id(request_id=request_id, transform_id=transform_id, session=session) +@read_session +def get_update_contents_from_others_by_dep_id(request_id=None, transform_id=None, session=None): + """ + Update contents from others by content_dep_id + + :param request_id: The Request id. + :param transfomr_id: The transform id. + """ + return orm_contents.get_update_contents_from_others_by_dep_id(request_id=request_id, transform_id=transform_id, session=session) + + @transactional_session def add_contents_update(contents, bulk_size=10000, session=None): """ @@ -673,7 +684,27 @@ def add_contents_update(contents, bulk_size=10000, session=None): @transactional_session -def delete_contents_update(request_id=None, transform_id=None, session=None): +def set_fetching_contents_update(request_id=None, transform_id=None, fetch=False, session=None): + """ + Set fetching contents update. + + :param session: session. + """ + return orm_contents.set_fetching_contents_update(request_id=request_id, transform_id=transform_id, fetch=fetch, session=session) + + +@read_session +def get_contents_update(request_id=None, transform_id=None, fetch=False, session=None): + """ + Get contents update. + + :param session: session. + """ + return orm_contents.get_contents_update(request_id=request_id, transform_id=transform_id, fetch=fetch, session=session) + + +@transactional_session +def delete_contents_update(request_id=None, transform_id=None, contents=[], fetch=False, session=None): """ delete a content. @@ -682,7 +713,7 @@ def delete_contents_update(request_id=None, transform_id=None, session=None): :raises NoObject: If no content is founded. :raises DatabaseException: If there is a database error. """ - return orm_contents.delete_contents_update(request_id=request_id, transform_id=transform_id, session=session) + return orm_contents.delete_contents_update(request_id=request_id, transform_id=transform_id, contents=contents, fetch=fetch, session=session) def get_contents_ext_maps(): diff --git a/main/lib/idds/core/events.py b/main/lib/idds/core/events.py new file mode 100644 index 00000000..ab69ff53 --- /dev/null +++ b/main/lib/idds/core/events.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + + +""" +operations related to Events. +""" + +from idds.orm.base.session import read_session, transactional_session +from idds.orm import events as orm_events + + +@transactional_session +def add_event(event, session=None): + """ + Add an event to be submitted asynchronously to a command broker. + + :param event: The Event object. + :param session: The database session. + """ + return orm_events.add_event(event=event, session=session) + + +@read_session +def get_events(event_type, event_actual_id, status=None, session=None): + """ + Get events + + :param event_type: event type. + :param event_actual_id: event actual id. + :param status: event status. + """ + return orm_events.get_events(event_type=event_type, event_actual_id=event_actual_id, + status=status, session=session) + + +@read_session +def get_event_priority(event_type, event_actual_id, session=None): + """ + Get event priority + + :param event_type: event type. + :param event_actual_id: event actual id. + """ + return orm_events.get_event_priority(event_type=event_type, event_actual_id=event_actual_id, session=session) + + +@transactional_session +def update_event(event_id, status, session=None): + """ + Update event + + :param event_id: event id. + :param status: Event status. + """ + return orm_events.update_event(evnet_id=event_id, status=status, session=session) + + +@transactional_session +def get_event_for_processing(event_type, session=None): + """ + Get event for processing + + :param event_type: event type. + """ + return orm_events.get_event_for_processing(event_type=event_type, session=session) + + +@transactional_session +def delete_event(event_id, session=None): + """ + Delete event with the given id. + + :param event_id: The event id. + """ + return orm_events.delete_event(event_id=event_id, session=session) + + +@transactional_session +def add_event_archive(event, session=None): + """ + Add an event to the archive. + + :param event: The Event object. + :param session: The database session. + """ + return orm_events.add_event_archive(event=event, session=session) + + +@transactional_session +def clean_event(event, to_archive=True, session=None): + return orm_events.clean_event(event=event, to_archive=to_archive, session=session) + + +@transactional_session +def fail_event(event, to_archive=True, session=None): + return orm_events.fail_event(event=event, to_archive=to_archive, session=session) diff --git a/main/lib/idds/core/health.py b/main/lib/idds/core/health.py index 9f630a71..074f0e62 100644 --- a/main/lib/idds/core/health.py +++ b/main/lib/idds/core/health.py @@ -6,18 +6,22 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 +# - Wen Guan, , 2020 - 2023 """ operations related to Health. """ +import datetime +from idds.common.constants import HealthStatus from idds.orm import health as orm_health +from idds.orm.base.session import read_session, transactional_session -def add_health_item(agent, hostname, pid, thread_id, thread_name, payload): +@transactional_session +def add_health_item(agent, hostname, pid, thread_id, thread_name, payload, session=None): """ Add a health item. @@ -31,22 +35,67 @@ def add_health_item(agent, hostname, pid, thread_id, thread_name, payload): return orm_health.add_health_item(agent=agent, hostname=hostname, pid=pid, thread_id=thread_id, thread_name=thread_name, - payload=payload) + payload=payload, + session=session) -def retrieve_health_items(): +@read_session +def retrieve_health_items(session=None): """ Retrieve health items. :returns healths: List of dictionaries """ - return orm_health.retrieve_health_items() + return orm_health.retrieve_health_items(session=session) -def clean_health(older_than=3600): +@transactional_session +def clean_health(older_than=3600, hostname=None, pids=[], session=None): """ Clearn items which is older than the time. :param older_than in seconds """ - orm_health.clean_health(older_than=older_than) + orm_health.clean_health(older_than=older_than, hostname=hostname, pids=pids, session=session) + + +@transactional_session +def select_agent(name, newer_than=3600, session=None): + """ + Select one active receiver. + + :param older_than in seconds to be cleaned. + """ + orm_health.clean_health(older_than=newer_than, session=session) + health_items = orm_health.retrieve_health_items(session=session) + selected_agent = None + selected_agent_diff = None + utc_now = datetime.datetime.utcnow() + for health_item in health_items: + if health_item['agent'] != name: + continue + + updated_at = health_item['updated_at'] + time_diff = utc_now - updated_at + if time_diff.total_seconds() > newer_than: + continue + + if health_item['status'] == HealthStatus.Active: + selected_agent = health_item + break + + if selected_agent is None: + selected_agent = health_item + selected_agent_diff = time_diff + else: + if time_diff < selected_agent_diff: + selected_agent = health_item + selected_agent_diff = time_diff + + if selected_agent: + if selected_agent['status'] != HealthStatus.Active: + orm_health.update_health_item_status(selected_agent, status=HealthStatus.Active, session=session) + for health_item in health_items: + if health_item['status'] == HealthStatus.Active and health_item != selected_agent: + orm_health.update_health_item_status(selected_agent, status=HealthStatus.Default, session=session) + return selected_agent diff --git a/main/lib/idds/core/messages.py b/main/lib/idds/core/messages.py index 1ca8f002..3eb38370 100644 --- a/main/lib/idds/core/messages.py +++ b/main/lib/idds/core/messages.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2023 """ @@ -114,3 +114,13 @@ def update_messages(messages, session=None): :param messages: The messages to be updated as a list of dictionaries. """ return orm_messages.update_messages(messages=messages, session=session) + + +@transactional_session +def clean_old_messages(request_id, session=None): + """ + Delete messages whose request id is older than request_id. + + :param request_id: request id.. + """ + return orm_messages.clean_old_messages(request_id=request_id, session=session) diff --git a/main/lib/idds/core/requests.py b/main/lib/idds/core/requests.py index 8bd872a9..6821922c 100644 --- a/main/lib/idds/core/requests.py +++ b/main/lib/idds/core/requests.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2022 +# - Wen Guan, , 2019 - 2023 """ @@ -454,3 +454,16 @@ def clean_next_poll_at(status, session=None): :param status: status of the request """ orm_requests.clean_next_poll_at(status=status, session=session) + + +@read_session +def get_last_request_id(status, older_than=None, session=None): + """ + Get last request id which is older than a timestamp. + + :param status: status of the request. + :param older_than: days older than current timestamp. + + :returns request_id + """ + return orm_requests.get_last_request_id(status=status, older_than=older_than, session=session) diff --git a/main/lib/idds/orm/base/alembic/script.py.mako b/main/lib/idds/orm/base/alembic/script.py.mako index d6f2e054..16763eea 100644 --- a/main/lib/idds/orm/base/alembic/script.py.mako +++ b/main/lib/idds/orm/base/alembic/script.py.mako @@ -1,5 +1,5 @@ #!/usr/bin/env python -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # You may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -16,6 +16,7 @@ Create Date: ${create_date} """ from alembic import op +from alembic import context import sqlalchemy as sa ${imports if imports else ""} @@ -27,8 +28,12 @@ depends_on = ${repr(depends_on)} def upgrade() -> None: - ${upgrades if upgrades else "pass"} + if context.get_context().dialect.name in ['oracle', 'mysql', 'postgresql']: + schema = context.get_context().version_table_schema if context.get_context().version_table_schema else '' + ${upgrades if upgrades else "pass"} def downgrade() -> None: - ${downgrades if downgrades else "pass"} + if context.get_context().dialect.name in ['oracle', 'mysql', 'postgresql']: + schema = context.get_context().version_table_schema if context.get_context().version_table_schema else '' + ${downgrades if downgrades else "pass"} diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index 3993f37f..5ba4728c 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -28,11 +28,12 @@ ProcessingStatus, ProcessingLocking, CollectionStatus, CollectionLocking, CollectionType, CollectionRelationType, ContentType, ContentRelationType, - ContentStatus, ContentLocking, GranularityType, + ContentStatus, ContentFetchStatus, ContentLocking, GranularityType, MessageType, MessageStatus, MessageLocking, MessageSource, MessageDestination, CommandType, CommandStatus, CommandLocking, - CommandLocation) + CommandLocation, HealthStatus) +from idds.common.event import (EventType, EventStatus) from idds.common.utils import date_to_str from idds.orm.base.enum import EnumSymbol from idds.orm.base.types import JSON, JSONString, EnumWithValue @@ -136,18 +137,18 @@ class Request(BASE, ModelBase): scope = Column(String(SCOPE_LENGTH)) name = Column(String(NAME_LENGTH)) requester = Column(String(20)) - request_type = Column(EnumWithValue(RequestType)) + request_type = Column(EnumWithValue(RequestType), nullable=False) username = Column(String(20)) userdn = Column(String(200)) transform_tag = Column(String(20)) workload_id = Column(Integer()) priority = Column(Integer()) - status = Column(EnumWithValue(RequestStatus)) + status = Column(EnumWithValue(RequestStatus), nullable=False) substatus = Column(EnumWithValue(RequestStatus), default=0) oldstatus = Column(EnumWithValue(RequestStatus), default=0) - locking = Column(EnumWithValue(RequestLocking)) - created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) - updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) + locking = Column(EnumWithValue(RequestLocking), nullable=False) + created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow, nullable=False) + updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow, nullable=False) next_poll_at = Column("next_poll_at", DateTime, default=datetime.datetime.utcnow) accessed_at = Column("accessed_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) expired_at = Column("expired_at", DateTime) @@ -241,7 +242,8 @@ def update(self, values, flush=True, session=None): CheckConstraint('status IS NOT NULL', name='REQUESTS_STATUS_ID_NN'), # UniqueConstraint('name', 'scope', 'requester', 'request_type', 'transform_tag', 'workload_id', name='REQUESTS_NAME_SCOPE_UQ '), Index('REQUESTS_SCOPE_NAME_IDX', 'name', 'scope', 'workload_id'), - Index('REQUESTS_STATUS_PRIO_IDX', 'status', 'priority', 'request_id', 'locking', 'updated_at', 'next_poll_at', 'created_at')) + Index('REQUESTS_STATUS_PRIO_IDX', 'status', 'priority', 'request_id', 'locking', 'updated_at', 'next_poll_at', 'created_at'), + Index('REQUESTS_STATUS_POLL_IDX', 'status', 'priority', 'locking', 'updated_at', 'new_poll_period', 'update_poll_period', 'created_at', 'request_id')) class Workprogress(BASE, ModelBase): @@ -281,19 +283,19 @@ class Transform(BASE, ModelBase): """Represents a transform""" __tablename__ = 'transforms' transform_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('TRANSFORM_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) - request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + request_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) workload_id = Column(Integer()) - transform_type = Column(EnumWithValue(TransformType)) + transform_type = Column(EnumWithValue(TransformType), nullable=False) transform_tag = Column(String(20)) priority = Column(Integer()) safe2get_output_from_input = Column(Integer()) - status = Column(EnumWithValue(TransformStatus)) + status = Column(EnumWithValue(TransformStatus), nullable=False) substatus = Column(EnumWithValue(TransformStatus), default=0) oldstatus = Column(EnumWithValue(TransformStatus), default=0) - locking = Column(EnumWithValue(TransformLocking)) + locking = Column(EnumWithValue(TransformLocking), nullable=False) retries = Column(Integer(), default=0) - created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) - updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) + created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow, nullable=False) + updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow, nullable=False) next_poll_at = Column("next_poll_at", DateTime, default=datetime.datetime.utcnow) started_at = Column("started_at", DateTime) finished_at = Column("finished_at", DateTime) @@ -365,7 +367,9 @@ def update(self, values, flush=True, session=None): _table_args = (PrimaryKeyConstraint('transform_id', name='TRANSFORMS_PK'), CheckConstraint('status IS NOT NULL', name='TRANSFORMS_STATUS_ID_NN'), Index('TRANSFORMS_TYPE_TAG_IDX', 'transform_type', 'transform_tag', 'transform_id'), - Index('TRANSFORMS_STATUS_UPDATED_AT_IDX', 'status', 'locking', 'updated_at', 'next_poll_at', 'created_at')) + Index('TRANSFORMS_STATUS_UPDATED_AT_IDX', 'status', 'locking', 'updated_at', 'next_poll_at', 'created_at'), + Index('TRANSFORMS_REQ_IDX', 'request_id', 'transform_id'), + Index('TRANSFORMS_STATUS_POLL_IDX', 'status', 'locking', 'updated_at', 'new_poll_period', 'update_poll_period', 'created_at', 'transform_id')) class Workprogress2transform(BASE, ModelBase): @@ -383,19 +387,19 @@ class Processing(BASE, ModelBase): """Represents a processing""" __tablename__ = 'processings' processing_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('PROCESSING_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) - transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) - request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + transform_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) + request_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) workload_id = Column(Integer()) - status = Column(EnumWithValue(ProcessingStatus)) + status = Column(EnumWithValue(ProcessingStatus), nullable=False) substatus = Column(EnumWithValue(ProcessingStatus), default=0) oldstatus = Column(EnumWithValue(ProcessingStatus), default=0) - locking = Column(EnumWithValue(ProcessingLocking)) + locking = Column(EnumWithValue(ProcessingLocking), nullable=False) submitter = Column(String(20)) submitted_id = Column(Integer()) granularity = Column(Integer()) granularity_type = Column(EnumWithValue(GranularityType)) - created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) - updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) + created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow, nullable=False) + updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow, nullable=False) next_poll_at = Column("next_poll_at", DateTime, default=datetime.datetime.utcnow) poller_updated_at = Column("poller_updated_at", DateTime, default=datetime.datetime.utcnow) submitted_at = Column("submitted_at", DateTime) @@ -469,24 +473,25 @@ def update(self, values, flush=True, session=None): ForeignKeyConstraint(['transform_id'], ['transforms.transform_id'], name='PROCESSINGS_TRANSFORM_ID_FK'), CheckConstraint('status IS NOT NULL', name='PROCESSINGS_STATUS_ID_NN'), CheckConstraint('transform_id IS NOT NULL', name='PROCESSINGS_TRANSFORM_ID_NN'), - Index('PROCESSINGS_STATUS_UPDATED_IDX', 'status', 'locking', 'updated_at', 'next_poll_at', 'created_at')) + Index('PROCESSINGS_STATUS_UPDATED_IDX', 'status', 'locking', 'updated_at', 'next_poll_at', 'created_at'), + Index('PROCESSINGS_STATUS_POLL_IDX', 'status', 'processing_id', 'locking', 'updated_at', 'new_poll_period', 'update_poll_period', 'created_at')) class Collection(BASE, ModelBase): """Represents a collection""" __tablename__ = 'collections' coll_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('COLLECTION_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) - request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + request_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) workload_id = Column(Integer()) - transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) - coll_type = Column(EnumWithValue(CollectionType)) - relation_type = Column(EnumWithValue(CollectionRelationType)) + transform_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) + coll_type = Column(EnumWithValue(CollectionType), nullable=False) + relation_type = Column(EnumWithValue(CollectionRelationType), nullable=False) scope = Column(String(SCOPE_LENGTH)) name = Column(String(NAME_LENGTH)) bytes = Column(Integer()) - status = Column(EnumWithValue(CollectionStatus)) + status = Column(EnumWithValue(CollectionStatus), nullable=False) substatus = Column(EnumWithValue(CollectionStatus), default=0) - locking = Column(EnumWithValue(CollectionLocking)) + locking = Column(EnumWithValue(CollectionLocking), nullable=False) total_files = Column(Integer()) storage_id = Column(Integer()) new_files = Column(Integer()) @@ -500,8 +505,8 @@ class Collection(BASE, ModelBase): missing_ext_files = Column(Integer()) processing_id = Column(Integer()) retries = Column(Integer(), default=0) - created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) - updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) + created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow, nullable=False) + updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow, nullable=False) next_poll_at = Column("next_poll_at", DateTime, default=datetime.datetime.utcnow) accessed_at = Column("accessed_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) expired_at = Column("expired_at", DateTime) @@ -514,28 +519,29 @@ class Collection(BASE, ModelBase): CheckConstraint('transform_id IS NOT NULL', name='COLLECTIONS_TRANSFORM_ID_NN'), Index('COLLECTIONS_STATUS_RELAT_IDX', 'status', 'relation_type'), Index('COLLECTIONS_TRANSFORM_IDX', 'transform_id', 'coll_id'), - Index('COLLECTIONS_STATUS_UPDATED_IDX', 'status', 'locking', 'updated_at', 'next_poll_at', 'created_at')) + Index('COLLECTIONS_STATUS_UPDATED_IDX', 'status', 'locking', 'updated_at', 'next_poll_at', 'created_at'), + Index('COLLECTIONS_REQ_IDX', 'request_id', 'transform_id', 'updated_at'),) class Content(BASE, ModelBase): """Represents a content""" __tablename__ = 'contents' content_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('CONTENT_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) - transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) - coll_id = Column(BigInteger().with_variant(Integer, "sqlite")) - request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + transform_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) + coll_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) + request_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) workload_id = Column(Integer()) - map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0) + map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0, nullable=False) content_dep_id = Column(BigInteger()) scope = Column(String(SCOPE_LENGTH)) name = Column(String(LONG_NAME_LENGTH)) min_id = Column(Integer(), default=0) max_id = Column(Integer(), default=0) - content_type = Column(EnumWithValue(ContentType)) - content_relation_type = Column(EnumWithValue(ContentRelationType), default=0) - status = Column(EnumWithValue(ContentStatus)) + content_type = Column(EnumWithValue(ContentType), nullable=False) + content_relation_type = Column(EnumWithValue(ContentRelationType), default=0, nullable=False) + status = Column(EnumWithValue(ContentStatus), nullable=False) substatus = Column(EnumWithValue(ContentStatus)) - locking = Column(EnumWithValue(ContentLocking)) + locking = Column(EnumWithValue(ContentLocking), nullable=False) bytes = Column(Integer()) md5 = Column(String(32)) adler32 = Column(String(8)) @@ -561,6 +567,8 @@ class Content(BASE, ModelBase): Index('CONTENTS_STATUS_UPDATED_IDX', 'status', 'locking', 'updated_at', 'created_at'), Index('CONTENTS_ID_NAME_IDX', 'coll_id', 'scope', 'name', 'status'), Index('CONTENTS_DEP_IDX', 'request_id', 'transform_id', 'content_dep_id'), + Index('CONTENTS_REL_IDX', 'request_id', 'content_relation_type', 'transform_id', 'substatus'), + Index('CONTENTS_TF_IDX', 'transform_id', 'request_id', 'coll_id', 'map_id', 'content_relation_type'), Index('CONTENTS_REQ_TF_COLL_IDX', 'request_id', 'transform_id', 'workload_id', 'coll_id', 'content_relation_type', 'status', 'substatus')) @@ -572,19 +580,21 @@ class Content_update(BASE, ModelBase): request_id = Column(BigInteger().with_variant(Integer, "sqlite")) transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) workload_id = Column(Integer()) + fetch_status = Column(EnumWithValue(ContentFetchStatus), default=0, nullable=False) coll_id = Column(BigInteger().with_variant(Integer, "sqlite")) + content_metadata = Column(JSONString(100)) class Content_ext(BASE, ModelBase): """Represents a content extension""" __tablename__ = 'contents_ext' content_id = Column(BigInteger().with_variant(Integer, "sqlite"), primary_key=True) - transform_id = Column(BigInteger().with_variant(Integer, "sqlite")) - coll_id = Column(BigInteger().with_variant(Integer, "sqlite")) - request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + transform_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) + coll_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) + request_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) workload_id = Column(Integer()) - map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0) - status = Column(EnumWithValue(ContentStatus)) + map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0, nullable=False) + status = Column(EnumWithValue(ContentStatus), nullable=False) panda_id = Column(BigInteger()) job_definition_id = Column(BigInteger()) scheduler_id = Column(String(128)) @@ -658,7 +668,9 @@ class Content_ext(BASE, ModelBase): job_label = Column(String(20)) _table_args = (PrimaryKeyConstraint('content_id', name='CONTENTS_EXT_PK'), - Index('CONTENTS_EXT_RTF_IDX', 'request_id', 'transform_id', 'workload_id', 'coll_id', 'content_id', 'panda_id', 'status')) + Index('CONTENTS_EXT_RTF_IDX', 'request_id', 'transform_id', 'workload_id', 'coll_id', 'content_id', 'panda_id', 'status'), + Index('CONTENTS_EXT_RTW_IDX', 'request_id', 'transform_id', 'workload_id'), + Index('CONTENTS_EXT_RTM_IDX', 'request_id', 'transform_id', 'map_id')) class Health(BASE, ModelBase): @@ -668,13 +680,15 @@ class Health(BASE, ModelBase): Sequence('HEALTH_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) agent = Column(String(30)) - hostname = Column(String(127)) + hostname = Column(String(500)) pid = Column(Integer, autoincrement=False) + status = Column(EnumWithValue(HealthStatus), default=0, nullable=False) thread_id = Column(BigInteger, autoincrement=False) thread_name = Column(String(255)) - payload = Column(String(255)) + payload = Column(String(2048)) created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) + payload = Column(String(2048)) _table_args = (PrimaryKeyConstraint('health_id', name='HEALTH_PK'), UniqueConstraint('agent', 'hostname', 'pid', 'thread_id', name='HEALTH_UK')) @@ -685,26 +699,28 @@ class Message(BASE, ModelBase): msg_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('MESSAGE_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) - msg_type = Column(EnumWithValue(MessageType)) - status = Column(EnumWithValue(MessageStatus)) + msg_type = Column(EnumWithValue(MessageType), nullable=False) + status = Column(EnumWithValue(MessageStatus), nullable=False) substatus = Column(Integer()) - locking = Column(EnumWithValue(MessageLocking)) - source = Column(EnumWithValue(MessageSource)) - destination = Column(EnumWithValue(MessageDestination)) - request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + locking = Column(EnumWithValue(MessageLocking), nullable=False) + source = Column(EnumWithValue(MessageSource), nullable=False) + destination = Column(EnumWithValue(MessageDestination), nullable=False) + request_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) workload_id = Column(Integer()) - transform_id = Column(Integer()) - processing_id = Column(Integer()) + transform_id = Column(Integer(), nullable=False) + processing_id = Column(Integer(), nullable=False) num_contents = Column(Integer()) retries = Column(Integer(), default=0) - created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) - updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) + created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow, nullable=False) + updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow, nullable=False) msg_content = Column(JSON()) _table_args = (PrimaryKeyConstraint('msg_id', name='MESSAGES_PK'), Index('MESSAGES_TYPE_ST_IDX', 'msg_type', 'status', 'destination', 'request_id'), Index('MESSAGES_TYPE_ST_TF_IDX', 'msg_type', 'status', 'destination', 'transform_id'), - Index('MESSAGES_TYPE_ST_PR_IDX', 'msg_type', 'status', 'destination', 'processing_id')) + Index('MESSAGES_TYPE_ST_PR_IDX', 'msg_type', 'status', 'destination', 'processing_id'), + Index('MESSAGES_ST_IDX', 'status', 'destination', 'created_at'), + Index('MESSAGES_TYPE_STU_IDX', 'msg_type', 'status', 'destination', 'retries', 'updated_at', 'created_at')) class Command(BASE, ModelBase): @@ -713,27 +729,158 @@ class Command(BASE, ModelBase): cmd_id = Column(BigInteger().with_variant(Integer, "sqlite"), Sequence('COMMAND_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), primary_key=True) - request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + request_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) workload_id = Column(Integer()) transform_id = Column(Integer()) processing_id = Column(Integer()) cmd_type = Column(EnumWithValue(CommandType)) - status = Column(EnumWithValue(CommandStatus)) + status = Column(EnumWithValue(CommandStatus), nullable=False) substatus = Column(Integer()) - locking = Column(EnumWithValue(CommandLocking)) + locking = Column(EnumWithValue(CommandLocking), nullable=False) username = Column(String(50)) retries = Column(Integer(), default=0) source = Column(EnumWithValue(CommandLocation)) destination = Column(EnumWithValue(CommandLocation)) - created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) - updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) + created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow, nullable=False) + updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow, nullable=False) cmd_content = Column(JSON()) errors = Column(JSONString(1024)) _table_args = (PrimaryKeyConstraint('cmd_id', name='COMMANDS_PK'), Index('COMMANDS_TYPE_ST_IDX', 'cmd_type', 'status', 'destination', 'request_id'), Index('COMMANDS_TYPE_ST_TF_IDX', 'cmd_type', 'status', 'destination', 'transform_id'), - Index('COMMANDS_TYPE_ST_PR_IDX', 'cmd_type', 'status', 'destination', 'processing_id')) + Index('COMMANDS_TYPE_ST_PR_IDX', 'cmd_type', 'status', 'destination', 'processing_id'), + Index('COMMANDS_STATUS_IDX', 'status', 'locking', 'updated_at')) + + +class EventPriority(BASE, ModelBase): + """Represents the operations events""" + __tablename__ = 'events_priority' + event_type = Column(EnumWithValue(EventType), primary_key=True, nullable=False) + event_actual_id = Column(Integer(), primary_key=True, nullable=False) + priority = Column(Integer(), default=1000, nullable=False) + last_processed_at = Column("last_processed_at", DateTime, default=datetime.datetime.utcnow, nullable=False) + updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow, nullable=False) + + _table_args = (PrimaryKeyConstraint('event_type', 'event_actual_id', name='EVENTS_PR_PK')) + + +class Event(BASE, ModelBase): + """Represents the operations events""" + __tablename__ = 'events' + event_id = Column(BigInteger().with_variant(Integer, "sqlite"), + Sequence('EVENT_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), + primary_key=True) + event_type = Column(EnumWithValue(EventType), nullable=False) + event_actual_id = Column(Integer(), nullable=False) + priority = Column(Integer()) + status = Column(EnumWithValue(EventStatus), nullable=False) + created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow, nullable=False) + processing_at = Column("processing_at", DateTime, default=None) + processed_at = Column("processed_at", DateTime, default=None) + content = Column(JSON()) + + @property + def _id(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event']._id + return None + + @property + def _publisher_id(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event']._publisher_id + return None + + @property + def _event_type(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event']._event_type + return None + + @property + def _timestamp(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event']._timestamp + return None + + @property + def _counter(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event']._counter + return None + + @property + def _content(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event']._content + return None + + @property + def has_changes(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event'].has_changes + return None + + def get_event_id(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event'].get_event_id() + return None + + def able_to_merge(self, event): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event'].able_to_merge(event) + return False + + def changed(self): + return self.has_changes + + def merge(self, event): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event'].merge(event) + return False, event + + @property + def _request_id(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event']._request_id + return None + + @property + def _command_id(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event']._command_id + return None + + @property + def _transform_id(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event']._transform_id + return None + + @property + def _processing_id(self): + if self.content and 'event' in self.content and self.content['event']: + return self.content['event']._processing_id + return None + + _table_args = (PrimaryKeyConstraint('event_id', name='EVENTS_PK')) + + +class EventArchive(BASE, ModelBase): + """Represents the operations events""" + __tablename__ = 'events_archive' + event_id = Column(BigInteger(), primary_key=True) + event_type = Column(EnumWithValue(EventType), nullable=False) + event_actual_id = Column(Integer(), nullable=False) + priority = Column(Integer()) + status = Column(EnumWithValue(EventStatus), nullable=False) + created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow, nullable=False) + processing_at = Column("processing_at", DateTime, default=None) + processed_at = Column("processed_at", DateTime, default=None) + content = Column(JSON()) + + _table_args = (PrimaryKeyConstraint('event_id', name='EVENTS_AR_PK')) def create_trigger(): diff --git a/main/lib/idds/orm/commands.py b/main/lib/idds/orm/commands.py index 27e3fb5e..3a77b5c0 100644 --- a/main/lib/idds/orm/commands.py +++ b/main/lib/idds/orm/commands.py @@ -98,14 +98,6 @@ def retrieve_command(cmd_type=None, status=None, source=None, command = [] try: query = session.query(models.Command) - if request_id is not None: - query = query.with_hint(models.Command, "INDEX(COMMANDS COMMANDS_TYPE_ST_IDX)", 'oracle') - elif transform_id: - query = query.with_hint(models.Command, "INDEX(COMMANDS COMMANDS_TYPE_ST_TF_IDX)", 'oracle') - elif processing_id is not None: - query = query.with_hint(models.Command, "INDEX(COMMANDS COMMANDS_TYPE_ST_PR_IDX)", 'oracle') - else: - query = query.with_hint(models.Command, "INDEX(COMMANDS COMMANDS_TYPE_ST_IDX)", 'oracle') if cmd_type is not None: query = query.filter_by(cmd_type=cmd_type) @@ -151,7 +143,6 @@ def delete_command(command, session=None): try: if command_condition: session.query(models.Command).\ - with_hint(models.Command, "index(command COMMANDS_PK)", 'oracle').\ filter(or_(*command_condition)).\ delete(synchronize_session=False) except IntegrityError as e: diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index 16649bc7..81ddd3f7 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -23,7 +23,7 @@ from idds.common import exceptions from idds.common.constants import (ContentType, ContentStatus, ContentLocking, - ContentRelationType) + ContentFetchStatus, ContentRelationType) from idds.orm.base.session import read_session, transactional_session from idds.orm.base import models @@ -193,14 +193,12 @@ def get_content(content_id=None, coll_id=None, scope=None, name=None, content_ty else: if content_type in [ContentType.File, ContentType.File.value]: query = session.query(models.Content)\ - .with_hint(models.Content, "INDEX(CONTENTS CONTENTS_ID_NAME_IDX)", 'oracle')\ .filter(models.Content.coll_id == coll_id)\ .filter(models.Content.scope == scope)\ .filter(models.Content.name == name)\ .filter(models.Content.content_type == content_type) else: query = session.query(models.Content)\ - .with_hint(models.Content, "INDEX(CONTENTS CONTENTS_ID_NAME_IDX)", 'oracle')\ .filter(models.Content.coll_id == coll_id)\ .filter(models.Content.scope == scope)\ .filter(models.Content.name == name)\ @@ -246,7 +244,6 @@ def get_match_contents(coll_id, scope, name, content_type=None, min_id=None, max try: query = session.query(models.Content)\ - .with_hint(models.Content, "INDEX(CONTENTS CONTENTS_ID_NAME_IDX)", 'oracle')\ .filter(models.Content.coll_id == coll_id)\ .filter(models.Content.scope == scope)\ .filter(models.Content.name.like(name.replace('*', '%'))) @@ -305,7 +302,6 @@ def get_contents(scope=None, name=None, transform_id=None, coll_id=None, status= coll_id = [coll_id[0], coll_id[0]] query = session.query(models.Content) - query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_ID_NAME_IDX)", 'oracle') if transform_id: query = query.filter(models.Content.transform_id == transform_id) @@ -360,7 +356,6 @@ def get_contents_by_request_transform(request_id=None, transform_id=None, worklo status = [status] query = session.query(models.Content) - query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') if request_id: query = query.filter(models.Content.request_id == request_id) if transform_id: @@ -398,7 +393,6 @@ def get_content_status_statistics(coll_id=None, session=None): """ try: query = session.query(models.Content.status, func.count(models.Content.content_id)) - query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_ID_NAME_IDX)", 'oracle') if coll_id: query = query.filter(models.Content.coll_id == coll_id) query = query.group_by(models.Content.status) @@ -472,7 +466,7 @@ def update_dep_contents(request_id, content_dep_ids, status, bulk_size=10000, se params = {'substatus': status} chunks = [content_dep_ids[i:i + bulk_size] for i in range(0, len(content_dep_ids), bulk_size)] for chunk in chunks: - session.query(models.Content).with_hint(models.Content, "INDEX(CONTENTS CONTENTS_DEP_IDX)", "oracle")\ + session.query(models.Content)\ .filter(models.Content.request_id == request_id)\ .filter(models.Content.content_id.in_(chunk))\ .update(params, synchronize_session=False) @@ -528,7 +522,45 @@ def update_contents_from_others_by_dep_id(request_id=None, transform_id=None, se @read_session -def get_updated_transforms_by_content_status(request_id=None, transform_id=None, session=None): +def get_update_contents_from_others_by_dep_id(request_id=None, transform_id=None, session=None): + """ + Get contents to update from others by content_dep_id + + :param request_id: The Request id. + :param transfomr_id: The transform id. + """ + try: + subquery = session.query(models.Content.content_id, + models.Content.substatus) + if request_id: + subquery = subquery.filter(models.Content.request_id == request_id) + subquery = subquery.filter(models.Content.content_relation_type == 1)\ + .filter(models.Content.substatus != ContentStatus.New) + subquery = subquery.subquery() + + query = session.query(models.Content.content_id, + subquery.c.substatus) + if request_id: + query = query.filter(models.Content.request_id == request_id) + if transform_id: + query = query.filter(models.Content.transform_id == transform_id) + query = query.filter(models.Content.content_relation_type == 3) + query = query.join(subquery, and_(models.Content.content_dep_id == subquery.c.content_id, + models.Content.substatus != subquery.c.substatus)) + + tmp = query.distinct() + rets = [] + if tmp: + for t in tmp: + t2 = dict(zip(t.keys(), t)) + rets.append(t2) + return rets + except Exception as ex: + raise ex + + +@read_session +def get_updated_transforms_by_content_status(request_id=None, transform_id=None, check_substatus=False, session=None): """ Get updated transform ids by content status @@ -539,7 +571,7 @@ def get_updated_transforms_by_content_status(request_id=None, transform_id=None, try: subquery = session.query(models.Content.content_id, models.Content.substatus) - subquery = subquery.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') + # subquery = subquery.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') if request_id: subquery = subquery.filter(models.Content.request_id == request_id) if transform_id: @@ -551,14 +583,16 @@ def get_updated_transforms_by_content_status(request_id=None, transform_id=None, models.Content.transform_id, models.Content.workload_id, models.Content.coll_id) - query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') + # query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') if request_id: query = query.filter(models.Content.request_id == request_id) query = query.filter(models.Content.content_relation_type == 3) - query = query.join(subquery, and_(models.Content.content_dep_id == subquery.c.content_id, - models.Content.substatus != subquery.c.substatus)) - + if check_substatus: + query = query.join(subquery, and_(models.Content.content_dep_id == subquery.c.content_id, + models.Content.substatus != subquery.c.substatus)) + else: + query = query.join(subquery, and_(models.Content.content_dep_id == subquery.c.content_id)) tmp = query.distinct() rets = [] @@ -654,7 +688,64 @@ def add_contents_update(contents, bulk_size=10000, session=None): @transactional_session -def delete_contents_update(request_id=None, transform_id=None, session=None): +def set_fetching_contents_update(request_id=None, transform_id=None, fetch=True, session=None): + """ + Set fetching contents update. + + :param session: session. + """ + try: + if fetch: + query = session.query(models.Content_update) + if request_id: + query = query.filter(models.Content_update.request_id == request_id) + if transform_id: + query = query.filter(models.Content_update.transform_id == transform_id) + query.update({'fetch_status': ContentFetchStatus.Fetching}) + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('No record can be found with (transform_id=%s): %s' % + (transform_id, error)) + except Exception as error: + raise error + + +@read_session +def get_contents_update(request_id=None, transform_id=None, fetch=False, session=None): + """ + Get contents update. + + :param session: session. + """ + try: + if fetch: + query = session.query(models.Content_update) + if request_id: + query = query.filter(models.Content_update.request_id == request_id) + if transform_id: + query = query.filter(models.Content_update.transform_id == transform_id) + query = query.filter(models.Content_update.fetch_status == ContentFetchStatus.Fetching) + else: + query = session.query(models.Content_update) + if request_id: + query = query.filter(models.Content_update.request_id == request_id) + if transform_id: + query = query.filter(models.Content_update.transform_id == transform_id) + + tmp = query.all() + rets = [] + if tmp: + for t in tmp: + rets.append(t.to_dict()) + return rets + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('No record can be found with (transform_id=%s): %s' % + (transform_id, error)) + except Exception as error: + raise error + + +@transactional_session +def delete_contents_update(request_id=None, transform_id=None, contents=[], bulk_size=1000, fetch=False, session=None): """ delete a content. @@ -664,13 +755,36 @@ def delete_contents_update(request_id=None, transform_id=None, session=None): :raises DatabaseException: If there is a database error. """ try: - del_query = session.query(models.Content_update) - if request_id: - del_query = del_query.filter(models.Content_update.request_id == request_id) - if transform_id: - del_query = del_query.filter(models.Content_update.transform_id == transform_id) - del_query.with_for_update(nowait=True, skip_locked=True) - del_query.delete() + if fetch: + del_query = session.query(models.Content_update) + if request_id: + del_query = del_query.filter(models.Content_update.request_id == request_id) + if transform_id: + del_query = del_query.filter(models.Content_update.transform_id == transform_id) + del_query = del_query.filter(models.Content_update.fetch_status == ContentFetchStatus.Fetching) + del_query.delete() + else: + if contents: + contents_sub_params = [contents[i:i + bulk_size] for i in range(0, len(contents), bulk_size)] + + for contents_sub_param in contents_sub_params: + del_query = session.query(models.Content_update) + if request_id: + del_query = del_query.filter(models.Content_update.request_id == request_id) + if transform_id: + del_query = del_query.filter(models.Content_update.transform_id == transform_id) + if contents_sub_param: + del_query = del_query.filter(models.Content_update.content_id.in_(contents_sub_param)) + del_query.with_for_update(nowait=True, skip_locked=True) + del_query.delete() + else: + del_query = session.query(models.Content_update) + if request_id: + del_query = del_query.filter(models.Content_update.request_id == request_id) + if transform_id: + del_query = del_query.filter(models.Content_update.transform_id == transform_id) + del_query.with_for_update(nowait=True, skip_locked=True) + del_query.delete() except Exception as error: raise exceptions.NoObject('Content_update deletion error: %s' % (error)) @@ -749,7 +863,6 @@ def get_contents_ext(request_id=None, transform_id=None, workload_id=None, coll_ status = [status] query = session.query(models.Content_ext) - query = query.with_hint(models.Content_ext, "INDEX(CONTENTS_EXT CONTENTS_EXT_RTF_IDX)", "oracle") if request_id: query = query.filter(models.Content_ext.request_id == request_id) if transform_id: @@ -803,7 +916,6 @@ def get_contents_ext_ids(request_id=None, transform_id=None, workload_id=None, c models.Content_ext.content_id, models.Content_ext.panda_id, models.Content_ext.status) - query = query.with_hint(models.Content_ext, "INDEX(CONTENTS_EXT CONTENTS_EXT_RTF_IDX)", "oracle") if request_id: query = query.filter(models.Content_ext.request_id == request_id) if transform_id: diff --git a/main/lib/idds/orm/events.py b/main/lib/idds/orm/events.py new file mode 100644 index 00000000..9bcd3ee2 --- /dev/null +++ b/main/lib/idds/orm/events.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + + +""" +operations related to Events. +""" + +import re +import datetime + +from sqlalchemy.exc import DatabaseError, IntegrityError, NoResultFound +from sqlalchemy.sql.expression import asc, desc + +from idds.common import exceptions +from idds.common.event import EventStatus +from idds.orm.base import models +from idds.orm.base.session import read_session, transactional_session + + +@transactional_session +def add_event(event, session=None): + """ + Add an event to be submitted asynchronously to a command broker. + + :param event: The Event object. + :param session: The database session. + """ + + try: + old_events = get_events(event_type=event._event_type, event_actual_id=event.get_event_id(), + status=EventStatus.New, session=session) + merge = False + for old_event_db in old_events: + old_event = old_event_db['content']['event'] + if old_event.able_to_merge(event): + # discard current event + old_event.merge(event) + if old_event.changed(): + old_event_db['content']['event'] = old_event + update_event(old_event.event_id, status=EventStatus.New, session=session) + merge = True + return None + if not merge: + priority = get_event_priority(event_type=event._event_type, + event_actual_id=event.get_event_id(), + session=session) + event_db = models.Event(event_type=event._event_type, + event_actual_id=event.get_event_id(), + status=EventStatus.New, + priority=priority, + content={'event': event}) + event_db.save(session=session) + return event_db.event_id + except TypeError as e: + raise exceptions.DatabaseException('Invalid JSON for content: %s' % str(e)) + except DatabaseError as e: + if re.match('.*ORA-12899.*', e.args[0]) \ + or re.match('.*1406.*', e.args[0]): + raise exceptions.DatabaseException('Could not persist event, content too large: %s' % str(e)) + else: + raise exceptions.DatabaseException('Could not persist event: %s' % str(e)) + return None + + +@read_session +def get_events(event_type, event_actual_id, status=None, session=None): + """ + Get events + + :param event_type: event type. + :param event_actual_id: event actual id. + :param status: event status. + """ + try: + if not isinstance(status, (list, tuple)): + status = [status] + if len(status) == 1: + status = [status[0], status[0]] + + query = session.query(models.Event) + if event_type: + query = query.filter_by(event_type=event_type) + if event_actual_id: + query = query.filter_by(event_actual_id=event_actual_id) + if status: + query = query.filter(models.Event.status.in_(status)) + + tmp = query.all() + events = [] + if tmp: + for t in tmp: + events.append(t.to_dict()) + return events + except DatabaseError as e: + if re.match('.*ORA-12899.*', e.args[0]) \ + or re.match('.*1406.*', e.args[0]): + raise exceptions.DatabaseException('Could not persist event, content too large: %s' % str(e)) + else: + raise exceptions.DatabaseException('Could not persist event: %s' % str(e)) + return None + + +@transactional_session +def add_event_priority(event_type, event_actual_id, priority=10, session=None): + """ + add event priority + + :param event_type: event type. + :param event_actual_id: event actual id. + """ + try: + event_pr = models.EventPriority(event_type=event_type, + event_actual_id=event_actual_id, + priority=priority, + last_processed_at=datetime.datetime.utcnow(), + updated_at=datetime.datetime.utcnow()) + event_pr.save(session=session) + except DatabaseError as e: + if re.match('.*ORA-12899.*', e.args[0]) \ + or re.match('.*1406.*', e.args[0]): + raise exceptions.DatabaseException('Could not persist event, content too large: %s' % str(e)) + else: + raise exceptions.DatabaseException('Could not persist event: %s' % str(e)) + + +@transactional_session +def update_event_priority(event_type, event_actual_id, priority=10, session=None): + """ + Update event priority + + :param event_type: event type. + :param event_actual_id: event actual id. + """ + try: + parameters = {'priority': priority, + 'last_processed_at': datetime.datetime.utcnow(), + 'updated_at': datetime.datetime.utcnow()} + query = session.query(models.EventPriority) + if event_type: + query = query.filter_by(event_type=event_type) + if event_actual_id: + query = query.filter_by(event_actual_id=event_actual_id) + row_count = query.update(parameters, synchronize_session=False) + + if row_count < 1: + add_event_priority(event_type=event_type, event_actual_id=event_actual_id, + priority=priority, session=session) + except DatabaseError as e: + if re.match('.*ORA-12899.*', e.args[0]) \ + or re.match('.*1406.*', e.args[0]): + raise exceptions.DatabaseException('Could not persist event, content too large: %s' % str(e)) + else: + raise exceptions.DatabaseException('Could not persist event: %s' % str(e)) + return None + + +@read_session +def get_event_priority(event_type, event_actual_id, session=None): + """ + Get event priority + + :param event_type: event type. + :param event_actual_id: event actual id. + """ + try: + query = session.query(models.EventPriority) + if event_type: + query = query.filter_by(event_type=event_type) + if event_actual_id: + query = query.filter_by(event_actual_id=event_actual_id) + + tmp = query.first() + if tmp: + t = tmp.to_dict() + time_diff = datetime.datetime.utcnow() - t['last_processed_at'] + priority = time_diff.total_seconds() + else: + priority = 3600 * 24 * 7 + return priority + except DatabaseError as e: + if re.match('.*ORA-12899.*', e.args[0]) \ + or re.match('.*1406.*', e.args[0]): + raise exceptions.DatabaseException('Could not persist event, content too large: %s' % str(e)) + else: + raise exceptions.DatabaseException('Could not persist event: %s' % str(e)) + return 10 + + +@transactional_session +def update_event(event_id, status, session=None): + """ + Update event + + :param event_id: event id. + :param status: Event status. + """ + try: + parameters = {'status': status} + if status == EventStatus.Processing: + parameters['processing_at'] = datetime.datetime.utcnow() + if status == EventStatus.Processed: + parameters['processed_at'] = datetime.datetime.utcnow() + session.query(models.Event).filter_by(event_id=event_id)\ + .update(parameters, synchronize_session=False) + except DatabaseError as e: + if re.match('.*ORA-12899.*', e.args[0]) \ + or re.match('.*1406.*', e.args[0]): + raise exceptions.DatabaseException('Could not persist event, content too large: %s' % str(e)) + else: + raise exceptions.DatabaseException('Could not persist event: %s' % str(e)) + + +@transactional_session +def get_event_for_processing(event_type, session=None): + """ + Get event for processing + + :param event_type: event type. + """ + try: + query = session.query(models.Event) + if event_type: + query = query.filter_by(event_type=event_type) + status = [EventStatus.New, EventStatus.New] + query = query.filter(models.Event.status.in_(status)) + query = query.order_by(desc(models.Event.priority)) + query = query.order_by(asc(models.Event.event_id)) + + tmp = query.first() + if tmp: + # event = tmp.to_dict() + event = tmp + session.expunge(event) + update_event_priority(event.event_type, event.get_event_id(), session=session) + update_event(event.event_id, status=EventStatus.Processing, session=session) + return event + return None + except NoResultFound as _: # noqa F841 + return None + except DatabaseError as e: + if re.match('.*ORA-12899.*', e.args[0]) \ + or re.match('.*1406.*', e.args[0]): + raise exceptions.DatabaseException('Could not persist event, content too large: %s' % str(e)) + else: + raise exceptions.DatabaseException('Could not persist event: %s' % str(e)) + return None + + +@transactional_session +def delete_event(event_id, session=None): + """ + Delete event with the given id. + + :param event_id: The event id. + """ + try: + session.query(models.Event).filter_by(event_id=event_id).delete() + except IntegrityError as e: + raise exceptions.DatabaseException(e.args) + + +@transactional_session +def add_event_archive(event, session=None): + """ + Add an event to the archive. + + :param event: The Event object. + :param session: The database session. + """ + + try: + event_db = models.EventArchive(event_id=event.event_id, + event_type=event.event_type, + event_actual_id=event.event_actual_id, + status=event.status, + priority=event.priority, + created_at=event.created_at, + processing_at=event.processing_at, + processed_at=event.processed_at, + content=event.content) + event_db.save(session=session) + return event_db.event_id + except TypeError as e: + raise exceptions.DatabaseException('Invalid JSON for content: %s' % str(e)) + except DatabaseError as e: + if re.match('.*ORA-12899.*', e.args[0]) \ + or re.match('.*1406.*', e.args[0]): + raise exceptions.DatabaseException('Could not persist event, content too large: %s' % str(e)) + else: + raise exceptions.DatabaseException('Could not persist event: %s' % str(e)) + return None + + +@transactional_session +def clean_event(event, to_archive=True, session=None): + event.status = EventStatus.Processed + event.processed_at = datetime.datetime.utcnow() + delete_event(event.event_id, session=session) + if to_archive: + add_event_archive(event, session=session) + + +@transactional_session +def fail_event(event, to_archive=True, session=None): + event.status = EventStatus.Failed + event.processed_at = datetime.datetime.utcnow() + delete_event(event.event_id, session=session) + if to_archive: + add_event_archive(event, session=session) diff --git a/main/lib/idds/orm/health.py b/main/lib/idds/orm/health.py index 9b4a23f6..fd5a8238 100644 --- a/main/lib/idds/orm/health.py +++ b/main/lib/idds/orm/health.py @@ -14,6 +14,7 @@ """ import datetime +import re from sqlalchemy.exc import DatabaseError, IntegrityError @@ -42,12 +43,16 @@ def add_health_item(agent, hostname, pid, thread_id, thread_name, payload, sessi .filter(models.Health.hostname == hostname)\ .filter(models.Health.pid == pid)\ .filter(models.Health.thread_id == thread_id)\ - .update({'updated_at': datetime.datetime.utcnow()}) + .update({'updated_at': datetime.datetime.utcnow(), + 'payload': payload}) if not counts: new_h = models.Health(agent=agent, hostname=hostname, pid=pid, thread_id=thread_id, thread_name=thread_name, payload=payload) new_h.save(session=session) + except IntegrityError as e: + if re.match('.*ORA-00001.*', e.args[0]) or re.match('.*unique constraint.*', e.args[0]): + print("unique constraintviolated: %s" % str(e)) except DatabaseError as e: raise exceptions.DatabaseException('Could not persist message: %s' % str(e)) @@ -75,13 +80,28 @@ def retrieve_health_items(session=None): @transactional_session -def clean_health(older_than=3600, session=None): +def clean_health(older_than=3600, hostname=None, pids=[], session=None): """ Clearn items which is older than the time. :param older_than in seconds """ + query = session.query(models.Health) + if older_than: + query = query.filter(models.Health.updated_at < datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than)) + if hostname: + query = query.filter(models.Health.hostname == hostname) + if pids: + query = query.filter(models.Health.pid.in_(pids)) + query.delete() + + +@transactional_session +def update_health_item_status(item, status, session=None): session.query(models.Health)\ - .filter(models.Health.updated_at < datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than))\ - .delete() + .filter(models.Health.agent == item['agent'])\ + .filter(models.Health.hostname == item['hostname'])\ + .filter(models.Health.pid == item['pid'])\ + .filter(models.Health.thread_id == item['thread_id'])\ + .update({'status': status, 'updated_at': item['updated_at']}) diff --git a/main/lib/idds/orm/messages.py b/main/lib/idds/orm/messages.py index f4619175..f2860cb1 100644 --- a/main/lib/idds/orm/messages.py +++ b/main/lib/idds/orm/messages.py @@ -144,14 +144,6 @@ def retrieve_messages(bulk_size=1000, msg_type=None, status=None, source=None, msg_type = [msg_type[0], msg_type[0]] query = session.query(models.Message) - if request_id is not None: - query = query.with_hint(models.Message, "INDEX(MESSAGES MESSAGES_TYPE_ST_IDX)", 'oracle') - elif transform_id: - query = query.with_hint(models.Message, "INDEX(MESSAGES MESSAGES_TYPE_ST_TF_IDX)", 'oracle') - elif processing_id is not None: - query = query.with_hint(models.Message, "INDEX(MESSAGES MESSAGES_TYPE_ST_PR_IDX)", 'oracle') - else: - query = query.with_hint(models.Message, "INDEX(MESSAGES MESSAGES_TYPE_ST_IDX)", 'oracle') if msg_type is not None: query = query.filter(models.Message.msg_type.in_(msg_type)) @@ -201,13 +193,23 @@ def delete_messages(messages, session=None): try: if message_condition: session.query(models.Message).\ - with_hint(models.Message, "index(messages MESSAGES_PK)", 'oracle').\ filter(or_(*message_condition)).\ delete(synchronize_session=False) except IntegrityError as e: raise exceptions.DatabaseException(e.args) +@transactional_session +def clean_old_messages(request_id, session=None): + """ + Delete messages whose request id is older than request_id. + + :param request_id: request id.. + """ + session.query(models.Message)\ + .filter(models.Message.request_id <= request_id)\ + .delete(synchronize_session=False) + # @transactional_session # def update_messages(messages, session=None): # """ diff --git a/main/lib/idds/orm/processings.py b/main/lib/idds/orm/processings.py index 772801d4..9175e461 100644 --- a/main/lib/idds/orm/processings.py +++ b/main/lib/idds/orm/processings.py @@ -157,8 +157,8 @@ def get_processing_by_id_status(processing_id, status=None, locking=False, sessi """ try: - query = session.query(models.Processing).with_hint(models.Processing, "INDEX(PROCESSINGS PROCESSINGS_PK)", 'oracle')\ - .filter(models.Processing.processing_id == processing_id) + query = session.query(models.Processing)\ + .filter(models.Processing.processing_id == processing_id) if status: if not isinstance(status, (list, tuple)): diff --git a/main/lib/idds/orm/requests.py b/main/lib/idds/orm/requests.py index ae0d66f7..3c6f7ee2 100644 --- a/main/lib/idds/orm/requests.py +++ b/main/lib/idds/orm/requests.py @@ -160,7 +160,6 @@ def get_request_ids_by_workload_id(workload_id, session=None): try: query = session.query(models.Request.request_id)\ - .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", 'oracle')\ .filter(models.Request.workload_id == workload_id) tmp = query.all() ret_ids = [] @@ -186,7 +185,6 @@ def get_request_ids_by_name(name, session=None): """ try: query = session.query(models.Request.request_id, models.Request.name)\ - .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", "oracle")\ .filter(models.Request.name.like(name.replace('*', '%'))) tmp = query.all() ret_ids = {} @@ -232,8 +230,8 @@ def get_request(request_id, to_json=False, session=None): """ try: - query = session.query(models.Request).with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", 'oracle')\ - .filter(models.Request.request_id == request_id) + query = session.query(models.Request)\ + .filter(models.Request.request_id == request_id) ret = query.first() if not ret: @@ -264,8 +262,8 @@ def get_request_by_id_status(request_id, status=None, locking=False, session=Non """ try: - query = session.query(models.Request).with_hint(models.Request, "INDEX(REQUESTS REQUESTS_PK)", 'oracle')\ - .filter(models.Request.request_id == request_id) + query = session.query(models.Request)\ + .filter(models.Request.request_id == request_id) if status: if not isinstance(status, (list, tuple)): @@ -305,8 +303,7 @@ def get_requests(request_id=None, workload_id=None, with_detail=False, with_meta try: if with_request or not (with_transform or with_processing or with_detail or with_metadata): if with_metadata: - query = session.query(models.Request)\ - .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", 'oracle') + query = session.query(models.Request) if request_id: query = query.filter(models.Request.request_id == request_id) @@ -342,8 +339,7 @@ def get_requests(request_id=None, workload_id=None, with_detail=False, with_meta models.Request.next_poll_at, models.Request.accessed_at, models.Request.expired_at, - models.Request.errors)\ - .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", 'oracle') + models.Request.errors) if request_id: query = query.filter(models.Request.request_id == request_id) @@ -767,7 +763,6 @@ def get_requests_by_requester(scope, name, requester, to_json=False, session=Non try: query = session.query(models.Request)\ - .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", 'oracle')\ .filter(models.Request.requester == requester)\ .filter(models.Request.scope == scope)\ .filter(models.Request.name.like(name.replace('*', '%'))) @@ -810,11 +805,9 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, req status = [status[0], status[0]] if only_return_id: - query = session.query(models.Request.request_id)\ - .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", 'oracle') + query = session.query(models.Request.request_id) else: - query = session.query(models.Request)\ - .with_hint(models.Request, "INDEX(REQUESTS REQUESTS_SCOPE_NAME_IDX)", 'oracle') + query = session.query(models.Request) if status: if by_substatus: @@ -962,3 +955,29 @@ def clean_next_poll_at(status, session=None): params = {'next_poll_at': datetime.datetime.utcnow()} session.query(models.Request).filter(models.Request.status.in_(status))\ .update(params, synchronize_session=False) + + +@read_session +def get_last_request_id(status, older_than=None, session=None): + """ + Get last request id which is older than a timestamp. + + :param status: status of the request. + :param older_than: days older than current timestamp. + + :returns request_id + """ + if not isinstance(status, (list, tuple)): + status = [status] + if len(status) == 1: + status = [status[0], status[0]] + + query = session.query(models.Request.request_id) + if status: + query = query.filter(models.Request.status.in_(status)) + query = query.filter(models.Request.updated_at <= datetime.datetime.utcnow() - datetime.timedelta(days=older_than)) + query = query.order_by(desc(models.Request.request_id)) + ret = query.first() + if ret: + return ret[0] + return ret diff --git a/main/lib/idds/orm/transforms.py b/main/lib/idds/orm/transforms.py index 2f61d4df..c011c21f 100644 --- a/main/lib/idds/orm/transforms.py +++ b/main/lib/idds/orm/transforms.py @@ -199,8 +199,8 @@ def get_transform_by_id_status(transform_id, status=None, locking=False, session """ try: - query = session.query(models.Transform).with_hint(models.Transform, "INDEX(TRANSFORMS TRANSFORMS_PK)", 'oracle')\ - .filter(models.Transform.transform_id == transform_id) + query = session.query(models.Transform)\ + .filter(models.Transform.transform_id == transform_id) if status: if not isinstance(status, (list, tuple)): diff --git a/main/lib/idds/orm/workprogress.py b/main/lib/idds/orm/workprogress.py index c6af09b0..3d41dcdd 100644 --- a/main/lib/idds/orm/workprogress.py +++ b/main/lib/idds/orm/workprogress.py @@ -135,8 +135,7 @@ def get_workprogresses(request_id=None, to_json=False, session=None): """ try: - query = session.query(models.Workprogress)\ - .with_hint(models.Workprogress, "INDEX(WORKPROGRESSES WORKPROGRESS_PK)", 'oracle') + query = session.query(models.Workprogress) if request_id is not None: query = query.filter(models.Workprogress.request_id == request_id) tmp = query.all() @@ -169,7 +168,6 @@ def get_workprogress(workprogress_id, to_json=False, session=None): try: query = session.query(models.Workprogress)\ - .with_hint(models.Workprogress, "INDEX(WORKPROGRESSES WORKPROGRESS_PK)", 'oracle')\ .filter(models.Workprogress.workprogress_id == workprogress_id) ret = query.first() @@ -208,7 +206,6 @@ def get_workprogresses_by_status(status, period=None, locking=False, bulk_size=N status = [status[0], status[0]] query = session.query(models.Workprogress)\ - .with_hint(models.Workprogress, "INDEX(WORKPROGRESSES WORKPROGRESS_STATUS_PRIO_IDX)", 'oracle')\ .filter(models.Workprogress.status.in_(status))\ .filter(models.Workprogress.next_poll_at < datetime.datetime.utcnow()) diff --git a/main/lib/idds/tests/core_tests.py b/main/lib/idds/tests/core_tests.py index bb92d36a..b9d42530 100644 --- a/main/lib/idds/tests/core_tests.py +++ b/main/lib/idds/tests/core_tests.py @@ -171,7 +171,7 @@ def print_workflow_template(workflow, layers=0): # reqs = get_requests(request_id=381520, with_request=True, with_detail=False, with_metadata=True) # reqs = get_requests(request_id=28182323, with_request=True, with_detail=False, with_metadata=True) # reqs = get_requests(request_id=385554, with_request=True, with_detail=False, with_metadata=True) -reqs = get_requests(request_id=474, with_request=True, with_detail=False, with_metadata=True) +reqs = get_requests(request_id=458999, with_request=True, with_detail=False, with_metadata=True) for req in reqs: # print(req['request_id']) # print(req) @@ -179,11 +179,18 @@ def print_workflow_template(workflow, layers=0): # print(json_dumps(req, sort_keys=True, indent=4)) # show_works(req) pass - workflow = req['request_metadata']['build_workflow'] - # workflow.get_new_works() - print(workflow.runs.keys()) - # print(workflow.runs["1"]) - print(json_dumps(workflow.runs["1"], sort_keys=True, indent=4)) + if 'build_workflow' in req['request_metadata']: + workflow = req['request_metadata']['build_workflow'] + # workflow.get_new_works() + print(workflow.runs.keys()) + # print(workflow.runs["1"]) + print(json_dumps(workflow.runs["1"], sort_keys=True, indent=4)) + elif 'workflow' in req['request_metadata']: + workflow = req['request_metadata']['workflow'] + # workflow.get_new_works() + print(workflow.runs.keys()) + # print(workflow.runs["1"]) + print(json_dumps(workflow.runs["1"], sort_keys=True, indent=4)) # print(workflow.runs["1"].works.keys()) # print(workflow.runs["1"].has_loop_condition()) @@ -208,10 +215,13 @@ def print_workflow_template(workflow, layers=0): print("work %s signature: %s" % (work.get_work_id(), work.signature)) # print("workflow template") - # print_workflow_template(workflow) + print_workflow_template(workflow) # workflow.sync_works() + print("workflow template") + print(json_dumps(workflow.template, sort_keys=True, indent=4)) + sys.exit(0) reqs = get_requests(request_id=28182323, with_request=False, with_detail=True, with_metadata=False) diff --git a/main/lib/idds/tests/panda_test.py b/main/lib/idds/tests/panda_test.py index a82f9196..7830b869 100644 --- a/main/lib/idds/tests/panda_test.py +++ b/main/lib/idds/tests/panda_test.py @@ -100,7 +100,12 @@ # task_ids = [i for i in range(140349, 140954)] + [142268, 142651] # task_ids = [1851] + [i for i in range(4336, 4374)] + [i for i in range(133965, 136025)] # task_ids = [832, 2347, 3045, 66860, 67036] + [i for i in range(121273, 140349)] -task_ids = [i for i in range(144088, 144111)] + [144891, 144892] +# task_ids = [i for i in range(144088, 144111)] + [144891, 144892] +# task_ids = [i for i in range(150050, 150065)] +# task_ids = [150607, 150619, 150649, 150637, 150110, 150111] +# task_ids = [150864, 150897, 150910] +# task_ids = [151114, 151115] +task_ids = [i for i in range(151444, 151453)] # task_ids = [] for task_id in task_ids: print("Killing %s" % task_id) diff --git a/main/lib/idds/tests/test_get_dn.py b/main/lib/idds/tests/test_get_dn.py index 8ac7f662..ba5c000b 100644 --- a/main/lib/idds/tests/test_get_dn.py +++ b/main/lib/idds/tests/test_get_dn.py @@ -20,14 +20,18 @@ def get_user_name_from_dn1(dn): username = up.sub('', dn) up2 = re.compile('/CN=[0-9]+') username = up2.sub('', username) - up3 = re.compile(' [0-9]+') - username = up3.sub('', username) - up4 = re.compile('_[0-9]+') + up4 = re.compile(' [0-9]+') username = up4.sub('', username) + up5 = re.compile('_[0-9]+') + username = up5.sub('', username) username = username.replace('/CN=proxy', '') username = username.replace('/CN=limited proxy', '') username = username.replace('limited proxy', '') + username = re.sub('/CN=Robot:[^/]+,', ',', username) username = re.sub('/CN=Robot:[^/]+', '', username) + username = re.sub('/CN=Robot[^/]+,', ',', username) + username = re.sub('/CN=Robot[^/]+', '', username) + username = re.sub('/CN=nickname:[^/]+,', ',', username) username = re.sub('/CN=nickname:[^/]+', '', username) pat = re.compile('.*/CN=([^\/]+)/CN=([^\/]+)') # noqa W605 mat = pat.match(username) @@ -56,14 +60,20 @@ def get_user_name_from_dn2(dn): username = up.sub('', dn) up2 = re.compile(',CN=[0-9]+') username = up2.sub('', username) - up3 = re.compile(' [0-9]+') - username = up3.sub('', username) - up4 = re.compile('_[0-9]+') + up3 = re.compile('CN=[0-9]+,') + username = up3.sub(',', username) + up4 = re.compile(' [0-9]+') username = up4.sub('', username) + up5 = re.compile('_[0-9]+') + username = up5.sub('', username) username = username.replace(',CN=proxy', '') username = username.replace(',CN=limited proxy', '') username = username.replace('limited proxy', '') + username = re.sub(',CN=Robot:[^/]+,', ',', username) username = re.sub(',CN=Robot:[^/]+', '', username) + username = re.sub(',CN=Robot[^/]+,', ',', username) + username = re.sub(',CN=Robot[^/]+', '', username) + username = re.sub(',CN=nickname:[^/]+,', ',', username) username = re.sub(',CN=nickname:[^/]+', '', username) pat = re.compile('.*,CN=([^\,]+),CN=([^\,]+)') # noqa W605 mat = pat.match(username) @@ -109,3 +119,9 @@ def get_user_name_from_dn(dn): print(username) username = authentication.get_user_name_from_dn(dn) print("auth: " + username) + + dn = "CN=1316551436,CN=Robot: ATLAS Panda Server1,CN=663551,CN=pandasv1,OU=Users,OU=Organic Units,DC=cern,DC=ch" + username = get_user_name_from_dn(dn) + print(username) + username = authentication.get_user_name_from_dn(dn) + print("auth: " + username) diff --git a/main/lib/idds/tests/test_merge_dict.py b/main/lib/idds/tests/test_merge_dict.py new file mode 100644 index 00000000..c11b086b --- /dev/null +++ b/main/lib/idds/tests/test_merge_dict.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + + +def merge_dict(dict1, dict2): + print("merge: %s, %s" % (dict1, dict2)) + # keys = list(dict1.keys()) + list(dict2.keys()) + keys = list(dict1.keys()) + for key in list(dict2.keys()): + if key not in keys: + keys.append(key) + for key in keys: + if key in dict2: + if key not in dict1 or dict1[key] is None: + dict1[key] = dict2[key] + else: + if dict2[key] is None: + continue + elif isinstance(dict1[key], type(dict2[key])): + raise Exception("type of %s is different from %s, cannot merge" % (type(dict1[key]), type(dict2[key]))) + elif dict1[key] == dict2[key]: + continue + elif type(dict1[key]) in (list, tuple, str): + dict1[key] = dict1[key] + dict2[key] + elif type(dict1[key]) in (int, float, complex): + dict1[key] = dict1[key] + dict2[key] + elif type(dict1[key]) in (bool, bool): + dict1[key] = True + elif type(dict1[key]) in (dict, dict): + dict1[key] = merge_dict(dict1[key], dict2[key]) + return dict1 + + +if __name__ == '__main__': + a = {'a': 1, 'b': 2, 'c': [1, 2], 'd': {'a': 1, 'c': [4, 5]}, 'f': 1332} + b = {'a': 3, 'b': 4, 'c': [3, 4], 'd': {'b': 1, 'c': [6, 5]}, 'e': 'abd'} + print(id(a)) + print(id(b)) + c = merge_dict(a, b) + print(id(c)) + print(c) diff --git a/main/lib/idds/tests/test_migrate_requests.py b/main/lib/idds/tests/test_migrate_requests.py index 3e6b7257..489eb6ed 100644 --- a/main/lib/idds/tests/test_migrate_requests.py +++ b/main/lib/idds/tests/test_migrate_requests.py @@ -54,18 +54,22 @@ def migrate(): old_request_id = 2802 old_request_id = 2816 # big tasks # old_request_id = 3178 # 125 tasks + old_request_id = 3578 + old_request_id = 3612 + old_request_id = 3628 + old_request_ids = [3628] # old_request_id = 1 # for old_request_id in [152]: # for old_request_id in [60]: # noqa E115 # for old_request_id in [200]: # noqa E115 - for old_request_id in [old_request_id]: # noqa E115 # doma 183 + for old_request_id in old_request_ids: # noqa E115 # doma 183 reqs = cm1.get_requests(request_id=old_request_id, with_metadata=True) - # cm2 = ClientManager(host=dev_host) + cm2 = ClientManager(host=dev_host) # cm2 = ClientManager(host=doma_host) # cm2 = ClientManager(host=atlas_host) - cm2 = ClientManager(host=slac_k8s_dev_host) + # cm2 = ClientManager(host=slac_k8s_dev_host) # cm2 = ClientManager(host=cern_k8s_dev_host) # print(reqs) diff --git a/main/tools/env/dump_database.py b/main/tools/env/dump_database.py new file mode 100644 index 00000000..f5a9304a --- /dev/null +++ b/main/tools/env/dump_database.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2021 - 2022 + +import sys +import os.path +base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(base_path) +os.chdir(base_path) + +from idds.orm.base.utils import dump_schema # noqa E402 + + +if __name__ == '__main__': + dump_schema() diff --git a/main/tools/env/environment.yml b/main/tools/env/environment.yml index 9747a65d..649f928a 100644 --- a/main/tools/env/environment.yml +++ b/main/tools/env/environment.yml @@ -28,6 +28,8 @@ dependencies: - cryptography - redis - alembic + - deepdiff + - pyzmq - idds-common==0.11.5 - idds-workflow==0.11.5 - idds-client==0.11.5 diff --git a/main/tools/env/setup_panda.sh b/main/tools/env/setup_panda.sh index b5fa762c..93fb64b4 100644 --- a/main/tools/env/setup_panda.sh +++ b/main/tools/env/setup_panda.sh @@ -37,6 +37,8 @@ elif [ "$instance" == "slac" ]; then export PANDA_AUTH_VO=Rubin export PANDACACHE_URL=$PANDA_URL_SSL + export PANDA_SYS=/afs/cern.ch/user/w/wguan/workdisk/iDDS/.conda/iDDS/ + # export PANDA_CONFIG_ROOT=/afs/cern.ch/user/w/wguan/workdisk/iDDS/main/etc/panda/ export PANDA_CONFIG_ROOT=~/.panda/ else diff --git a/main/tools/logrotate/idds.conf b/main/tools/logrotate/idds.conf new file mode 100644 index 00000000..9d71d77b --- /dev/null +++ b/main/tools/logrotate/idds.conf @@ -0,0 +1,11 @@ +/var/log/idds/*.log { + daily + rotate 5 + size 2G + compress + delaycompress + postrotate + /usr/bin/supervisorctl stop idds-server:00 > /dev/null 2>/dev/null || true + /usr/bin/supervisorctl start idds-server:00 > /dev/null 2>/dev/null || true + endscript +} diff --git a/monitor/data/conf.js b/monitor/data/conf.js index 9b9020c8..014a6e97 100644 --- a/monitor/data/conf.js +++ b/monitor/data/conf.js @@ -1,9 +1,9 @@ var appConfig = { - 'iddsAPI_request': "https://lxplus8s16.cern.ch:443/idds/monitor_request/null/null", - 'iddsAPI_transform': "https://lxplus8s16.cern.ch:443/idds/monitor_transform/null/null", - 'iddsAPI_processing': "https://lxplus8s16.cern.ch:443/idds/monitor_processing/null/null", - 'iddsAPI_request_detail': "https://lxplus8s16.cern.ch:443/idds/monitor/null/null/true/false/false", - 'iddsAPI_transform_detail': "https://lxplus8s16.cern.ch:443/idds/monitor/null/null/false/true/false", - 'iddsAPI_processing_detail': "https://lxplus8s16.cern.ch:443/idds/monitor/null/null/false/false/true" + 'iddsAPI_request': "https://lxplus807.cern.ch:443/idds/monitor_request/null/null", + 'iddsAPI_transform': "https://lxplus807.cern.ch:443/idds/monitor_transform/null/null", + 'iddsAPI_processing': "https://lxplus807.cern.ch:443/idds/monitor_processing/null/null", + 'iddsAPI_request_detail': "https://lxplus807.cern.ch:443/idds/monitor/null/null/true/false/false", + 'iddsAPI_transform_detail': "https://lxplus807.cern.ch:443/idds/monitor/null/null/false/true/false", + 'iddsAPI_processing_detail': "https://lxplus807.cern.ch:443/idds/monitor/null/null/false/false/true" } diff --git a/requirements.yaml b/requirements.yaml index cf87039a..e15db948 100644 --- a/requirements.yaml +++ b/requirements.yaml @@ -28,3 +28,5 @@ dependencies: - cryptography - redis - alembic + - deepdiff + - pyzmq diff --git a/workflow/lib/idds/workflowv2/workflow.py b/workflow/lib/idds/workflowv2/workflow.py index 94956008..103c2d53 100644 --- a/workflow/lib/idds/workflowv2/workflow.py +++ b/workflow/lib/idds/workflowv2/workflow.py @@ -921,7 +921,8 @@ def sync_global_parameters(self, global_parameters, sliced_global_parameters=Non self.global_parameters = gp def sync_global_parameters_from_work(self, work): - self.log_debug("work %s (%s) is_terminated, global_parameters: %s" % (work.get_internal_id(), str(work), str(self.global_parameters))) + self.log_debug("work %s (%s) is_terminated, global_parameters: %s" % (work.get_internal_id(), str(work.metadata), + str(self.global_parameters))) if isinstance(work, Work): if self.global_parameters: for key in self.global_parameters: @@ -1399,8 +1400,10 @@ def add_next_work(self, work_id): self.next_works = next_works def enable_next_works(self, work, cond): - self.log_debug("Checking Work %s condition: %s" % (work.get_internal_id(), - json_dumps(cond, sort_keys=True, indent=4))) + # self.log_debug("Checking Work %s condition: %s" % (work.get_internal_id(), + # json_dumps(cond, sort_keys=True, indent=4))) + self.log_debug("Checking Work %s condition: %s" % (work.get_internal_id(), cond.get_internal_id())) + # load_conditions should cover it. # if cond and self.is_class_method(cond.cond): # # cond_work_id = self.works[cond.cond['idds_method_class_id']] @@ -1681,8 +1684,10 @@ def sync_works(self, to_cancel=False): json_dumps(self.work_conds[work.get_internal_id()], sort_keys=True, indent=4))) for cond_id in self.work_conds[work.get_internal_id()]: cond = self.conditions[cond_id] - self.log_debug("Work %s has condition dependencie %s" % (work.get_internal_id(), - json_dumps(cond, sort_keys=True, indent=4))) + # self.log_debug("Work %s has condition dependencie %s" % (work.get_internal_id(), + # json_dumps(cond, sort_keys=True, indent=4))) + self.log_debug("Work %s has condition dependencie %s" % (work.get_internal_id(), cond.get_internal_id())) + self.enable_next_works(work, cond) if work.is_terminated(synchronize=False):