diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 76319057..356774e0 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -49,4 +49,3 @@ jobs: python main/tools/pypi/update_version.py ${version_tag} python setup.py sdist bdist_wheel twine upload */dist/idds*-${version_tag}.tar.gz - twine upload */dist/idds*-${version_tag}*.whl diff --git a/atlas/lib/idds/atlas/version.py b/atlas/lib/idds/atlas/version.py index 7c2e8610..777f1776 100644 --- a/atlas/lib/idds/atlas/version.py +++ b/atlas/lib/idds/atlas/version.py @@ -9,4 +9,4 @@ # - Wen Guan, , 2019 - 2021 -release_version = "0.11.5" +release_version = "2.0.9" diff --git a/atlas/tools/env/environment.yml b/atlas/tools/env/environment.yml index 42f15752..0fb1a1df 100644 --- a/atlas/tools/env/environment.yml +++ b/atlas/tools/env/environment.yml @@ -13,5 +13,5 @@ dependencies: - panda-client-light # panda client - rucio-clients - rucio-clients-atlas - - idds-common==0.11.5 - - idds-workflow==0.11.5 \ No newline at end of file + - idds-common==2.0.9 + - idds-workflow==2.0.9 \ No newline at end of file diff --git a/client/lib/idds/client/version.py b/client/lib/idds/client/version.py index 7c2e8610..777f1776 100644 --- a/client/lib/idds/client/version.py +++ b/client/lib/idds/client/version.py @@ -9,4 +9,4 @@ # - Wen Guan, , 2019 - 2021 -release_version = "0.11.5" +release_version = "2.0.9" diff --git a/client/tools/env/environment.yml b/client/tools/env/environment.yml index bb4a0e95..c48c842c 100644 --- a/client/tools/env/environment.yml +++ b/client/tools/env/environment.yml @@ -7,5 +7,5 @@ dependencies: - urllib3 # url connections - tabulate - argcomplete - - idds-common==0.11.5 - - idds-workflow==0.11.5 + - idds-common==2.0.9 + - idds-workflow==2.0.9 \ No newline at end of file diff --git a/common/lib/idds/common/constants.py b/common/lib/idds/common/constants.py index ad52a979..2129163a 100644 --- a/common/lib/idds/common/constants.py +++ b/common/lib/idds/common/constants.py @@ -293,6 +293,8 @@ class ContentStatus(IDDSEnum): Missing = 9 Cancelled = 10 Activated = 11 + SubAvailable = 12 + FinalSubAvailable = 13 class ContentLocking(IDDSEnum): diff --git a/common/lib/idds/common/version.py b/common/lib/idds/common/version.py index 7c2e8610..777f1776 100644 --- a/common/lib/idds/common/version.py +++ b/common/lib/idds/common/version.py @@ -9,4 +9,4 @@ # - Wen Guan, , 2019 - 2021 -release_version = "0.11.5" +release_version = "2.0.9" diff --git a/common/tools/env/environment.yml b/common/tools/env/environment.yml index fa99bbe7..c991fe97 100644 --- a/common/tools/env/environment.yml +++ b/common/tools/env/environment.yml @@ -7,4 +7,4 @@ dependencies: - pyjwt # Pyjwt - packaging - requests - - dogpile.cache + - dogpile.cache \ No newline at end of file diff --git a/doma/lib/idds/doma/version.py b/doma/lib/idds/doma/version.py index a9a272c5..a123f19a 100644 --- a/doma/lib/idds/doma/version.py +++ b/doma/lib/idds/doma/version.py @@ -9,4 +9,4 @@ # - Wen Guan, , 2020 - 2021 -release_version = "0.11.5" +release_version = "2.0.9" diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index b0aa5d78..7df7bca5 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -24,6 +24,7 @@ from idds.common.constants import (TransformType, CollectionStatus, CollectionType, ContentStatus, ContentType, ProcessingStatus, WorkStatus) +from idds.common.utils import get_list_chunks, split_chunks_not_continous from idds.workflowv2.work import Work, Processing from idds.workflowv2.workflow import Condition @@ -56,6 +57,9 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, task_rss_retry_step=0, task_rss_max=None, vo='wlcg', + es=False, + es_label=None, + max_events_per_job=40, max_name_length=4000, working_group='lsst'): @@ -88,10 +92,6 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, except Exception as ex: self.logger.warn('IDDS_MAX_NAME_LENGTH is not defined correctly: %s' % str(ex)) - self.dependency_map = dependency_map - if self.dependency_map is None: - self.dependency_map = {} - self.dependency_map_deleted = [] # self.logger.setLevel(logging.DEBUG) self.task_name = task_name @@ -131,6 +131,16 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.dependency_tasks = None + self.es = es + self.es_label = es_label + self.max_events_per_job = max_events_per_job + self.es_files = {} + + self.dependency_map = dependency_map + if self.dependency_map is None: + self.dependency_map = {} + self.dependency_map_deleted = [] + def my_condition(self): if self.is_finished(): return True @@ -168,6 +178,87 @@ def dependency_map(self, value): self._dependency_map = value + if self.es: + self.construct_es_files() + + def construct_es_files(self): + # job's order id must not be skipped/duplicated + order_id_map = {} + local_order_id = False + for job in self._dependency_map: + if job.get("order_id", None) is None: + local_order_id = True + self.logger.warn("order_id is not set, user local order id") + break + order_id = job.get("order_id", None) + if order_id in order_id_map: + local_order_id = True + self.logger.warn("order_id has duplications, user local order id") + break + order_id_map[order_id] = job + + if local_order_id: + self.logger.warn("order_id is not set correctly. With EventService, it will not be able to mapping jobs to events correctly. Disable EventService.") + self.es = False + return + + order_id_map = {} + order_id_group_map = {} + if local_order_id: + order_id = 0 + for job in self._dependency_map: + groups = job.get("groups", "es_default") + if groups not in order_id_group_map: + order_id_group_map[groups] = {} + order_id_group_map[groups][order_id] = job + order_id_map[order_id] = job + order_id += 1 + else: + for job in self._dependency_map: + groups = job.get("groups", "es_default") + if groups not in order_id_group_map: + order_id_group_map[groups] = {} + order_id = int(job.get("order_id")) + order_id_group_map[groups][order_id] = job + order_id_map[order_id] = job + + final_chunks = [] + for groups in order_id_group_map: + order_id_list = sorted(list(order_id_group_map[groups].keys())) + order_id_list_chunks = split_chunks_not_continous(order_id_list) + for chunk in order_id_list_chunks: + sub_chunks = get_list_chunks(chunk, bulk_size=self.max_events_per_job) + final_chunks = final_chunks + sub_chunks + + for map_id, chunk in enumerate(final_chunks): + order_id_start = chunk[0] + order_id_end = chunk[-1] + num_events = order_id_end - order_id_start + 1 + eventservice_file_name = "%s:eventservice_%s^%s" % (self.es_label, order_id_start, num_events) + + has_dependencies = False + # sub_maps = {} + for order_id in chunk: + job = order_id_map[order_id] + # output_name = job['name'] + inputs_dependency = job["dependencies"] + if inputs_dependency: + has_dependencies = True + sub_map_id = order_id - order_id_start + # sub_maps[sub_map_id] = {'order_id': order_id, + # 'sub_map_id': sub_map_id, + # 'job': job} + + # set the job information + # order_id should be already there + # job["order_id"] = order_id + job["map_id"] = map_id + job["sub_map_id"] = sub_map_id + job["es_name"] = eventservice_file_name + + # self.es_files[eventservice_file_name] = {'has_dependencies': has_dependencies, 'sub_maps': sub_maps} + self.es_files[eventservice_file_name] = {'has_dependencies': has_dependencies, 'map_id': map_id, 'order_ids': chunk} + def load_panda_config(self): panda_config = ConfigParser.ConfigParser() if os.environ.get('IDDS_PANDA_CONFIG', None): @@ -317,11 +408,12 @@ def get_mapped_inputs(self, mapped_input_output_maps): inputs = mapped_input_output_maps[map_id]['inputs'] # if 'primary' is not set, the first one is the primary input. - primary_input = inputs[0] + # primary_input = inputs[0] for ip in inputs: - if 'primary' in ip['content_metadata'] and ip['content_metadata']['primary']: - primary_input = ip - ret.append(primary_input) + # if 'primary' in ip['content_metadata'] and ip['content_metadata']['primary']: + # primary_input = ip + # ret.append(primary_input) + ret.append(ip) return ret def get_mapped_outputs(self, mapped_input_output_maps): @@ -330,14 +422,15 @@ def get_mapped_outputs(self, mapped_input_output_maps): outputs = mapped_input_output_maps[map_id]['outputs'] # if 'primary' is not set, the first one is the primary input. - primary_output = outputs[0] + # primary_output = outputs[0] for ip in outputs: - if 'primary' in ip['content_metadata'] and ip['content_metadata']['primary']: - primary_output = ip - ret.append(primary_output) + # if 'primary' in ip['content_metadata'] and ip['content_metadata']['primary']: + # primary_output = ip + # ret.append(primary_output) + ret.append(ip) return ret - def map_file_to_content(self, coll_id, scope, name): + def map_file_to_content(self, coll_id, scope, name, order_id=None, sub_map_id=None, es_name=None): content = {'coll_id': coll_id, 'scope': scope, 'name': name, # or a different file name from the dataset name @@ -349,6 +442,13 @@ def map_file_to_content(self, coll_id, scope, name): # 'content_relation_type': content_relation_type, # here events is all events for eventservice, not used here. 'content_metadata': {'events': 1}} + if order_id is not None: + content['min_id'] = int(order_id) + content['max_id'] = int(order_id) + 1 + if sub_map_id is not None: + content['sub_map_id'] = sub_map_id + if es_name is not None: + content['path'] = es_name.split('^')[0] return content def is_all_dependency_tasks_available(self, inputs_dependency, task_name_to_coll_map): @@ -414,12 +514,6 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): return new_input_output_maps if unmapped_jobs: - mapped_keys = mapped_input_output_maps.keys() - if mapped_keys: - next_key = max(mapped_keys) + 1 - else: - next_key = 1 - input_coll = self.get_input_collections()[0] input_coll_id = input_coll.coll_id output_coll = self.get_output_collections()[0] @@ -427,37 +521,106 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): task_name_to_coll_map = self.get_work_name_to_coll_map() - for job in unmapped_jobs: - output_name = job['name'] - inputs_dependency = job["dependencies"] - - if self.is_all_dependency_tasks_available(inputs_dependency, task_name_to_coll_map): - input_content = self.map_file_to_content(input_coll_id, input_coll.scope, output_name) - output_content = self.map_file_to_content(output_coll_id, output_coll.scope, output_name) - new_input_output_maps[next_key] = {'inputs_dependency': [], - 'logs': [], - 'inputs': [input_content], - 'outputs': [output_content]} - uni_input_name = {} - for input_d in inputs_dependency: - task_name = input_d['task'] - input_name = input_d['inputname'] - task_name_input_name = task_name + input_name - if task_name_input_name not in uni_input_name: - uni_input_name[task_name_input_name] = None - input_d_coll = task_name_to_coll_map[task_name]['outputs'][0] - input_d_content = self.map_file_to_content(input_d_coll['coll_id'], input_d_coll['scope'], input_name) - new_input_output_maps[next_key]['inputs_dependency'].append(input_d_content) - else: - self.logger.debug("get_new_input_output_maps, duplicated input dependency for job %s: %s" % (job['name'], str(job["dependencies"]))) - - # all inputs are parsed. move it to dependency_map_deleted - # self.dependency_map_deleted.append(job) - next_key += 1 + if not self.es: + mapped_keys = mapped_input_output_maps.keys() + if mapped_keys: + next_key = max(mapped_keys) + 1 else: - # not all inputs for this job can be parsed. - # self.dependency_map.append(job) - pass + next_key = 1 + + for job in unmapped_jobs: + output_name = job['name'] + inputs_dependency = job["dependencies"] + + if self.is_all_dependency_tasks_available(inputs_dependency, task_name_to_coll_map): + input_content = self.map_file_to_content(input_coll_id, input_coll.scope, output_name) + output_content = self.map_file_to_content(output_coll_id, output_coll.scope, output_name) + new_input_output_maps[next_key] = {'inputs_dependency': [], + 'logs': [], + 'inputs': [input_content], + 'outputs': [output_content]} + + uni_input_name = {} + for input_d in inputs_dependency: + task_name = input_d['task'] + input_name = input_d['inputname'] + task_name_input_name = task_name + input_name + if task_name_input_name not in uni_input_name: + uni_input_name[task_name_input_name] = None + input_d_coll = task_name_to_coll_map[task_name]['outputs'][0] + input_d_content = self.map_file_to_content(input_d_coll['coll_id'], input_d_coll['scope'], input_name) + new_input_output_maps[next_key]['inputs_dependency'].append(input_d_content) + else: + self.logger.debug("get_new_input_output_maps, duplicated input dependency for job %s: %s" % (job['name'], str(job["dependencies"]))) + + # all inputs are parsed. move it to dependency_map_deleted + # self.dependency_map_deleted.append(job) + next_key += 1 + else: + # not all inputs for this job can be parsed. + # self.dependency_map.append(job) + pass + else: + order_id_map = {} + for job in unmapped_jobs: + order_id = job["order_id"] + order_id_map[order_id] = job + for es_name in self.es_files: + order_ids = self.es_files[es_name]["order_ids"] + not_filled = False + for order_id in order_ids: + if order_id in order_id_map: + not_filled = True + + if not not_filled: + continue + + # order_id_start = order_ids[0] + # order_id_end = order_ids[-1] + # num_events = order_id_end - order_id_start + 1 + # eventservice_file_name = es_name + next_key = self.es_files[es_name]["map_id"] + + all_inputs_dependency = [] + for order_id in order_ids: + job = order_id_map[order_id] + # output_name = job['name'] + inputs_dependency = job["dependencies"] + all_inputs_dependency = all_inputs_dependency + inputs_dependency + + if self.is_all_dependency_tasks_available(all_inputs_dependency, task_name_to_coll_map): + new_input_output_maps[next_key] = {"sub_maps": []} + + for order_id in order_ids: + job = order_id_map[order_id] + output_name = job['name'] + inputs_dependency = job["dependencies"] + sub_map_id = job["sub_map_id"] + input_content = self.map_file_to_content(input_coll_id, input_coll.scope, output_name, + order_id=order_id, sub_map_id=sub_map_id, es_name=es_name) + output_content = self.map_file_to_content(output_coll_id, output_coll.scope, output_name, + order_id=order_id, sub_map_id=sub_map_id, es_name=es_name) + sub_map = {'order_id': order_id, + 'sub_map_id': sub_map_id, + 'inputs_dependency': [], + 'logs': [], + 'inputs': [input_content], + 'outputs': [output_content]} + + uni_input_name = {} + for input_d in inputs_dependency: + task_name = input_d['task'] + input_name = input_d['inputname'] + task_name_input_name = task_name + input_name + if task_name_input_name not in uni_input_name: + uni_input_name[task_name_input_name] = None + input_d_coll = task_name_to_coll_map[task_name]['outputs'][0] + input_d_content = self.map_file_to_content(input_d_coll['coll_id'], input_d_coll['scope'], input_name) + sub_map['inputs_dependency'].append(input_d_content) + else: + self.logger.debug("get_new_input_output_maps, duplicated input dependency for job %s: %s" % (job['name'], str(job["dependencies"]))) + + new_input_output_maps[next_key]["sub_maps"].append(sub_map) # self.logger.debug("get_new_input_output_maps, new_input_output_maps: %s" % str(new_input_output_maps)) self.logger.debug("get_new_input_output_maps, new_input_output_maps len: %s" % len(new_input_output_maps)) @@ -497,10 +660,15 @@ def create_processing(self, input_output_maps=[]): has_dependencies = False if self.dependency_map is None: self.dependency_map = {} - for job in self.dependency_map: - in_files.append(job['name']) - if not has_dependencies and "dependencies" in job and job['dependencies']: - has_dependencies = True + if not self.es: + for job in self.dependency_map: + in_files.append(job['name']) + if not has_dependencies and "dependencies" in job and job['dependencies']: + has_dependencies = True + else: + for es_file in self.es_files: + has_dependencies = self.es_files[es_file]['has_dependencies'] + in_files.append(es_file) task_param_map = {} task_param_map['vo'] = self.vo @@ -523,6 +691,11 @@ def create_processing(self, input_output_maps=[]): task_param_map['noInput'] = True task_param_map['pfnList'] = in_files + if self.es: + # enabling eventservice + task_param_map['fineGrainedProc'] = True + # task_param_map['eventService'] = 3 + task_param_map['taskName'] = self.task_name task_param_map['userName'] = self.username if self.username else 'iDDS' task_param_map['taskPriority'] = self.task_priority @@ -762,12 +935,14 @@ def reactive_contents(self, input_output_maps): if not all_outputs_available: for content in inputs + outputs: update_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'status': ContentStatus.New, 'substatus': ContentStatus.New} updated_contents.append(update_content) for content in inputs_dependency: if content['status'] not in [ContentStatus.Available]: update_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'status': ContentStatus.New, 'substatus': ContentStatus.New} updated_contents.append(update_content) @@ -778,59 +953,114 @@ def get_content_status_from_panda_status(self, job_info): return ContentStatus.Processing jobstatus = job_info.jobStatus - if jobstatus in ['finished', 'merging']: - return ContentStatus.Available - elif jobstatus in ['failed', 'closed', 'cancelled', 'lost', 'broken', 'missing']: - attempt_nr = int(job_info.attemptNr) if job_info.attemptNr else 0 - max_attempt = int(job_info.maxAttempt) if job_info.maxAttempt else 0 - self_maxAttempt = int(self.maxAttempt) if self.maxAttempt else 0 - if (attempt_nr >= max_attempt) and (attempt_nr >= self_maxAttempt): - return ContentStatus.FinalFailed + if not job_info.eventService or job_info.eventService in ['NULL', 'None']: + if jobstatus in ['finished', 'merging']: + return ContentStatus.Available + elif jobstatus in ['failed', 'closed', 'cancelled', 'lost', 'broken', 'missing']: + attempt_nr = int(job_info.attemptNr) if job_info.attemptNr else 0 + max_attempt = int(job_info.maxAttempt) if job_info.maxAttempt else 0 + self_maxAttempt = int(self.maxAttempt) if self.maxAttempt else 0 + if (attempt_nr >= max_attempt) and (attempt_nr >= self_maxAttempt): + return ContentStatus.FinalFailed + else: + return ContentStatus.Failed + elif jobstatus in ['activated']: + return ContentStatus.Activated else: - return ContentStatus.Failed - elif jobstatus in ['activated']: - return ContentStatus.Activated + return ContentStatus.Processing else: - return ContentStatus.Processing + # job_info.eventService is 6 + jobsubstatus = job_info.jobSubStatus + if jobstatus in ['finished', 'merging']: + if jobsubstatus in ['fg_done']: + return ContentStatus.Available + elif jobsubstatus in ['fg_partial']: + attempt_nr = int(job_info.attemptNr) if job_info.attemptNr else 0 + max_attempt = int(job_info.maxAttempt) if job_info.maxAttempt else 0 + self_maxAttempt = int(self.maxAttempt) if self.maxAttempt else 0 + if (attempt_nr >= max_attempt) and (attempt_nr >= self_maxAttempt): + return ContentStatus.FinalSubAvailable + else: + return ContentStatus.SubAvailable + else: + return ContentStatus.SubAvailable + elif jobstatus in ['failed', 'closed', 'cancelled', 'lost', 'broken', 'missing']: + attempt_nr = int(job_info.attemptNr) if job_info.attemptNr else 0 + max_attempt = int(job_info.maxAttempt) if job_info.maxAttempt else 0 + self_maxAttempt = int(self.maxAttempt) if self.maxAttempt else 0 + if (attempt_nr >= max_attempt) and (attempt_nr >= self_maxAttempt): + return ContentStatus.FinalFailed + else: + return ContentStatus.Failed + elif jobstatus in ['activated']: + return ContentStatus.Activated + else: + return ContentStatus.Processing + + def get_job_status_from_contents(self, contents, contents_ext_dict): + all_finished, all_terminated, has_finished, panda_id = True, True, False, None + for content in contents: + if content['substatus'] in [ContentStatus.Available]: + has_finished = True + else: + all_finished = False + if content['substatus'] in [ContentStatus.FinalFailed, + ContentStatus.Lost, + ContentStatus.Deleted, + ContentStatus.Missing]: + pass + else: + all_terminated = False + break + + if 'panda_id' in content['content_metadata']: + panda_id = content['content_metadata']['panda_id'] + else: + all_finished = False + all_terminated = False + break + + if content['content_id'] not in contents_ext_dict: + all_finished = False + all_terminated = False + break + + content_ext = contents_ext_dict[content['content_id']] + if content['substatus'] != content_ext['status'] or str(panda_id) != str(content_ext['panda_id']): + all_finished = False + all_terminated = False + break + + return all_finished, all_terminated, has_finished, panda_id def get_unterminated_jobs(self, all_jobs_ids, input_output_maps, contents_ext): - finished_jobs, failed_jobs = [], [] + finished_jobs, sub_finished_jobs, failed_jobs = [], [], [] contents_ext_dict = {content['content_id']: content for content in contents_ext} for map_id in input_output_maps: outputs = input_output_maps[map_id]['outputs'] - for content in outputs: - if content['substatus'] in [ContentStatus.Available]: - if 'panda_id' in content['content_metadata']: - panda_id = content['content_metadata']['panda_id'] - if content['content_id'] not in contents_ext_dict: - continue - - content_ext = contents_ext_dict[content['content_id']] - if content['substatus'] != content_ext['status'] or panda_id != content_ext['panda_id']: - continue - - if panda_id not in finished_jobs: - finished_jobs.append(panda_id) - elif content['substatus'] in [ContentStatus.FinalFailed, - ContentStatus.Lost, ContentStatus.Deleted, - ContentStatus.Missing]: - if 'panda_id' in content['content_metadata']: - panda_id = content['content_metadata']['panda_id'] - if content['content_id'] not in contents_ext_dict: - continue - - content_ext = contents_ext_dict[content['content_id']] - if content['substatus'] != content_ext['status'] or panda_id != content_ext['panda_id']: - continue - + all_finished, all_terminated, has_finished, panda_id = self.get_job_status_from_contents(outputs, contents_ext_dict) + if all_finished: + if panda_id not in finished_jobs: + finished_jobs.append(panda_id) + else: + if all_terminated: + if has_finished: + if panda_id not in sub_finished_jobs: + sub_finished_jobs.append(panda_id) + else: if panda_id not in failed_jobs: failed_jobs.append(panda_id) all_jobs_ids = set(all_jobs_ids) - terminated_jobs = set(finished_jobs + failed_jobs) - unterminated_jobs = all_jobs_ids - terminated_jobs + terminated_jobs = finished_jobs + failed_jobs + sub_finished_jobs + terminated_jobs_final = [] + for job_id in terminated_jobs: + job_ids = [int(i) for i in str(job_id).split(",")] + terminated_jobs_final.extend(job_ids) + terminated_jobs_final = set(terminated_jobs_final) + unterminated_jobs = all_jobs_ids - terminated_jobs_final return list(unterminated_jobs) def get_panda_job_status(self, jobids, log_prefix=''): @@ -865,6 +1095,61 @@ def get_panda_job_status(self, jobids, log_prefix=''): self.logger.error(traceback.format_exc()) return [] + def get_last_job_info(self, jobs): + # job = {'panda_id': job_info.PandaID, 'status': job_status, 'job_info': job_info} + panda_ids = [] + job_status, last_job_info = None, None + for job in jobs: + panda_id = job['panda_id'] + status = job['status'] + job_info = job['job_info'] + if status in [ContentStatus.Available, ContentStatus.FinalSubAvailable, ContentStatus.FinalFailed]: + if panda_id not in panda_ids: + panda_ids.append(panda_id) + if job_status is None: + job_status = status + last_job_info = job_info + else: + if status in [ContentStatus.Available]: + job_status = status + last_job_info = job_info + elif job_status in [ContentStatus.Available]: + pass + elif status in [ContentStatus.FinalSubAvailable]: + job_status = status + last_job_info = job_info + elif job_status in [ContentStatus.FinalSubAvailable]: + pass + elif status in [ContentStatus.FinalFailed]: + job_status = status + last_job_info = job_info + elif job_status in [ContentStatus.FinalFailed]: + pass + return sorted(panda_ids), job_status, last_job_info + + def get_panda_event_status(self, jobids, log_prefix=''): + self.logger.debug(log_prefix + "get_panda_event_status, jobids[:3]: %s" % str(jobids[:3])) + try: + from pandaclient import Client + ret = Client.get_event_status(jobids, verbose=True) + if ret[0] == 0: + job_events_status = ret[1] + return job_events_status + except Exception as ex: + self.logger.error(str(ex)) + self.logger.error(traceback.format_exc()) + return {} + + def poll_panda_events(self, event_ids, log_prefix=''): + self.logger.debug(log_prefix + "poll_panda_events, poll_panda_jobs_chunk_size: %s, event_ids[:3]: %s" % (self.poll_panda_jobs_chunk_size, str(event_ids[:3]))) + chunksize = self.poll_panda_jobs_chunk_size + chunks = [event_ids[i:i + chunksize] for i in range(0, len(event_ids), chunksize)] + jobs_event_status = {} + for chunk in chunks: + job_event_status = self.get_panda_event_status(chunk, log_prefix=log_prefix) + jobs_event_status.update(job_event_status) + return jobs_event_status + def poll_panda_jobs(self, job_ids, log_prefix=''): job_status_info = {} self.logger.debug(log_prefix + "poll_panda_jobs, poll_panda_jobs_chunk_size: %s, job_ids[:10]: %s" % (self.poll_panda_jobs_chunk_size, str(job_ids[:10]))) @@ -876,6 +1161,7 @@ def poll_panda_jobs(self, job_ids, log_prefix=''): if jobs_list: self.logger.debug(log_prefix + "poll_panda_jobs, input jobs: %s, output_jobs: %s" % (len(chunk), len(jobs_list))) for job_info in jobs_list: + job_set_id = job_info.jobsetID job_status = self.get_content_status_from_panda_status(job_info) if job_info and job_info.Files and len(job_info.Files) > 0: for job_file in job_info.Files: @@ -888,95 +1174,344 @@ def poll_panda_jobs(self, job_ids, log_prefix=''): # input_file = job_file.lfn.split(':')[1] else: input_file = job_file.lfn - job_status_info[input_file] = {'panda_id': job_info.PandaID, 'status': job_status, 'job_info': job_info} + # job_status_info[input_file] = {'panda_id': job_info.PandaID, 'status': job_status, 'job_info': job_info} + if input_file not in job_status_info: + job_status_info[input_file] = {'job_set_id': job_set_id, 'jobs': []} + job_status_info[input_file]['jobs'].append({'panda_id': job_info.PandaID, 'status': job_status, 'job_info': job_info}) else: self.logger.warn(log_prefix + "poll_panda_jobs, input jobs: %s, output_jobs: %s" % (len(chunk), jobs_list)) + + if not self.es: + for filename in job_status_info: + job_set_id = job_status_info[filename]['job_set_id'] + jobs = job_status_info[filename]['jobs'] + panda_ids, status, job_info = self.get_last_job_info(jobs) + if status: + job_status_info[input_file]['status'] = status + job_status_info[input_file]['job_info'] = job_info + job_status_info[input_file]['panda_id'] = panda_ids + else: + es_job_ids = [] + for filename in job_status_info: + job_set_id = job_status_info[filename]['job_set_id'] + jobs = job_status_info[filename]['jobs'] + panda_ids, status, job_info = self.get_last_job_info(jobs) + if status: + job_status_info[input_file]['status'] = status + job_status_info[input_file]['job_info'] = job_info + job_status_info[input_file]['panda_id'] = panda_ids + if status in [ContentStatus.FinalSubAvailable]: + task_id = job_info.jediTaskID + for panda_id in panda_ids: + es_job_id = {'task_id': task_id, 'panda_id': panda_id} + es_job_ids.append(es_job_id) + job_events_status = self.poll_panda_events(es_job_ids) + for filename in job_status_info: + jobs = job_status_info[filename]['jobs'] + for job in jobs: + panda_id = job['panda_id'] + events = job_events_status.get(panda_id, {}) + job['events'] = events return job_status_info + def get_event_job(self, sub_map_id, panda_jobs): + ret_event, ret_job = None, None + for panda_job in panda_jobs: + events = panda_job.get('events', {}) + for event_id in events: + event_index = int(event_id.split('-')[3]) - 1 + if event_index == sub_map_id: + event_status = events[event_id] + ret_event['status'] = event_status + # todo: get the event error code and error diag + ret_event['error_code'] = None + ret_event['error_diag'] = None + ret_job = panda_job + break + return ret_event, ret_job + def get_update_contents(self, unterminated_jobs_status, input_output_maps, contents_ext, job_info_maps, abort=False, log_prefix=''): inputname_to_map_id_outputs = {} for map_id in input_output_maps: inputs = input_output_maps[map_id]['inputs'] outputs = input_output_maps[map_id]['outputs'] - for content in inputs: - inputname_to_map_id_outputs[content['name']] = {'map_id': map_id, 'outputs': outputs} + if not self.es: + for content in inputs: + if content['name'] not in inputname_to_map_id_outputs: + inputname_to_map_id_outputs[content['name']] = [] + inputname_to_map_id_outputs[content['name']].append({'map_id': map_id, 'outputs': outputs}) + else: + # es_name = input_output_maps[map_id]['es_name'] + # sub_maps = input_output_maps[map_id]['sub_maps'] + if inputs: + es_name = inputs[0]['path'] + es_name = es_name.split("^")[0] + if es_name not in inputname_to_map_id_outputs: + inputname_to_map_id_outputs[es_name] = [] + inputname_to_map_id_outputs[es_name].append({'map_id': map_id, 'outputs': outputs, 'inputs': inputs}) contents_ext_dict = {content['content_id']: content for content in contents_ext} update_contents, update_contents_full = [], [] new_contents_ext, update_contents_ext = [], [] update_contents_dict, new_contents_ext_dict = {}, {} - for input_file in unterminated_jobs_status: - panda_job_status = unterminated_jobs_status[input_file] - panda_id = panda_job_status['panda_id'] - panda_status = panda_job_status['status'] - job_info = panda_job_status['job_info'] - - if input_file not in inputname_to_map_id_outputs: - continue - - output_contents = inputname_to_map_id_outputs[input_file]['outputs'] - for content in output_contents: - content['substatus'] = panda_status - update_contents_full.append(content) - update_content = {'content_id': content['content_id'], - # 'status': panda_status, - 'substatus': panda_status} - - if 'panda_id' in content['content_metadata'] and content['content_metadata']['panda_id']: - if content['content_metadata']['panda_id'] < panda_id: - # new panda id is the bigger one. - if 'old_panda_id' not in content['content_metadata']: - content['content_metadata']['old_panda_id'] = [] - if content['content_metadata']['panda_id'] not in content['content_metadata']['old_panda_id']: - content['content_metadata']['old_panda_id'].append(content['content_metadata']['panda_id']) - content['content_metadata']['panda_id'] = panda_id - update_content['content_metadata'] = content['content_metadata'] - elif content['content_metadata']['panda_id'] > panda_id: - if 'old_panda_id' not in content['content_metadata']: - content['content_metadata']['old_panda_id'] = [] - if panda_id not in content['content_metadata']['old_panda_id']: - content['content_metadata']['old_panda_id'].append(panda_id) - # content['content_metadata']['panda_id'] = content['content_metadata']['panda_id'] - # content['substatus'] = panda_status - update_content['content_metadata'] = content['content_metadata'] - else: - pass - else: - content['content_metadata']['panda_id'] = panda_id - update_content['content_metadata'] = content['content_metadata'] - - update_contents.append(update_content) - update_contents_dict[update_content['content_id']] = update_content - - if panda_status in [ContentStatus.Available, ContentStatus.Failed, ContentStatus.FinalFailed, - ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: - if content['content_id'] not in contents_ext_dict: - new_content_ext = {'content_id': content['content_id'], - 'request_id': content['request_id'], - 'transform_id': content['transform_id'], - 'workload_id': content['workload_id'], - 'coll_id': content['coll_id'], - 'map_id': content['map_id'], - 'status': panda_status} - for job_info_item in job_info_maps: - new_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) - if new_content_ext[job_info_item] == 'NULL': - new_content_ext[job_info_item] = None - if new_content_ext[job_info_item] is None: - del new_content_ext[job_info_item] - new_contents_ext.append(new_content_ext) - new_contents_ext_dict[new_content_ext['content_id']] = new_content_ext - else: - update_content_ext = {'content_id': content['content_id'], - 'status': panda_status} - for job_info_item in job_info_maps: - update_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) - if update_content_ext[job_info_item] == 'NULL': - update_content_ext[job_info_item] = None - if update_content_ext[job_info_item] is None: - del update_content_ext[job_info_item] - update_contents_ext.append(update_content_ext) + + if not self.es: + for input_file in unterminated_jobs_status: + # job_set_id = unterminated_jobs_status[input_file]['job_set_id'] + panda_jobs = unterminated_jobs_status[input_file]['jobs'] + if 'status' not in unterminated_jobs_status[input_file]: + continue + panda_status = unterminated_jobs_status[input_file]['status'] + panda_ids = unterminated_jobs_status[input_file]['panda_id'] + panda_id = ",".join([str(i) for i in panda_ids]) + job_info = unterminated_jobs_status[input_file]['job_info'] + + if input_file not in inputname_to_map_id_outputs: + continue + + # output_contents = inputname_to_map_id_outputs[input_file]['outputs'] + map_id_outputs = inputname_to_map_id_outputs[input_file] + for map_id_output in map_id_outputs: + # map_id = map_id_output['map_id'] + output_contents = map_id_output['outputs'] + + for content in output_contents: + content['substatus'] = panda_status + update_contents_full.append(content) + update_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], + # 'status': panda_status, + 'substatus': panda_status} + + if 'panda_id' in content['content_metadata'] and content['content_metadata']['panda_id']: + if str(content['content_metadata']['panda_id']) < str(panda_id): + # new panda id is the bigger one. + if 'old_panda_id' not in content['content_metadata']: + content['content_metadata']['old_panda_id'] = [] + if content['content_metadata']['panda_id'] not in content['content_metadata']['old_panda_id']: + content['content_metadata']['old_panda_id'].append(content['content_metadata']['panda_id']) + content['content_metadata']['panda_id'] = str(panda_id) + update_content['content_metadata'] = content['content_metadata'] + elif str(content['content_metadata']['panda_id']) > str(panda_id): + if 'old_panda_id' not in content['content_metadata']: + content['content_metadata']['old_panda_id'] = [] + if panda_id not in content['content_metadata']['old_panda_id']: + content['content_metadata']['old_panda_id'].append(panda_id) + # content['content_metadata']['panda_id'] = content['content_metadata']['panda_id'] + # content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] + else: + pass + else: + content['content_metadata']['panda_id'] = str(panda_id) + update_content['content_metadata'] = content['content_metadata'] + + update_contents.append(update_content) + update_contents_dict[update_content['content_id']] = update_content + + if panda_status in [ContentStatus.Available, ContentStatus.Failed, ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: + if content['content_id'] not in contents_ext_dict: + new_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id'], + 'map_id': content['map_id'], + 'status': panda_status} + for job_info_item in job_info_maps: + new_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if new_content_ext[job_info_item] == 'NULL': + new_content_ext[job_info_item] = None + if new_content_ext[job_info_item] is None: + del new_content_ext[job_info_item] + new_contents_ext.append(new_content_ext) + new_contents_ext_dict[new_content_ext['content_id']] = new_content_ext + else: + update_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'status': panda_status} + for job_info_item in job_info_maps: + update_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if update_content_ext[job_info_item] == 'NULL': + update_content_ext[job_info_item] = None + if update_content_ext[job_info_item] is None: + del update_content_ext[job_info_item] + update_contents_ext.append(update_content_ext) + else: + # ES jobs + for input_file in unterminated_jobs_status: + # job_set_id = unterminated_jobs_status[input_file]['job_set_id'] + panda_jobs = unterminated_jobs_status[input_file]['jobs'] + if 'status' not in unterminated_jobs_status[input_file]: + continue + + panda_status = unterminated_jobs_status[input_file]['status'] + panda_ids = unterminated_jobs_status[input_file]['panda_id'] + panda_id = ",".join([str(i) for i in panda_ids]) + job_info = unterminated_jobs_status[input_file]['job_info'] + + if input_file not in inputname_to_map_id_outputs: + continue + + if panda_status in [ContentStatus.Available, ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: + # output_contents = inputname_to_map_id_outputs[input_file]['outputs'] + map_id_outputs = inputname_to_map_id_outputs[input_file] + for map_id_output in map_id_outputs: + # map_id = map_id_output['map_id'] + output_contents = map_id_output['outputs'] + + for content in output_contents: + content['substatus'] = panda_status + update_contents_full.append(content) + update_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], + # 'status': panda_status, + 'substatus': panda_status} + + if 'panda_id' in content['content_metadata'] and content['content_metadata']['panda_id']: + if str(content['content_metadata']['panda_id']) < str(panda_id): + # new panda id is the bigger one. + if 'old_panda_id' not in content['content_metadata']: + content['content_metadata']['old_panda_id'] = [] + if content['content_metadata']['panda_id'] not in content['content_metadata']['old_panda_id']: + content['content_metadata']['old_panda_id'].append(content['content_metadata']['panda_id']) + content['content_metadata']['panda_id'] = str(panda_id) + update_content['content_metadata'] = content['content_metadata'] + elif str(content['content_metadata']['panda_id']) > str(panda_id): + if 'old_panda_id' not in content['content_metadata']: + content['content_metadata']['old_panda_id'] = [] + if panda_id not in content['content_metadata']['old_panda_id']: + content['content_metadata']['old_panda_id'].append(panda_id) + # content['content_metadata']['panda_id'] = content['content_metadata']['panda_id'] + # content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] + else: + pass + else: + content['content_metadata']['panda_id'] = str(panda_id) + update_content['content_metadata'] = content['content_metadata'] + + update_contents.append(update_content) + update_contents_dict[update_content['content_id']] = update_content + + if content['content_id'] not in contents_ext_dict: + new_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id'], + 'map_id': content['map_id'], + 'status': panda_status} + for job_info_item in job_info_maps: + new_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if new_content_ext[job_info_item] == 'NULL': + new_content_ext[job_info_item] = None + if new_content_ext[job_info_item] is None: + del new_content_ext[job_info_item] + new_contents_ext.append(new_content_ext) + new_contents_ext_dict[new_content_ext['content_id']] = new_content_ext + else: + update_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'status': panda_status} + for job_info_item in job_info_maps: + update_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if update_content_ext[job_info_item] == 'NULL': + update_content_ext[job_info_item] = None + if update_content_ext[job_info_item] is None: + del update_content_ext[job_info_item] + update_contents_ext.append(update_content_ext) + elif panda_status in [ContentStatus.FinalSubAvailable, ContentStatus.FinalFailed]: + # partly finished or all failed, needs to check the event status + # output_contents = inputname_to_map_id_outputs[input_file]['outputs'] + map_id_outputs = inputname_to_map_id_outputs[input_file] + for map_id_output in map_id_outputs: + # map_id = map_id_output['map_id'] + output_contents = map_id_output['outputs'] + + for content in output_contents: + sub_map_id = content['sub_map_id'] + # min_id = content['min_id'] # min_id should be the same as sub_map_id here + event, event_panda_job = self.get_event_job(sub_map_id, panda_jobs) + event_status = event['status'] + event_error_code = event['error_code'] + event_error_diag = event['error_diag'] + + panda_id = event_panda_job['panda_id'] + job_info = event_panda_job['job_info'] + + content['substatus'] = event_status + update_contents_full.append(content) + update_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], + # 'status': panda_status, + 'substatus': event_status} + + if 'panda_id' in content['content_metadata'] and content['content_metadata']['panda_id']: + if str(content['content_metadata']['panda_id']) < str(panda_id): + # new panda id is the bigger one. + if 'old_panda_id' not in content['content_metadata']: + content['content_metadata']['old_panda_id'] = [] + if content['content_metadata']['panda_id'] not in content['content_metadata']['old_panda_id']: + content['content_metadata']['old_panda_id'].append(content['content_metadata']['panda_id']) + content['content_metadata']['panda_id'] = str(panda_id) + update_content['content_metadata'] = content['content_metadata'] + elif str(content['content_metadata']['panda_id']) > str(panda_id): + if 'old_panda_id' not in content['content_metadata']: + content['content_metadata']['old_panda_id'] = [] + if panda_id not in content['content_metadata']['old_panda_id']: + content['content_metadata']['old_panda_id'].append(panda_id) + # content['content_metadata']['panda_id'] = content['content_metadata']['panda_id'] + # content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] + else: + pass + else: + content['content_metadata']['panda_id'] = str(panda_id) + update_content['content_metadata'] = content['content_metadata'] + + update_contents.append(update_content) + update_contents_dict[update_content['content_id']] = update_content + + if content['content_id'] not in contents_ext_dict: + new_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id'], + 'map_id': content['map_id'], + 'status': panda_status} + for job_info_item in job_info_maps: + new_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if new_content_ext[job_info_item] == 'NULL': + new_content_ext[job_info_item] = None + if new_content_ext[job_info_item] is None: + del new_content_ext[job_info_item] + if event_error_code is not None: + new_content_ext['trans_exit_code'] = event_error_code + new_content_ext['exe_exit_code'] = event_error_code + if event_error_diag is not None: + new_content_ext['exe_exit_diag'] = event_error_diag + new_contents_ext.append(new_content_ext) + new_contents_ext_dict[new_content_ext['content_id']] = new_content_ext + else: + update_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'status': panda_status} + for job_info_item in job_info_maps: + update_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if update_content_ext[job_info_item] == 'NULL': + update_content_ext[job_info_item] = None + if update_content_ext[job_info_item] is None: + del update_content_ext[job_info_item] + if event_error_code is not None: + update_content_ext['trans_exit_code'] = event_error_code + update_content_ext['exe_exit_code'] = event_error_code + if event_error_diag is not None: + update_content_ext['exe_exit_diag'] = event_error_diag + update_contents_ext.append(update_content_ext) if abort: for map_id in input_output_maps: @@ -986,6 +1521,7 @@ def get_update_contents(self, unterminated_jobs_status, input_output_maps, conte ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: if content['content_id'] not in update_contents_dict: update_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'substatus': ContentStatus.Missing} update_contents.append(update_content) if content['content_id'] not in contents_ext_dict and content['content_id'] not in new_contents_ext_dict: diff --git a/doma/lib/idds/doma/workflowv2/domatree.py b/doma/lib/idds/doma/workflowv2/domatree.py index cc28f4f3..7cd028b4 100644 --- a/doma/lib/idds/doma/workflowv2/domatree.py +++ b/doma/lib/idds/doma/workflowv2/domatree.py @@ -13,9 +13,9 @@ Construct tree from a generic workflow """ -from idds.workflowv2.tree import JobNode, LabelNode, Tree +import json -from .domaeventmap import DomaEventMap, DomaEventMapTask, DomaEventMapJob +from idds.workflowv2.tree import JobNode, LabelNode, Tree class DomaTree(Tree): @@ -108,283 +108,59 @@ def get_ordered_nodes_by_level(self, roots): return level_dict - def group_label_level_dict(self, label_level_dict): - grouped_label_level_dict = {} - current_grouped_level, current_required_resource, current_number_jobs = 0, None, 0 + def order_job_nodes(self, job_nodes): + job_nodes_order_id = {} + for job_node in job_nodes: + potential_order_id = job_node.get_potential_order_id() + if potential_order_id not in job_nodes_order_id: + job_nodes_order_id[potential_order_id] = job_node + potential_order_ids = sorted(list(job_nodes_order_id.keys())) + ordered_job_nodes = [] + for potential_order_id in potential_order_ids: + ordered_job_nodes.append(job_nodes_order_id[potential_order_id]) + order_id = 0 + for job_node in ordered_job_nodes: + job_node.order_id = order_id + gwjob = job_node.gwjob + # gwjob.order_id = order_id + gwjob.attrs["order_id"] = order_id + order_id += 1 + + def order_job_tree(self, label_level_dict): for level in label_level_dict: for node in label_level_dict[level]: - whether_to_group = self.whether_to_group(node) - node.whether_to_group = whether_to_group - - if not whether_to_group: - # if there is a previous group, close it - if current_number_jobs: - current_grouped_level += 1 - - grouped_label_level_dict[str(current_grouped_level)] = [node] - current_grouped_level += 1 - current_required_resource, current_number_jobs = None, 0 - else: - num_jobs = len(node.jobs) - # self.logger.debug(node) - print(node) - max_events_per_job = self.get_max_events_per_job(node) - required_resource = "%s_%s_%s_%s_%s" % (node.compute_cloud, node.compute_site, node.queue, node.request_memory, max_events_per_job) - - if str(current_grouped_level) not in grouped_label_level_dict: - grouped_label_level_dict[str(current_grouped_level)] = [] - - if not current_number_jobs: - # new job - current_required_resource = required_resource - current_number_jobs = num_jobs - grouped_label_level_dict[str(current_grouped_level)].append(node) - elif current_required_resource != required_resource or num_jobs + current_number_jobs > max_events_per_job: - # close the current group - current_grouped_level += 1 - # create new group and wait for others to join this group - grouped_label_level_dict[str(current_grouped_level)] = [node] - current_required_resource = required_resource - current_number_jobs = num_jobs - # elif num_jobs >= max_events_per_job / 2: - # # close the current group - # current_grouped_level += 1 - # # create new group as a separate group - # grouped_label_level_dict[str(current_grouped_level)] = [node] - # # move to the next group - # current_grouped_level += 1 - # current_required_resource, current_number_jobs = None, 0 - else: - # group the current node to the previous group - grouped_label_level_dict[str(current_grouped_level)].append(node) - current_number_jobs += num_jobs - return grouped_label_level_dict - - def whether_to_group(self, node): - gwjob = node.one_gwjob - if gwjob: - return gwjob.attrs.get('grouping', True) - return self._label_grouping.get(node.name, {}).get('grouping', True) - - def get_max_events_per_job(self, node): - gwjob = node.one_gwjob - if gwjob: - return gwjob.attrs.get('grouping_max_jobs', 100) - return self._label_grouping.get(node.name, {}).get('grouping_max_jobs', 100) - - def split_big_node(self, node, max_events_per_job=1000): - job_nodes = node.jobs - groups = {} - for job_node in job_nodes: - group_id = job_node.get_potential_group_id() - if group_id not in groups: - groups[group_id] = [] - groups[group_id].append(job_node) - - job_chunks = [] - for group_id in groups: - group_jobs = groups[group_id] - if len(group_jobs) > max_events_per_job: - cluster_chunks = [group_jobs[i:i + max_events_per_job] for i in range(0, len(group_jobs), max_events_per_job)] - job_chunks.extend(cluster_chunks) - else: - job_chunks.append(group_jobs) - - # merge job chunks - merged_job_chunks = [] - current_job_chunk = None - for job_chunk in job_chunks: - if len(job_chunk) > max_events_per_job / 2: - merged_job_chunks.append(job_chunk) - else: - if current_job_chunk is None: - current_job_chunk = job_chunk - else: - if len(current_job_chunk) + len(job_chunk) <= max_events_per_job: - current_job_chunk.extend(job_chunk) - else: - merged_job_chunks.append(current_job_chunk) - current_job_chunk = job_chunk - if current_job_chunk: - merged_job_chunks.append(current_job_chunk) - - return merged_job_chunks - - def construct_grouped_jobs(self, grouped_label_level_dict): - group_jobs, group_label, events, event_index = {}, None, {}, 0 - for level in grouped_label_level_dict: - nodes = grouped_label_level_dict[level] - # one level is one task - group_label = "_".join([node.name for node in nodes]) - if len(nodes) > 1: - # mulitple node to be merged into one job - events, event_index = {}, 0 - group_id = "%s_0" % level - event_file = "eventservice_" + group_label + "_" + group_id - for node in nodes: - for job in node.jobs: - event_index_str = str(event_index) - event_index += 1 - events[event_index_str] = job - - job.group_label = group_label - job.event_file = event_file - job.event_index = event_index_str - job.group_id = group_id - group_jobs[group_label] = [{'name': event_file, 'events': events}] - else: - # there is only one big node - node = nodes[0] - max_events_per_job = self.get_max_events_per_job(node) - if len(node.jobs) <= max_events_per_job: - events, event_index = {}, 0 - group_id = "%s_0" % level - event_file = "eventservice_" + group_label + "_" + group_id - for job in node.jobs: - event_index_str = str(event_index) - event_index += 1 - - events[event_index_str] = job - - job.group_label = group_label - job.event_file = event_file - job.event_index = event_index_str - job.group_id = group_id - group_jobs[group_label] = [{'name': event_file, 'events': events}] - else: - chunks = self.split_big_node(node, max_events_per_job) - group_jobs[group_label] = [] - for i, chunk in enumerate(chunks): - events, event_index = {}, 0 - group_id = "%s_%s" % (level, i) - event_file = "eventservice_" + group_label + "_" + group_id - for job in chunk: - event_index_str = str(event_index) - event_index += 1 - - events[event_index_str] = job - job.group_id = group_id + label_node = node + # label_name = label_node.name + job_nodes = label_node.jobs + self.order_job_nodes(job_nodes) - job.group_label = group_label - job.event_file = event_file - job.event_index = event_index_str - group_jobs[group_label].append({'name': event_file, 'events': events}) - - return group_jobs - - def from_generic_workflow_combine(self, generic_workflow): - job_tree_roots, job_nodes, label_jobs, label_parent_labels = self.get_job_tree(generic_workflow) - self.job_tree_roots = job_tree_roots - self.job_nodes = job_nodes - self.label_jobs = label_jobs - self.label_parent_labels = label_parent_labels - print("job tree") - print(job_tree_roots) - print(job_nodes) - print(label_jobs) - print(label_parent_labels) - - label_tree_roots, label_nodes = self.get_label_tree(generic_workflow, label_parent_labels, label_jobs) - self.label_tree_roots = label_tree_roots - self.label_nodes = label_nodes - print("label tree") - print(label_tree_roots) - print(label_nodes) - - label_level_dict = self.get_ordered_nodes_by_level(label_tree_roots) - print("label_level_dict") - print(label_level_dict) - - grouped_label_level_dict = self.group_label_level_dict(label_level_dict) - print("grouped_label_level_dict") - print(grouped_label_level_dict) - - # self.logger.debug(grouped_label_level_dict) - grouped_jobs = self.construct_grouped_jobs(grouped_label_level_dict) - return grouped_jobs - - def from_generic_workflow_width(self, generic_workflow): + def save_order_id_map(self, label_level_dict, order_id_map_file): + order_id_map = {} + for level in label_level_dict: + for node in label_level_dict[level]: + label_node = node + label_name = label_node.name + job_nodes = label_node.jobs + order_id_map[label_name] = {} + for job_node in job_nodes: + gwjob = job_node.gwjob + order_id = gwjob.attrs.get("order_id", 0) + order_id_map[label_name][str(order_id)] = gwjob.name + with open(order_id_map_file, 'w') as f: + json.dump(order_id_map, f) + + def order_jobs_from_generic_workflow(self, generic_workflow, order_id_map_file): job_tree_roots, job_nodes, label_jobs, label_parent_labels = self.get_job_tree(generic_workflow) self.job_tree_roots = job_tree_roots self.job_nodes = job_nodes self.label_jobs = label_jobs self.label_parent_labels = label_parent_labels - print("job tree") - print(job_tree_roots) - print(job_nodes) - print(label_jobs) - print(label_parent_labels) label_tree_roots, label_nodes = self.get_label_tree(generic_workflow, label_parent_labels, label_jobs) self.label_tree_roots = label_tree_roots self.label_nodes = label_nodes - print("label tree") - print(label_tree_roots) - print(label_nodes) label_level_dict = self.get_ordered_nodes_by_level(label_tree_roots) - print("label_level_dict") - print(label_level_dict) - - grouped_label_level_dict = self.group_label_level_dict(label_level_dict) - print("grouped_label_level_dict") - print(grouped_label_level_dict) - - # self.logger.debug(grouped_label_level_dict) - grouped_jobs = self.construct_grouped_jobs(grouped_label_level_dict) - return grouped_jobs - - def from_generic_workflow(self, generic_workflow): - if self.group_type == 'width': - return self.from_generic_workflow_width(generic_workflow) - return self.from_generic_workflow_combine(generic_workflow) - - def construct_map_between_jobs_and_events(self, job_nodes, grouped_jobs): - job_event_map = {} - for grouped_label in grouped_jobs: - for eventservice in grouped_jobs[grouped_label]: - name = eventservice['name'] - events = eventservice['events'] - for event_index in events: - job = events[event_index] - job_event_map[job.name] = {'group_label': grouped_label, 'event_job': name, 'event_index': event_index} - for job_name in job_nodes: - if job_name not in job_event_map: - raise Exception("Job is not converted into EventService maps" % job_name) - return job_event_map - - def construct_event_map(self, grouped_jobs, event_map_name=None): - job_event_map = self.construct_map_between_jobs_and_events(self.job_nodes, grouped_jobs) - - event_map = DomaEventMap(event_map_name) - for grouped_label in grouped_jobs: - event_task = DomaEventMapTask(grouped_label) - for eventservice in grouped_jobs[grouped_label]: - name = eventservice['name'] - events = eventservice['events'] - event_job = DomaEventMapJob(grouped_label, name, events) - event_job.construct_event_dependencies(job_event_map) - - event_task.add_job(event_job) - event_map.add_task(event_task) - event_map.save() - return event_map - def construct_idds_work(self, label, jobs, job_nodes): - for job in jobs: - name = job['name'] - events = job['events'] - construct_events = [] - for event_index in events: - job_node = events[event_index] - # gwjob = job_node.gwjob - deps = job_node.deps - construct_event = {'name': name, 'index': event_index, 'dependencies': {}} - for dep_name in deps: - dep_job_node = job_nodes[dep_name] - dep = {'group_label': dep_job_node.group_label, - 'event_file': dep_job_node.event_file, - 'event_index': dep_job_node.event_index} - construct_event['dependencies'].append(dep) - construct_events.append(construct_event) - job['construct_events'] = construct_events + self.order_job_tree(label_level_dict) + self.save_order_id_map(label_level_dict, order_id_map_file) diff --git a/doma/tools/env/environment.yml b/doma/tools/env/environment.yml index 0c6edadc..19e0b9f3 100644 --- a/doma/tools/env/environment.yml +++ b/doma/tools/env/environment.yml @@ -5,5 +5,5 @@ dependencies: - pip: - futures # multiple process/threads - panda-client-light # panda client - - idds-common==0.11.5 - - idds-workflow==0.11.5 + - idds-common==2.0.9 + - idds-workflow==2.0.9 \ No newline at end of file diff --git a/main/config_default/idds.cfg b/main/config_default/idds.cfg index 564f9df8..254ad4a7 100755 --- a/main/config_default/idds.cfg +++ b/main/config_default/idds.cfg @@ -36,8 +36,8 @@ coordination_interval_delay = 300 [clerk] -num_threads = 4 -max_number_workers = 4 +num_threads = 8 +max_number_workers = 8 poll_period = 300 new_poll_period = 10 update_poll_period = 300 @@ -50,7 +50,7 @@ retrieve_bulk_size = 32 pending_time = 30 [transformer] -num_threads = 3 +num_threads = 8 poll_period = 180 new_poll_period = 10 update_poll_period = 180 @@ -64,15 +64,15 @@ message_bulk_size = 1000 domapandawork.num_retries = 0 [carrier] -num_threads = 8 -max_number_workers = 8 -trigger_max_number_workers = 8 -finisher_max_number_workers = 8 -receiver_num_threads = 8 +num_threads = 16 +max_number_workers = 16 +trigger_max_number_workers = 20 +finisher_max_number_workers = 16 +receiver_num_threads = 16 -poll_period = 300 +poll_period = 60 new_poll_period = 10 -update_poll_period = 300 +update_poll_period = 60 poll_time_period = 60 poll_operation_time_period = 180 diff --git a/main/config_default/idds.cfg.orig b/main/config_default/idds.cfg.orig new file mode 100755 index 00000000..564f9df8 --- /dev/null +++ b/main/config_default/idds.cfg.orig @@ -0,0 +1,132 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# +# Authors: +# - Wen Guan, , 2019 - 2023 + +[common] +#logdir = /var/log/idds +# loglevel = DEBUG +loglevel = INFO + +[database] +default = sqlite:////tmp/idds.db +pool_size=20 +pool_recycle=3600 +echo=0 +pool_reset_on_return=rollback + +[rest] +host = https://localhost:443/idds +cacher_dir = /var/log/idds + +[main] +# agents = clerk, transformer, carrier, conductor +# agents = clerk, transformer, submitter, poller, receiver, trigger, finisher, conductor +agents = clerk, transformer, submitter, poller, receiver, trigger, finisher, conductor, archiver, coordinator + +[eventbus] +# backend = database +backend = message +# debug = True + +[coordinator] +coordination_interval_delay = 300 + + +[clerk] +num_threads = 4 +max_number_workers = 4 +poll_period = 300 +new_poll_period = 10 +update_poll_period = 300 +new_command_poll_period = 10 +update_command_poll_period = 300 + +poll_time_period = 60 +poll_operation_time_period = 60 +retrieve_bulk_size = 32 +pending_time = 30 + +[transformer] +num_threads = 3 +poll_period = 180 +new_poll_period = 10 +update_poll_period = 180 + +poll_time_period = 60 +retrieve_bulk_size = 64 +poll_operation_time_period = 180 +message_bulk_size = 1000 + +# domapandawork.life_time = 86400 +domapandawork.num_retries = 0 + +[carrier] +num_threads = 8 +max_number_workers = 8 +trigger_max_number_workers = 8 +finisher_max_number_workers = 8 +receiver_num_threads = 8 + +poll_period = 300 +new_poll_period = 10 +update_poll_period = 300 + +poll_time_period = 60 +poll_operation_time_period = 180 +retrieve_bulk_size = 16 +message_bulk_size = 1000 + +plugin.receiver = idds.agents.common.plugins.messaging.MessagingReceiver +plugin.receiver.brokers = atlas-mb.cern.ch +plugin.receiver.port = 61013 +# plugin.notifier.vhost = +plugin.receiver.destination = /topic/doma.panda_idds +plugin.receiver.username = user +plugin.receiver.password = password +plugin.receiver.broker_timeout = 10 + +plugin.receiver.channels = {"default": {"brokers": ["atlas-mb.cern.ch:61013"], + "destination": "/topic/doma.panda_idds", + "username": "user", + "password": "password", + "broker_timeout": 360}} + +# domapandawork.life_time = 86400 +domapandawork.num_retries = 0 +domapandawork.poll_panda_jobs_chunk_size = 2000 + +[conductor] +delay = 120 +replay_times = 2 + +threshold_to_release_messages = 1000 +random_delay = 60 + +plugin.notifier = idds.agents.conductor.plugins.messaging.MessagingSender +plugin.notifier.brokers = atlas-test-mb.cern.ch +plugin.notifier.port = 61013 +plugin.notifier.destination = /queue/atlas.idds +plugin.notifier.username = user +plugin.notifier.password = password +plugin.notifier.broker_timeout = 10 + +plugin.notifier.channels = {"default": {"brokers": ["atlas-mb.cern.ch:61013"], + "destination": "/topic/doma.idds", + "username": "user", + "password": "password", + "broker_timeout": 360}, + "ContentExt": {"brokers": ["atlas-test-mb.cern.ch:61013"], + "destination": "/queue/atlas.idds", + "username": "user", + "password": "password", + "broker_timeout": 360} + } + +[archiver] +# days +older_than = 60 +poll_period = 1 + diff --git a/main/etc/condor/submitter/00personal_condor.config b/main/etc/condor/submitter/00personal_condor.config index cbb5af33..25b69e77 100644 --- a/main/etc/condor/submitter/00personal_condor.config +++ b/main/etc/condor/submitter/00personal_condor.config @@ -1,6 +1,6 @@ ## What machine is your central manager? -CONDOR_HOST = aipanda180.cern.ch +CONDOR_HOST = aipanda101.cern.ch ## Pool's short description diff --git a/main/etc/idds/idds.cfg.template b/main/etc/idds/idds.cfg.template index 52028959..d75942fb 100755 --- a/main/etc/idds/idds.cfg.template +++ b/main/etc/idds/idds.cfg.template @@ -20,6 +20,12 @@ loglevel = DEBUG # aipanda187 monitor(can be reused) # aipanda160, 161, 162 (new vms) # doma aipanda015, aipanda016, and aipanda017 +# +# new +# iddsserver: aipanda102,aipanda103 +# idds atlas condor pool: aipanda101 +# dev: aipanda104 +# doma: aipanda105-107 # [database] #default = mysql://idds:idds@pcuwvirt5.cern.ch/idds diff --git a/main/etc/sql/postgresql_update.sql b/main/etc/sql/postgresql_update.sql index 561da38c..381e79f3 100644 --- a/main/etc/sql/postgresql_update.sql +++ b/main/etc/sql/postgresql_update.sql @@ -7,3 +7,7 @@ alter table contents_update add column fetch_status INTEGER DEFAULT 0; -- 2023.09.26 -- update slac idds database, without updating the idds models alter table contents alter column name type varchar(8000); + +-- 2023.11.09 +-- update slac idds database, without updating the idds models +alter table contents alter column name type varchar(40000); diff --git a/main/lib/idds/agents/carrier/finisher.py b/main/lib/idds/agents/carrier/finisher.py index 68d04d7d..ee25f0e0 100644 --- a/main/lib/idds/agents/carrier/finisher.py +++ b/main/lib/idds/agents/carrier/finisher.py @@ -325,7 +325,7 @@ def process_resume_processing(self, event): ret = self.handle_resume_processing(pr, log_prefix=log_pre) self.logger.info(log_pre + "process_resume_processing result: %s" % str(ret)) - self.update_processing(ret, pr) + self.update_processing(ret, pr, use_bulk_update_mappings=False) self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id'], content=event._content) diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py index 4bc07e48..30b7296d 100644 --- a/main/lib/idds/agents/carrier/poller.py +++ b/main/lib/idds/agents/carrier/poller.py @@ -120,6 +120,9 @@ def get_running_processings(self): self.show_queue_size() + if BaseAgent.min_request_id is None: + return [] + processing_status = [ProcessingStatus.Submitting, ProcessingStatus.Submitted, ProcessingStatus.Running, ProcessingStatus.FinishedOnExec, ProcessingStatus.ToCancel, ProcessingStatus.Cancelling, @@ -133,6 +136,7 @@ def get_running_processings(self): locking=True, update_poll=True, not_lock=True, only_return_id=True, + min_request_id=BaseAgent.min_request_id, bulk_size=self.retrieve_bulk_size) # self.logger.debug("Main thread get %s [submitting + submitted + running] processings to process" % (len(processings))) @@ -202,7 +206,7 @@ def get_log_prefix(self, processing): processing['transform_id'], processing['processing_id']) - def update_processing(self, processing, processing_model): + def update_processing(self, processing, processing_model, use_bulk_update_mappings=True): try: if processing: log_prefix = self.get_log_prefix(processing_model) @@ -227,6 +231,7 @@ def update_processing(self, processing, processing_model): retry_num += 1 try: core_processings.update_processing_contents(update_processing=processing.get('update_processing', None), + request_id=processing_model['request_id'], update_collections=processing.get('update_collections', None), update_contents=processing.get('update_contents', None), update_dep_contents=processing.get('update_dep_contents', None), @@ -236,7 +241,8 @@ def update_processing(self, processing, processing_model): new_update_contents=processing.get('new_update_contents', None), new_contents_ext=processing.get('new_contents_ext', None), update_contents_ext=processing.get('update_contents_ext', None), - new_input_dependency_contents=processing.get('new_input_dependency_contents', None)) + new_input_dependency_contents=processing.get('new_input_dependency_contents', None), + use_bulk_update_mappings=use_bulk_update_mappings) except exceptions.DatabaseException as ex: if 'ORA-00060' in str(ex): self.logger.warn(log_prefix + "(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") diff --git a/main/lib/idds/agents/carrier/submitter.py b/main/lib/idds/agents/carrier/submitter.py index 817268df..118fd3ae 100644 --- a/main/lib/idds/agents/carrier/submitter.py +++ b/main/lib/idds/agents/carrier/submitter.py @@ -14,6 +14,7 @@ from idds.common.constants import ProcessingStatus, ProcessingLocking from idds.common.utils import setup_logging, truncate_string from idds.core import processings as core_processings +from idds.agents.common.baseagent import BaseAgent from idds.agents.common.eventbus.event import (EventType, NewProcessingEvent, SyncProcessingEvent, @@ -50,10 +51,14 @@ def get_new_processings(self): self.show_queue_size() + if BaseAgent.min_request_id is None: + return [] + processing_status = [ProcessingStatus.New] processings = core_processings.get_processings_by_status(status=processing_status, locking=True, not_lock=True, new_poll=True, only_return_id=True, + min_request_id=BaseAgent.min_request_id, bulk_size=self.retrieve_bulk_size) # self.logger.debug("Main thread get %s [new] processings to process" % len(processings)) diff --git a/main/lib/idds/agents/carrier/trigger.py b/main/lib/idds/agents/carrier/trigger.py index 83b5705e..5667b132 100644 --- a/main/lib/idds/agents/carrier/trigger.py +++ b/main/lib/idds/agents/carrier/trigger.py @@ -14,6 +14,7 @@ from idds.common.constants import ProcessingStatus, ProcessingLocking, ReturnCode from idds.common.utils import setup_logging, truncate_string from idds.core import processings as core_processings +from idds.agents.common.baseagent import BaseAgent from idds.agents.common.eventbus.event import (EventType, UpdateTransformEvent, TriggerProcessingEvent, @@ -68,11 +69,15 @@ def get_trigger_processings(self): return [] # self.show_queue_size() + if BaseAgent.min_request_id is None: + return [] + processing_status = [ProcessingStatus.ToTrigger, ProcessingStatus.Triggering] processings = core_processings.get_processings_by_status(status=processing_status, locking=True, update_poll=True, not_lock=True, only_return_id=True, + min_request_id=BaseAgent.min_request_id, bulk_size=self.retrieve_bulk_size) if processings: self.logger.info("Main thread get [ToTrigger, Triggering] processings to process: %s" % (str(processings))) diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index 507d45d9..c89db59b 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -41,7 +41,8 @@ def get_logger(logger=None): return logger -def get_new_content(request_id, transform_id, workload_id, map_id, input_content, content_relation_type=ContentRelationType.Input): +def get_new_content(request_id, transform_id, workload_id, map_id, input_content, content_relation_type=ContentRelationType.Input, + es_name=None, sub_map_id=None, order_id=None): content = {'transform_id': transform_id, 'coll_id': input_content['coll_id'], 'request_id': request_id, @@ -67,6 +68,14 @@ def get_new_content(request_id, transform_id, workload_id, map_id, input_content content['sub_map_id'] = input_content['sub_map_id'] if 'dep_sub_map_id' in input_content: content['dep_sub_map_id'] = input_content['dep_sub_map_id'] + + if order_id is not None: + content['min_id'] = order_id + content['max_id'] = order_id + if sub_map_id is not None: + content['sub_map_id'] = sub_map_id + if es_name is not None and content_relation_type == ContentRelationType.Output: + content['path'] = es_name return content @@ -214,33 +223,74 @@ def get_new_contents(request_id, transform_id, workload_id, new_input_output_map new_input_dep_coll_ids = [] chunks = [] for map_id in new_input_output_maps: - inputs = new_input_output_maps[map_id]['inputs'] if 'inputs' in new_input_output_maps[map_id] else [] - inputs_dependency = new_input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in new_input_output_maps[map_id] else [] - outputs = new_input_output_maps[map_id]['outputs'] if 'outputs' in new_input_output_maps[map_id] else [] - logs = new_input_output_maps[map_id]['logs'] if 'logs' in new_input_output_maps[map_id] else [] - - for input_content in inputs: - content = get_new_content(request_id, transform_id, workload_id, map_id, input_content, content_relation_type=ContentRelationType.Input) - new_input_contents.append(content) - for input_content in inputs_dependency: - content = get_new_content(request_id, transform_id, workload_id, map_id, input_content, content_relation_type=ContentRelationType.InputDependency) - new_input_dependency_contents.append(content) - if content['coll_id'] not in new_input_dep_coll_ids: - new_input_dep_coll_ids.append(content['coll_id']) - for output_content in outputs: - content = get_new_content(request_id, transform_id, workload_id, map_id, output_content, content_relation_type=ContentRelationType.Output) - new_output_contents.append(content) - for log_content in logs: - content = get_new_content(request_id, transform_id, workload_id, map_id, log_content, content_relation_type=ContentRelationType.Log) - new_log_contents.append(content) - - total_num_updates = len(new_input_contents) + len(new_output_contents) + len(new_log_contents) + len(new_input_dependency_contents) - if total_num_updates > max_updates_per_round: - chunk = new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents - chunks.append(chunk) - - new_input_contents, new_output_contents, new_log_contents = [], [], [] - new_input_dependency_contents = [] + if "sub_maps" not in new_input_output_maps[map_id] or not new_input_output_maps[map_id]["sub_maps"]: + inputs = new_input_output_maps[map_id]['inputs'] if 'inputs' in new_input_output_maps[map_id] else [] + inputs_dependency = new_input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in new_input_output_maps[map_id] else [] + outputs = new_input_output_maps[map_id]['outputs'] if 'outputs' in new_input_output_maps[map_id] else [] + logs = new_input_output_maps[map_id]['logs'] if 'logs' in new_input_output_maps[map_id] else [] + + for input_content in inputs: + content = get_new_content(request_id, transform_id, workload_id, map_id, input_content, content_relation_type=ContentRelationType.Input) + new_input_contents.append(content) + for input_content in inputs_dependency: + content = get_new_content(request_id, transform_id, workload_id, map_id, input_content, content_relation_type=ContentRelationType.InputDependency) + new_input_dependency_contents.append(content) + if content['coll_id'] not in new_input_dep_coll_ids: + new_input_dep_coll_ids.append(content['coll_id']) + for output_content in outputs: + content = get_new_content(request_id, transform_id, workload_id, map_id, output_content, content_relation_type=ContentRelationType.Output) + new_output_contents.append(content) + for log_content in logs: + content = get_new_content(request_id, transform_id, workload_id, map_id, log_content, content_relation_type=ContentRelationType.Log) + new_log_contents.append(content) + + total_num_updates = len(new_input_contents) + len(new_output_contents) + len(new_log_contents) + len(new_input_dependency_contents) + if total_num_updates > max_updates_per_round: + chunk = new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents + chunks.append(chunk) + + new_input_contents, new_output_contents, new_log_contents = [], [], [] + new_input_dependency_contents = [] + else: + sub_maps = new_input_output_maps[map_id]["sub_maps"] + for sub_map in sub_maps: + sub_map_id = sub_map['sub_map_id'] + order_id = sub_map['order_id'] + inputs = sub_map['inputs'] if 'inputs' in sub_map else [] + inputs_dependency = sub_map['inputs_dependency'] if 'inputs_dependency' in sub_map else [] + outputs = sub_map['outputs'] if 'outputs' in sub_map else [] + logs = sub_map['logs'] if 'logs' in sub_map else [] + + for input_content in inputs: + content = get_new_content(request_id, transform_id, workload_id, map_id, input_content, + content_relation_type=ContentRelationType.Input, + sub_map_id=sub_map_id, order_id=order_id) + new_input_contents.append(content) + for input_content in inputs_dependency: + content = get_new_content(request_id, transform_id, workload_id, map_id, input_content, + content_relation_type=ContentRelationType.InputDependency, + sub_map_id=sub_map_id, order_id=order_id) + new_input_dependency_contents.append(content) + if content['coll_id'] not in new_input_dep_coll_ids: + new_input_dep_coll_ids.append(content['coll_id']) + for output_content in outputs: + content = get_new_content(request_id, transform_id, workload_id, map_id, output_content, + content_relation_type=ContentRelationType.Output, + sub_map_id=sub_map_id, order_id=order_id) + new_output_contents.append(content) + for log_content in logs: + content = get_new_content(request_id, transform_id, workload_id, map_id, log_content, + content_relation_type=ContentRelationType.Log, + sub_map_id=sub_map_id, order_id=order_id) + new_log_contents.append(content) + + total_num_updates = len(new_input_contents) + len(new_output_contents) + len(new_log_contents) + len(new_input_dependency_contents) + if total_num_updates > max_updates_per_round: + chunk = new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents + chunks.append(chunk) + + new_input_contents, new_output_contents, new_log_contents = [], [], [] + new_input_dependency_contents = [] total_num_updates = len(new_input_contents) + len(new_output_contents) + len(new_log_contents) + len(new_input_dependency_contents) if total_num_updates > 0: @@ -253,6 +303,7 @@ def get_new_contents(request_id, transform_id, workload_id, new_input_output_map def get_update_content(content): updated_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'status': content['substatus'], 'substatus': content['substatus']} content['status'] = content['substatus'] @@ -321,13 +372,22 @@ def generate_file_messages(request_id, transform_id, workload_id, work, files, r work_type = TransformType.Processing i_msg_type, i_msg_type_str = get_message_type(work_type, input_type='file') + no_dup_files = {} files_message = [] for file in files: + filename = file['name'] + if work and work.es: + filename = file['path'] + if filename in no_dup_files: + continue + else: + no_dup_files[filename] = None + file_status = file['substatus'].name if file['substatus'] == ContentStatus.FakeAvailable: file_status = ContentStatus.Available.name file_message = {'scope': file['scope'], - 'name': file['name'], + 'name': filename, 'path': file['path'], 'map_id': file['map_id'], 'content_id': file['content_id'] if 'content_id' in file else None, @@ -533,6 +593,7 @@ def handle_new_processing(processing, agent_attributes, func_site_to_cloud=None, # ret_msgs = ret_msgs + msgs logger.debug(log_prefix + "handle_new_processing: add %s new contents" % (len(new_contents))) core_processings.update_processing_contents(update_processing=None, + request_id=request_id, new_contents=new_contents, new_input_dependency_contents=new_input_dependency_contents, messages=ret_msgs) @@ -554,6 +615,7 @@ def get_updated_contents_by_request(request_id, transform_id, workload_id, work, for content in contents: if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'status': content['substatus']} updated_contents.append(u_content) if content['content_relation_type'] == ContentRelationType.Output: @@ -612,6 +674,7 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, terminated for content in inputs: if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'status': content['substatus']} updated_contents.append(u_content) u_content_substatus = {'content_id': content['content_id'], @@ -626,6 +689,7 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, terminated for content in outputs: if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'status': content['substatus']} updated_contents.append(u_content) u_content_substatus = {'content_id': content['content_id'], @@ -640,6 +704,7 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, terminated for content in inputs_dependency: if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'status': content['substatus']} updated_contents.append(u_content) updated_contents_full_input_deps.append(content) @@ -658,6 +723,7 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, terminated for content in inputs_sub: if content['substatus'] != input_content_update_status: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'status': input_content_update_status, 'substatus': input_content_update_status} updated_contents.append(u_content) @@ -682,6 +748,7 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, terminated for content in outputs_sub: if content['substatus'] != output_content_update_status: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'status': output_content_update_status, 'substatus': output_content_update_status} updated_contents.append(u_content) @@ -888,6 +955,7 @@ def trigger_release_inputs_no_deps(request_id, transform_id, workload_id, work, for content in inputs_sub: if content['substatus'] != ContentStatus.Available: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], # 'status': ContentStatus.Available, 'substatus': ContentStatus.Available} update_contents.append(u_content) @@ -946,6 +1014,7 @@ def trigger_release_inputs(request_id, transform_id, workload_id, work, updated_ pass for content in inputs_sub: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'substatus': input_content_update_status} update_contents.append(u_content) content['status'] = input_content_update_status @@ -968,6 +1037,7 @@ def trigger_release_inputs(request_id, transform_id, workload_id, work, updated_ if output_content_update_status: for content in outputs_sub: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'substatus': output_content_update_status} update_contents.append(u_content) @@ -998,6 +1068,7 @@ def poll_missing_outputs(input_output_maps, max_updates_per_round=2000): content['substatus'] = content_update_status if content['status'] != content['substatus']: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'substatus': content['substatus']} content_updates_missing.append(u_content) @@ -1046,13 +1117,14 @@ def get_update_external_content_ids(input_output_maps, external_content_ids): content_ids = name_to_id_map.get(lfn, []) for content_id in content_ids: update_content = {'content_id': content_id, + 'request_id': content['request_id'], 'external_coll_id': dataset_id, 'external_content_id': file_id} update_contents.append(update_content) return update_contents -def handle_update_processing(processing, agent_attributes, max_updates_per_round=2000, logger=None, log_prefix=''): +def handle_update_processing(processing, agent_attributes, max_updates_per_round=2000, use_bulk_update_mappings=True, logger=None, log_prefix=''): logger = get_logger(logger) ret_msgs = [] @@ -1117,6 +1189,9 @@ def handle_update_processing(processing, agent_attributes, max_updates_per_round core_processings.update_processing_contents(update_processing=None, new_contents=new_contents, new_input_dependency_contents=new_input_dependency_contents, + request_id=request_id, + # transform_id=transform_id, + use_bulk_update_mappings=use_bulk_update_mappings, messages=ret_msgs) ret_msgs = [] @@ -1130,6 +1205,10 @@ def handle_update_processing(processing, agent_attributes, max_updates_per_round logger.debug(log_prefix + "handle_update_processing: update %s missing contents" % (len(content_updates_missing))) core_processings.update_processing_contents(update_processing=None, update_contents=content_updates_missing, + request_id=request_id, + # transform_id=transform_id, + # use_bulk_update_mappings=use_bulk_update_mappings, + use_bulk_update_mappings=False, messages=msgs) if updated_contents_full: @@ -1138,6 +1217,7 @@ def handle_update_processing(processing, agent_attributes, max_updates_per_round msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='file', files=updated_contents_full_chunk, relation_type='output') core_processings.update_processing_contents(update_processing=None, + request_id=request_id, messages=msgs) if new_contents_ext: @@ -1145,12 +1225,14 @@ def handle_update_processing(processing, agent_attributes, max_updates_per_round for new_contents_ext_chunk in new_contents_ext_chunks: logger.debug(log_prefix + "handle_update_processing: add %s ext contents" % (len(new_contents_ext_chunk))) core_processings.update_processing_contents(update_processing=None, + request_id=request_id, new_contents_ext=new_contents_ext_chunk) if update_contents_ext: update_contents_ext_chunks = get_list_chunks(update_contents_ext, bulk_size=max_updates_per_round) for update_contents_ext_chunk in update_contents_ext_chunks: logger.debug(log_prefix + "handle_update_processing: update %s ext contents" % (len(update_contents_ext_chunk))) core_processings.update_processing_contents(update_processing=None, + request_id=request_id, update_contents_ext=update_contents_ext_chunk) if content_updates: @@ -1158,6 +1240,9 @@ def handle_update_processing(processing, agent_attributes, max_updates_per_round for content_updates_chunk in content_updates_chunks: logger.debug(log_prefix + "handle_update_processing: update %s contents" % (len(content_updates_chunk))) core_processings.update_processing_contents(update_processing=None, + request_id=request_id, + # transform_id=transform_id, + use_bulk_update_mappings=use_bulk_update_mappings, update_contents=content_updates_chunk) # return process_status, new_contents, new_input_dependency_contents, ret_msgs, content_updates + content_updates_missing, parameters, new_contents_ext, update_contents_ext @@ -1222,17 +1307,21 @@ def handle_trigger_processing(processing, agent_attributes, trigger_new_updates= new_contents_update_list = [] # contents_id_list = [] for con in contents_update_list: - con_dict = {'content_id': con['content_id'], - 'substatus': con['substatus']} - if 'content_metadata' in con and con['content_metadata']: - con_dict['content_metadata'] = con['content_metadata'] - new_contents_update_list.append(con_dict) - # contents_id_list.append(con['content_id']) + has_updates = True + if not work.es or con['substatus'] in [ContentStatus.Available]: + con_dict = {'content_id': con['content_id'], + 'request_id': con['request_id'], + 'substatus': con['substatus']} + if 'content_metadata' in con and con['content_metadata']: + con_dict['content_metadata'] = con['content_metadata'] + new_contents_update_list.append(con_dict) + # contents_id_list.append(con['content_id']) new_contents_update_list_chunks = [new_contents_update_list[i:i + max_updates_per_round] for i in range(0, len(new_contents_update_list), max_updates_per_round)] for chunk in new_contents_update_list_chunks: has_updates = True logger.debug(log_prefix + "new_contents_update chunk[:3](total: %s): %s" % (len(chunk), str(chunk[:3]))) - core_catalog.update_contents(chunk) + # core_catalog.update_contents(chunk, request_id=request_id, transform_id=transform_id, use_bulk_update_mappings=False) + core_catalog.update_contents(chunk, request_id=request_id, transform_id=transform_id, use_bulk_update_mappings=True) # core_catalog.delete_contents_update(contents=contents_id_list) core_catalog.delete_contents_update(request_id=request_id, transform_id=transform_id, fetch=True) logger.debug(log_prefix + "sync contents_update to contents done") @@ -1244,7 +1333,7 @@ def handle_trigger_processing(processing, agent_attributes, trigger_new_updates= for chunk in to_triggered_contents_chunks: has_updates = True logger.debug(log_prefix + "update_contents_from_others_by_dep_id chunk[:3](total: %s): %s" % (len(chunk), str(chunk[:3]))) - core_catalog.update_contents(chunk) + core_catalog.update_contents(chunk, request_id=request_id, transform_id=transform_id, use_bulk_update_mappings=False) logger.debug(log_prefix + "update_contents_from_others_by_dep_id done") input_output_maps = get_input_output_maps(transform_id, work) @@ -1285,7 +1374,10 @@ def handle_trigger_processing(processing, agent_attributes, trigger_new_updates= core_processings.update_processing_contents(update_processing=None, update_contents=updated_contents, # new_update_contents=new_update_contents, - messages=ret_msgs) + messages=ret_msgs, + request_id=request_id, + # transform_id=transform_id, + use_bulk_update_mappings=False) updated_contents = [] new_update_contents = [] ret_msgs = [] @@ -1444,6 +1536,10 @@ def get_input_name_content_id_map(request_id, workload_id, transform_id): if content['name'] not in input_name_content_id_map: input_name_content_id_map[content['name']] = [] input_name_content_id_map[content['name']].append(content['content_id']) + if content['path']: + if content['path'] not in input_name_content_id_map: + input_name_content_id_map[content['path']] = [] + input_name_content_id_map[content['path']].append(content['content_id']) cache.set(input_name_content_id_map_key, input_name_content_id_map) @@ -1920,6 +2016,7 @@ def reactive_contents(request_id, transform_id, workload_id, work, input_output_ ContentStatus.Available.value, ContentStatus.Mapped.value, ContentStatus.FakeAvailable, ContentStatus.FakeAvailable.value]: u_content = {'content_id': content['content_id'], + 'request_id': content['request_id'], 'substatus': ContentStatus.New, 'status': ContentStatus.New} updated_contents.append(u_content) diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index d7104a3c..e64cf45b 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -133,7 +133,9 @@ def is_ok_to_run_more_requests(self): def show_queue_size(self): if self.show_queue_size_time is None or time.time() - self.show_queue_size_time >= 600: self.show_queue_size_time = time.time() - q_str = "number of requests: %s, max number of requests: %s" % (self.number_workers, self.max_number_workers) + q_str = "min request_id: %s, number of requests: %s, max number of requests: %s" % (BaseAgent.min_request_id, + self.number_workers, + self.max_number_workers) self.logger.debug(q_str) def get_new_requests(self): @@ -152,7 +154,7 @@ def get_new_requests(self): req_status = [RequestStatus.New, RequestStatus.Extend, RequestStatus.Built, RequestStatus.Throttling] reqs_new = core_requests.get_requests_by_status_type(status=req_status, locking=True, - not_lock=True, + not_lock=True, min_request_id=BaseAgent.min_request_id, new_poll=True, only_return_id=True, bulk_size=self.retrieve_bulk_size) @@ -162,6 +164,8 @@ def get_new_requests(self): events = [] for req_id in reqs_new: + if BaseAgent.min_request_id is None or BaseAgent.min_request_id > req_id: + BaseAgent.min_request_id = req_id event = NewRequestEvent(publisher_id=self.id, request_id=req_id) events.append(event) self.event_bus.send_bulk(events) @@ -193,6 +197,7 @@ def get_running_requests(self): RequestStatus.ToResume, RequestStatus.Resuming, RequestStatus.Building] reqs = core_requests.get_requests_by_status_type(status=req_status, time_period=None, + min_request_id=BaseAgent.min_request_id, locking=True, bulk_size=self.retrieve_bulk_size, not_lock=True, update_poll=True, only_return_id=True) @@ -202,6 +207,8 @@ def get_running_requests(self): events = [] for req_id in reqs: + if BaseAgent.min_request_id is None or BaseAgent.min_request_id > req_id: + BaseAgent.min_request_id = req_id event = UpdateRequestEvent(publisher_id=self.id, request_id=req_id) events.append(event) self.event_bus.send_bulk(events) @@ -248,6 +255,9 @@ def get_operation_requests(self): 'cmd_id': cmd['cmd_id'], 'cmd_content': cmd_content} + if BaseAgent.min_request_id is None or BaseAgent.min_request_id > request_id: + BaseAgent.min_request_id = request_id + event = None if cmd_status in [CommandStatus.New, CommandStatus.Processing]: if cmd_type in [CommandType.AbortRequest]: diff --git a/main/lib/idds/agents/common/baseagent.py b/main/lib/idds/agents/common/baseagent.py index 0f8730f3..aa6a9ae3 100644 --- a/main/lib/idds/agents/common/baseagent.py +++ b/main/lib/idds/agents/common/baseagent.py @@ -36,6 +36,8 @@ class BaseAgent(TimerScheduler, PluginBase): The base IDDS agent class """ + min_request_id = None + def __init__(self, num_threads=1, name=None, logger=None, **kwargs): super(BaseAgent, self).__init__(num_threads, name=name) self.name = self.__class__.__name__ @@ -370,11 +372,11 @@ def add_health_message_task(self): def get_request_message(self, request_id, bulk_size=1): return core_messages.retrieve_request_messages(request_id, bulk_size=bulk_size) - def get_transform_message(self, transform_id, bulk_size=1): - return core_messages.retrieve_transform_messages(transform_id, bulk_size=bulk_size) + def get_transform_message(self, request_id, transform_id, bulk_size=1): + return core_messages.retrieve_transform_messages(request_id=request_id, transform_id=transform_id, bulk_size=bulk_size) - def get_processing_message(self, processing_id, bulk_size=1): - return core_messages.retrieve_processing_messages(processing_id, bulk_size=bulk_size) + def get_processing_message(self, request_id, processing_id, bulk_size=1): + return core_messages.retrieve_processing_messages(request_id=request_id, processing_id=processing_id, bulk_size=bulk_size) if __name__ == '__main__': diff --git a/main/lib/idds/agents/conductor/conductor.py b/main/lib/idds/agents/conductor/conductor.py index ed5d8e1b..4764fc5b 100644 --- a/main/lib/idds/agents/conductor/conductor.py +++ b/main/lib/idds/agents/conductor/conductor.py @@ -108,8 +108,12 @@ def get_messages(self): """ Get messages """ + if BaseAgent.min_request_id is None: + return [] + destination = [MessageDestination.Outside, MessageDestination.ContentExt] messages = core_messages.retrieve_messages(status=MessageStatus.New, + min_request_id=BaseAgent.min_request_id, bulk_size=self.retrieve_bulk_size, destination=destination) @@ -125,6 +129,7 @@ def get_messages(self): retry_messages = [] messages_d = core_messages.retrieve_messages(status=MessageStatus.Delivered, + min_request_id=BaseAgent.min_request_id, use_poll_period=True, bulk_size=self.retrieve_bulk_size, destination=destination) # msg_type=msg_type) @@ -149,10 +154,11 @@ def clean_messages(self, msgs, confirm=False): else: delay = self.max_retry_delay to_updates.append({'msg_id': msg['msg_id'], + 'request_id': msg['request_id'], 'retries': msg['retries'] + 1, 'poll_period': datetime.timedelta(seconds=delay), 'status': msg_status}) - core_messages.update_messages(to_updates) + core_messages.update_messages(to_updates, min_request_id=BaseAgent.min_request_id) def start_notifier(self): if 'notifier' not in self.plugins: @@ -287,6 +293,7 @@ def run(self): self.clean_messages(output_messages) except IDDSException as error: self.logger.error("Main thread IDDSException: %s" % str(error)) + self.logger.error(traceback.format_exc()) except Exception as error: self.logger.critical("Main thread exception: %s\n%s" % (str(error), traceback.format_exc())) # time.sleep(random.randint(5, self.random_delay)) diff --git a/main/lib/idds/agents/transformer/transformer.py b/main/lib/idds/agents/transformer/transformer.py index 681422c5..ab0d1863 100644 --- a/main/lib/idds/agents/transformer/transformer.py +++ b/main/lib/idds/agents/transformer/transformer.py @@ -112,11 +112,15 @@ def get_new_transforms(self): self.show_queue_size() + if BaseAgent.min_request_id is None: + return [] + transform_status = [TransformStatus.New, TransformStatus.Ready, TransformStatus.Extend] # next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_period) transforms_new = core_transforms.get_transforms_by_status(status=transform_status, locking=True, not_lock=True, new_poll=True, only_return_id=True, + min_request_id=BaseAgent.min_request_id, bulk_size=self.retrieve_bulk_size) # self.logger.debug("Main thread get %s New+Ready+Extend transforms to process" % len(transforms_new)) @@ -149,6 +153,9 @@ def get_running_transforms(self): self.show_queue_size() + if BaseAgent.min_request_id is None: + return [] + transform_status = [TransformStatus.Transforming, TransformStatus.ToCancel, TransformStatus.Cancelling, TransformStatus.ToSuspend, TransformStatus.Suspending, @@ -159,6 +166,7 @@ def get_running_transforms(self): period=None, locking=True, not_lock=True, + min_request_id=BaseAgent.min_request_id, update_poll=True, only_return_id=True, bulk_size=self.retrieve_bulk_size) diff --git a/main/lib/idds/core/messages.py b/main/lib/idds/core/messages.py index 2cb1767c..2ac79cc3 100644 --- a/main/lib/idds/core/messages.py +++ b/main/lib/idds/core/messages.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2023 +# - Wen Guan, , 2019 - 2024 """ @@ -49,7 +49,7 @@ def add_messages(messages, bulk_size=1000, session=None): def retrieve_messages(bulk_size=None, msg_type=None, status=None, destination=None, source=None, request_id=None, workload_id=None, transform_id=None, processing_id=None, use_poll_period=False, retries=None, delay=None, - fetching_id=None, session=None): + min_request_id=None, fetching_id=None, session=None): """ Retrieve up to $bulk messages. @@ -70,6 +70,7 @@ def retrieve_messages(bulk_size=None, msg_type=None, status=None, destination=No request_id=request_id, workload_id=workload_id, transform_id=transform_id, processing_id=processing_id, retries=retries, delay=delay, fetching_id=fetching_id, + min_request_id=min_request_id, use_poll_period=use_poll_period, session=session) @@ -84,8 +85,9 @@ def retrieve_request_messages(request_id, bulk_size=1, session=None): @read_session -def retrieve_transform_messages(transform_id, bulk_size=1, session=None): - return retrieve_messages(transform_id=transform_id, +def retrieve_transform_messages(request_id, transform_id, bulk_size=1, session=None): + return retrieve_messages(request_id=request_id, + transform_id=transform_id, msg_type=MessageType.IDDSCommunication, status=MessageStatus.New, bulk_size=bulk_size, @@ -94,8 +96,9 @@ def retrieve_transform_messages(transform_id, bulk_size=1, session=None): @read_session -def retrieve_processing_messages(processing_id, bulk_size=1, session=None): - return retrieve_messages(processing_id=processing_id, +def retrieve_processing_messages(request_id, processing_id, bulk_size=1, session=None): + return retrieve_messages(request_id=request_id, + processing_id=processing_id, msg_type=MessageType.IDDSCommunication, status=MessageStatus.New, bulk_size=bulk_size, diff --git a/main/lib/idds/core/processings.py b/main/lib/idds/core/processings.py index 87bae8b6..5c35c886 100644 --- a/main/lib/idds/core/processings.py +++ b/main/lib/idds/core/processings.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2023 +# - Wen Guan, , 2019 - 2024 """ @@ -125,7 +125,7 @@ def get_processing_by_id_status(processing_id, status=None, locking=False, sessi @transactional_session def get_processings_by_status(status, time_period=None, locking=False, bulk_size=None, to_json=False, by_substatus=False, not_lock=False, next_poll_at=None, for_poller=False, only_return_id=False, - locking_for_update=False, new_poll=False, update_poll=False, session=None): + min_request_id=None, locking_for_update=False, new_poll=False, update_poll=False, session=None): """ Get processing or raise a NoObject exception. @@ -147,10 +147,12 @@ def get_processings_by_status(status, time_period=None, locking=False, bulk_size bulk_size=bulk_size * 2, to_json=False, locking_for_update=False, by_substatus=by_substatus, only_return_id=True, for_poller=for_poller, new_poll=new_poll, + min_request_id=min_request_id, update_poll=update_poll, session=session) if proc_ids: processing2s = orm_processings.get_processings_by_status(status=status, period=time_period, locking=locking, processing_ids=proc_ids, + min_request_id=min_request_id, bulk_size=None, to_json=to_json, locking_for_update=locking_for_update, by_substatus=by_substatus, only_return_id=only_return_id, @@ -178,6 +180,7 @@ def get_processings_by_status(status, time_period=None, locking=False, bulk_size locking_for_update=locking_for_update, new_poll=new_poll, update_poll=update_poll, only_return_id=only_return_id, + min_request_id=min_request_id, by_substatus=by_substatus, for_poller=for_poller, session=session) parameters = {} @@ -197,6 +200,7 @@ def get_processings_by_status(status, time_period=None, locking=False, bulk_size bulk_size=bulk_size, to_json=to_json, new_poll=new_poll, update_poll=update_poll, only_return_id=only_return_id, + min_request_id=min_request_id, by_substatus=by_substatus, for_poller=for_poller, session=session) return processings @@ -285,12 +289,12 @@ def update_processing_with_collection_contents(updated_processing, new_processin session=session) -def resolve_input_dependency_id(new_input_dependency_contents, session=None): +def resolve_input_dependency_id(new_input_dependency_contents, request_id=None, session=None): coll_ids = [] for content in new_input_dependency_contents: if content['coll_id'] not in coll_ids: coll_ids.append(content['coll_id']) - contents = orm_contents.get_contents(coll_id=coll_ids, relation_type=ContentRelationType.Output, session=session) + contents = orm_contents.get_contents(coll_id=coll_ids, request_id=request_id, relation_type=ContentRelationType.Output, session=session) content_name_id_map = {} for content in contents: if content['coll_id'] not in content_name_id_map: @@ -299,16 +303,17 @@ def resolve_input_dependency_id(new_input_dependency_contents, session=None): content_name_id_map[content['coll_id']][content['name']] = {} # if content['map_id'] not in content_name_id_map[content['coll_id']][content['name']]: # content_name_id_map[content['coll_id']][content['name']][content['map_id']] = {} - content_name_id_map[content['coll_id']][content['name']][content['sub_map_id']] = content['content_id'] - # content_name_id_map[content['coll_id']][content['name']] = content['content_id'] + # content_name_id_map[content['coll_id']][content['name']][content['sub_map_id']] = content['content_id'] + content_name_id_map[content['coll_id']][content['name']] = content['content_id'] for content in new_input_dependency_contents: if 'sub_map_id' not in content or content['sub_map_id'] is None: content['sub_map_id'] = 0 - dep_sub_map_id = content.get("dep_sub_map_id", 0) - if dep_sub_map_id is None: - dep_sub_map_id = 0 - content_dep_id = content_name_id_map[content['coll_id']][content['name']][dep_sub_map_id] + # dep_sub_map_id = content.get("dep_sub_map_id", 0) + # if dep_sub_map_id is None: + # dep_sub_map_id = 0 + # content_dep_id = content_name_id_map[content['coll_id']][content['name']][dep_sub_map_id] + content_dep_id = content_name_id_map[content['coll_id']][content['name']] content['content_dep_id'] = content_dep_id return new_input_dependency_contents @@ -355,7 +360,7 @@ def update_processing_contents(update_processing, update_contents=None, update_m orm_contents.update_contents_ext(chunk, request_id=request_id, transform_id=transform_id, use_bulk_update_mappings=use_bulk_update_mappings, session=session) if new_input_dependency_contents: - new_input_dependency_contents = resolve_input_dependency_id(new_input_dependency_contents, session=session) + new_input_dependency_contents = resolve_input_dependency_id(new_input_dependency_contents, request_id=request_id, session=session) chunks = get_list_chunks(new_input_dependency_contents) for chunk in chunks: orm_contents.add_contents(chunk, session=session) diff --git a/main/lib/idds/core/requests.py b/main/lib/idds/core/requests.py index b3990f7c..83e388a6 100644 --- a/main/lib/idds/core/requests.py +++ b/main/lib/idds/core/requests.py @@ -368,7 +368,7 @@ def get_operation_request_msgs(locking=False, bulk_size=None, session=None): @transactional_session def get_requests_by_status_type(status, request_type=None, time_period=None, locking=False, bulk_size=None, to_json=False, by_substatus=False, not_lock=False, next_poll_at=None, new_poll=False, update_poll=False, - only_return_id=False, session=None): + min_request_id=None, only_return_id=False, session=None): """ Get requests by status and type @@ -381,6 +381,9 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, loc :returns: list of Request. """ + if min_request_id is None: + min_request_id = orm_requests.get_min_request_id(session=session) + if locking: if not only_return_id and bulk_size: # order by cannot work together with locking. So first select 2 * bulk_size without locking with order by. @@ -388,11 +391,13 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, loc req_ids = orm_requests.get_requests_by_status_type(status, request_type, time_period, locking=locking, bulk_size=bulk_size * 2, locking_for_update=False, to_json=False, by_substatus=by_substatus, new_poll=new_poll, update_poll=update_poll, + min_request_id=min_request_id, only_return_id=True, session=session) if req_ids: req2s = orm_requests.get_requests_by_status_type(status, request_type, time_period, request_ids=req_ids, locking=locking, locking_for_update=False, bulk_size=None, to_json=to_json, + min_request_id=min_request_id, new_poll=new_poll, update_poll=update_poll, by_substatus=by_substatus, session=session) if req2s: @@ -414,6 +419,7 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, loc else: reqs = orm_requests.get_requests_by_status_type(status, request_type, time_period, locking=locking, locking_for_update=False, bulk_size=bulk_size, + min_request_id=min_request_id, new_poll=new_poll, update_poll=update_poll, only_return_id=only_return_id, to_json=to_json, by_substatus=by_substatus, session=session) @@ -432,6 +438,7 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, loc else: reqs = orm_requests.get_requests_by_status_type(status, request_type, time_period, locking=locking, bulk_size=bulk_size, new_poll=new_poll, update_poll=update_poll, only_return_id=only_return_id, + min_request_id=min_request_id, to_json=to_json, by_substatus=by_substatus, session=session) return reqs diff --git a/main/lib/idds/core/transforms.py b/main/lib/idds/core/transforms.py index f6db3fd8..5a029c69 100644 --- a/main/lib/idds/core/transforms.py +++ b/main/lib/idds/core/transforms.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2022 +# - Wen Guan, , 2019 - 2024 """ @@ -151,7 +151,7 @@ def get_transforms(request_id=None, workload_id=None, transform_id=None, to_json @transactional_session def get_transforms_by_status(status, period=None, locking=False, bulk_size=None, to_json=False, by_substatus=False, - new_poll=False, update_poll=False, only_return_id=False, + new_poll=False, update_poll=False, only_return_id=False, min_request_id=None, not_lock=False, next_poll_at=None, session=None): """ Get transforms or raise a NoObject exception. @@ -172,6 +172,7 @@ def get_transforms_by_status(status, period=None, locking=False, bulk_size=None, tf_ids = orm_transforms.get_transforms_by_status(status=status, period=period, locking=locking, bulk_size=bulk_size * 2, locking_for_update=False, to_json=False, only_return_id=True, + min_request_id=min_request_id, new_poll=new_poll, update_poll=update_poll, by_substatus=by_substatus, session=session) if tf_ids: @@ -179,6 +180,7 @@ def get_transforms_by_status(status, period=None, locking=False, bulk_size=None, bulk_size=None, locking_for_update=False, to_json=to_json, transform_ids=tf_ids, new_poll=new_poll, update_poll=update_poll, + min_request_id=min_request_id, by_substatus=by_substatus, session=session) if transform2s: # reqs = req2s[:bulk_size] @@ -202,6 +204,7 @@ def get_transforms_by_status(status, period=None, locking=False, bulk_size=None, bulk_size=bulk_size, to_json=to_json, new_poll=new_poll, update_poll=update_poll, only_return_id=only_return_id, + min_request_id=min_request_id, by_substatus=by_substatus, session=session) parameters = {} @@ -221,6 +224,7 @@ def get_transforms_by_status(status, period=None, locking=False, bulk_size=None, bulk_size=bulk_size, to_json=to_json, new_poll=new_poll, update_poll=update_poll, only_return_id=only_return_id, + min_request_id=min_request_id, by_substatus=by_substatus, session=session) return transforms @@ -381,7 +385,7 @@ def clean_next_poll_at(status, session=None): @read_session -def get_transform_input_output_maps(transform_id, input_coll_ids, output_coll_ids, log_coll_ids=[], with_sub_map_id=False, session=None): +def get_transform_input_output_maps(transform_id, input_coll_ids, output_coll_ids, log_coll_ids=[], with_sub_map_id=False, is_es=False, session=None): """ Get transform input output maps. @@ -391,8 +395,15 @@ def get_transform_input_output_maps(transform_id, input_coll_ids, output_coll_id ret = {} for content in contents: map_id = content['map_id'] + sub_map_id = content['sub_map_id'] if not with_sub_map_id: - if map_id not in ret: + if is_es: + sub_map_id = content['sub_map_id'] + path = content['path'] + if map_id not in ret: + ret[map_id] = {'inputs_dependency': [], 'inputs': [], 'outputs': [], 'logs': [], 'others': [], + 'es_name': path, 'sub_maps': {}} + elif map_id not in ret: ret[map_id] = {'inputs_dependency': [], 'inputs': [], 'outputs': [], 'logs': [], 'others': []} else: sub_map_id = content['sub_map_id'] @@ -417,6 +428,12 @@ def get_transform_input_output_maps(transform_id, input_coll_ids, output_coll_id ret[map_id]['inputs_dependency'].append(content) elif content['content_relation_type'] == ContentRelationType.Output: ret[map_id]['outputs'].append(content) + + if is_es: + sub_map_id = content['sub_map_id'] + if sub_map_id not in ret[map_id]['sub_maps'][sub_map_id]: + ret[map_id]['sub_maps'][sub_map_id] = [] + ret[map_id]['sub_maps'][sub_map_id].append(content) elif content['content_relation_type'] == ContentRelationType.Log: ret[map_id]['logs'].append(content) else: diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index 70b67471..4bd9ad4b 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -1050,6 +1050,13 @@ def drop_proc_to_update_contents(): event.listen(Content.__table__, "before_drop", func.execute_if(dialect="postgresql")) +def get_request_sequence(): + seq = Sequence('REQUEST_ID_SEQ', schema=DEFAULT_SCHEMA_NAME, metadata=Request.metadata) + # return seq.next_value().scalar() + # return seq.next_value() + return seq + + def register_models(engine): """ Creates database tables for all models with the given engine diff --git a/main/lib/idds/orm/base/session.py b/main/lib/idds/orm/base/session.py index 11d40aa6..925d698e 100644 --- a/main/lib/idds/orm/base/session.py +++ b/main/lib/idds/orm/base/session.py @@ -228,6 +228,12 @@ def retry_if_db_connection_error(exception): for err_code in conn_err_codes: if exception.args[0].find(err_code) != -1: return True + if isinstance(exception, DatabaseException): + conn_err_codes = ('server closed the connection unexpectedly', + 'closed the connection',) + for err_code in conn_err_codes: + if exception.args[0].find(err_code) != -1: + return True return False diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index a06720db..eec653dd 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -279,7 +279,7 @@ def get_match_contents(coll_id, scope, name, content_type=None, min_id=None, max @read_session -def get_contents(scope=None, name=None, transform_id=None, coll_id=None, status=None, +def get_contents(scope=None, name=None, request_id=None, transform_id=None, coll_id=None, status=None, relation_type=None, to_json=False, session=None): """ Get content or raise a NoObject exception. @@ -310,6 +310,8 @@ def get_contents(scope=None, name=None, transform_id=None, coll_id=None, status= query = session.query(models.Content) + if request_id: + query = query.filter(models.Content.request_id == request_id) if transform_id: query = query.filter(models.Content.transform_id == transform_id) if coll_id: diff --git a/main/lib/idds/orm/messages.py b/main/lib/idds/orm/messages.py index e3cbe478..51697220 100644 --- a/main/lib/idds/orm/messages.py +++ b/main/lib/idds/orm/messages.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2023 +# - Wen Guan, , 2019 - 2024 """ @@ -102,7 +102,7 @@ def add_messages(messages, bulk_size=1000, session=None): @transactional_session -def update_messages(messages, bulk_size=1000, use_bulk_update_mappings=False, request_id=None, transform_id=None, session=None): +def update_messages(messages, bulk_size=1000, use_bulk_update_mappings=False, request_id=None, transform_id=None, min_request_id=None, session=None): try: if use_bulk_update_mappings: session.bulk_update_mappings(models.Message, messages) @@ -115,6 +115,9 @@ def update_messages(messages, bulk_size=1000, use_bulk_update_mappings=False, re query = session.query(models.Message) if request_id: query = query.filter(models.Message.request_id == request_id) + else: + if min_request_id: + query = query.filter(models.Message.request_id >= min_request_id) if transform_id: query = query.filter(models.Message.transform_id == transform_id) query = query.filter(models.Message.msg_id.in_(keys))\ @@ -133,7 +136,8 @@ def update_messages(messages, bulk_size=1000, use_bulk_update_mappings=False, re def retrieve_messages(bulk_size=1000, msg_type=None, status=None, source=None, destination=None, request_id=None, workload_id=None, transform_id=None, processing_id=None, fetching_id=None, - use_poll_period=False, retries=None, delay=None, session=None): + min_request_id=None, use_poll_period=False, retries=None, + delay=None, session=None): """ Retrieve up to $bulk messages. @@ -170,6 +174,9 @@ def retrieve_messages(bulk_size=1000, msg_type=None, status=None, source=None, query = query.filter(models.Message.destination.in_(destination)) if request_id is not None: query = query.filter_by(request_id=request_id) + else: + if min_request_id: + query = query.filter_by(request_id >= min_request_id) if workload_id is not None: query = query.filter_by(workload_id=workload_id) if transform_id is not None: diff --git a/main/lib/idds/orm/processings.py b/main/lib/idds/orm/processings.py index 4223027f..f77e340a 100644 --- a/main/lib/idds/orm/processings.py +++ b/main/lib/idds/orm/processings.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2020 +# - Wen Guan, , 2019 - 2024 """ @@ -261,7 +261,7 @@ def get_processings_by_transform_id(transform_id=None, to_json=False, session=No @transactional_session def get_processings_by_status(status, period=None, processing_ids=[], locking=False, locking_for_update=False, bulk_size=None, submitter=None, to_json=False, by_substatus=False, only_return_id=False, - new_poll=False, update_poll=False, for_poller=False, session=None): + min_request_id=None, new_poll=False, update_poll=False, for_poller=False, session=None): """ Get processing or raise a NoObject exception. @@ -303,6 +303,8 @@ def get_processings_by_status(status, period=None, processing_ids=[], locking=Fa if processing_ids: query = query.filter(models.Processing.processing_id.in_(processing_ids)) + if min_request_id: + query = query.filter(models.Processing.request_id >= min_request_id) # if period: # query = query.filter(models.Processing.updated_at < datetime.datetime.utcnow() - datetime.timedelta(seconds=period)) if locking: diff --git a/main/lib/idds/orm/requests.py b/main/lib/idds/orm/requests.py index 5f97a9dd..b4a6208e 100644 --- a/main/lib/idds/orm/requests.py +++ b/main/lib/idds/orm/requests.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2022 +# - Wen Guan, , 2019 - 2024 """ @@ -782,7 +782,8 @@ def get_requests_by_requester(scope, name, requester, to_json=False, session=Non @transactional_session def get_requests_by_status_type(status, request_type=None, time_period=None, request_ids=[], locking=False, locking_for_update=False, bulk_size=None, to_json=False, by_substatus=False, - new_poll=False, update_poll=False, only_return_id=False, session=None): + min_request_id=None, new_poll=False, update_poll=False, only_return_id=False, + session=None): """ Get requests. @@ -823,6 +824,9 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, req query = query.filter(models.Request.request_type == request_type) if request_ids: query = query.filter(models.Request.request_id.in_(request_ids)) + else: + if min_request_id is not None: + query = query.filter(models.Request.request_id >= min_request_id) if locking: query = query.filter(models.Request.locking == RequestLocking.Idle) @@ -1018,3 +1022,26 @@ def get_active_requests(active_status=None, session=None): return tmp except Exception as error: raise error + + +@read_session +def get_min_request_id(difference=1000, session=None): + try: + seq = models.get_request_sequence() + row = session.query(seq.next_value()).one() + if row: + max_request_id = row[0] + return max_request_id - difference + else: + return 0 + except Exception: + try: + query = session.query(func.max(models.Request.request_id)) + row = query.one() + if row: + max_request_id = row[0] + return max_request_id - difference + else: + return 0 + except Exception as error: + raise error diff --git a/main/lib/idds/orm/transforms.py b/main/lib/idds/orm/transforms.py index e3ba3639..ca06e017 100644 --- a/main/lib/idds/orm/transforms.py +++ b/main/lib/idds/orm/transforms.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2022 +# - Wen Guan, , 2019 - 2024 """ @@ -347,7 +347,7 @@ def get_transforms(request_id=None, workload_id=None, transform_id=None, @transactional_session def get_transforms_by_status(status, period=None, transform_ids=[], locking=False, locking_for_update=False, bulk_size=None, to_json=False, by_substatus=False, only_return_id=False, - new_poll=False, update_poll=False, session=None): + min_request_id=None, new_poll=False, update_poll=False, session=None): """ Get transforms or raise a NoObject exception. @@ -386,6 +386,8 @@ def get_transforms_by_status(status, period=None, transform_ids=[], locking=Fals if transform_ids: query = query.filter(models.Transform.transform_id.in_(transform_ids)) + if min_request_id: + query = query.filter(models.Transform.request_id >= min_request_id) # if period: # query = query.filter(models.Transform.updated_at < datetime.datetime.utcnow() - datetime.timedelta(seconds=period)) if locking: diff --git a/main/lib/idds/tests/panda_test.py b/main/lib/idds/tests/panda_test.py index 15edea09..f778b05b 100644 --- a/main/lib/idds/tests/panda_test.py +++ b/main/lib/idds/tests/panda_test.py @@ -10,6 +10,9 @@ # os.environ['PANDA_URL'] = 'http://rubin-panda-server-dev.slac.stanford.edu:80/server/panda' # os.environ['PANDA_URL_SSL'] = 'https://rubin-panda-server-dev.slac.stanford.edu:8443/server/panda' +# os.environ['PANDA_URL'] = 'https://usdf-panda-server.slac.stanford.edu:8443/server/panda' +# os.environ['PANDA_URL_SSL'] = 'https://usdf-panda-server.slac.stanford.edu:8443/server/panda' + from pandaclient import Client # noqa E402 @@ -36,12 +39,23 @@ task_ids = [282, 322, 323, 324, 325] task_ids = [i for i in range(165243, 165277)] task_ids = [165277] +task_ids = [i for i in range(5838, 5912)] +task_ids = [165290, 165295, 165299, 165728] +task_ids = [] +task_ids = [i for i in range(166636, 166778)] +task_ids = [166253, 166254] for task_id in task_ids: print("Killing %s" % task_id) ret = Client.killTask(task_id, verbose=True) print(ret) -sys.exit(0) +# sys.exit(0) + +jediTaskID = 166303 +ret = Client.getJediTaskDetails({'jediTaskID': jediTaskID}, True, True, verbose=False) +print(ret) + +# sys.exit(0) # jobids = [52690679] jobids = [9] @@ -65,20 +79,45 @@ print(f.type) """ -# ret = Client.getFullJobStatus(ids=jobids, verbose=False) -# print(ret) +jobids = [66573292] +jobids = [67228019] +job_ids = [67228019] +ret = Client.getFullJobStatus(ids=jobids, verbose=False) +print(ret) + +jobs_list = ret[1] +# print(jobs_list) +for job_info in jobs_list: + print(job_info) + print(job_info.eventService) + print(job_info.jobStatus) + print(job_info.jobSubStatus) + print(job_info.jobsetID) + print(job_info.taskID) + print(job_info.jediTaskID) + print(job_info.Files) + for job_file in job_info.Files: + print(job_file.type) + print(job_file.lfn) +# sys.exit(0) -jediTaskID = 156668 +jediTaskID = 166303 ret = Client.get_files_in_datasets(jediTaskID, verbose=False) print(ret) print("get events") panda_ids = [{'task_id': 157016, 'panda_id': 53943290}] -panda_ids = [{'task_id': 157076, 'panda_id': 53943504}] +panda_ids = [{'task_id': 166303, 'panda_id': 66573292}] +panda_ids = [{'task_id': 166643, 'panda_id': 66988434}] +panda_ids = [{'task_id': 166943, 'panda_id': 67228019}] ret = Client.get_events_status(panda_ids, verbose=True) print(ret) -# sys.exit(0) +panda_ids = [{'task_id': 166943, 'panda_id': 67228018}] +ret = Client.get_events_status(panda_ids, verbose=True) +print(ret) + +sys.exit(0) """ jediTaskID = 10517 # 10607 diff --git a/main/lib/idds/tests/subprocess_test.py b/main/lib/idds/tests/subprocess_test.py new file mode 100644 index 00000000..7b3b4909 --- /dev/null +++ b/main/lib/idds/tests/subprocess_test.py @@ -0,0 +1,45 @@ +import subprocess + +process = subprocess.Popen(['/usr/bin/ps', '-ef'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') + +# Read the output and error streams asynchronously +output_reader = process.stdout +error_reader = process.stderr + +output = "" +error = "" + +while True: + # Read from the output stream + output_chunk = output_reader.read() + if output_chunk: + output += output_chunk + else: + break + +while True: + # Read from the error stream + error_chunk = error_reader.read() + if error_chunk: + error += error_chunk + else: + break + +stdout, stderr = process.communicate() +print("stdout, stderr") +print(stdout) +print(stderr) + +output = output + stdout +error = error + stderr +# Wait for the process to finish and get the return code +return_code = process.wait() + +# Decode the output and error streams +# output = output.decode('utf-8') +# error = error.decode('utf-8') + +# Print the output, error, and return code +print("Output:", output) +print("Error:", error) +print("Return Code:", return_code) diff --git a/main/lib/idds/tests/test_class_attr.py b/main/lib/idds/tests/test_class_attr.py new file mode 100644 index 00000000..cb1cfaa9 --- /dev/null +++ b/main/lib/idds/tests/test_class_attr.py @@ -0,0 +1,39 @@ +class ExampleClass(object): + class_attr = 0 + + def __init__(self, instance_attr): + self.instance_attr = instance_attr + + +if __name__ == '__main__': + + foo = ExampleClass(1) + bar = ExampleClass(2) + + print(foo.instance_attr) + print(foo.class_attr) + + print(bar.instance_attr) + print(bar.class_attr) + + print(ExampleClass.class_attr) + + foo.class_attr = 100 + + print(foo.instance_attr) + print(foo.class_attr) + + print(bar.instance_attr) + print(bar.class_attr) + + print(ExampleClass.class_attr) + + ExampleClass.class_attr = 100 + + print(foo.instance_attr) + print(foo.class_attr) + + print(bar.instance_attr) + print(bar.class_attr) + + print(ExampleClass.class_attr) diff --git a/main/lib/idds/tests/test_datacarousel.py b/main/lib/idds/tests/test_datacarousel.py index 32ee936b..c56a4727 100644 --- a/main/lib/idds/tests/test_datacarousel.py +++ b/main/lib/idds/tests/test_datacarousel.py @@ -91,10 +91,16 @@ def get_rule_id(scope, name, src_rse, dest_rse): def get_workflow(): - scope = 'data16_13TeV' - name = 'data16_13TeV.00298862.physics_Main.daq.RAW' + # scope = 'data16_13TeV' + # name = 'data16_13TeV.00298862.physics_Main.daq.RAW' src_rse = 'NDGF-T1_DATATAPE' dest_rse = 'NDGF-T1_DATADISK' + + scope = 'mc16_13TeV' + name = 'mc16_13TeV.411332.PhHerwig7EG_ttbar_hdamp258p75_713_dil_BBFilt.merge.EVNT.e7800_e5984_tid19396149_00' + + src_rse = None + dest_rse = 'UKI-SCOTGRID-GLASGOW-CEPH_DATADISK' rule_id = get_rule_id(scope, name, src_rse, dest_rse) work = ATLASStageinWork(executable=None, arguments=None, parameters=None, setup=None, exec_type='local', sandbox=None, work_id=None, diff --git a/main/lib/idds/tests/test_domapanda.py b/main/lib/idds/tests/test_domapanda.py index 4beebdf3..004cd3d2 100644 --- a/main/lib/idds/tests/test_domapanda.py +++ b/main/lib/idds/tests/test_domapanda.py @@ -74,6 +74,7 @@ # task_queue = 'SLAC_Rubin_Extra_Himem_32Cores' # task_queue = 'SLAC_Rubin_Merge' # task_queue = 'SLAC_TEST' + # task_queue4 = task_queue3 = task_queue2 = task_queue1 = task_queue # task_cloud = None @@ -95,6 +96,7 @@ def setup_workflow(): taskN1.name = site + "_" + taskN1.step + "_" + randStr() taskN1.dependencies = [ {"name": "00000" + str(k), + "order_id": k, "dependencies": [], "submitted": False} for k in range(6) ] @@ -105,18 +107,21 @@ def setup_workflow(): taskN2.dependencies = [ { "name": "000010", + "order_id": 0, "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, {"task": taskN1.name, "inputname": "000002", "available": False}], "submitted": False }, { "name": "000011", + "order_id": 1, "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, {"task": taskN1.name, "inputname": "000002", "available": False}], "submitted": False }, { "name": "000012", + "order_id": 2, "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, {"task": taskN1.name, "inputname": "000002", "available": False}], "submitted": False @@ -129,28 +134,33 @@ def setup_workflow(): taskN3.dependencies = [ { "name": "000020", + "order_id": 0, "dependencies": [], "submitted": False }, { "name": "000021", + "order_id": 1, "dependencies": [{"task": taskN2.name, "inputname": "000010", "available": False}, {"task": taskN2.name, "inputname": "000011", "available": False}], "submitted": False }, { "name": "000022", + "order_id": 2, "dependencies": [{"task": taskN2.name, "inputname": "000011", "available": False}, {"task": taskN2.name, "inputname": "000012", "available": False}], "submitted": False }, { "name": "000023", + "order_id": 3, "dependencies": [], "submitted": False }, { "name": "000024", + "order_id": 4, "dependencies": [{"task": taskN3.name, "inputname": "000021", "available": False}, {"task": taskN3.name, "inputname": "000023", "available": False}], "submitted": False @@ -162,6 +172,7 @@ def setup_workflow(): taskN4.name = site + "_" + taskN4.step + "_" + randStr() taskN4.dependencies = [ {"name": "00004" + str(k), + "order_id": k, "dependencies": [], "submitted": False} for k in range(6) ] @@ -171,6 +182,7 @@ def setup_workflow(): taskN5.name = site + "_" + taskN5.step + "_" + randStr() taskN5.dependencies = [ {"name": "00005" + str(k), + "order_id": k, "dependencies": [], "submitted": False} for k in range(6) ] diff --git a/main/lib/idds/tests/test_domapanda_es.py b/main/lib/idds/tests/test_domapanda_es.py new file mode 100644 index 00000000..3aca6623 --- /dev/null +++ b/main/lib/idds/tests/test_domapanda_es.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Sergey Padolski, , 2021 +# - Wen Guan, , 2023 + + +""" +Test client. +""" + +import json # noqa F401 +import sys +import string +import random +import time + +# import traceback + +# from rucio.client.client import Client as Rucio_Client +# from rucio.common.exception import CannotAuthenticate + +# from idds.client.client import Client +from idds.client.clientmanager import ClientManager +# from idds.common.constants import RequestType, RequestStatus +from idds.common.utils import get_rest_host +# from idds.tests.common import get_example_real_tape_stagein_request +# from idds.tests.common import get_example_prodsys2_tape_stagein_request + +# from idds.workflowv2.work import Work, Parameter, WorkStatus +# from idds.workflowv2.workflow import Condition, Workflow +from idds.workflowv2.workflow import Workflow +# from idds.atlas.workflowv2.atlasstageinwork import ATLASStageinWork +from idds.doma.workflowv2.domapandawork import DomaPanDAWork + + +if len(sys.argv) > 1 and sys.argv[1] == "in2p3": + site = 'in2p3' + task_cloud = 'EU' + # task_queue = 'CC-IN2P3_TEST' + task_queue = 'CC-IN2P3_Rubin' + task_queue1 = 'CC-IN2P3_Rubin_Medium' + task_queue2 = 'CC-IN2P3_Rubin_Himem' + task_queue3 = 'CC-IN2P3_Rubin_Extra_Himem' + task_queue4 = 'CC-IN2P3_Rubin_Merge' +elif len(sys.argv) > 1 and sys.argv[1] == "lancs": + site = 'lancs' + task_cloud = 'EU' + # task_queue = 'LANCS_TEST' + task_queue = 'LANCS_Rubin' + task_queue1 = 'LANCS_Rubin_Medium' + task_queue2 = 'LANCS_Rubin_Himem' + task_queue3 = 'LANCS_Rubin_Extra_Himem' + task_queue3 = 'LANCS_Rubin_Himem' + task_queue4 = 'LANCS_Rubin_Merge' +else: + site = 'slac' + # task_cloud = 'LSST' + task_cloud = 'US' + + task_queue = 'DOMA_LSST_GOOGLE_TEST' + # task_queue = 'DOMA_LSST_GOOGLE_MERGE' + # task_queue = 'SLAC_TEST' + # task_queue = 'DOMA_LSST_SLAC_TEST' + task_queue = 'SLAC_Rubin' + task_queue1 = 'SLAC_Rubin_Medium' + task_queue2 = 'SLAC_Rubin_Himem' + task_queue3 = 'SLAC_Rubin_Extra_Himem' + task_queue4 = 'SLAC_Rubin_Merge' + # task_queue = 'SLAC_Rubin_Extra_Himem_32Cores' + # task_queue = 'SLAC_Rubin_Merge' + task_queue = 'SLAC_TEST' + task_queue4 = task_queue3 = task_queue2 = task_queue1 = task_queue + +# task_cloud = None + + +def randStr(chars=string.ascii_lowercase + string.digits, N=10): + return ''.join(random.choice(chars) for _ in range(N)) + + +class PanDATask(object): + name = None + step = None + dependencies = [] + + +def setup_workflow(): + + es_map = {} + taskN1 = PanDATask() + taskN1.step = "step1" + taskN1.name = site + "_" + taskN1.step + "_" + randStr() + taskN1.dependencies = [ + {"name": "00000" + str(k), + "order_id": k, + "dependencies": [], + "submitted": False} for k in range(6) + ] + + es_map[taskN1.step] = {str(item["order_id"]): item["name"] for item in taskN1.dependencies} + + taskN2 = PanDATask() + taskN2.step = "step2" + taskN2.name = site + "_" + taskN2.step + "_" + randStr() + taskN2.dependencies = [ + { + "name": "000010", + "order_id": 0, + "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, + {"task": taskN1.name, "inputname": "000002", "available": False}], + "submitted": False + }, + { + "name": "000011", + "order_id": 1, + "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, + {"task": taskN1.name, "inputname": "000002", "available": False}], + "submitted": False + }, + { + "name": "000012", + "order_id": 2, + "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, + {"task": taskN1.name, "inputname": "000002", "available": False}], + "submitted": False + } + ] + + es_map[taskN2.step] = {str(item["order_id"]): item["name"] for item in taskN2.dependencies} + + taskN3 = PanDATask() + taskN3.step = "step3" + taskN3.name = site + "_" + taskN3.step + "_" + randStr() + taskN3.dependencies = [ + { + "name": "000020", + "order_id": 0, + "dependencies": [], + "submitted": False + }, + { + "name": "000021", + "order_id": 1, + "dependencies": [{"task": taskN2.name, "inputname": "000010", "available": False}, + {"task": taskN2.name, "inputname": "000011", "available": False}], + "submitted": False + }, + { + "name": "000022", + "order_id": 2, + "dependencies": [{"task": taskN2.name, "inputname": "000011", "available": False}, + {"task": taskN2.name, "inputname": "000012", "available": False}], + "submitted": False + }, + { + "name": "000023", + "order_id": 3, + "dependencies": [], + "submitted": False + }, + { + "name": "000024", + "order_id": 4, + "groups": taskN3.name, + "dependencies": [{"task": taskN3.name, "inputname": "000021", "available": False}, + {"task": taskN3.name, "inputname": "000023", "available": False}], + "submitted": False + }, + ] + + es_map[taskN3.step] = {str(item["order_id"]): item["name"] for item in taskN3.dependencies} + + taskN4 = PanDATask() + taskN4.step = "step4" + taskN4.name = site + "_" + taskN4.step + "_" + randStr() + taskN4.dependencies = [ + {"name": "00004" + str(k), + "order_id": k, + "dependencies": [], + "submitted": False} for k in range(6) + ] + + es_map[taskN4.step] = {str(item["order_id"]): item["name"] for item in taskN4.dependencies} + + taskN5 = PanDATask() + taskN5.step = "step5" + taskN5.name = site + "_" + taskN5.step + "_" + randStr() + taskN5.dependencies = [ + {"name": "00005" + str(k), + "order_id": k, + "dependencies": [], + "submitted": False} for k in range(6) + ] + + es_map[taskN5.step] = {str(item["order_id"]): item["name"] for item in taskN5.dependencies} + + # print(json.dumps(es_map)) + # raise + # executable = "wget https://wguan-wisc.web.cern.ch/wguan-wisc/doma_es_executor.py; chmod +x doma_es_executor.py; ./doma_es_executor.py echo ${IN/L}" + # executable = "export RUBIN_ES_CORES=4; echo; RUBIN_ES_MAP=%s; echo ${IN/L}" % json.dumps(es_map) + + es_map_file = "/sdf/data/rubin/panda_jobs/panda_env_pilot/test_rubin_es_map.json" + executable = "export RUBIN_ES_CORES=4; echo; RUBIN_ES_MAP_FILE=%s; echo ${IN/L}" % es_map_file + + work1 = DomaPanDAWork(executable=executable, + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#1'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#1'}], + log_collections=[], dependency_map=taskN1.dependencies, + task_name=taskN1.name, task_queue=task_queue, + encode_command_line=True, + task_priority=981, + es=True, + es_label=taskN1.step, + max_events_per_job=100, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + work2 = DomaPanDAWork(executable=executable, + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#2'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#2'}], + log_collections=[], dependency_map=taskN2.dependencies, + task_name=taskN2.name, task_queue=task_queue1, + encode_command_line=True, + task_priority=881, + es=True, + es_label=taskN2.step, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + work3 = DomaPanDAWork(executable=executable, + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#3'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#3'}], + log_collections=[], dependency_map=taskN3.dependencies, + task_name=taskN3.name, task_queue=task_queue2, + encode_command_line=True, + task_priority=781, + es=True, + es_label=taskN3.step, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + + work4 = DomaPanDAWork(executable=executable, + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#1'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#1'}], + log_collections=[], dependency_map=taskN4.dependencies, + task_name=taskN4.name, task_queue=task_queue3, + encode_command_line=True, + task_priority=981, + es=True, + es_label=taskN4.step, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + + work5 = DomaPanDAWork(executable=executable, + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#1'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#1'}], + log_collections=[], dependency_map=taskN5.dependencies, + task_name=taskN5.name, task_queue=task_queue4, + encode_command_line=True, + task_priority=981, + es=True, + es_label=taskN5.step, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + + pending_time = 12 + # pending_time = None + workflow = Workflow(pending_time=pending_time) + workflow.add_work(work1) + workflow.add_work(work2) + workflow.add_work(work3) + workflow.add_work(work4) + workflow.add_work(work5) + workflow.name = site + "_" + 'test_workflow.idds.%s.test' % time.time() + return workflow + + +if __name__ == '__main__': + host = get_rest_host() + workflow = setup_workflow() + + wm = ClientManager(host=host) + # wm.set_original_user(user_name="wguandev") + request_id = wm.submit(workflow, use_dataset_name=False) + print(request_id) diff --git a/main/lib/idds/tests/test_domapanda_small_mem.py b/main/lib/idds/tests/test_domapanda_small_mem.py new file mode 100644 index 00000000..7505a010 --- /dev/null +++ b/main/lib/idds/tests/test_domapanda_small_mem.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + + +""" +Test client. +""" + +import sys +import string +import random +import time + +# import traceback + +# from rucio.client.client import Client as Rucio_Client +# from rucio.common.exception import CannotAuthenticate + +# from idds.client.client import Client +from idds.client.clientmanager import ClientManager +# from idds.common.constants import RequestType, RequestStatus +from idds.common.utils import get_rest_host +# from idds.tests.common import get_example_real_tape_stagein_request +# from idds.tests.common import get_example_prodsys2_tape_stagein_request + +# from idds.workflowv2.work import Work, Parameter, WorkStatus +# from idds.workflowv2.workflow import Condition, Workflow +from idds.workflowv2.workflow import Workflow +# from idds.atlas.workflowv2.atlasstageinwork import ATLASStageinWork +from idds.doma.workflowv2.domapandawork import DomaPanDAWork + + +if len(sys.argv) > 1 and sys.argv[1] == "in2p3": + site = 'in2p3' + task_cloud = 'EU' + # task_queue = 'CC-IN2P3_TEST' + task_queue = 'CC-IN2P3_Rubin' + task_queue1 = 'CC-IN2P3_Rubin_Medium' + task_queue2 = 'CC-IN2P3_Rubin_Himem' + task_queue3 = 'CC-IN2P3_Rubin_Extra_Himem' + task_queue4 = 'CC-IN2P3_Rubin_Merge' +elif len(sys.argv) > 1 and sys.argv[1] == "lancs": + site = 'lancs' + task_cloud = 'EU' + # task_queue = 'LANCS_TEST' + task_queue = 'LANCS_Rubin' + task_queue1 = 'LANCS_Rubin_Medium' + task_queue2 = 'LANCS_Rubin_Himem' + task_queue3 = 'LANCS_Rubin_Extra_Himem' + task_queue3 = 'LANCS_Rubin_Himem' + task_queue4 = 'LANCS_Rubin_Merge' +else: + site = 'slac' + # task_cloud = 'LSST' + task_cloud = 'US' + + task_queue = 'DOMA_LSST_GOOGLE_TEST' + # task_queue = 'DOMA_LSST_GOOGLE_MERGE' + # task_queue = 'SLAC_TEST' + # task_queue = 'DOMA_LSST_SLAC_TEST' + task_queue = 'SLAC_Rubin' + task_queue1 = 'SLAC_Rubin_Medium' + task_queue2 = 'SLAC_Rubin_Himem' + task_queue3 = 'SLAC_Rubin_Extra_Himem' + task_queue4 = 'SLAC_Rubin_Merge' + # task_queue = 'SLAC_Rubin_Extra_Himem_32Cores' + # task_queue = 'SLAC_Rubin_Merge' + # task_queue = 'SLAC_TEST' + +# task_cloud = None + + +def randStr(chars=string.ascii_lowercase + string.digits, N=10): + return ''.join(random.choice(chars) for _ in range(N)) + + +class PanDATask(object): + name = None + step = None + dependencies = [] + + +def setup_workflow(): + + taskN1 = PanDATask() + taskN1.step = "step1" + taskN1.name = site + "_" + taskN1.step + "_" + randStr() + taskN1.dependencies = [ + {"name": "00000" + str(k), + "dependencies": [], + "submitted": False} for k in range(6) + ] + + taskN2 = PanDATask() + taskN2.step = "step2" + taskN2.name = site + "_" + taskN2.step + "_" + randStr() + taskN2.dependencies = [ + { + "name": "000010", + "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, + {"task": taskN1.name, "inputname": "000002", "available": False}], + "submitted": False + }, + { + "name": "000011", + "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, + {"task": taskN1.name, "inputname": "000002", "available": False}], + "submitted": False + }, + { + "name": "000012", + "dependencies": [{"task": taskN1.name, "inputname": "000001", "available": False}, + {"task": taskN1.name, "inputname": "000002", "available": False}], + "submitted": False + } + ] + + taskN3 = PanDATask() + taskN3.step = "step3" + taskN3.name = site + "_" + taskN3.step + "_" + randStr() + taskN3.dependencies = [ + { + "name": "000020", + "dependencies": [], + "submitted": False + }, + { + "name": "000021", + "dependencies": [{"task": taskN2.name, "inputname": "000010", "available": False}, + {"task": taskN2.name, "inputname": "000011", "available": False}], + "submitted": False + }, + { + "name": "000022", + "dependencies": [{"task": taskN2.name, "inputname": "000011", "available": False}, + {"task": taskN2.name, "inputname": "000012", "available": False}], + "submitted": False + }, + { + "name": "000023", + "dependencies": [], + "submitted": False + }, + { + "name": "000024", + "dependencies": [{"task": taskN3.name, "inputname": "000021", "available": False}, + {"task": taskN3.name, "inputname": "000023", "available": False}], + "submitted": False + }, + ] + + taskN4 = PanDATask() + taskN4.step = "step4" + taskN4.name = site + "_" + taskN4.step + "_" + randStr() + taskN4.dependencies = [ + {"name": "00004" + str(k), + "dependencies": [], + "submitted": False} for k in range(6) + ] + + taskN5 = PanDATask() + taskN5.step = "step5" + taskN5.name = site + "_" + taskN5.step + "_" + randStr() + taskN5.dependencies = [ + {"name": "00005" + str(k), + "dependencies": [], + "submitted": False} for k in range(6) + ] + + work1 = DomaPanDAWork(executable='echo', + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#1'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#1'}], + log_collections=[], dependency_map=taskN1.dependencies, + task_name=taskN1.name, task_queue=task_queue, + encode_command_line=True, + task_priority=981, + task_rss=3, task_rss_retry_offset=3, task_rss_retry_step=1000, task_rss_max=8000, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + work2 = DomaPanDAWork(executable='echo', + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#2'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#2'}], + log_collections=[], dependency_map=taskN2.dependencies, + task_name=taskN2.name, task_queue=task_queue1, + encode_command_line=True, + task_priority=881, + task_rss=3, task_rss_retry_offset=3, task_rss_retry_step=1000, task_rss_max=8000, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + work3 = DomaPanDAWork(executable='echo', + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#3'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#3'}], + log_collections=[], dependency_map=taskN3.dependencies, + task_name=taskN3.name, task_queue=task_queue2, + encode_command_line=True, + task_priority=781, + task_rss=3, task_rss_retry_offset=3, task_rss_retry_step=1000, task_rss_max=8000, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + + work4 = DomaPanDAWork(executable='echo', + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#1'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#1'}], + log_collections=[], dependency_map=taskN4.dependencies, + task_name=taskN4.name, task_queue=task_queue3, + encode_command_line=True, + task_priority=981, + task_rss=3, task_rss_retry_offset=3, task_rss_retry_step=1000, task_rss_max=8000, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + + work5 = DomaPanDAWork(executable='echo', + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#1'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#1'}], + log_collections=[], dependency_map=taskN5.dependencies, + task_name=taskN5.name, task_queue=task_queue4, + encode_command_line=True, + task_priority=981, + task_rss=3, task_rss_retry_offset=3, task_rss_retry_step=1000, task_rss_max=8000, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + + pending_time = 12 + # pending_time = None + workflow = Workflow(pending_time=pending_time) + workflow.add_work(work1) + workflow.add_work(work2) + workflow.add_work(work3) + workflow.add_work(work4) + workflow.add_work(work5) + workflow.name = site + "_" + 'test_workflow.idds.%s.test' % time.time() + return workflow + + +if __name__ == '__main__': + host = get_rest_host() + workflow = setup_workflow() + + wm = ClientManager(host=host) + # wm.set_original_user(user_name="wguandev") + request_id = wm.submit(workflow, use_dataset_name=False) + print(request_id) diff --git a/main/lib/idds/tests/test_migrate_requests.py b/main/lib/idds/tests/test_migrate_requests.py index cb74006d..57efd7ff 100644 --- a/main/lib/idds/tests/test_migrate_requests.py +++ b/main/lib/idds/tests/test_migrate_requests.py @@ -46,6 +46,7 @@ def migrate(): # cm1 = ClientManager(host=doma_host) cm1 = ClientManager(host=atlas_host) # cm1 = ClientManager(host=slac_k8s_dev_host) + cm1 = ClientManager(host=slac_k8s_prod_host) # reqs = cm1.get_requests(request_id=290) # old_request_id = 298163 # old_request_id = 350723 @@ -76,13 +77,14 @@ def migrate(): # for old_request_id in [152]: # for old_request_id in [60]: # noqa E115 # for old_request_id in [200]: # noqa E115 + old_request_ids = [635] for old_request_id in old_request_ids: # noqa E115 # doma 183 reqs = cm1.get_requests(request_id=old_request_id, with_metadata=True) cm2 = ClientManager(host=dev_host) # cm2 = ClientManager(host=doma_host) # cm2 = ClientManager(host=atlas_host) - # cm2 = ClientManager(host=slac_k8s_dev_host) + cm2 = ClientManager(host=slac_k8s_dev_host) # cm2 = ClientManager(host=slac_k8s_prod_host) # cm2 = ClientManager(host=cern_k8s_dev_host) # print(reqs) diff --git a/main/lib/idds/tests/test_sequence.py b/main/lib/idds/tests/test_sequence.py new file mode 100644 index 00000000..71b33afa --- /dev/null +++ b/main/lib/idds/tests/test_sequence.py @@ -0,0 +1,4 @@ +from idds.orm import requests + +ret = requests.get_min_request_id() +print(ret) diff --git a/main/lib/idds/version.py b/main/lib/idds/version.py index 7c2e8610..777f1776 100644 --- a/main/lib/idds/version.py +++ b/main/lib/idds/version.py @@ -9,4 +9,4 @@ # - Wen Guan, , 2019 - 2021 -release_version = "0.11.5" +release_version = "2.0.9" diff --git a/main/tools/env/environment.yml b/main/tools/env/environment.yml index 9602d5db..c27e9ad0 100644 --- a/main/tools/env/environment.yml +++ b/main/tools/env/environment.yml @@ -1,6 +1,6 @@ name: iDDS dependencies: -- python==3.9.7 +- python==3.9.7 - pip - pip: - argcomplete @@ -31,7 +31,3 @@ dependencies: - deepdiff - pyzmq - oic - - lsst-ctrl-bps - - idds-common==0.11.5 - - idds-workflow==0.11.5 - - idds-client==0.11.5 diff --git a/main/tools/env/install_idds_example.sh b/main/tools/env/install_idds_example.sh new file mode 100644 index 00000000..d3b2d807 --- /dev/null +++ b/main/tools/env/install_idds_example.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# as root +yum install -y httpd.x86_64 conda gridsite mod_ssl.x86_64 httpd-devel.x86_64 gcc.x86_64 supervisor.noarch +# yum install -y gfal2-plugin-gridftp gfal2-plugin-file.x86_64 gfal2-plugin-http.x86_64 gfal2-plugin-xrootd.x86_64 gfal2-python.x86_64 gfal2-python3.x86_64 gfal2-all.x86_64 +# conda install -c conda-forge python-gfal2 +# pip install requests SQLAlchemy urllib3 retrying mod_wsgi flask futures stomp.py cx-Oracle unittest2 pep8 flake8 pytest nose sphinx recommonmark sphinx-rtd-theme nevergrad + +mkdir /opt/idds +mkdir /opt/idds_source +mkdir /opt/idds +mkdir /var/log/idds +mkdir /var/log/idds/wsgisocks +mkdir /tmp/idds/wsgisocks +chown atlpilo1 -R /opt/idds +chown atlpilo1 -R /opt/idds_source +chown atlpilo1 /var/log/idds +chown apache -R /var/log/idds/wsgisocks +chown apache -R /tmp/idds/wsgisocks + +cd /opt/idds_source +# rm -fr *; cp -r /afs/cern.ch/user/w/wguan/workdisk/iDDS/* .;python setup.py install --old-and-unmanageable +# git clone @github_idds@ /opt/idds_source + +wget https://raw.githubusercontent.com/HSF/iDDS/master/main/tools/env/environment.yml +source /etc/profile.d/conda.sh +conda env create --prefix=/opt/idds -f environment.yml +conda activate /opt/idds +# conda install -c conda-forge python-gfal2 + +pip install rucio-clients-atlas rucio-clients panda-client-light panda-client +# root ca.crt to /opt/idds/etc/ca.crt + +pip install requests SQLAlchemy urllib3 retrying mod_wsgi flask futures stomp.py cx-Oracle unittest2 pep8 flake8 pytest nose sphinx recommonmark sphinx-rtd-theme nevergrad + pip install psycopg2-binary + +# add "auth_type = x509_proxy" to /opt/idds/etc/rucio.cfg + +# python setup.py install --old-and-unmanageable +# cp /opt/idds/etc/idds/rest/httpd-idds-443-py36-cc7.conf.install_template /etc/httpd/conf.d/httpd-idds-443-py36-cc7.conf + +# scp wguan@aipanda102:/opt/idds/etc/rucio.cfg /opt/idds/etc/rucio.cfg +# scp wguan@aipanda102:/etc/httpd/conf.d/httpd-idds-443-py310-al.conf /etc/httpd/conf.d/httpd-idds-443-py310-al.conf +# scp wguan@aipanda102:/etc/httpd/conf.d/httpd-idds-443-py39-cc7.conf /etc/httpd/conf.d/httpd-idds-443-py39-cc7.conf +# mv /etc/httpd/conf.d/httpd-idds-443-py310-al.conf /etc/httpd/conf.d/httpd-idds-443-py310-al.conf.bac +# scp wguan@aipanda102:/etc/supervisord.d/idds.ini /etc/supervisord.d/idds.ini +# scp wguan@aipanda102:/opt/idds/etc/idds/idds.cfg /opt/idds/etc/idds/idds.cfg +# scp wguan@aipanda102:/opt/idds/etc/idds/rest/gacl /opt/idds/etc/idds/rest/gacl +# scp wguan@aipanda102:/opt/idds/etc/idds/auth/auth.cfg /opt/idds/etc/idds/auth/auth.cfg + +chown atlpilo1 -R /opt/idds +chown atlpilo1 -R /opt/idds_source + +cp /opt/idds/etc/idds/idds.cfg.template /opt/idds/etc/idds/idds.cfg + +# comment /etc/httpd/conf.d/ssl.conf "Listen 443 https" +systemctl restart httpd.service +systemctl enable httpd.service + + +cp /opt/idds/etc/idds/supervisord.d/idds.ini /etc/supervisord.d/idds.ini +cp /opt/idds_source/main/etc/idds/supervisord.d/idds.ini /etc/supervisord.d/idds.ini + +systemctl start supervisord +systemctl status supervisord +systemctl enable supervisord +#supervisorctl status +#supervisorctl start all +#supervisorctl stop all + + +#condor +yum install https://research.cs.wisc.edu/htcondor/repo/current/htcondor-release-current.el9.noarch.rpm +# cp /etc/yum.repos.d/htcondor* /etc/yum-puppet.repos.d/ +# yum install -y condor.x86_64 condor-python.x86_64 +yum install -y condor.x86_64 python3-condor.x86_64 +#firewall-cmd --zone=public --add-port=9618/tcp --permanent +firewall-cmd --zone=public --add-port=9618/udp --permanent +firewall-cmd --zone=public --add-port=9600-9700/tcp --permanent +firewall-cmd --reload +cp /opt/idds_source/main/etc/condor/submitter/00personal_condor.config /etc/condor/config.d/ +systemctl enable condor +systemctl start condor +systemctl status condor + + +#docker https://docs.docker.com/engine/install/linux-postinstall/ +groupadd docker +yum install docker +systemctl start docker +systemctl status docker +systemctl enable docker +usermod -aG docker $(whoami) +usermod -aG docker nobody # for condor jobs which are running in this account + +# shpinx https://sphinx-rtd-tutorial.readthedocs.io/en/latest/sphinx-config.html +#[wguan@lxplus723 docs]$ make html +pip install --upgrade sphinx +pip install --upgrade sphinx-rtd-theme +sphinx-quickstart +make clean +make html +sphinx-apidoc -f -o ./source/codes/main/ ../main/lib/idds +sphinx-apidoc -f -o ./source/codes/common/ ../common/lib/idds +sphinx-apidoc -f -o ./source/codes/client/ ../client/lib/idds +sphinx-apidoc -f -o ./source/codes/workflow/ ../workflow/lib/idds +sphinx-apidoc -f -o ./source/codes/atlas/ ../atlas/lib/idds +sphinx-apidoc -f -o ./source/codes/doma/ ../doma/lib/idds + + +yum install fetch-crl.noarch +yum install lcg-CA + + +yum install redis +systemctl start redis +systemctl enable redis diff --git a/main/tools/env/install_idds_full.sh b/main/tools/env/install_idds_full.sh index cec5ea3f..224a69b9 100644 --- a/main/tools/env/install_idds_full.sh +++ b/main/tools/env/install_idds_full.sh @@ -26,7 +26,7 @@ conda env create --prefix=/opt/idds -f main/tools/env/environment.yml conda activate /opt/idds conda install -c conda-forge python-gfal2 -pip install rucio-clients-atlas rucio-clients panda-client-light +pip install rucio-clients-atlas rucio-clients panda-client-light panda-client # root ca.crt to /opt/idds/etc/ca.crt pip install requests SQLAlchemy urllib3 retrying mod_wsgi flask futures stomp.py cx-Oracle unittest2 pep8 flake8 pytest nose sphinx recommonmark sphinx-rtd-theme nevergrad @@ -37,6 +37,15 @@ pip install requests SQLAlchemy urllib3 retrying mod_wsgi flask futures stomp.py # python setup.py install --old-and-unmanageable # cp /opt/idds/etc/idds/rest/httpd-idds-443-py36-cc7.conf.install_template /etc/httpd/conf.d/httpd-idds-443-py36-cc7.conf +# scp wguan@aipanda102:/opt/idds/etc/rucio.cfg /opt/idds/etc/rucio.cfg +# scp wguan@aipanda102:/etc/httpd/conf.d/httpd-idds-443-py310-al.conf /etc/httpd/conf.d/httpd-idds-443-py310-al.conf +# scp wguan@aipanda102:/etc/httpd/conf.d/httpd-idds-443-py39-cc7.conf /etc/httpd/conf.d/httpd-idds-443-py39-cc7.conf +# mv /etc/httpd/conf.d/httpd-idds-443-py310-al.conf /etc/httpd/conf.d/httpd-idds-443-py310-al.conf.bac +# scp wguan@aipanda102:/etc/supervisord.d/idds.ini /etc/supervisord.d/idds.ini +# scp wguan@aipanda102:/opt/idds/etc/idds/idds.cfg /opt/idds/etc/idds/idds.cfg +# scp wguan@aipanda102:/opt/idds/etc/idds/rest/gacl /opt/idds/etc/idds/rest/gacl +# scp wguan@aipanda102:/opt/idds/etc/idds/auth/auth.cfg /opt/idds/etc/idds/auth/auth.cfg + chown atlpilo1 -R /opt/idds chown atlpilo1 -R /opt/idds_source @@ -59,7 +68,10 @@ systemctl enable supervisord #condor -yum install -y condor.x86_64 condor-python.x86_64 +yum install https://research.cs.wisc.edu/htcondor/repo/current/htcondor-release-current.el9.noarch.rpm +# cp /etc/yum.repos.d/htcondor* /etc/yum-puppet.repos.d/ +# yum install -y condor.x86_64 condor-python.x86_64 +yum install -y condor.x86_64 python3-condor.x86_64 #firewall-cmd --zone=public --add-port=9618/tcp --permanent firewall-cmd --zone=public --add-port=9618/udp --permanent firewall-cmd --zone=public --add-port=9600-9700/tcp --permanent diff --git a/monitor/data/conf.js b/monitor/data/conf.js index 2b82bd85..190646a6 100644 --- a/monitor/data/conf.js +++ b/monitor/data/conf.js @@ -1,9 +1,9 @@ var appConfig = { - 'iddsAPI_request': "https://lxplus812.cern.ch:443/idds/monitor_request/null/null", - 'iddsAPI_transform': "https://lxplus812.cern.ch:443/idds/monitor_transform/null/null", - 'iddsAPI_processing': "https://lxplus812.cern.ch:443/idds/monitor_processing/null/null", - 'iddsAPI_request_detail': "https://lxplus812.cern.ch:443/idds/monitor/null/null/true/false/false", - 'iddsAPI_transform_detail': "https://lxplus812.cern.ch:443/idds/monitor/null/null/false/true/false", - 'iddsAPI_processing_detail': "https://lxplus812.cern.ch:443/idds/monitor/null/null/false/false/true" + 'iddsAPI_request': "https://lxplus811.cern.ch:443/idds/monitor_request/null/null", + 'iddsAPI_transform': "https://lxplus811.cern.ch:443/idds/monitor_transform/null/null", + 'iddsAPI_processing': "https://lxplus811.cern.ch:443/idds/monitor_processing/null/null", + 'iddsAPI_request_detail': "https://lxplus811.cern.ch:443/idds/monitor/null/null/true/false/false", + 'iddsAPI_transform_detail': "https://lxplus811.cern.ch:443/idds/monitor/null/null/false/true/false", + 'iddsAPI_processing_detail': "https://lxplus811.cern.ch:443/idds/monitor/null/null/false/false/true" } diff --git a/monitor/lib/idds/monitor/version.py b/monitor/lib/idds/monitor/version.py index 7c2e8610..777f1776 100644 --- a/monitor/lib/idds/monitor/version.py +++ b/monitor/lib/idds/monitor/version.py @@ -9,4 +9,4 @@ # - Wen Guan, , 2019 - 2021 -release_version = "0.11.5" +release_version = "2.0.9" diff --git a/website/lib/idds/website/version.py b/website/lib/idds/website/version.py index 7c2e8610..777f1776 100644 --- a/website/lib/idds/website/version.py +++ b/website/lib/idds/website/version.py @@ -9,4 +9,4 @@ # - Wen Guan, , 2019 - 2021 -release_version = "0.11.5" +release_version = "2.0.9" diff --git a/workflow/lib/idds/workflow/version.py b/workflow/lib/idds/workflow/version.py index a09efc7f..777f1776 100644 --- a/workflow/lib/idds/workflow/version.py +++ b/workflow/lib/idds/workflow/version.py @@ -9,4 +9,4 @@ # - Wen Guan, , 2019 - 2021 -release_version = "0.7.7" +release_version = "2.0.9" diff --git a/workflow/lib/idds/workflowv2/tree.py b/workflow/lib/idds/workflowv2/tree.py index 004d4871..0a52d099 100644 --- a/workflow/lib/idds/workflowv2/tree.py +++ b/workflow/lib/idds/workflowv2/tree.py @@ -37,7 +37,9 @@ def __init__(self, name, children=None, parents=None, index=None, level=None, gr self.children = children self.parents = parents self.groups = [] + self.order_groups = [] self.group_id = None + self.order_id = None self.disable_grouping = disable_grouping self.setup_logger() @@ -101,6 +103,37 @@ def group_id(self, group_id): if not parent.group_id: parent.add_group(group_id) + def add_order_group(self, group): + if group is not None and group not in self.order_groups: + self.order_groups.append(group) + for child in self.children: + if child.order_id is None: + child.add_order_group(group) + for parent in self.parents: + if parent.order_id is None: + parent.add_order_group(group) + + def get_potential_order_id(self): + if not self.order_groups: + return 0 + return "_".join([str(i) for i in sorted(self.order_groups)]) + + @property + def order_id(self): + return self.__order_id + + @order_id.setter + def order_id(self, value): + self.__order_id = value + if value is not None: + self.__order_id = int(value) + for child in self.children: + if child.order_id is None: + child.add_order_group(value) + for parent in self.parents: + if parent.order_id is None: + parent.add_order_group(value) + def get_node_name(self): return get_node_name(self.index, self.name) diff --git a/workflow/lib/idds/workflowv2/version.py b/workflow/lib/idds/workflowv2/version.py index a09efc7f..777f1776 100644 --- a/workflow/lib/idds/workflowv2/version.py +++ b/workflow/lib/idds/workflowv2/version.py @@ -9,4 +9,4 @@ # - Wen Guan, , 2019 - 2021 -release_version = "0.7.7" +release_version = "2.0.9" diff --git a/workflow/tools/env/environment.yml b/workflow/tools/env/environment.yml index 581ae0ab..78877365 100644 --- a/workflow/tools/env/environment.yml +++ b/workflow/tools/env/environment.yml @@ -5,4 +5,4 @@ dependencies: - pip: - anytree - networkx - - idds-common==0.11.5 + - idds-common==2.0.9 \ No newline at end of file