From a11097a909e3f9f3cf6916c24f10166f645cbb1f Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 14:39:06 -0500 Subject: [PATCH 01/23] added some null value condition checking on pr files for error: ```bash Traceback (most recent call last): File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 451, in trace_task R = retval = fun(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 734, in __protected_call__ return self.run(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/tasks.py", line 18, in process_pull_request_files pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth) File "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/core.py", line 68, in pull_request_files_model pr_file_rows += [{ ^^ File "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/core.py", line 68, in pr_file_rows += [{ ^^ File "/home/ubuntu/github/augur/augur/tasks/github/util/gh_graphql_entities.py", line 344, in __iter__ coreData['totalCount'] ~~~~~~~~^^^^^^^^^^^^^^ TypeError: 'NoneType' object is not subscriptable ``` --- augur/tasks/github/util/gh_graphql_entities.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index cb5df455b7..bba45b0d07 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -341,7 +341,11 @@ def __iter__(self): coreData = self.extract_paginate_result(data) #Check to make sure we have data - coreData['totalCount'] + if coreData['totalCount']: + self.logger.info(f"pr file core data obtained") + else: + self.logginer.info(f"Helen, the ghost in our machine, did not get a numerical result for pr file core data (value): {data} \n Zero value assigned.") + coreData['totalCount'] = 0 except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") self.logger.error( From 793e1007353bad9e9c735701cdf0e82d919435ba Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 14:49:12 -0500 Subject: [PATCH 02/23] small edit --- augur/tasks/github/util/gh_graphql_entities.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index bba45b0d07..5e55df9b16 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -342,9 +342,9 @@ def __iter__(self): #Check to make sure we have data if coreData['totalCount']: - self.logger.info(f"pr file core data obtained") + self.logger.info(f"... core data obtained") else: - self.logginer.info(f"Helen, the ghost in our machine, did not get a numerical result for pr file core data (value): {data} \n Zero value assigned.") + self.logginer.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") coreData['totalCount'] = 0 except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") From 0cc6a5438591f9bbc28c64e1007fa3b527fdb708 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 17:46:43 -0500 Subject: [PATCH 03/23] collection rate for secondary bump --- augur/application/cli/backend.py | 2 +- augur/application/cli/collection.py | 2 +- augur/application/cli/tasks.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index a0480adab4..d7a8ad745d 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -172,7 +172,7 @@ def determine_worker_processes(ratio,maximum): sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.25, 25) + secondary_num_processes = determine_worker_processes(.25, 45) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py index 63c433a79e..7d65cad978 100644 --- a/augur/application/cli/collection.py +++ b/augur/application/cli/collection.py @@ -132,7 +132,7 @@ def determine_worker_processes(ratio,maximum): sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.25, 25) + secondary_num_processes = determine_worker_processes(.25, 45) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) diff --git a/augur/application/cli/tasks.py b/augur/application/cli/tasks.py index b4bec994eb..7b74d208e0 100644 --- a/augur/application/cli/tasks.py +++ b/augur/application/cli/tasks.py @@ -37,7 +37,7 @@ def start(): scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h" - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=25 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=90 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" ")) core_worker_process = subprocess.Popen(core_worker.split(" ")) From 4a18742eda9fe953fd8d93c94a483e424c4b494d Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 18:11:54 -0500 Subject: [PATCH 04/23] typo fix --- augur/tasks/github/util/gh_graphql_entities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index 5e55df9b16..c5714ce60f 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -344,7 +344,7 @@ def __iter__(self): if coreData['totalCount']: self.logger.info(f"... core data obtained") else: - self.logginer.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") + self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") coreData['totalCount'] = 0 except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") From b230d73e4052433ce51e5c8050bc6e0534e86c6d Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 18:34:53 -0500 Subject: [PATCH 05/23] `is not None` instead. --- augur/tasks/github/util/gh_graphql_entities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index c5714ce60f..0e8939cf6c 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -341,7 +341,7 @@ def __iter__(self): coreData = self.extract_paginate_result(data) #Check to make sure we have data - if coreData['totalCount']: + if coreData['totalCount'] is not None: self.logger.info(f"... core data obtained") else: self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") From f296a07ad0fd83258fc5b183dc3bba97a9502f71 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 21:07:08 -0500 Subject: [PATCH 06/23] updated is not None logic: ```python if coreData is not None: self.logger.info(f"... core data obtained") else: self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") coreData['totalCount'] = 0 except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") self.logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) self.logger.info(f"Graphql paramters: {params}") return ``` --- augur/tasks/github/util/gh_graphql_entities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index 0e8939cf6c..ca5dec2f6e 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -341,7 +341,7 @@ def __iter__(self): coreData = self.extract_paginate_result(data) #Check to make sure we have data - if coreData['totalCount'] is not None: + if coreData is not None: self.logger.info(f"... core data obtained") else: self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") From 8663655ea1355dcaf24ed8586697078d633c9fbf Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 21:12:00 -0500 Subject: [PATCH 07/23] better update: ```python try: if coreData is not None: if coreData.get('totalCount') is not None: self.logger.info("... core data obtained") else: self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") coreData['totalCount'] = 0 else: self.logger.error("Core data is None, cannot proceed with operations on it.") except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__))) ``` --- .../tasks/github/util/gh_graphql_entities.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index ca5dec2f6e..503f440a3f 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -341,18 +341,18 @@ def __iter__(self): coreData = self.extract_paginate_result(data) #Check to make sure we have data - if coreData is not None: - self.logger.info(f"... core data obtained") - else: - self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") - coreData['totalCount'] = 0 + try: + if coreData is not None: + if coreData.get('totalCount') is not None: + self.logger.info("... core data obtained") + else: + self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") + coreData['totalCount'] = 0 + else: + self.logger.error("Core data is None, cannot proceed with operations on it.") except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") - self.logger.error( - ''.join(traceback.format_exception(None, e, e.__traceback__))) - - self.logger.info(f"Graphql paramters: {params}") - return + self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__))) if int(coreData['totalCount']) == 0: From ca6cb7958916a095404d2d360193e237acbfaaec Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 21:12:48 -0500 Subject: [PATCH 08/23] woops. one error. --- augur/tasks/github/util/gh_graphql_entities.py | 1 - 1 file changed, 1 deletion(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index 503f440a3f..a3bf02ff14 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -341,7 +341,6 @@ def __iter__(self): coreData = self.extract_paginate_result(data) #Check to make sure we have data - try: if coreData is not None: if coreData.get('totalCount') is not None: self.logger.info("... core data obtained") From 0de7fdd2ce96a2d5e0854ab37eb2d8d694267b75 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 21:15:12 -0500 Subject: [PATCH 09/23] one more. --- augur/tasks/github/util/gh_graphql_entities.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index a3bf02ff14..1ad7ba5ee1 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -348,7 +348,9 @@ def __iter__(self): self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") coreData['totalCount'] = 0 else: - self.logger.error("Core data is None, cannot proceed with operations on it.") + self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.") + coreData['totalCount'] = 0 + except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__))) From 85bd70f83e778c49e9881dc7312c0dc5c26b46db Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 21:16:43 -0500 Subject: [PATCH 10/23] another tweak. --- augur/tasks/github/util/gh_graphql_entities.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index 1ad7ba5ee1..9a4b041f4d 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -346,10 +346,12 @@ def __iter__(self): self.logger.info("... core data obtained") else: self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") - coreData['totalCount'] = 0 + yield None + return else: self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.") - coreData['totalCount'] = 0 + yield None + return except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") From e9b83c6550b3add3c2248f3ea501c02028f03001 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 21:17:57 -0500 Subject: [PATCH 11/23] and another --- augur/tasks/github/util/gh_graphql_entities.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index 9a4b041f4d..c3ace4ee0c 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -346,8 +346,7 @@ def __iter__(self): self.logger.info("... core data obtained") else: self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") - yield None - return + coreData['totalCount'] = 0 else: self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.") yield None From b244c608da025421a98d7be06f902fb092ae6791 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 22:14:26 -0500 Subject: [PATCH 12/23] another tweak, as the error just moved up: ```python coreData = self.extract_paginate_result(data) ``` ```python Traceback (most recent call last): File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 451, in trace_task R = retval = fun(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 734, in __protected_call__ return self.run(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/tasks.py", line 18, in process_pull_request_files pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth) File "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/core.py", line 68, in pull_request_files_model pr_file_rows += [{ ^^ File "/home/ubuntu/github/augur/augur/tasks/github/pull_requests/files_model/core.py", line 68, in pr_file_rows += [{ ^^ File "/home/ubuntu/github/augur/augur/tasks/github/util/gh_graphql_entities.py", line 341, in __iter__ coreData = self.extract_paginate_result(data) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/github/augur/augur/tasks/github/util/gh_graphql_entities.py", line 253, in extract_paginate_result raise TimeoutError("No data received from endpoint.") TimeoutError: No data received from endpoint. ``` --- augur/tasks/github/util/gh_graphql_entities.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index c3ace4ee0c..574adbbaf0 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -338,9 +338,8 @@ def __iter__(self): #self.logger.info(f"{params}") data = self.request_graphql_dict(variables=params) try: - coreData = self.extract_paginate_result(data) - #Check to make sure we have data + coreData = self.extract_paginate_result(data) if coreData is not None: if coreData.get('totalCount') is not None: self.logger.info("... core data obtained") @@ -351,7 +350,6 @@ def __iter__(self): self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.") yield None return - except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__))) From 478048d4f55bb8fc959d8ef036210fdae96b6c3d Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Sat, 11 May 2024 10:31:01 -0500 Subject: [PATCH 13/23] Issue with extremely large repository pull_requests fixed by adding incremental inserts. Error Details: - OS Level OOM Error Log - Augur error log - Database state showing that the killed collection process leaves all subsequent core tasks hanging after the OOM. ```bash May 11 08:40:26 ip-172-31-43-26 kernel: [196984.540841] [ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name May 11 08:40:26 ip-172-31-43-26 kernel: [196984.543107] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=user.slice,mems_allowed=0,global_oom,task_memcg=/user.slice/user-1000.slice/session-330.scope,task=celery,pid=2787413,uid=1000 May 11 08:40:26 ip-172-31-43-26 kernel: [196984.543138] Out of memory: Killed process 2787413 (celery) total-vm:14657984kB, anon-rss:8728064kB, file-rss:5632kB, shmem-rss:0kB, UID:1000 pgtables:17844kB oom_score_adj:0 May 11 11:33:17 ip-172-31-43-26 kernel: [207355.229829] Softwar~cThread invoked oom-killer: gfp_mask=0x140cca(GFP_HIGHUSER_MOVABLE|__GFP_COMP), order=0, oom_score_adj=0 May 11 11:33:17 ip-172-31-43-26 kernel: [207355.229874] oom_kill_process+0x10c/0x1b0 May 11 11:33:17 ip-172-31-43-26 kernel: [207355.229884] __alloc_pages_may_oom+0x114/0x1e0 May 11 11:33:17 ip-172-31-43-26 kernel: [207355.230235] [ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name May 11 11:33:17 ip-172-31-43-26 kernel: [207355.232557] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=user.slice,mems_allowed=0,global_oom,task_memcg=/user.slice/user-1000.slice/session-330.scope,task=celery,pid=2787363,uid=1000 May 11 11:33:17 ip-172-31-43-26 kernel: [207355.232580] Out of memory: Killed process 2787363 (celery) total-vm:14415148kB, anon-rss:8500952kB, file-rss:9216kB, shmem-rss:0kB, UID:1000 pgtables:17364kB oom_score_adj:0 augur.tasks.github.pull_requests.tasks.collect_pull_requests cf2337d6-6adb-4b0b-a47d-cfd74c01d86a Name augur.tasks.github.pull_requests.tasks.collect_pull_requests UUID cf2337d6-6adb-4b0b-a47d-cfd74c01d86a State FAILURE args ('https://github.com/kubernetes/kubernetes',) kwargs {} Result None Received 2024-05-11 02:19:36.542103 UTC Started 2024-05-11 02:19:36.543418 UTC Failed 2024-05-11 05:03:50.075814 UTC Retries 0 Worker core:c1d207996f614c648319b1c800672fce@ip-172-31-43-26 Exception WorkerLostError('Worker exited prematurely: signal 9 (SIGKILL) Job: 35.') Timestamp 2024-05-11 05:03:50.075814 UTC Traceback Traceback (most recent call last): File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/billiard/pool.py", line 1265, in mark_as_worker_lost raise WorkerLostError( billiard.exceptions.WorkerLostError: Worker exited prematurely: signal 9 (SIGKILL) Job: 35. Clock 56697 Root Root id f93b7451-cfb2-42fc-a7b8-4ea8fe59d647 Parent Parent id 41327ca8-c735-44c5-a536-38acd8968e42 Children Augur thinks we are still collecting, so it will never get to messages: repo_id core_data_last_collected core_status core_task_id secondary_data_last_collected secondary_status secondary_task_id event_last_collected facade_status facade_data_last_collected facade_task_id core_weight facade_weight secondary_weight issue_pr_sum commit_sum ml_status ml_data_last_collected ml_task_id ml_weight 123948 2024-02-09 04:43:29 Collecting 9bef29e6-9519-4acb-8469-24c6857c8d92 2022-12-20 00:00:00 Success Success 2024-04-10 06:07:05 -1249186522 121948 -52204734437 203819 121948 Pending -12149126357``` --- augur/tasks/github/pull_requests/tasks.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 69e40f6818..04c7ef7a0d 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -68,6 +68,15 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") all_data += page_data + + if len(all_data) >= 200: + with GithubTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + repo_id = augur_db.session.query(Repo).filter( + Repo.repo_git == repo_git).one().repo_id + owner, repo = get_owner_repo(repo_git) + process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + all_data.clear() return all_data From e1f99501f62a55d9f6b2c7d73218f4baefc8a050 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Sat, 11 May 2024 11:14:05 -0500 Subject: [PATCH 14/23] Added error logging to dependency task for this error, which I think arises from not having any files in our annointed programming languages, which are all the most common ones. ```bash Traceback (most recent call last): File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 451, in trace_task R = retval = fun(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/github/virtualenvs/hosted/lib/python3.11/site-packages/celery/app/trace.py", line 734, in __protected_call__ return self.run(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/github/augur/augur/tasks/git/dependency_tasks/tasks.py", line 47, in process_ossf_dependency_metrics generate_scorecard(session, repo.repo_id, repo_git) File "/home/ubuntu/github/augur/augur/tasks/git/dependency_tasks/core.py", line 75, in generate_scorecard required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ubuntu/github/augur/augur/tasks/util/worker_util.py", line 141, in parse_json_from_subprocess_call raise e File "/home/ubuntu/github/augur/augur/tasks/util/worker_util.py", line 138, in parse_json_from_subprocess_call required_output = json.loads(output) ^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.11/json/__init__.py", line 346, in loads return _default_decoder.decode(s) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.11/json/decoder.py", line 337, in decode obj, end = self.raw_decode(s, idx=_w(s, 0).end()) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.11/json/decoder.py", line 355, in raw_decode raise JSONDecodeError("Expecting value", s, err.value) from None json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) ``` --- augur/tasks/git/dependency_tasks/core.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 296e69075e..e4c6273479 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -72,7 +72,11 @@ def generate_scorecard(session,repo_id,path): key_handler = GithubApiKeyHandler(session, session.logger) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() - required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + try: + required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + except Exception as e: + session.logger.error(f"Could not parse required output! Error: {e}") + raise e session.logger.info('adding to database...') session.logger.debug(f"output: {required_output}") From f50d1240314e854da791f7f8b7fe70f649f10e5e Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Mon, 13 May 2024 17:48:39 -0500 Subject: [PATCH 15/23] changing frequency Signed-off-by: Sean P. Goggins --- augur/tasks/github/pull_requests/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 04c7ef7a0d..36a7aab1b3 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -69,7 +69,7 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: all_data += page_data - if len(all_data) >= 200: + if len(all_data) >= 2000: with GithubTaskManifest(logger) as manifest: augur_db = manifest.augur_db repo_id = augur_db.session.query(Repo).filter( From f55ed92bd91d4c4d999219f071793c49cc997d11 Mon Sep 17 00:00:00 2001 From: Ulincsys Date: Mon, 13 May 2024 18:00:48 -0500 Subject: [PATCH 16/23] Make retrieve_all_pr_data a generator --- augur/tasks/github/pull_requests/tasks.py | 36 ++++++++++------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 36a7aab1b3..d2bbe55c90 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -12,6 +12,8 @@ from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors +from typing import Generator, List, Dict + platform_id = 1 @@ -29,12 +31,18 @@ def collect_pull_requests(repo_git: str) -> int: Repo.repo_git == repo_git).one().repo_id owner, repo = get_owner_repo(repo_git) - pr_data = retrieve_all_pr_data(repo_git, logger, manifest.key_auth) - if pr_data: - process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + total_count = 0 + all_data = [] + for page in retrieve_all_pr_data(repo_git, logger, manifest.key_auth): + all_data += page + + if len(all_data) >= 1000: + process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + total_count += len(all_data) + all_data.clear() - return len(pr_data) + return total_count else: logger.info(f"{owner}/{repo} has no pull requests") return 0 @@ -42,7 +50,7 @@ def collect_pull_requests(repo_git: str) -> int: # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers # TODO: Fix column names in pull request labels table -def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: +def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> Generator[List[Dict]]: owner, repo = get_owner_repo(repo_git) @@ -52,33 +60,21 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: # returns an iterable of all prs at this url (this essentially means you can treat the prs variable as a list of the prs) prs = GithubPaginator(url, key_auth, logger) - all_data = [] num_pages = prs.get_num_pages() for page_data, page in prs.iter_pages(): if page_data is None: - return all_data + return if len(page_data) == 0: logger.debug( f"{owner}/{repo} Prs Page {page} contains no data...returning") logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") - return all_data + return logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") - - all_data += page_data - if len(all_data) >= 2000: - with GithubTaskManifest(logger) as manifest: - augur_db = manifest.augur_db - repo_id = augur_db.session.query(Repo).filter( - Repo.repo_git == repo_git).one().repo_id - owner, repo = get_owner_repo(repo_git) - process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) - all_data.clear() - - return all_data + yield page_data def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): From d4e276202600432b04870ce3020b5d6ed83c5a47 Mon Sep 17 00:00:00 2001 From: Ulincsys Date: Mon, 13 May 2024 18:10:27 -0500 Subject: [PATCH 17/23] Add un-pushed changes --- augur/tasks/github/pull_requests/tasks.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index d2bbe55c90..f739105f49 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -42,11 +42,17 @@ def collect_pull_requests(repo_git: str) -> int: total_count += len(all_data) all_data.clear() + if len(all_data): + process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + total_count += len(all_data) + + if total_count > 0: return total_count else: logger.info(f"{owner}/{repo} has no pull requests") return 0 + # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers # TODO: Fix column names in pull request labels table From 6e98367c33b72671a638c022bcbaacfa4e1b0be7 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 13 May 2024 19:52:35 -0500 Subject: [PATCH 18/23] Simplify collection Signed-off-by: Andrew Brain --- augur/tasks/util/collection_util.py | 162 +++++++++------------------- 1 file changed, 50 insertions(+), 112 deletions(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 3561b19b40..b8e4cda2fd 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -132,58 +132,73 @@ def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab def get_active_repo_count(self,session): return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all()) - #Get repo urls based on passed in info. + def get_valid_repos(self,session): - #getattr(CollectionStatus,f"{hook}_status" ) represents the status of the given hook - #Get the count of repos that are currently running this collection hook - #status_column = f"{hook}_status" + active_repo_count = self.get_active_repo_count(session) + limit = self.max_repo-active_repo_count - #Will always disallow errored repos and repos that are already collecting + if limit <= 0: + return - #The maximum amount of repos to schedule is affected by the existing repos running tasks - limit = self.max_repo-active_repo_count + collection_list = get_newly_added_repos(session, limit, hook=self.name) + self.repo_list.extend(collection_list) + limit -= len(collection_list) - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_random_users_list(session,f"{self.name}_status",self.new_status) + #Now start recollecting other repos if there is space to do so. + if limit <= 0: + return - session.logger.info(f"User_list: {split_user_list}") + collection_list = get_repos_for_recollection(session, limit, hook=self.name, days_until_collect_again=self.days_until_collect_again) - #Iterate through each fourth of the users fetched - for quarter_list in split_user_list: - if limit <= 0: - return + self.repo_list.extend(collection_list) - collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) - self.repo_list.extend(collection_list) - #Update limit with amount of repos started - limit -= len(collection_list) +def get_newly_added_repos(session, limit, hook): - #Now start old repos if there is space to do so. - if limit <= 0: - return + if hook in ["core", "secondary", "ml"]: + condition_string = f"""{hook}_status='{str(CollectionState.PENDING.value)}'""" + + elif hook == "facade": + condition_string = f"""facade_status='{str(CollectionState.UPDATE.value)}'""" + + + repo_query = s.sql.text(f""" + select repo_git + from augur_operations.collection_status x, augur_data.repo y + where x.repo_id=y.repo_id + and {condition_string} + order by repo_added + limit :limit_num + """).bindparams(limit_num=limit) + valid_repos = session.execute_sql(repo_query).fetchall() + valid_repo_git_list = [repo[0] for repo in valid_repos] - user_list = get_list_of_all_users(session) - random.shuffle(user_list) + return valid_repo_git_list - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) +def get_repos_for_recollection(session, limit, hook, days_until_collect_again): - for quarter_list in split_user_list: + if hook in ["core", "secondary", "ml"]: + condition_string = f"""{hook}_status='{str(CollectionState.SUCCESS.value)}'""" - #Break out if limit has been reached - if limit <= 0: - return + elif hook == "facade": + condition_string = f"""facade_status='{str(CollectionState.SUCCESS.value)}'""" - #only start repos older than the specified amount of days - #Query a set of valid repositories sorted by weight, also making sure that the repos aren't new or errored - #Order by the relevant weight for the collection hook - collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),allow_old_repos=True,hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) + repo_query = s.sql.text(f""" + select repo_git + from augur_operations.collection_status x, repo y + where x.repo_id = y.repo_id + and {condition_string} + and {hook}_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' + order by {hook}_data_last_collected + limit :limit_num + """).bindparams(limit_num=limit) - self.repo_list.extend(collection_list) - limit -= len(collection_list) + valid_repos = session.execute_sql(repo_query).fetchall() + valid_repo_git_list = [repo[0] for repo in valid_repos] + + return valid_repo_git_list def get_enabled_phase_names_from_config(): @@ -610,80 +625,3 @@ def send_messages(self): #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated yield repo_git, task_id, col_hook.name - -#def start_block_of_repos(logger,session,repo_git_identifiers,phases,repos_type,hook="core"): -# -# logger.info(f"Starting collection on {len(repo_git_identifiers)} {repos_type} {hook} repos") -# if len(repo_git_identifiers) == 0: -# return 0 -# -# logger.info(f"Collection starting for {hook}: {tuple(repo_git_identifiers)}") -# -# routine = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=phases,collection_hook=hook) -# -# routine.start_data_collection() -# -# return len(repo_git_identifiers) - -def get_valid_repos_for_users(session,limit,users,allow_old_repos = False,hook="core",days_to_wait_until_next_collection = 1): - - condition_string = "1" - - if hook == "core": - condition_string = get_required_conditions_for_core_repos(allow_collected_before=allow_old_repos,days_until_collect_again= days_to_wait_until_next_collection) - elif hook == "secondary": - condition_string = get_required_conditions_for_secondary_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - elif hook == "facade": - condition_string = get_required_conditions_for_facade_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - elif hook == "ml": - condition_string = get_required_conditions_for_ml_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - - #Query a set of valid repositories sorted by weight, also making sure that the repos are new - #Order by the relevant weight for the collection hook - repo_query = s.sql.text(f""" - SELECT DISTINCT repo.repo_id, repo.repo_git, collection_status.{hook}_weight - FROM augur_operations.user_groups - JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id - JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id - JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id - WHERE user_id IN :list_of_user_ids AND {condition_string} - ORDER BY augur_operations.collection_status.{hook}_weight - LIMIT :limit_num - """).bindparams(list_of_user_ids=users,limit_num=limit) - - #Get a list of valid repo ids, limit set to 2 times the usual - valid_repos = session.execute_sql(repo_query).fetchall() - valid_repo_git_list = [repo[1] for repo in valid_repos] - - session.logger.info(f"valid repo git list: {tuple(valid_repo_git_list)}") - - #start repos for new primary collection hook - #collection_size = start_block_of_repos( - # session.logger, session, - # valid_repo_git_list, - # phases, repos_type=repos_type, hook=hook - #) - - return valid_repo_git_list - -def split_random_users_list(session,status_col, status_new): - #Split all users that have new repos into four lists and randomize order - query = s.sql.text(f""" - SELECT - user_id - FROM augur_operations.user_groups - JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id - JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id - JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id - WHERE {status_col}='{str(status_new)}' - GROUP BY user_id - """) - - user_list = session.execute_sql(query).fetchall() - random.shuffle(user_list) - - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) - - return split_user_list - From 8f596cc614615225ab4d20251a0c1ece3bcc3311 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 13 May 2024 20:00:12 -0500 Subject: [PATCH 19/23] Fix annotation Signed-off-by: Andrew Brain --- augur/tasks/github/pull_requests/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index f739105f49..73ea1b025a 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -56,7 +56,7 @@ def collect_pull_requests(repo_git: str) -> int: # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers # TODO: Fix column names in pull request labels table -def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> Generator[List[Dict]]: +def retrieve_all_pr_data(repo_git: str, logger, key_auth): #-> Generator[List[Dict]]: owner, repo = get_owner_repo(repo_git) From f6d5680ddb42166f5dad8b810126bea6e9630df6 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 14 May 2024 11:40:26 -0500 Subject: [PATCH 20/23] small dial back of concurrency which didn't make sense. --- augur/application/cli/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/application/cli/tasks.py b/augur/application/cli/tasks.py index 7b74d208e0..c64dce5b88 100644 --- a/augur/application/cli/tasks.py +++ b/augur/application/cli/tasks.py @@ -37,7 +37,7 @@ def start(): scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h" - secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=90 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n secondary:{uuid.uuid4().hex}@%h -Q secondary" scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" ")) core_worker_process = subprocess.Popen(core_worker.split(" ")) From 03fc1fe6df59ae94495cc7c65521a5066bf25d6e Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 15 May 2024 11:52:20 -0500 Subject: [PATCH 21/23] updated versions Signed-off-by: Sean P. Goggins --- README.md | 4 ++-- docker/backend/Dockerfile | 2 +- docker/database/Dockerfile | 2 +- docker/rabbitmq/Dockerfile | 2 +- metadata.py | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 3f83946f5d..02ec125fb6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.63.3 +# Augur NEW Release v0.70.0 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.63.3 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.70.0 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 1ec7871c9b..6e158d199b 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.10-bookworm LABEL maintainer="outdoors@acm.org" -LABEL version="0.63.3" +LABEL version="0.70.0" ENV DEBIAN_FRONTEND=noninteractive diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile index effe34d3b2..1421e1f76c 100644 --- a/docker/database/Dockerfile +++ b/docker/database/Dockerfile @@ -2,7 +2,7 @@ FROM postgres:14 LABEL maintainer="outdoors@acm.org" -LABEL version="0.63.3" +LABEL version="0.70.0" ENV POSTGRES_DB "test" ENV POSTGRES_USER "augur" diff --git a/docker/rabbitmq/Dockerfile b/docker/rabbitmq/Dockerfile index 80542c788b..079c73dc99 100644 --- a/docker/rabbitmq/Dockerfile +++ b/docker/rabbitmq/Dockerfile @@ -1,7 +1,7 @@ FROM rabbitmq:3.12-management-alpine LABEL maintainer="574/augur@simplelogin.com" -LABEL version="0.63.3" +LABEL version="0.70.0" ARG RABBIT_MQ_DEFAULT_USER ARG RABBIT_MQ_DEFAULT_PASSWORD diff --git a/metadata.py b/metadata.py index 841521b515..b914869d58 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.63.3" -__release__ = "v0.63.3 (Supply Chain Gang)" +__version__ = "0.70.0" +__release__ = "v0.70.0 (Windows 95 Man!)" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024" From 2241f3383d58688028fd2478bc6ac6d501a192a7 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Thu, 16 May 2024 21:54:02 -0500 Subject: [PATCH 22/23] Fix issue where secondary tries to collect before core is collected Signed-off-by: Andrew Brain --- augur/tasks/util/collection_util.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index b8e4cda2fd..9776258626 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -156,12 +156,15 @@ def get_valid_repos(self,session): def get_newly_added_repos(session, limit, hook): + condition_string = "" if hook in ["core", "secondary", "ml"]: - condition_string = f"""{hook}_status='{str(CollectionState.PENDING.value)}'""" + condition_string += f"""{hook}_status='{str(CollectionState.PENDING.value)}'""" elif hook == "facade": - condition_string = f"""facade_status='{str(CollectionState.UPDATE.value)}'""" + condition_string += f"""facade_status='{str(CollectionState.UPDATE.value)}'""" + if hook == "secondary": + condition_string += f""" and core_status='{str(CollectionState.SUCCESS.value)}'""" repo_query = s.sql.text(f""" select repo_git From 16f6d6b3e4d96514542a39b4be3f70b55f20d4a7 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 21 May 2024 13:33:21 +0300 Subject: [PATCH 23/23] commented out the rebuilding of the dm_ tables. This should be rebuilt using a materialized view. Signed-off-by: Sean P. Goggins --- .../facade_worker/rebuildcache.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py index 5668739767..e4697dbc19 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py @@ -396,7 +396,8 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_weekly c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_weekly) + +# session.execute_sql(clear_dm_repo_group_weekly) clear_dm_repo_group_monthly = s.sql.text(""" DELETE @@ -410,7 +411,8 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_monthly c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_monthly) + +# session.execute_sql(clear_dm_repo_group_monthly) clear_dm_repo_group_annual = s.sql.text(""" DELETE @@ -424,7 +426,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_annual c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_annual) +# session.execute_sql(clear_dm_repo_group_annual) clear_dm_repo_weekly = s.sql.text(""" DELETE @@ -441,7 +443,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_weekly) +# session.execute_sql(clear_dm_repo_weekly) clear_dm_repo_monthly = s.sql.text(""" DELETE @@ -458,7 +460,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_monthly) +# session.execute_sql(clear_dm_repo_monthly) clear_dm_repo_annual = s.sql.text(""" DELETE @@ -475,7 +477,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_annual) +# session.execute_sql(clear_dm_repo_annual) clear_unknown_cache = s.sql.text(""" DELETE @@ -573,7 +575,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "r.repo_group_id, info.a, info.b, info.c") ).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) - session.execute_sql(cache_projects_by_week) +# session.execute_sql(cache_projects_by_week) cache_projects_by_month = s.sql.text( ("INSERT INTO dm_repo_group_monthly (repo_group_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) " @@ -609,7 +611,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "r.repo_group_id, info.a, info.b, info.c" )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) - session.execute_sql(cache_projects_by_month) +# session.execute_sql(cache_projects_by_month) cache_projects_by_year = s.sql.text(( "INSERT INTO dm_repo_group_annual (repo_group_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) " @@ -649,7 +651,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): - session.execute_sql(cache_projects_by_year) + # session.execute_sql(cache_projects_by_year) # Start caching by repo session.log_activity('Verbose','Caching repos') @@ -689,7 +691,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "a.repo_id, info.a, info.b, info.c" )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) - session.execute_sql(cache_repos_by_week) +# session.execute_sql(cache_repos_by_week) cache_repos_by_month = s.sql.text(( "INSERT INTO dm_repo_monthly (repo_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -725,7 +727,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "a.repo_id, info.a, info.b, info.c" )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) - session.execute_sql(cache_repos_by_month) +# session.execute_sql(cache_repos_by_month) cache_repos_by_year = s.sql.text(( "INSERT INTO dm_repo_annual (repo_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -759,7 +761,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "a.repo_id, info.a, info.b, info.c" )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) - session.execute_sql(cache_repos_by_year) +# session.execute_sql(cache_repos_by_year) # Reset cache flags